import importlib import re from bs4 import BeautifulSoup import ofscraper.utils.config.data as data html_parser = "lxml" if importlib.util.find_spec("lxml") else "html.parser" class base: def __init__(self): None def text_trunicate(self, text): text = str(text) if text == None: return "None" if len(text) == 0: return text length = int(data.get_textlength(mediatype="Text")) if length == 0: return text elif data.get_textType(mediatype="Text") == "letter": return f"{''.join(list(text)[:length])}" else: # split and reduce wordarray = list(filter(lambda x: len(x) != 0, re.split("( )", text))) splitArray = wordarray[: length + 1] text = f"{''.join(splitArray)}" text = re.sub(" +$", "", text) return text def file_cleanup(self, text, mediatype=None): text = str(text) text = re.sub('<[^>]*>', "", text) text = re.sub('[\n<>:"/\|?*:;]+', "", text) text = re.sub("-+", "_", text) text = re.sub(" +", " ", text) text = re.sub(" ", data.get_spacereplacer(mediatype=mediatype), text) return text def db_cleanup(self, string): string = re.sub("<[^>]*>", "", string) string = " ".join(string.split()) string = BeautifulSoup(string, html_parser).get_text() return string