import re from num2words import num2words from ruaccent import RUAccent from transliterate import translit from config import BASE class Accents: def __init__(self): self.accentizer = RUAccent() self.accentizer.load( omograph_model_size='turbo', use_dictionary=True, workdir=str(BASE / 'preprocessing_accents') ) def __call__(self, text: str) -> str: return self.accentizer.process_all(text) preprocess_accents = Accents() def preprocess_nums(text: str) -> str: def _num2wordsshor(match): match = match.group() ret = num2words(match, lang='ru') return ret return re.sub(r'\d+', _num2wordsshor, text) def preprocess_translit(text: str) -> str: return translit(text, 'ru') def preprocess(text: str) -> str: for preprocess_func in (preprocess_accents, preprocess_nums, preprocess_translit): text = preprocess_func(text) return text