44 lines
1015 B
Python
44 lines
1015 B
Python
import re
|
|
|
|
from num2words import num2words
|
|
from ruaccent import RUAccent
|
|
from transliterate import translit
|
|
|
|
from config import BASE
|
|
|
|
|
|
class Accents:
|
|
def __init__(self):
|
|
self.accentizer = RUAccent()
|
|
self.accentizer.load(
|
|
omograph_model_size='turbo',
|
|
use_dictionary=True,
|
|
workdir=str(BASE / 'preprocessing_accents')
|
|
)
|
|
|
|
def __call__(self, text: str) -> str:
|
|
return self.accentizer.process_all(text)
|
|
|
|
|
|
preprocess_accents = Accents()
|
|
|
|
|
|
def preprocess_nums(text: str) -> str:
|
|
def _num2wordsshor(match):
|
|
match = match.group()
|
|
ret = num2words(match, lang='ru')
|
|
return ret
|
|
|
|
return re.sub(r'\d+', _num2wordsshor, text)
|
|
|
|
|
|
def preprocess_translit(text: str) -> str:
|
|
return translit(text, 'ru')
|
|
|
|
|
|
def preprocess(text: str) -> str:
|
|
for preprocess_func in (preprocess_accents, preprocess_nums, preprocess_translit):
|
|
text = preprocess_func(text)
|
|
|
|
return text
|