ttsserver/preprocessing.py
2024-05-29 02:39:47 +03:00

44 lines
1015 B
Python

import re
from num2words import num2words
from ruaccent import RUAccent
from transliterate import translit
from config import BASE
class Accents:
def __init__(self):
self.accentizer = RUAccent()
self.accentizer.load(
omograph_model_size='turbo',
use_dictionary=True,
workdir=str(BASE / 'preprocessing_accents')
)
def __call__(self, text: str) -> str:
return self.accentizer.process_all(text)
preprocess_accents = Accents()
def preprocess_nums(text: str) -> str:
def _num2wordsshor(match):
match = match.group()
ret = num2words(match, lang='ru')
return ret
return re.sub(r'\d+', _num2wordsshor, text)
def preprocess_translit(text: str) -> str:
return translit(text, 'ru')
def preprocess(text: str) -> str:
for preprocess_func in (preprocess_accents, preprocess_nums, preprocess_translit):
text = preprocess_func(text)
return text