ttsserver/preprocessing.py

import re

from num2words import num2words
from ruaccent import RUAccent
from transliterate import translit

from config import BASE


class Accents:
    def __init__(self):
        self.accentizer = RUAccent()
        self.accentizer.load(
            omograph_model_size='turbo',
            use_dictionary=True,
            workdir=str(BASE / 'preprocessing_accents')
        )

    def __call__(self, text: str) -> str:
        return self.accentizer.process_all(text)


preprocess_accents = Accents()


def preprocess_nums(text: str) -> str:
    def _num2wordsshor(match):
        match = match.group()
        ret = num2words(match, lang='ru')
        return ret

    return re.sub(r'\d+', _num2wordsshor, text)


def preprocess_translit(text: str) -> str:
    return translit(text, 'ru')


def preprocess(text: str) -> str:
    for preprocess_func in (preprocess_accents, preprocess_nums, preprocess_translit):
        text = preprocess_func(text)

    return text