diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9fb3340 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.10-slim as builder + +WORKDIR /app + +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + +RUN apt update && apt install git gcc libc6-dev -y --no-install-recommends && apt clean && rm -rf /var/lib/apt/lists/* +COPY requirements.txt . +RUN --mount=type=cache,target=/root/.cache/pip pip install Cython && \ + pip wheel --no-deps --wheel-dir /app/wheels -r requirements.txt && \ + + + +FROM python:3.10-slim + +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + +RUN useradd -ms /bin/bash user && rm -rf /var/lib/apt/lists/* +USER user + +WORKDIR /app + +COPY --from=builder /app/wheels /wheels +COPY --from=builder /app/requirements.txt . + +RUN pip install --no-cache /wheels/* + +COPY . . + +CMD ["python3", "/app/main.py"] diff --git a/EngineABC.py b/EngineABC.py index 0d778c6..4bfd2b0 100644 --- a/EngineABC.py +++ b/EngineABC.py @@ -1,7 +1,6 @@ -from typing import Literal from abc import abstractmethod, ABC -from dataclasses import dataclass from pathlib import Path +from typing import Literal from pydantic import BaseModel @@ -29,5 +28,3 @@ class EngineABC(ABC): @abstractmethod def synth(self, text: str, model: str, *args, **kwargs) -> bytes: ... - -: \ No newline at end of file diff --git a/EnginesController.py b/EnginesController.py index b4334bc..7e6cf5f 100644 --- a/EnginesController.py +++ b/EnginesController.py @@ -1,10 +1,12 @@ -from loguru import logger import importlib -from EngineABC import EngineABC, ModelDescription from pathlib import Path + +from loguru import logger + +import config +from EngineABC import EngineABC, ModelDescription from analytics import measure from preprocessing import preprocess -import config class EnginesController: diff --git a/analytics.py b/analytics.py index 9c62be9..b187eab 100644 --- a/analytics.py +++ b/analytics.py @@ -35,4 +35,4 @@ class Measure: self.name = name def record(self) -> None: - print(f'{self.name}: {(time.time() - self.start) * 100} ms') \ No newline at end of file + print(f'{self.name}: {(time.time() - self.start) * 100} ms') diff --git a/engines/silero/__init__.py b/engines/silero/__init__.py index 7350246..4a05bf1 100644 --- a/engines/silero/__init__.py +++ b/engines/silero/__init__.py @@ -1,8 +1,11 @@ -from EngineABC import EngineABC, ModelDescription +import typing + import torch.package from loguru import logger + +from EngineABC import EngineABC, ModelDescription from to_wav import tensor2wav -import typing + if typing.TYPE_CHECKING: from .multi_v2_package import TTSModelMulti_v2 from pathlib import Path @@ -39,7 +42,20 @@ class Silero(EngineABC): ModelDescription( engine=self.__class__.__name__, name=speaker_name, - arguments=dict() + arguments=dict(), + description=desc ) - for speaker_name in self.model.speaker_to_id.keys() + for speaker_name, desc in { + 'aidar': 'ru', + 'baya': 'ru', + 'kseniya': 'ru', + 'irina': 'ru', + 'ruslan': 'ru', + 'natasha': 'ru', + 'thorsten': 'de', + 'tux': 'es', + 'gilles': 'fr', + 'lj': 'en', + 'dilyara': 'tt' + } ) diff --git a/engines/silero/multi_v2_package.py b/engines/silero/multi_v2_package.py index b12a824..bfdb625 100644 --- a/engines/silero/multi_v2_package.py +++ b/engines/silero/multi_v2_package.py @@ -1,8 +1,10 @@ -import re -import wave -import torch -import warnings import contextlib +import re +import warnings +import wave + +import torch + # for type hints only @@ -46,21 +48,22 @@ class TTSModelMulti_v2: text = [text] symbol_to_id = {s: i for i, s in enumerate(symbols)} if len(text) == 1: - return self.prepare_text_input(text[0], symbols, symbol_to_id).unsqueeze(0), torch.LongTensor(speakers), torch.LongTensor([0]) + return self.prepare_text_input(text[0], symbols, symbol_to_id).unsqueeze(0), torch.LongTensor( + speakers), torch.LongTensor([0]) text_tensors = [] for string in text: string_tensor = self.prepare_text_input(string, symbols, symbol_to_id) text_tensors.append(string_tensor) input_lengths, ids_sorted_decreasing = torch.sort( - torch.LongTensor([len(t) for t in text_tensors]), - dim=0, descending=True) + torch.LongTensor([len(t) for t in text_tensors]), + dim=0, descending=True) max_input_len = input_lengths[0] batch_size = len(text_tensors) text_padded = torch.ones(batch_size, max_input_len, dtype=torch.int32) if len(speakers) == 1: - speakers = speakers*batch_size + speakers = speakers * batch_size speaker_ids = torch.LongTensor(batch_size).zero_() for i, idx in enumerate(ids_sorted_decreasing): diff --git a/engines/teratts/TeraTTS/infer_onnx.py b/engines/teratts/TeraTTS/infer_onnx.py index fbee725..6226a7c 100644 --- a/engines/teratts/TeraTTS/infer_onnx.py +++ b/engines/teratts/TeraTTS/infer_onnx.py @@ -1,12 +1,15 @@ import os -import onnxruntime + import numpy as np +import onnxruntime from huggingface_hub import snapshot_download + from .tokenizer import TokenizerG2P class TTS: - def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 1.0, tokenizer_load_dict=True) -> None: + def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 1.0, + tokenizer_load_dict=True) -> None: if not os.path.exists(save_path): os.mkdir(save_path) diff --git a/engines/teratts/TeraTTS/tokenizer/__init__.py b/engines/teratts/TeraTTS/tokenizer/__init__.py index b34d838..504e2bc 100644 --- a/engines/teratts/TeraTTS/tokenizer/__init__.py +++ b/engines/teratts/TeraTTS/tokenizer/__init__.py @@ -1 +1 @@ -from .g2p import Tokenizer as TokenizerG2P \ No newline at end of file +from .g2p import Tokenizer as TokenizerG2P diff --git a/engines/teratts/TeraTTS/tokenizer/g2p/__init__.py b/engines/teratts/TeraTTS/tokenizer/g2p/__init__.py index cda8600..bec1b7e 100644 --- a/engines/teratts/TeraTTS/tokenizer/g2p/__init__.py +++ b/engines/teratts/TeraTTS/tokenizer/g2p/__init__.py @@ -1 +1 @@ -from .tokenizer import Tokenizer \ No newline at end of file +from .tokenizer import Tokenizer diff --git a/engines/teratts/TeraTTS/tokenizer/g2p/g2p.py b/engines/teratts/TeraTTS/tokenizer/g2p/g2p.py index be52206..2c4313a 100644 --- a/engines/teratts/TeraTTS/tokenizer/g2p/g2p.py +++ b/engines/teratts/TeraTTS/tokenizer/g2p/g2p.py @@ -1,59 +1,60 @@ - -softletters=set(u"яёюиье") -startsyl=set(u"#ъьаяоёуюэеиы-") +softletters = set(u"яёюиье") +startsyl = set(u"#ъьаяоёуюэеиы-") others = set(["#", "+", "-", u"ь", u"ъ"]) softhard_cons = { - u"б" : u"b", - u"в" : u"v", - u"г" : u"g", - u"Г" : u"g", - u"д" : u"d", - u"з" : u"z", - u"к" : u"k", - u"л" : u"l", - u"м" : u"m", - u"н" : u"n", - u"п" : u"p", - u"р" : u"r", - u"с" : u"s", - u"т" : u"t", - u"ф" : u"f", - u"х" : u"h" + u"б": u"b", + u"в": u"v", + u"г": u"g", + u"Г": u"g", + u"д": u"d", + u"з": u"z", + u"к": u"k", + u"л": u"l", + u"м": u"m", + u"н": u"n", + u"п": u"p", + u"р": u"r", + u"с": u"s", + u"т": u"t", + u"ф": u"f", + u"х": u"h" } other_cons = { - u"ж" : u"zh", - u"ц" : u"c", - u"ч" : u"ch", - u"ш" : u"sh", - u"щ" : u"sch", - u"й" : u"j" + u"ж": u"zh", + u"ц": u"c", + u"ч": u"ch", + u"ш": u"sh", + u"щ": u"sch", + u"й": u"j" } vowels = { - u"а" : u"a", - u"я" : u"a", - u"у" : u"u", - u"ю" : u"u", - u"о" : u"o", - u"ё" : u"o", - u"э" : u"e", - u"е" : u"e", - u"и" : u"i", - u"ы" : u"y", -} + u"а": u"a", + u"я": u"a", + u"у": u"u", + u"ю": u"u", + u"о": u"o", + u"ё": u"o", + u"э": u"e", + u"е": u"e", + u"и": u"i", + u"ы": u"y", +} + def pallatize(phones): for i, phone in enumerate(phones[:-1]): if phone[0] in softhard_cons: - if phones[i+1][0] in softletters: + if phones[i + 1][0] in softletters: phones[i] = (softhard_cons[phone[0]] + "j", 0) else: phones[i] = (softhard_cons[phone[0]], 0) if phone[0] in other_cons: phones[i] = (other_cons[phone[0]], 0) + def convert_vowels(phones): new_phones = [] prev = "" @@ -69,10 +70,10 @@ def convert_vowels(phones): return new_phones + def convert(stressword): phones = ("#" + stressword + "#") - # Assign stress marks stress_phones = [] stress = 0 diff --git a/engines/teratts/TeraTTS/tokenizer/g2p/tokenizer.py b/engines/teratts/TeraTTS/tokenizer/g2p/tokenizer.py index 5ab5b56..21b8994 100644 --- a/engines/teratts/TeraTTS/tokenizer/g2p/tokenizer.py +++ b/engines/teratts/TeraTTS/tokenizer/g2p/tokenizer.py @@ -1,19 +1,21 @@ -import re -from .g2p import * #noqa import json import os +import re + +from .g2p import * # noqa + class Tokenizer(): def __init__(self, data_path: str, load_dict=True) -> None: '''data_path - path to data dir; load_dict - load dict, if you use accent model like ruaccent you dont need its''' self.dic = {} if load_dict: - for line in open(os.path.join(data_path, "dictionary.txt")): #noqa + for line in open(os.path.join(data_path, "dictionary.txt")): # noqa items = line.split() self.dic[items[0]] = " ".join(items[1:]) - self.config = json.load(open(os.path.join(data_path, "config.json"))) #noqa - + self.config = json.load(open(os.path.join(data_path, "config.json"))) # noqa + def g2p(self, text): text = re.sub("—", "-", text) text = re.sub("([!'(),-.:;?])", r' \1 ', text) @@ -25,13 +27,13 @@ class Tokenizer(): continue word = word.lower() - if len(phonemes) > 0: + if len(phonemes) > 0: phonemes.append(' ') if word in self.dic: phonemes.extend(self.dic[word].split()) else: - phonemes.extend(convert(word).split()) #noqa + phonemes.extend(convert(word).split()) # noqa phoneme_id_map = self.config["phoneme_id_map"] phoneme_ids = [] @@ -44,7 +46,7 @@ class Tokenizer(): phoneme_ids.extend(phoneme_id_map["$"]) return phoneme_ids, phonemes - + def _get_seq(self, text: str) -> list[int]: seq = self.g2p(text)[0] - return seq \ No newline at end of file + return seq diff --git a/engines/teratts/__init__.py b/engines/teratts/__init__.py index 1292832..114c77e 100644 --- a/engines/teratts/__init__.py +++ b/engines/teratts/__init__.py @@ -1,7 +1,9 @@ -from to_wav import ndarray2wav from pathlib import Path + from loguru import logger + from EngineABC import EngineABC, ModelDescription, Argument +from to_wav import ndarray2wav from .TeraTTS import TTS @@ -14,7 +16,9 @@ class TeraTTSEngine(EngineABC): arguments={ 'lenght_scale': Argument( type='float', - description="'length_scale' можно использовать для замедления аудио для лучшего звучания, по умолчанию 1.1")}) + description="'length_scale' можно использовать для замедления аудио для лучшего звучания, по умолчанию 1.1")}, + description='Вроде ru' + ) for model_name in self.speakers.keys() ) @@ -24,7 +28,8 @@ class TeraTTSEngine(EngineABC): for speaker_name in ('natasha-g2p-vits', 'glados2-g2p-vits', 'glados-g2p-vits', 'girl_nice-g2p-vits'): logger.debug(f"Loading speaker: {speaker_name}") - self.speakers[speaker_name] = TTS(f"TeraTTS/{speaker_name}", add_time_to_end=1.0, save_path=str(save_path / 'tts')) + self.speakers[speaker_name] = TTS(f"TeraTTS/{speaker_name}", add_time_to_end=1.0, + save_path=str(save_path / 'tts')) def synth(self, text: str, model: str, **kwargs) -> bytes: tts = self.speakers[model] diff --git a/main.py b/main.py index e216b26..e320040 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,11 @@ from contextlib import asynccontextmanager + from fastapi import FastAPI, Response -from pprint import pprint -from loguru import logger +from fastapi.concurrency import run_in_threadpool from pydantic import BaseModel -from EnginesController import EnginesController -from fastapi.concurrency import run_in_threadpool from EngineABC import ModelDescription +from EnginesController import EnginesController def play_bytes(bytes_sound: bytes) -> None: diff --git a/preprocessing.py b/preprocessing.py index bb8a4b7..0c61139 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,7 +1,9 @@ import re + from num2words import num2words -from transliterate import translit from ruaccent import RUAccent +from transliterate import translit + from config import BASE diff --git a/to_wav.py b/to_wav.py index bb4d661..e97bdcb 100644 --- a/to_wav.py +++ b/to_wav.py @@ -1,10 +1,11 @@ -import numpy -import io import contextlib +import io import wave -import torch from collections.abc import Iterable +import numpy +import torch + def frames2wav(resulting_array: Iterable[int], sample_rate: int) -> bytes: res_io_stream = io.BytesIO()