This commit is contained in:
norohind 2024-05-29 01:44:19 +03:00
parent f123cc1f86
commit a68051e1bb
Signed by: norohind
GPG Key ID: 01C3BECC26FB59E1
15 changed files with 145 additions and 82 deletions

32
Dockerfile Normal file
View File

@ -0,0 +1,32 @@
FROM python:3.10-slim as builder
WORKDIR /app
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
RUN apt update && apt install git gcc libc6-dev -y --no-install-recommends && apt clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN --mount=type=cache,target=/root/.cache/pip pip install Cython && \
pip wheel --no-deps --wheel-dir /app/wheels -r requirements.txt && \
FROM python:3.10-slim
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
RUN useradd -ms /bin/bash user && rm -rf /var/lib/apt/lists/*
USER user
WORKDIR /app
COPY --from=builder /app/wheels /wheels
COPY --from=builder /app/requirements.txt .
RUN pip install --no-cache /wheels/*
COPY . .
CMD ["python3", "/app/main.py"]

View File

@ -1,7 +1,6 @@
from typing import Literal
from abc import abstractmethod, ABC
from dataclasses import dataclass
from pathlib import Path
from typing import Literal
from pydantic import BaseModel
@ -29,5 +28,3 @@ class EngineABC(ABC):
@abstractmethod
def synth(self, text: str, model: str, *args, **kwargs) -> bytes:
...
:

View File

@ -1,10 +1,12 @@
from loguru import logger
import importlib
from EngineABC import EngineABC, ModelDescription
from pathlib import Path
from loguru import logger
import config
from EngineABC import EngineABC, ModelDescription
from analytics import measure
from preprocessing import preprocess
import config
class EnginesController:

View File

@ -35,4 +35,4 @@ class Measure:
self.name = name
def record(self) -> None:
print(f'{self.name}: {(time.time() - self.start) * 100} ms')
print(f'{self.name}: {(time.time() - self.start) * 100} ms')

View File

@ -1,8 +1,11 @@
from EngineABC import EngineABC, ModelDescription
import typing
import torch.package
from loguru import logger
from EngineABC import EngineABC, ModelDescription
from to_wav import tensor2wav
import typing
if typing.TYPE_CHECKING:
from .multi_v2_package import TTSModelMulti_v2
from pathlib import Path
@ -39,7 +42,20 @@ class Silero(EngineABC):
ModelDescription(
engine=self.__class__.__name__,
name=speaker_name,
arguments=dict()
arguments=dict(),
description=desc
)
for speaker_name in self.model.speaker_to_id.keys()
for speaker_name, desc in {
'aidar': 'ru',
'baya': 'ru',
'kseniya': 'ru',
'irina': 'ru',
'ruslan': 'ru',
'natasha': 'ru',
'thorsten': 'de',
'tux': 'es',
'gilles': 'fr',
'lj': 'en',
'dilyara': 'tt'
}
)

View File

@ -1,8 +1,10 @@
import re
import wave
import torch
import warnings
import contextlib
import re
import warnings
import wave
import torch
# for type hints only
@ -46,21 +48,22 @@ class TTSModelMulti_v2:
text = [text]
symbol_to_id = {s: i for i, s in enumerate(symbols)}
if len(text) == 1:
return self.prepare_text_input(text[0], symbols, symbol_to_id).unsqueeze(0), torch.LongTensor(speakers), torch.LongTensor([0])
return self.prepare_text_input(text[0], symbols, symbol_to_id).unsqueeze(0), torch.LongTensor(
speakers), torch.LongTensor([0])
text_tensors = []
for string in text:
string_tensor = self.prepare_text_input(string, symbols, symbol_to_id)
text_tensors.append(string_tensor)
input_lengths, ids_sorted_decreasing = torch.sort(
torch.LongTensor([len(t) for t in text_tensors]),
dim=0, descending=True)
torch.LongTensor([len(t) for t in text_tensors]),
dim=0, descending=True)
max_input_len = input_lengths[0]
batch_size = len(text_tensors)
text_padded = torch.ones(batch_size, max_input_len, dtype=torch.int32)
if len(speakers) == 1:
speakers = speakers*batch_size
speakers = speakers * batch_size
speaker_ids = torch.LongTensor(batch_size).zero_()
for i, idx in enumerate(ids_sorted_decreasing):

View File

@ -1,12 +1,15 @@
import os
import onnxruntime
import numpy as np
import onnxruntime
from huggingface_hub import snapshot_download
from .tokenizer import TokenizerG2P
class TTS:
def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 1.0, tokenizer_load_dict=True) -> None:
def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 1.0,
tokenizer_load_dict=True) -> None:
if not os.path.exists(save_path):
os.mkdir(save_path)

View File

@ -1 +1 @@
from .g2p import Tokenizer as TokenizerG2P
from .g2p import Tokenizer as TokenizerG2P

View File

@ -1 +1 @@
from .tokenizer import Tokenizer
from .tokenizer import Tokenizer

View File

@ -1,59 +1,60 @@
softletters=set(u"яёюиье")
startsyl=set(u"#ъьаяоёуюэеиы-")
softletters = set(u"яёюиье")
startsyl = set(u"#ъьаяоёуюэеиы-")
others = set(["#", "+", "-", u"ь", u"ъ"])
softhard_cons = {
u"б" : u"b",
u"в" : u"v",
u"г" : u"g",
u"Г" : u"g",
u"д" : u"d",
u"з" : u"z",
u"к" : u"k",
u"л" : u"l",
u"м" : u"m",
u"н" : u"n",
u"п" : u"p",
u"р" : u"r",
u"с" : u"s",
u"т" : u"t",
u"ф" : u"f",
u"х" : u"h"
u"б": u"b",
u"в": u"v",
u"г": u"g",
u"Г": u"g",
u"д": u"d",
u"з": u"z",
u"к": u"k",
u"л": u"l",
u"м": u"m",
u"н": u"n",
u"п": u"p",
u"р": u"r",
u"с": u"s",
u"т": u"t",
u"ф": u"f",
u"х": u"h"
}
other_cons = {
u"ж" : u"zh",
u"ц" : u"c",
u"ч" : u"ch",
u"ш" : u"sh",
u"щ" : u"sch",
u"й" : u"j"
u"ж": u"zh",
u"ц": u"c",
u"ч": u"ch",
u"ш": u"sh",
u"щ": u"sch",
u"й": u"j"
}
vowels = {
u"а" : u"a",
u"я" : u"a",
u"у" : u"u",
u"ю" : u"u",
u"о" : u"o",
u"ё" : u"o",
u"э" : u"e",
u"е" : u"e",
u"и" : u"i",
u"ы" : u"y",
}
u"а": u"a",
u"я": u"a",
u"у": u"u",
u"ю": u"u",
u"о": u"o",
u"ё": u"o",
u"э": u"e",
u"е": u"e",
u"и": u"i",
u"ы": u"y",
}
def pallatize(phones):
for i, phone in enumerate(phones[:-1]):
if phone[0] in softhard_cons:
if phones[i+1][0] in softletters:
if phones[i + 1][0] in softletters:
phones[i] = (softhard_cons[phone[0]] + "j", 0)
else:
phones[i] = (softhard_cons[phone[0]], 0)
if phone[0] in other_cons:
phones[i] = (other_cons[phone[0]], 0)
def convert_vowels(phones):
new_phones = []
prev = ""
@ -69,10 +70,10 @@ def convert_vowels(phones):
return new_phones
def convert(stressword):
phones = ("#" + stressword + "#")
# Assign stress marks
stress_phones = []
stress = 0

View File

@ -1,19 +1,21 @@
import re
from .g2p import * #noqa
import json
import os
import re
from .g2p import * # noqa
class Tokenizer():
def __init__(self, data_path: str, load_dict=True) -> None:
'''data_path - path to data dir; load_dict - load dict, if you use accent model like ruaccent you dont need its'''
self.dic = {}
if load_dict:
for line in open(os.path.join(data_path, "dictionary.txt")): #noqa
for line in open(os.path.join(data_path, "dictionary.txt")): # noqa
items = line.split()
self.dic[items[0]] = " ".join(items[1:])
self.config = json.load(open(os.path.join(data_path, "config.json"))) #noqa
self.config = json.load(open(os.path.join(data_path, "config.json"))) # noqa
def g2p(self, text):
text = re.sub("", "-", text)
text = re.sub("([!'(),-.:;?])", r' \1 ', text)
@ -25,13 +27,13 @@ class Tokenizer():
continue
word = word.lower()
if len(phonemes) > 0:
if len(phonemes) > 0:
phonemes.append(' ')
if word in self.dic:
phonemes.extend(self.dic[word].split())
else:
phonemes.extend(convert(word).split()) #noqa
phonemes.extend(convert(word).split()) # noqa
phoneme_id_map = self.config["phoneme_id_map"]
phoneme_ids = []
@ -44,7 +46,7 @@ class Tokenizer():
phoneme_ids.extend(phoneme_id_map["$"])
return phoneme_ids, phonemes
def _get_seq(self, text: str) -> list[int]:
seq = self.g2p(text)[0]
return seq
return seq

View File

@ -1,7 +1,9 @@
from to_wav import ndarray2wav
from pathlib import Path
from loguru import logger
from EngineABC import EngineABC, ModelDescription, Argument
from to_wav import ndarray2wav
from .TeraTTS import TTS
@ -14,7 +16,9 @@ class TeraTTSEngine(EngineABC):
arguments={
'lenght_scale': Argument(
type='float',
description="'length_scale' можно использовать для замедления аудио для лучшего звучания, по умолчанию 1.1")})
description="'length_scale' можно использовать для замедления аудио для лучшего звучания, по умолчанию 1.1")},
description='Вроде ru'
)
for model_name in self.speakers.keys()
)
@ -24,7 +28,8 @@ class TeraTTSEngine(EngineABC):
for speaker_name in ('natasha-g2p-vits', 'glados2-g2p-vits', 'glados-g2p-vits', 'girl_nice-g2p-vits'):
logger.debug(f"Loading speaker: {speaker_name}")
self.speakers[speaker_name] = TTS(f"TeraTTS/{speaker_name}", add_time_to_end=1.0, save_path=str(save_path / 'tts'))
self.speakers[speaker_name] = TTS(f"TeraTTS/{speaker_name}", add_time_to_end=1.0,
save_path=str(save_path / 'tts'))
def synth(self, text: str, model: str, **kwargs) -> bytes:
tts = self.speakers[model]

View File

@ -1,12 +1,11 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI, Response
from pprint import pprint
from loguru import logger
from fastapi.concurrency import run_in_threadpool
from pydantic import BaseModel
from EnginesController import EnginesController
from fastapi.concurrency import run_in_threadpool
from EngineABC import ModelDescription
from EnginesController import EnginesController
def play_bytes(bytes_sound: bytes) -> None:

View File

@ -1,7 +1,9 @@
import re
from num2words import num2words
from transliterate import translit
from ruaccent import RUAccent
from transliterate import translit
from config import BASE

View File

@ -1,10 +1,11 @@
import numpy
import io
import contextlib
import io
import wave
import torch
from collections.abc import Iterable
import numpy
import torch
def frames2wav(resulting_array: Iterable[int], sample_rate: int) -> bytes:
res_io_stream = io.BytesIO()