WIP
This commit is contained in:
parent
f123cc1f86
commit
a68051e1bb
32
Dockerfile
Normal file
32
Dockerfile
Normal file
@ -0,0 +1,32 @@
|
||||
FROM python:3.10-slim as builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE 1
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
RUN apt update && apt install git gcc libc6-dev -y --no-install-recommends && apt clean && rm -rf /var/lib/apt/lists/*
|
||||
COPY requirements.txt .
|
||||
RUN --mount=type=cache,target=/root/.cache/pip pip install Cython && \
|
||||
pip wheel --no-deps --wheel-dir /app/wheels -r requirements.txt && \
|
||||
|
||||
|
||||
|
||||
FROM python:3.10-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE 1
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
RUN useradd -ms /bin/bash user && rm -rf /var/lib/apt/lists/*
|
||||
USER user
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY --from=builder /app/wheels /wheels
|
||||
COPY --from=builder /app/requirements.txt .
|
||||
|
||||
RUN pip install --no-cache /wheels/*
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD ["python3", "/app/main.py"]
|
@ -1,7 +1,6 @@
|
||||
from typing import Literal
|
||||
from abc import abstractmethod, ABC
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
@ -29,5 +28,3 @@ class EngineABC(ABC):
|
||||
@abstractmethod
|
||||
def synth(self, text: str, model: str, *args, **kwargs) -> bytes:
|
||||
...
|
||||
|
||||
:
|
@ -1,10 +1,12 @@
|
||||
from loguru import logger
|
||||
import importlib
|
||||
from EngineABC import EngineABC, ModelDescription
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
|
||||
import config
|
||||
from EngineABC import EngineABC, ModelDescription
|
||||
from analytics import measure
|
||||
from preprocessing import preprocess
|
||||
import config
|
||||
|
||||
|
||||
class EnginesController:
|
||||
|
@ -35,4 +35,4 @@ class Measure:
|
||||
self.name = name
|
||||
|
||||
def record(self) -> None:
|
||||
print(f'{self.name}: {(time.time() - self.start) * 100} ms')
|
||||
print(f'{self.name}: {(time.time() - self.start) * 100} ms')
|
||||
|
@ -1,8 +1,11 @@
|
||||
from EngineABC import EngineABC, ModelDescription
|
||||
import typing
|
||||
|
||||
import torch.package
|
||||
from loguru import logger
|
||||
|
||||
from EngineABC import EngineABC, ModelDescription
|
||||
from to_wav import tensor2wav
|
||||
import typing
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from .multi_v2_package import TTSModelMulti_v2
|
||||
from pathlib import Path
|
||||
@ -39,7 +42,20 @@ class Silero(EngineABC):
|
||||
ModelDescription(
|
||||
engine=self.__class__.__name__,
|
||||
name=speaker_name,
|
||||
arguments=dict()
|
||||
arguments=dict(),
|
||||
description=desc
|
||||
)
|
||||
for speaker_name in self.model.speaker_to_id.keys()
|
||||
for speaker_name, desc in {
|
||||
'aidar': 'ru',
|
||||
'baya': 'ru',
|
||||
'kseniya': 'ru',
|
||||
'irina': 'ru',
|
||||
'ruslan': 'ru',
|
||||
'natasha': 'ru',
|
||||
'thorsten': 'de',
|
||||
'tux': 'es',
|
||||
'gilles': 'fr',
|
||||
'lj': 'en',
|
||||
'dilyara': 'tt'
|
||||
}
|
||||
)
|
||||
|
@ -1,8 +1,10 @@
|
||||
import re
|
||||
import wave
|
||||
import torch
|
||||
import warnings
|
||||
import contextlib
|
||||
import re
|
||||
import warnings
|
||||
import wave
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
# for type hints only
|
||||
|
||||
@ -46,21 +48,22 @@ class TTSModelMulti_v2:
|
||||
text = [text]
|
||||
symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
if len(text) == 1:
|
||||
return self.prepare_text_input(text[0], symbols, symbol_to_id).unsqueeze(0), torch.LongTensor(speakers), torch.LongTensor([0])
|
||||
return self.prepare_text_input(text[0], symbols, symbol_to_id).unsqueeze(0), torch.LongTensor(
|
||||
speakers), torch.LongTensor([0])
|
||||
|
||||
text_tensors = []
|
||||
for string in text:
|
||||
string_tensor = self.prepare_text_input(string, symbols, symbol_to_id)
|
||||
text_tensors.append(string_tensor)
|
||||
input_lengths, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([len(t) for t in text_tensors]),
|
||||
dim=0, descending=True)
|
||||
torch.LongTensor([len(t) for t in text_tensors]),
|
||||
dim=0, descending=True)
|
||||
max_input_len = input_lengths[0]
|
||||
batch_size = len(text_tensors)
|
||||
|
||||
text_padded = torch.ones(batch_size, max_input_len, dtype=torch.int32)
|
||||
if len(speakers) == 1:
|
||||
speakers = speakers*batch_size
|
||||
speakers = speakers * batch_size
|
||||
speaker_ids = torch.LongTensor(batch_size).zero_()
|
||||
|
||||
for i, idx in enumerate(ids_sorted_decreasing):
|
||||
|
@ -1,12 +1,15 @@
|
||||
import os
|
||||
import onnxruntime
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from .tokenizer import TokenizerG2P
|
||||
|
||||
|
||||
class TTS:
|
||||
def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 1.0, tokenizer_load_dict=True) -> None:
|
||||
def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 1.0,
|
||||
tokenizer_load_dict=True) -> None:
|
||||
if not os.path.exists(save_path):
|
||||
os.mkdir(save_path)
|
||||
|
||||
|
@ -1 +1 @@
|
||||
from .g2p import Tokenizer as TokenizerG2P
|
||||
from .g2p import Tokenizer as TokenizerG2P
|
||||
|
@ -1 +1 @@
|
||||
from .tokenizer import Tokenizer
|
||||
from .tokenizer import Tokenizer
|
||||
|
@ -1,59 +1,60 @@
|
||||
|
||||
softletters=set(u"яёюиье")
|
||||
startsyl=set(u"#ъьаяоёуюэеиы-")
|
||||
softletters = set(u"яёюиье")
|
||||
startsyl = set(u"#ъьаяоёуюэеиы-")
|
||||
others = set(["#", "+", "-", u"ь", u"ъ"])
|
||||
|
||||
softhard_cons = {
|
||||
u"б" : u"b",
|
||||
u"в" : u"v",
|
||||
u"г" : u"g",
|
||||
u"Г" : u"g",
|
||||
u"д" : u"d",
|
||||
u"з" : u"z",
|
||||
u"к" : u"k",
|
||||
u"л" : u"l",
|
||||
u"м" : u"m",
|
||||
u"н" : u"n",
|
||||
u"п" : u"p",
|
||||
u"р" : u"r",
|
||||
u"с" : u"s",
|
||||
u"т" : u"t",
|
||||
u"ф" : u"f",
|
||||
u"х" : u"h"
|
||||
u"б": u"b",
|
||||
u"в": u"v",
|
||||
u"г": u"g",
|
||||
u"Г": u"g",
|
||||
u"д": u"d",
|
||||
u"з": u"z",
|
||||
u"к": u"k",
|
||||
u"л": u"l",
|
||||
u"м": u"m",
|
||||
u"н": u"n",
|
||||
u"п": u"p",
|
||||
u"р": u"r",
|
||||
u"с": u"s",
|
||||
u"т": u"t",
|
||||
u"ф": u"f",
|
||||
u"х": u"h"
|
||||
}
|
||||
|
||||
other_cons = {
|
||||
u"ж" : u"zh",
|
||||
u"ц" : u"c",
|
||||
u"ч" : u"ch",
|
||||
u"ш" : u"sh",
|
||||
u"щ" : u"sch",
|
||||
u"й" : u"j"
|
||||
u"ж": u"zh",
|
||||
u"ц": u"c",
|
||||
u"ч": u"ch",
|
||||
u"ш": u"sh",
|
||||
u"щ": u"sch",
|
||||
u"й": u"j"
|
||||
}
|
||||
|
||||
vowels = {
|
||||
u"а" : u"a",
|
||||
u"я" : u"a",
|
||||
u"у" : u"u",
|
||||
u"ю" : u"u",
|
||||
u"о" : u"o",
|
||||
u"ё" : u"o",
|
||||
u"э" : u"e",
|
||||
u"е" : u"e",
|
||||
u"и" : u"i",
|
||||
u"ы" : u"y",
|
||||
}
|
||||
u"а": u"a",
|
||||
u"я": u"a",
|
||||
u"у": u"u",
|
||||
u"ю": u"u",
|
||||
u"о": u"o",
|
||||
u"ё": u"o",
|
||||
u"э": u"e",
|
||||
u"е": u"e",
|
||||
u"и": u"i",
|
||||
u"ы": u"y",
|
||||
}
|
||||
|
||||
|
||||
def pallatize(phones):
|
||||
for i, phone in enumerate(phones[:-1]):
|
||||
if phone[0] in softhard_cons:
|
||||
if phones[i+1][0] in softletters:
|
||||
if phones[i + 1][0] in softletters:
|
||||
phones[i] = (softhard_cons[phone[0]] + "j", 0)
|
||||
else:
|
||||
phones[i] = (softhard_cons[phone[0]], 0)
|
||||
if phone[0] in other_cons:
|
||||
phones[i] = (other_cons[phone[0]], 0)
|
||||
|
||||
|
||||
def convert_vowels(phones):
|
||||
new_phones = []
|
||||
prev = ""
|
||||
@ -69,10 +70,10 @@ def convert_vowels(phones):
|
||||
|
||||
return new_phones
|
||||
|
||||
|
||||
def convert(stressword):
|
||||
phones = ("#" + stressword + "#")
|
||||
|
||||
|
||||
# Assign stress marks
|
||||
stress_phones = []
|
||||
stress = 0
|
||||
|
@ -1,19 +1,21 @@
|
||||
import re
|
||||
from .g2p import * #noqa
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from .g2p import * # noqa
|
||||
|
||||
|
||||
class Tokenizer():
|
||||
def __init__(self, data_path: str, load_dict=True) -> None:
|
||||
'''data_path - path to data dir; load_dict - load dict, if you use accent model like ruaccent you dont need its'''
|
||||
self.dic = {}
|
||||
if load_dict:
|
||||
for line in open(os.path.join(data_path, "dictionary.txt")): #noqa
|
||||
for line in open(os.path.join(data_path, "dictionary.txt")): # noqa
|
||||
items = line.split()
|
||||
self.dic[items[0]] = " ".join(items[1:])
|
||||
|
||||
self.config = json.load(open(os.path.join(data_path, "config.json"))) #noqa
|
||||
|
||||
self.config = json.load(open(os.path.join(data_path, "config.json"))) # noqa
|
||||
|
||||
def g2p(self, text):
|
||||
text = re.sub("—", "-", text)
|
||||
text = re.sub("([!'(),-.:;?])", r' \1 ', text)
|
||||
@ -25,13 +27,13 @@ class Tokenizer():
|
||||
continue
|
||||
|
||||
word = word.lower()
|
||||
if len(phonemes) > 0:
|
||||
if len(phonemes) > 0:
|
||||
phonemes.append(' ')
|
||||
|
||||
if word in self.dic:
|
||||
phonemes.extend(self.dic[word].split())
|
||||
else:
|
||||
phonemes.extend(convert(word).split()) #noqa
|
||||
phonemes.extend(convert(word).split()) # noqa
|
||||
|
||||
phoneme_id_map = self.config["phoneme_id_map"]
|
||||
phoneme_ids = []
|
||||
@ -44,7 +46,7 @@ class Tokenizer():
|
||||
phoneme_ids.extend(phoneme_id_map["$"])
|
||||
|
||||
return phoneme_ids, phonemes
|
||||
|
||||
|
||||
def _get_seq(self, text: str) -> list[int]:
|
||||
seq = self.g2p(text)[0]
|
||||
return seq
|
||||
return seq
|
||||
|
@ -1,7 +1,9 @@
|
||||
from to_wav import ndarray2wav
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from EngineABC import EngineABC, ModelDescription, Argument
|
||||
from to_wav import ndarray2wav
|
||||
from .TeraTTS import TTS
|
||||
|
||||
|
||||
@ -14,7 +16,9 @@ class TeraTTSEngine(EngineABC):
|
||||
arguments={
|
||||
'lenght_scale': Argument(
|
||||
type='float',
|
||||
description="'length_scale' можно использовать для замедления аудио для лучшего звучания, по умолчанию 1.1")})
|
||||
description="'length_scale' можно использовать для замедления аудио для лучшего звучания, по умолчанию 1.1")},
|
||||
description='Вроде ru'
|
||||
)
|
||||
for model_name in self.speakers.keys()
|
||||
)
|
||||
|
||||
@ -24,7 +28,8 @@ class TeraTTSEngine(EngineABC):
|
||||
|
||||
for speaker_name in ('natasha-g2p-vits', 'glados2-g2p-vits', 'glados-g2p-vits', 'girl_nice-g2p-vits'):
|
||||
logger.debug(f"Loading speaker: {speaker_name}")
|
||||
self.speakers[speaker_name] = TTS(f"TeraTTS/{speaker_name}", add_time_to_end=1.0, save_path=str(save_path / 'tts'))
|
||||
self.speakers[speaker_name] = TTS(f"TeraTTS/{speaker_name}", add_time_to_end=1.0,
|
||||
save_path=str(save_path / 'tts'))
|
||||
|
||||
def synth(self, text: str, model: str, **kwargs) -> bytes:
|
||||
tts = self.speakers[model]
|
||||
|
7
main.py
7
main.py
@ -1,12 +1,11 @@
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, Response
|
||||
from pprint import pprint
|
||||
from loguru import logger
|
||||
from fastapi.concurrency import run_in_threadpool
|
||||
from pydantic import BaseModel
|
||||
|
||||
from EnginesController import EnginesController
|
||||
from fastapi.concurrency import run_in_threadpool
|
||||
from EngineABC import ModelDescription
|
||||
from EnginesController import EnginesController
|
||||
|
||||
|
||||
def play_bytes(bytes_sound: bytes) -> None:
|
||||
|
@ -1,7 +1,9 @@
|
||||
import re
|
||||
|
||||
from num2words import num2words
|
||||
from transliterate import translit
|
||||
from ruaccent import RUAccent
|
||||
from transliterate import translit
|
||||
|
||||
from config import BASE
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user