diff --git a/server/Exceptions.py b/server/Exceptions.py index 5c8d2f197..57c2a8e82 100644 --- a/server/Exceptions.py +++ b/server/Exceptions.py @@ -1,11 +1,3 @@ -class NoModeLoadedException(Exception): - def __init__(self, framework): - self.framework = framework - - def __str__(self): - return repr(f"No model for {self.framework} loaded. Please confirm the model uploaded.") - - class VoiceChangerIsNotSelectedException(Exception): def __str__(self): return repr("Voice Changer is not selected.") diff --git a/server/MMVCServerSIO.spec b/server/MMVCServerSIO.spec index c7d4d2f64..7da1fec0a 100644 --- a/server/MMVCServerSIO.spec +++ b/server/MMVCServerSIO.spec @@ -3,7 +3,6 @@ from PyInstaller.utils.hooks import collect_data_files, collect_all, collect_dyn import sys import os.path import site -import logging sys.setrecursionlimit(sys.getrecursionlimit() * 5) diff --git a/server/client.py b/server/client.py index d099a4ea0..14c0177c8 100644 --- a/server/client.py +++ b/server/client.py @@ -1,12 +1,10 @@ import asyncio -import traceback from main import setupArgParser, main from utils.strtobool import strtobool -from mods.log_control import VoiceChangaerLogger -VoiceChangaerLogger.get_instance().initialize(initialize=True) -logger = VoiceChangaerLogger.get_instance().getLogger() +import logging +logger = logging.getLogger(__name__) if __name__ == "__main__": parser = setupArgParser() @@ -16,5 +14,5 @@ try: asyncio.run(main(args)) except Exception as e: - print(traceback.format_exc()) + logger.exception(e) input('Press Enter to continue...') diff --git a/server/const.py b/server/const.py index e2ae0c20e..74b006ef5 100644 --- a/server/const.py +++ b/server/const.py @@ -40,7 +40,6 @@ class EnumInferenceTypes(Enum): pyTorchRVCv2Nono = "pyTorchRVCv2Nono" pyTorchWebUI = "pyTorchWebUI" pyTorchWebUINono = "pyTorchWebUINono" - pyTorchVoRASbeta = "pyTorchVoRASbeta" onnxRVC = "onnxRVC" onnxRVCNono = "onnxRVCNono" diff --git a/server/data/ModelSlot.py b/server/data/ModelSlot.py index 61969ec16..800585bf2 100644 --- a/server/data/ModelSlot.py +++ b/server/data/ModelSlot.py @@ -5,7 +5,8 @@ import os import json - +import logging +logger = logging.getLogger(__name__) @dataclass class ModelSlot: @@ -76,7 +77,7 @@ def loadAllSlotInfo(model_dir: str): def saveSlotInfo(model_dir: str, slotIndex: int, slotInfo: ModelSlots): slotDir = os.path.join(model_dir, str(slotIndex)) - print("SlotInfo:::", slotInfo) + logger.info(f"SlotInfo::: {slotInfo}") slotInfoDict = asdict(slotInfo) slotInfo.slotIndex = -1 # スロットインデックスは動的に注入 json.dump(slotInfoDict, open(os.path.join(slotDir, "params.json"), "w"), indent=4) diff --git a/server/downloader/Downloader.py b/server/downloader/Downloader.py index 528d29c5b..1ee69eaab 100644 --- a/server/downloader/Downloader.py +++ b/server/downloader/Downloader.py @@ -4,13 +4,13 @@ from downloader.HttpClient import HttpClient from tqdm import tqdm from threading import Lock -from mods.log_control import VoiceChangaerLogger from xxhash import xxh128 from utils.hasher import compute_hash from const import ASSETS_FILE from Exceptions import DownloadVerificationException -logger = VoiceChangaerLogger.get_instance().getLogger() +import logging +logger = logging.getLogger(__name__) lock = Lock() @@ -44,13 +44,13 @@ async def download(params: dict): # If hash was provided with the file - verify against provided hash if expected_hash is not None: if hash == expected_hash: - logger.info(f'[Voice Changer] Verified {saveTo}') + logger.info(f'Verified {saveTo}') return # If hash was not provided - verify against local cache elif saveTo in files: fhash = files[saveTo] if hash == fhash: - logger.info(f'[Voice Changer] Verified {saveTo}') + logger.info(f'Verified {saveTo}') return else: hash = None diff --git a/server/downloader/SampleDownloader.py b/server/downloader/SampleDownloader.py index 7f0812c82..e313dcd3a 100644 --- a/server/downloader/SampleDownloader.py +++ b/server/downloader/SampleDownloader.py @@ -1,18 +1,17 @@ import json import os -import sys import asyncio from typing import Any, Tuple from const import RVCSampleMode, getSampleJsonAndModelIds from data.ModelSample import ModelSamples, generateModelSample from data.ModelSlot import ModelSlot, RVCModelSlot -from mods.log_control import VoiceChangaerLogger +import logging from voice_changer.ModelSlotManager import ModelSlotManager from voice_changer.RVC.RVCModelSlotGenerator import RVCModelSlotGenerator from downloader.Downloader import download -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) async def downloadInitialSamples(mode: RVCSampleMode, model_dir: str): @@ -86,7 +85,7 @@ async def _downloadSamples(samples: list[ModelSamples], sampleModelIds: list[Tup match = True break if match is False: - logger.warn(f"[Voice Changer] initiail sample not found. {targetSampleId}") + logger.warn(f"Initial sample not found: {targetSampleId}") continue # 検出されたら、、、 @@ -145,17 +144,17 @@ async def _downloadSamples(samples: list[ModelSamples], sampleModelIds: list[Tup slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx") modelSlotManager.save_model_slot(targetSlotIndex, slotInfo) else: - logger.warn(f"[Voice Changer] {sample.voiceChangerType} is not supported.") + logger.warn(f"{sample.voiceChangerType} is not supported.") # ダウンロード - logger.info("[Voice Changer] Downloading model files...") + logger.info("Downloading model files...") tasks: list[asyncio.Task] = [] for file in downloadParams: tasks.append(asyncio.ensure_future(download(file))) await asyncio.gather(*tasks) # メタデータ作成 - logger.info("[Voice Changer] Generating metadata...") + logger.info("Generating metadata...") for targetSlotIndex in slotIndex: slotInfo = modelSlotManager.get_slot_info(targetSlotIndex) modelPath = os.path.join(model_dir, str(slotInfo.slotIndex), os.path.basename(slotInfo.modelFile)) diff --git a/server/downloader/WeightDownloader.py b/server/downloader/WeightDownloader.py index 9c655de78..e205e22de 100644 --- a/server/downloader/WeightDownloader.py +++ b/server/downloader/WeightDownloader.py @@ -1,14 +1,14 @@ import asyncio from downloader.Downloader import download -from mods.log_control import VoiceChangaerLogger +import logging from settings import ServerSettings from Exceptions import WeightDownloadException -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) async def downloadWeight(params: ServerSettings): - logger.info('[Voice Changer] Loading weights.') + logger.info('Loading weights.') file_params = [ # { # "url": "https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/hubert_base.pt", @@ -82,8 +82,8 @@ async def downloadWeight(params: ServerSettings): for res in await asyncio.gather(*tasks, return_exceptions=True): if isinstance(res, Exception): fail = True - logger.error(f'[Voice Changer] {res}') + logger.exception(res) if fail: raise WeightDownloadException() - logger.info('[Voice Changer] All weights are loaded!') + logger.info('All weights are loaded!') diff --git a/server/main.py b/server/main.py index b992b09cb..c2d228e8b 100644 --- a/server/main.py +++ b/server/main.py @@ -2,66 +2,48 @@ import multiprocessing as mp # NOTE: This is required to avoid recursive process call bug for macOS mp.freeze_support() -from const import SSL_KEY_DIR, DOTENV_FILE, ROOT_PATH, UPLOAD_DIR, TMP_DIR, get_version, get_edition +from const import SSL_KEY_DIR, ROOT_PATH, UPLOAD_DIR, TMP_DIR, LOG_FILE, get_version, get_edition # NOTE: This is required to fix current working directory on macOS os.chdir(ROOT_PATH) import sys import uvicorn import asyncio -import traceback import threading import socket import time -from dotenv import set_key +import logging from utils.strtobool import strtobool from datetime import datetime -import platform import argparse from downloader.WeightDownloader import downloadWeight from downloader.SampleDownloader import downloadInitialSamples from mods.ssl import create_self_signed_cert from webbrowser import open_new_tab from settings import ServerSettings -from mods.log_control import VoiceChangaerLogger -VoiceChangaerLogger.get_instance().initialize(initialize=True) -logger = VoiceChangaerLogger.get_instance().getLogger() +stream_handler = logging.StreamHandler() +stream_handler.setLevel(logging.INFO) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)-15s %(levelname)-8s [%(module)s] %(message)s", + handlers=[logging.FileHandler(LOG_FILE), stream_handler] +) +logger = logging.getLogger(__name__) settings = ServerSettings() def setupArgParser(): parser = argparse.ArgumentParser() - parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)") + parser.add_argument("--log-level", type=str, default="error", help="Log level info|critical|error.") parser.add_argument("--https", type=strtobool, default=False, help="use https") - parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https") - parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https") - parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate") + parser.add_argument("--https-key", type=str, default="ssl.key", help="path for the key of https") + parser.add_argument("--https-cert", type=str, default="ssl.cert", help="path for the cert of https") + parser.add_argument("--https-self-signed", type=strtobool, default=True, help="generate self-signed certificate") return parser -def printMessage(message, level=0): - pf = platform.system() - if pf == "Windows": - if level == 0: - message = f"{message}" - elif level == 1: - message = f" {message}" - elif level == 2: - message = f" {message}" - else: - message = f" {message}" - else: - if level == 0: - message = f"\033[17m{message}\033[0m" - elif level == 1: - message = f"\033[34m {message}\033[0m" - elif level == 2: - message = f"\033[32m {message}\033[0m" - else: - message = f"\033[47m {message}\033[0m" - logger.info(message) - def check_port(port) -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.bind(("127.0.0.1", port)) @@ -73,13 +55,13 @@ def wait_for_server(proto: str, launch_browser: bool): result = sock.connect_ex(('127.0.0.1', settings.port)) if result == 0: break - print('-' * 8) - print(f"The server is listening on {proto}://{settings.host}:{settings.port}/") - print('-' * 8) + logger.info('-' * 8) + logger.info(f"The server is listening on {proto}://{settings.host}:{settings.port}/") + logger.info('-' * 8) if launch_browser: open_new_tab(f'{proto}://127.0.0.1:{settings.port}') -async def runServer(host: str, port: int, launch_browser: bool = False, logLevel: str = 'critical', key_path: str | None = None, cert_path: str | None = None): +async def runServer(host: str, port: int, launch_browser: bool = False, log_level: str = 'error', key_path: str | None = None, cert_path: str | None = None): check_port(port) config = uvicorn.Config( @@ -89,7 +71,7 @@ async def runServer(host: str, port: int, launch_browser: bool = False, logLevel reload=False, ssl_keyfile=key_path, ssl_certfile=cert_path, - log_level=logLevel + log_level=log_level ) server = uvicorn.Server(config) @@ -101,14 +83,8 @@ async def runServer(host: str, port: int, launch_browser: bool = False, logLevel async def main(args): logger.debug(args) - if not os.path.exists(DOTENV_FILE): - for key, value in settings.model_dump().items(): - set_key(DOTENV_FILE, key.upper(), str(value)) - - printMessage(f"Python: {sys.version}", level=2) - printMessage(f"Voice changer version: {get_version()} {get_edition()}", level=2) - # printMessage("Voice Changerを起動しています。", level=2) - printMessage("Activating the Voice Changer.", level=2) + logger.info(f"Python: {sys.version}") + logger.info(f"Voice changer version: {get_version()} {get_edition()}") # ダウンロード(Weight) await downloadWeight(settings) @@ -116,8 +92,8 @@ async def main(args): try: await downloadInitialSamples(settings.sample_mode, settings.model_dir) except Exception as e: - print(traceback.format_exc()) - printMessage(f"Failed to download samples. Reason: {e}", level=2) + logger.error(f"Failed to download samples.") + logger.exception(e) # FIXME: Need to refactor samples download logic os.makedirs(settings.model_dir, exist_ok=True) @@ -125,7 +101,7 @@ async def main(args): os.makedirs(TMP_DIR, exist_ok=True) # HTTPS key/cert作成 - if args.https and args.httpsSelfSigned: + if args.https and args.https_self_signed: # HTTPS(おれおれ証明書生成) os.makedirs(SSL_KEY_DIR, exist_ok=True) key_base_name = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}" @@ -145,24 +121,23 @@ async def main(args): ) key_path = os.path.join(SSL_KEY_DIR, keyname) cert_path = os.path.join(SSL_KEY_DIR, certname) - printMessage(f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1) + logger.info(f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}") - elif args.https and not args.httpsSelfSigned: + elif args.https and not args.https_self_signed: # HTTPS - key_path = args.httpsKey - cert_path = args.httpsCert - printMessage(f"protocol: HTTPS, key:{key_path}, cert:{cert_path}", level=1) + key_path = args.https_key + cert_path = args.https_cert + logger.info(f"protocol: HTTPS, key:{key_path}, cert:{cert_path}") else: # HTTP - printMessage("protocol: HTTP", level=1) - printMessage("-- ---- -- ", level=1) + logger.info("protocol: HTTP") # サーバ起動 if args.https: # HTTPS サーバ起動 - await runServer(settings.host, settings.port, args.launch_browser, args.logLevel, key_path, cert_path) + await runServer(settings.host, settings.port, args.launch_browser, args.log_level, key_path, cert_path) else: - await runServer(settings.host, settings.port, args.launch_browser, args.logLevel) + await runServer(settings.host, settings.port, args.launch_browser, args.log_level) if __name__ == "__main__": @@ -170,10 +145,8 @@ async def main(args): args, _ = parser.parse_known_args() args.launch_browser = False - printMessage(f"Booting PHASE :{__name__}", level=2) - try: asyncio.run(main(args)) except Exception as e: - print(traceback.format_exc()) + logger.exception(e) input('Press Enter to continue...') diff --git a/server/mods/log_control.py b/server/mods/log_control.py deleted file mode 100644 index 768a24e04..000000000 --- a/server/mods/log_control.py +++ /dev/null @@ -1,101 +0,0 @@ -import logging -import traceback -import sys -from const import LOG_FILE - -class UvicornSuppressFilter(logging.Filter): - def filter(self, record): - return False - - -class NullHandler(logging.Handler): - def emit(self, record): - pass - - -class DebugStreamHandler(logging.StreamHandler): - def emit(self, record): - try: - super().emit(record) - except Exception as e: - print(f"Error logging message: {e}", file=sys.stderr) - traceback.print_exc() - - -class DebugFileHandler(logging.FileHandler): - def emit(self, record): - try: - super().emit(record) - except Exception as e: - print(f"Error writing log message to file: {e}", file=sys.stderr) - traceback.print_exc() - - -class VoiceChangaerLogger: - _instance = None - - @classmethod - def get_instance(cls): - if cls._instance is None: - cls._instance = cls() - return cls._instance - - def __init__(self): - # logger = logging.getLogger("uvicorn.error") - # logger.addFilter(UvicornSuppressFilter()) - - # logging.basicConfig(filename='myapp.log', level=logging.INFO) - # logging.basicConfig(level=logging.NOTSET) - logging.root.handlers = [NullHandler()] - - logger = logging.getLogger("fairseq.tasks.hubert_pretraining") - logger.addFilter(UvicornSuppressFilter()) - - logger = logging.getLogger("fairseq.models.hubert.hubert") - logger.addFilter(UvicornSuppressFilter()) - - logger = logging.getLogger("fairseq.tasks.text_to_speech") - logger.addFilter(UvicornSuppressFilter()) - - logger = logging.getLogger("numba.core.ssa") - logger.addFilter(UvicornSuppressFilter()) - - logger = logging.getLogger("numba.core.interpreter") - logger.addFilter(UvicornSuppressFilter()) - - logger = logging.getLogger("numba.core.byteflow") - logger.addFilter(UvicornSuppressFilter()) - - # logger.propagate = False - - logger = logging.getLogger("multipart.multipart") - logger.propagate = False - - logging.getLogger("asyncio").setLevel(logging.WARNING) - - logger = logging.getLogger("vcclient") - logger.setLevel(logging.DEBUG) - self.logger = logger - - def initialize(self, initialize: bool): - if not self.logger.handlers: - if initialize: - # file_handler = logging.FileHandler(LOG_FILE, encoding="utf-8", mode="w") - file_handler = DebugFileHandler(LOG_FILE, encoding="utf-8", mode="w") - else: - # file_handler = logging.FileHandler(LOG_FILE, encoding="utf-8") - file_handler = DebugFileHandler(LOG_FILE, encoding="utf-8") - file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(process)d - %(message)s") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - self.logger.addHandler(file_handler) - - stream_formatter = logging.Formatter("%(message)s") - # stream_handler = logging.StreamHandler() - stream_handler = DebugStreamHandler() - stream_handler.setFormatter(stream_formatter) - stream_handler.setLevel(logging.INFO) - self.logger.addHandler(stream_handler) - - def getLogger(self): - return self.logger diff --git a/server/restapi/MMVC_Rest.py b/server/restapi/MMVC_Rest.py index bc6cd0873..28cb260a9 100644 --- a/server/restapi/MMVC_Rest.py +++ b/server/restapi/MMVC_Rest.py @@ -1,18 +1,19 @@ +import logging + from restapi.mods.trustedorigin import TrustedOriginMiddleware from fastapi import FastAPI, Request, Response, HTTPException from fastapi.routing import APIRoute from fastapi.staticfiles import StaticFiles from fastapi.exceptions import RequestValidationError from typing import Callable, Optional, Sequence -from mods.log_control import VoiceChangaerLogger from voice_changer.VoiceChangerManager import VoiceChangerManager from restapi.MMVC_Rest_Hello import MMVC_Rest_Hello from restapi.MMVC_Rest_VoiceChanger import MMVC_Rest_VoiceChanger from restapi.MMVC_Rest_Fileuploader import MMVC_Rest_Fileuploader -from const import UPLOAD_DIR, FRONTEND_DIR, TMP_DIR +from const import UPLOAD_DIR, TMP_DIR -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class ValidationErrorLoggingRoute(APIRoute): @@ -22,10 +23,10 @@ def get_route_handler(self) -> Callable: async def custom_route_handler(request: Request) -> Response: try: return await original_route_handler(request) - except RequestValidationError as exc: # type: ignore - print("Exception", request.url, str(exc)) + except RequestValidationError as e: # type: ignore + logger.exception(e) body = await request.body() - detail = {"errors": exc.errors(), "body": body.decode()} + detail = {"errors": e.errors(), "body": body.decode()} raise HTTPException(status_code=422, detail=detail) return custom_route_handler @@ -43,7 +44,7 @@ def get_instance( port: Optional[int] = None, ): if cls._instance is None: - logger.info("[Voice Changer] MMVC_Rest initializing...") + logger.info("Initializing...") app_fastapi = FastAPI() app_fastapi.router.route_class = ValidationErrorLoggingRoute app_fastapi.add_middleware( @@ -69,7 +70,7 @@ def get_instance( app_fastapi.include_router(fileUploader.router) cls._instance = app_fastapi - logger.info("[Voice Changer] MMVC_Rest initializing... done.") + logger.info("Initialized.") return cls._instance return cls._instance diff --git a/server/restapi/MMVC_Rest_Fileuploader.py b/server/restapi/MMVC_Rest_Fileuploader.py index 5e8a00eac..f53c5e1b5 100644 --- a/server/restapi/MMVC_Rest_Fileuploader.py +++ b/server/restapi/MMVC_Rest_Fileuploader.py @@ -1,5 +1,4 @@ import json -import os from typing import Union from fastapi import APIRouter from fastapi.encoders import jsonable_encoder @@ -11,7 +10,8 @@ from const import UPLOAD_DIR from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams - +import logging +logger = logging.getLogger(__name__) class MMVC_Rest_Fileuploader: def __init__(self, voiceChangerManager: VoiceChangerManager): @@ -33,7 +33,7 @@ def post_upload_file(self, file: UploadFile, filename: str = Form(...)): json_compatible_item_data = jsonable_encoder(res) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] post_upload_file ex:", e) + logger.exception(e) def get_info(self): try: @@ -41,7 +41,7 @@ def get_info(self): json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] get_info ex:", e) + logger.exception(e) def post_update_settings(self, key: str = Form(...), val: Union[int, str, float] = Form(...)): try: @@ -49,9 +49,7 @@ def post_update_settings(self, key: str = Form(...), val: Union[int, str, float] json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] post_update_settings ex:", e) - import traceback - traceback.print_exc() + logger.exception(e) async def post_load_model( self, @@ -61,18 +59,16 @@ async def post_load_model( ): try: paramDict = json.loads(params) - print("paramDict", paramDict) + logger.info(f"paramDict", paramDict) loadModelparams = LoadModelParams(**paramDict) loadModelparams.files = [LoadModelParamFile(**x) for x in paramDict["files"]] - # print("paramDict", loadModelparams) + # logger.info(f"paramDict", loadModelparams) info = await self.voiceChangerManager.load_model(loadModelparams) json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] post_load_model ex:", e) - import traceback - traceback.print_exc() + logger.exception(e) def get_onnx(self): try: @@ -80,20 +76,16 @@ def get_onnx(self): json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] get_onnx ex:", e) - import traceback - traceback.print_exc() + logger.exception(e) async def post_merge_models(self, request: str = Form(...)): try: - print(request) + logger.info(request) info = await self.voiceChangerManager.merge_models(request) json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] post_merge_models ex:", e) - import traceback - traceback.print_exc() + logger.exception(e) def post_update_model_default(self): try: @@ -101,9 +93,7 @@ def post_update_model_default(self): json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] post_update_model_default ex:", e) - import traceback - traceback.print_exc() + logger.exception(e) def post_update_model_info(self, newData: str = Form(...)): try: @@ -111,7 +101,7 @@ def post_update_model_info(self, newData: str = Form(...)): json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] post_update_model_info ex:", e) + logger.exception(e) def post_upload_model_assets(self, params: str = Form(...)): try: @@ -119,4 +109,4 @@ def post_upload_model_assets(self, params: str = Form(...)): json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) except Exception as e: - print("[Voice Changer] post_update_model_info ex:", e) + logger.exception(e) diff --git a/server/restapi/MMVC_Rest_VoiceChanger.py b/server/restapi/MMVC_Rest_VoiceChanger.py index d8e8faa13..4fe2952b5 100644 --- a/server/restapi/MMVC_Rest_VoiceChanger.py +++ b/server/restapi/MMVC_Rest_VoiceChanger.py @@ -1,7 +1,5 @@ import base64 import numpy as np -import traceback -import os from fastapi import APIRouter from fastapi.encoders import jsonable_encoder @@ -9,8 +7,9 @@ from const import get_edition, get_version from voice_changer.VoiceChangerManager import VoiceChangerManager from pydantic import BaseModel -import threading +import logging +logger = logging.getLogger(__name__) class VoiceModel(BaseModel): timestamp: int @@ -25,8 +24,6 @@ def __init__(self, voiceChangerManager: VoiceChangerManager): self.router.add_api_route("/edition", self.edition, methods=["GET"]) self.router.add_api_route("/version", self.version, methods=["GET"]) - self.tlock = threading.Lock() - def edition(self): return PlainTextResponse(get_edition()) @@ -42,9 +39,7 @@ def test(self, voice: VoiceModel): unpackedData = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768 - self.tlock.acquire() out_audio, perf, err = self.voiceChangerManager.changeVoice(unpackedData) - self.tlock.release() out_audio = (out_audio * 32767).astype(np.int16).tobytes() if err is not None: @@ -66,7 +61,12 @@ def test(self, voice: VoiceModel): })) except Exception as e: - print("REQUEST PROCESSING!!!! EXCEPTION!!!", e) - print(traceback.format_exc()) - self.tlock.release() - return str(e) + logger.exception(e) + return JSONResponse(content=jsonable_encoder({ + "error": True, + "timestamp": 0, + "details": { + "code": "GENERIC_REST_SERVER_ERROR", + "message": "Check command line for more details.", + }, + })) diff --git a/server/sio/MMVC_Namespace.py b/server/sio/MMVC_Namespace.py index bbb3d219d..5509c699b 100644 --- a/server/sio/MMVC_Namespace.py +++ b/server/sio/MMVC_Namespace.py @@ -5,6 +5,8 @@ import asyncio +import logging +logger = logging.getLogger(__name__) class MMVC_Namespace(socketio.AsyncNamespace): sid: int = 0 @@ -38,8 +40,7 @@ def get_instance(cls, voiceChangerManager: VoiceChangerManager): def on_connect(self, sid, environ): self.sid = sid - print("[{}] connet sid : {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), sid)) - pass + logger.info(f"Connected SID: {sid}") async def on_request_message(self, sid, msg): self.sid = sid @@ -57,5 +58,4 @@ async def on_request_message(self, sid, msg): await self.emit("response", [timestamp, out_audio, perf], to=sid) def on_disconnect(self, sid): - # print('[{}] disconnect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) - pass + logger.info(f"Disconnected SID: {sid}") diff --git a/server/sio/MMVC_SocketIOApp.py b/server/sio/MMVC_SocketIOApp.py index a044aeb99..055b10a93 100644 --- a/server/sio/MMVC_SocketIOApp.py +++ b/server/sio/MMVC_SocketIOApp.py @@ -1,5 +1,5 @@ import socketio -from mods.log_control import VoiceChangaerLogger +import logging from mods.origins import compute_local_origins, normalize_origins from typing import Sequence, Optional @@ -7,7 +7,7 @@ from voice_changer.VoiceChangerManager import VoiceChangerManager from const import FRONTEND_DIR -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class MMVC_SocketIOApp: @@ -22,7 +22,7 @@ def get_instance( port: Optional[int] = None, ): if cls._instance is None: - logger.info("[Voice Changer] MMVC_SocketIOApp initializing...") + logger.info("Initializing...") allowed_origins: set[str] = set() if '*' in allowedOrigins: @@ -65,7 +65,7 @@ def get_instance( ) cls._instance = app_socketio - logger.info("[Voice Changer] MMVC_SocketIOApp initializing... done.") + logger.info("Initialized.") return cls._instance return cls._instance diff --git a/server/voice_changer/IORecorder.py b/server/voice_changer/IORecorder.py index 5cf946c65..e13821240 100644 --- a/server/voice_changer/IORecorder.py +++ b/server/voice_changer/IORecorder.py @@ -1,8 +1,8 @@ import wave import os -from mods.log_control import VoiceChangaerLogger +import logging -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class IORecorder: @@ -22,10 +22,10 @@ def __init__(self, inputFilename: str, outputFilename: str, inputSamplingRate: i def _clearFile(self, filename: str): if os.path.exists(filename): - logger.info(f"[IORecorder] delete old analyze file. {filename}") + logger.info(f"Removing old analyze file. {filename}") os.remove(filename) else: - logger.info(f"[IORecorder] old analyze file not exist. {filename}") + logger.info(f"Old analyze file does not exist. {filename}") def writeInput(self, wav): self.fi.writeframes(wav) diff --git a/server/voice_changer/Local/AudioDeviceList.py b/server/voice_changer/Local/AudioDeviceList.py index 6c145efe4..eb8c057b6 100644 --- a/server/voice_changer/Local/AudioDeviceList.py +++ b/server/voice_changer/Local/AudioDeviceList.py @@ -4,11 +4,11 @@ import numpy as np from const import ServerAudioDeviceType -from mods.log_control import VoiceChangaerLogger +import logging # from const import SERVER_DEVICE_SAMPLE_RATES -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) @dataclass @@ -39,7 +39,7 @@ def checkSamplingRate(deviceId: int, desiredSamplingRate: int, type: ServerAudio pass return True except Exception as e: # NOQA - print("[checkSamplingRate]", e) + logger.warn(f"[checkSamplingRate] {e}") return False else: try: @@ -52,7 +52,7 @@ def checkSamplingRate(deviceId: int, desiredSamplingRate: int, type: ServerAudio pass return True except Exception as e: # NOQA - print("[checkSamplingRate]", e) + logger.warn(f"[checkSamplingRate] {e}") return False @@ -60,7 +60,6 @@ def list_audio_device(): try: audioDeviceList = sd.query_devices() except Exception as e: - logger.error("[Voice Changer] ex:query_devices") logger.exception(e) raise e @@ -68,10 +67,6 @@ def list_audio_device(): outputAudioDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0] hostapis = sd.query_hostapis() - # print("input:", inputAudioDeviceList) - # print("output:", outputDeviceList) - # print("hostapis", hostapis) - serverAudioInputDevices: list[ServerAudioDevice] = [] serverAudioOutputDevices: list[ServerAudioDevice] = [] for d in inputAudioDeviceList: @@ -97,20 +92,4 @@ def list_audio_device(): ) serverAudioOutputDevices.append(serverOutputAudioDevice) - # print("check sample rate1") - # for d in serverAudioInputDevices: - # print("check sample rate1-1") - # for sr in SERVER_DEVICE_SAMPLE_RATES: - # print("check sample rate1-2") - # if checkSamplingRate(d.index, sr, "input"): - # d.available_samplerates.append(sr) - # print("check sample rate2") - # for d in serverAudioOutputDevices: - # print("check sample rate2-1") - # for sr in SERVER_DEVICE_SAMPLE_RATES: - # print("check sample rate2-2") - # if checkSamplingRate(d.index, sr, "output"): - # d.available_samplerates.append(sr) - # print("check sample rate3") - return serverAudioInputDevices, serverAudioOutputDevices diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py index 59806227a..6cefe25e5 100644 --- a/server/voice_changer/Local/ServerDevice.py +++ b/server/voice_changer/Local/ServerDevice.py @@ -2,7 +2,7 @@ from const import SERVER_DEVICE_SAMPLE_RATES from queue import Queue -from mods.log_control import VoiceChangaerLogger +import logging from voice_changer.VoiceChangerSettings import VoiceChangerSettings from voice_changer.Local.AudioDeviceList import checkSamplingRate, list_audio_device import time @@ -13,7 +13,7 @@ from typing import Protocol from typing import Union -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class ServerDeviceCallbacks(Protocol): def on_request(self, unpackedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: @@ -74,7 +74,7 @@ def audio_stream_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, outputChannels = outdata.shape[1] outdata[:] = (np.repeat(out_wav, outputChannels).reshape(-1, outputChannels) * self.settings.serverOutputAudioGain) except Exception as e: - print("[Voice Changer] ex:", e) + logger.exception(e) def audio_stream_callback_mon_queue(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status): try: @@ -83,7 +83,7 @@ def audio_stream_callback_mon_queue(self, indata: np.ndarray, outdata: np.ndarra outputChannels = outdata.shape[1] outdata[:] = (np.repeat(out_wav, outputChannels).reshape(-1, outputChannels) * self.settings.serverOutputAudioGain) except Exception as e: - print("[Voice Changer] ex:", e) + logger.exception(e) def audio_monitor_callback(self, outdata: np.ndarray, frames, times, status): try: @@ -93,9 +93,7 @@ def audio_monitor_callback(self, outdata: np.ndarray, frames, times, status): outputChannels = outdata.shape[1] outdata[:] = (np.repeat(mon_wav, outputChannels).reshape(-1, outputChannels) * self.settings.serverMonitorAudioGain) except Exception as e: - print("[Voice Changer][ServerDevice][audioMonitor_callback] ex:", e) - # import traceback - # traceback.print_exc() + logger.exception(e) ########################################### # Main Loop Section @@ -106,9 +104,6 @@ def run_no_monitor(self, block_frame: int, inputMaxChannel: int, outputMaxChanne ): while self.stream_loop: time.sleep(2) - print(f"[Voice Changer] server audio performance {self.performance}") - print(f" input : id:{self.settings.serverInputDeviceId}, sr:{self.settings.serverInputAudioSampleRate}, ch:{inputMaxChannel}") - print(f" output : id:{self.settings.serverOutputDeviceId}, sr:{self.settings.serverOutputAudioSampleRate}, ch:{outputMaxChannel}") def run_with_monitor(self, block_frame: int, inputMaxChannel: int, outputMaxChannel: int, monitorMaxChannel: int, inputExtraSetting, outputExtraSetting, monitorExtraSetting): with ( @@ -117,10 +112,6 @@ def run_with_monitor(self, block_frame: int, inputMaxChannel: int, outputMaxChan ): while self.stream_loop: time.sleep(2) - print(f"[Voice Changer] server audio performance {self.performance}") - print(f" input : id:{self.settings.serverInputDeviceId}, sr:{self.settings.serverInputAudioSampleRate}, ch:{inputMaxChannel}") - print(f" output : id:{self.settings.serverOutputDeviceId}, sr:{self.settings.serverOutputAudioSampleRate}, ch:{outputMaxChannel}") - print(f" monitor: id:{self.settings.serverMonitorDeviceId}, sr:{self.settings.serverMonitorAudioSampleRate}, ch:{monitorMaxChannel}") ########################################### # Start Section @@ -151,14 +142,14 @@ def start(self): if serverMonitorAudioDevice and "WASAPI" in serverMonitorAudioDevice.hostAPI: monitorExtraSetting = sd.WasapiSettings(exclusive=bool(self.settings.exclusiveMode)) - print("Devices:") - print(" [Input]:", serverInputAudioDevice, inputExtraSetting) - print(" [Output]:", serverOutputAudioDevice, outputExtraSetting) - print(" [Monitor]:", serverMonitorAudioDevice, monitorExtraSetting) + logger.info("Devices:") + logger.info(f" [Input]: {serverInputAudioDevice} {inputExtraSetting}") + logger.info(f" [Output]: {serverOutputAudioDevice}, {outputExtraSetting}") + logger.info(f" [Monitor]: {serverMonitorAudioDevice}, {monitorExtraSetting}") # Deviceがなかったらいったんスリープ if serverInputAudioDevice is None or serverOutputAudioDevice is None: - print("serverInputAudioDevice or serverOutputAudioDevice is None") + logger.error("serverInputAudioDevice or serverOutputAudioDevice is None") time.sleep(2) continue @@ -173,15 +164,15 @@ def start(self): outputAudioSampleRateAvailable = checkSamplingRate(self.settings.serverOutputDeviceId, self.settings.serverOutputAudioSampleRate, "output") monitorAudioSampleRateAvailable = checkSamplingRate(self.settings.serverMonitorDeviceId, self.settings.serverMonitorAudioSampleRate, "output") if serverMonitorAudioDevice else True - print("Sample Rate:") - print(f" [Input]: {self.settings.serverInputAudioSampleRate} -> {inputAudioSampleRateAvailable}") - print(f" [Output]: {self.settings.serverOutputAudioSampleRate} -> {outputAudioSampleRateAvailable}") + logger.info("Sample Rate:") + logger.info(f" [Input]: {self.settings.serverInputAudioSampleRate} -> {inputAudioSampleRateAvailable}") + logger.info(f" [Output]: {self.settings.serverOutputAudioSampleRate} -> {outputAudioSampleRateAvailable}") if serverMonitorAudioDevice is not None: - print(f" [Monitor]: {self.settings.serverMonitorAudioSampleRate} -> {monitorAudioSampleRateAvailable}") + logger.info(f" [Monitor]: {self.settings.serverMonitorAudioSampleRate} -> {monitorAudioSampleRateAvailable}") if not inputAudioSampleRateAvailable or not outputAudioSampleRateAvailable or not monitorAudioSampleRateAvailable: - print("Sample Rate is not supported by device:") - print("Checking Available Sample Rate:") + logger.error("Sample Rate is not supported by device:") + logger.info("Checking Available Sample Rate:") availableInputSampleRate = [] availableOutputSampleRate = [] availableMonitorSampleRate = [] @@ -193,20 +184,17 @@ def start(self): if serverMonitorAudioDevice is not None: if checkSamplingRate(self.settings.serverMonitorDeviceId, sr, "output"): availableMonitorSampleRate.append(sr) - print("Available Sample Rate:") - print(f" [Input]: {availableInputSampleRate}") - print(f" [Output]: {availableOutputSampleRate}") + logger.info("Available Sample Rate:") + logger.info(f" [Input]: {availableInputSampleRate}") + logger.info(f" [Output]: {availableOutputSampleRate}") if serverMonitorAudioDevice is not None: - print(f" [Monitor]: {availableMonitorSampleRate}") - - print("continue... ") + logger.info(f" [Monitor]: {availableMonitorSampleRate}") time.sleep(2) continue - # Blockサイズを計算 + # FIXME: In UI, block size is calculated based on 48kHz so we convert from 48kHz to input device sample rate. block_frame = int((self.settings.serverReadChunkSize * 128 / 48000) * self.settings.serverInputAudioSampleRate) - # main loop try: self.stream_loop = True if serverMonitorAudioDevice is None: @@ -214,10 +202,7 @@ def start(self): else: self.run_with_monitor(block_frame, serverInputAudioDevice.maxInputChannels, serverOutputAudioDevice.maxOutputChannels, serverMonitorAudioDevice.maxOutputChannels, inputExtraSetting, outputExtraSetting, monitorExtraSetting) except Exception as e: - print("[Voice Changer] processing, ex:", e) - import traceback - - traceback.print_exc() + logger.exception(e) time.sleep(2) ########################################### @@ -230,7 +215,7 @@ def get_info(self): self.serverAudioInputDevices = audioinput self.serverAudioOutputDevices = audiooutput except Exception as e: - print(e) + logger.exception(e) data["serverAudioInputDevices"] = self.serverAudioInputDevices data["serverAudioOutputDevices"] = self.serverAudioOutputDevices diff --git a/server/voice_changer/ModelSlotManager.py b/server/voice_changer/ModelSlotManager.py index ae198e7c8..f50c00a2a 100644 --- a/server/voice_changer/ModelSlotManager.py +++ b/server/voice_changer/ModelSlotManager.py @@ -4,9 +4,9 @@ import os import shutil -from mods.log_control import VoiceChangaerLogger +import logging -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class ModelSlotManager: @@ -15,7 +15,6 @@ class ModelSlotManager: def __init__(self, model_dir: str): self.model_dir = model_dir self.modelSlots = loadAllSlotInfo(self.model_dir) - logger.debug(f"[MODEL SLOT INFO] {self.modelSlots}") @classmethod def get_instance(cls, model_dir: str): @@ -42,7 +41,7 @@ def save_model_slot(self, slotIndex: int, slotInfo: ModelSlots): self._save_model_slot(slotIndex, slotInfo) def update_model_info(self, newData: str): - logger.info(f"[Voice Changer] UPDATE MODEL INFO, {newData}") + logger.info(f"UPDATE MODEL INFO: {newData}") newDataDict = json.loads(newData) slotInfo = self._load_model_slot(newDataDict["slot"]) if newDataDict["key"] == "speakers": @@ -65,5 +64,4 @@ def store_model_assets(self, params: str): setattr(slotInfo, paramsDict["name"], storePath) self._save_model_slot(paramsDict["slot"], slotInfo) except Exception as e: - logger.info(f"[Voice Changer] Exception: {e}") - logger.error(e) + logger.exception(e) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py deleted file mode 100644 index ed4ec1023..000000000 --- a/server/voice_changer/RVC/RVC.py +++ /dev/null @@ -1,243 +0,0 @@ -from dataclasses import asdict -import numpy as np -import torch -import torchaudio -from data.ModelSlot import RVCModelSlot -from mods.log_control import VoiceChangaerLogger - -from voice_changer.VoiceChangerSettings import VoiceChangerSettings -from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel -from settings import ServerSettings -from voice_changer.RVC.onnxExporter.export2onnx import export2onnx -from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager -from voice_changer.RVC.pipeline.PipelineGenerator import createPipeline -from voice_changer.common.deviceManager.DeviceManager import DeviceManager -from voice_changer.RVC.pipeline.Pipeline import Pipeline - -from Exceptions import PipelineCreateException, PipelineNotInitializedException - -logger = VoiceChangaerLogger.get_instance().getLogger() - - -class RVC(VoiceChangerModel): - def __init__(self, params: ServerSettings, slotInfo: RVCModelSlot, settings: VoiceChangerSettings): - logger.info("[Voice Changer] [RVC] Creating instance ") - self.deviceManager = DeviceManager.get_instance() - EmbedderManager.initialize(params) - PitchExtractorManager.initialize(params) - self.settings = settings - self.params = params - # self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) - - self.pipeline: Pipeline | None = None - - self.audio_buffer: AudioInOut | None = None - self.pitchf_buffer: PitchfInOut | None = None - self.feature_buffer: FeatureInOut | None = None - self.prevVol = 0.0 - self.slotInfo = slotInfo - # self.initialize() - - def initialize(self): - logger.info("[Voice Changer] [RVC] Initializing... ") - - # pipelineの生成 - try: - self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector) - except PipelineCreateException as e: # NOQA - logger.error("[Voice Changer] pipeline create failed. check your model is valid.") - return - - # その他の設定 - self.settings.set_properties({ - 'tran': self.slotInfo.defaultTune, - 'formantShift': self.slotInfo.defaultFormantShift, - 'indexRatio': self.slotInfo.defaultIndexRatio, - 'protect': self.slotInfo.defaultProtect - }) - logger.info("[Voice Changer] [RVC] Initializing... done") - - def update_settings(self, key: str, val, old_val): - logger.info(f"[Voice Changer][RVC]: update_settings {key}:{val}") - if key in self.settings.intData: - setattr(self.settings, key, int(val)) - if key == "gpu": - self.initialize() - elif key in self.settings.floatData: - setattr(self.settings, key, float(val)) - elif key in self.settings.strData: - setattr(self.settings, key, str(val)) - if key == "f0Detector" and self.pipeline is not None: - pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) - self.pipeline.setPitchExtractor(pitchExtractor) - else: - return False - return True - - def get_info(self): - data = {} - if self.pipeline is not None: - pipelineInfo = self.pipeline.getPipelineInfo() - data["pipelineInfo"] = pipelineInfo - else: - data["pipelineInfo"] = "None" - return data - - def get_processing_sampling_rate(self): - return self.slotInfo.samplingRate - - def generate_input( - self, - newData: AudioInOut, - inputSize: int, - crossfadeSize: int, - solaSearchFrame: int = 0, - ): - newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) - # ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。 - new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate - if self.audio_buffer is not None: - # 過去のデータに連結 - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) - if self.slotInfo.f0: - self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0) - self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0) - else: - self.audio_buffer = newData - if self.slotInfo.f0: - self.pitchf_buffer = np.zeros(new_feature_length) - self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels]) - - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize - - if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 - convertSize = convertSize + (128 - (convertSize % 128)) - outSize = convertSize - self.settings.extraConvertSize - - # バッファがたまっていない場合はzeroで補う - if self.audio_buffer.shape[0] < convertSize: - self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) - if self.slotInfo.f0: - self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer]) - self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer]) - - convertOffset = -1 * convertSize - featureOffset = -convertSize * 100 // self.slotInfo.samplingRate - self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 - if self.slotInfo.f0: - self.pitchf_buffer = self.pitchf_buffer[featureOffset:] - self.feature_buffer = self.feature_buffer[featureOffset:] - - # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) - cropOffset = -1 * (inputSize + crossfadeSize) - cropEnd = -1 * (crossfadeSize) - crop = self.audio_buffer[cropOffset:cropEnd] - vol = np.sqrt(np.square(crop).mean()) - vol = max(vol, self.prevVol * 0.0) - self.prevVol = vol - - return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize) - - def inference(self, data): - if self.pipeline is None: - logger.info("[Voice Changer] Pipeline is not initialized.111") - raise PipelineNotInitializedException() - - audio, pitchf, feature, convertSize, vol, outSize = data - - if vol < self.settings.silentThreshold: - return np.zeros(convertSize, dtype=np.int16) * np.sqrt(vol) - - if self.pipeline is not None: - device = self.pipeline.device - else: - device = torch.device("cpu") - audio = torch.as_tensor(audio, device=device, dtype=torch.float32) - audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99) - repeat = 0 - sid = self.settings.dstId - f0_up_key = self.settings.tran - index_rate = self.settings.indexRatio - protect = self.settings.protect - - embOutputLayer = self.slotInfo.embOutputLayer - useFinalProj = self.slotInfo.useFinalProj - - audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( - sid, - audio, - pitchf, - feature, - f0_up_key, - index_rate, - self.slotInfo.f0, - self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。 - embOutputLayer, - useFinalProj, - repeat, - protect, - outSize - ) - result = audio_out.detach().cpu().numpy() * np.sqrt(vol) - - return result - - return - - def __del__(self): - del self.pipeline - - # print("---------- REMOVING ---------------") - - # remove_path = os.path.join("RVC") - # sys.path = [x for x in sys.path if x.endswith(remove_path) is False] - - # for key in list(sys.modules): - # val = sys.modules.get(key) - # try: - # file_path = val.__file__ - # if file_path.find("RVC" + os.path.sep) >= 0: - # # print("remove", key, file_path) - # sys.modules.pop(key) - # except Exception: # type:ignore - # # print(e) - # pass - - def export2onnx(self): - modelSlot = self.slotInfo - - if modelSlot.isONNX: - logger.warn("[Voice Changer] export2onnx, No pyTorch filepath.") - return {"status": "ng", "path": ""} - - if self.pipeline is not None: - del self.pipeline - self.pipeline = None - - torch.cuda.empty_cache() - self.initialize() - - output_file_simple = export2onnx(self.settings.gpu, modelSlot) - - return { - "status": "ok", - "path": f"/tmp/{output_file_simple}", - "filename": output_file_simple, - } - - def get_model_current(self): - return [ - { - "key": "defaultTune", - "val": self.settings.tran, - }, - { - "key": "defaultIndexRatio", - "val": self.settings.indexRatio, - }, - { - "key": "defaultProtect", - "val": self.settings.protect, - }, - ] diff --git a/server/voice_changer/RVC/RVCModelMerger.py b/server/voice_changer/RVC/RVCModelMerger.py index 2ecd5c0aa..e2018dadc 100644 --- a/server/voice_changer/RVC/RVCModelMerger.py +++ b/server/voice_changer/RVC/RVCModelMerger.py @@ -5,7 +5,8 @@ from voice_changer.RVC.modelMerger.MergeModel import merge_model from voice_changer.utils.ModelMerger import ModelMerger, ModelMergerRequest from settings import ServerSettings - +import logging +logger = logging.getLogger(__name__) class RVCModelMerger(ModelMerger): @classmethod @@ -15,7 +16,7 @@ def merge_models(cls, params: ServerSettings, request: ModelMergerRequest, store # いったんは、アップロードフォルダに格納する。(歴史的経緯) # 後続のloadmodelを呼び出すことで永続化モデルフォルダに移動させられる。 storeDir = os.path.join(UPLOAD_DIR) - print("[Voice Changer] store merged model to:", storeDir) + logger.info(f"store merged model to: {storeDir}") os.makedirs(storeDir, exist_ok=True) storeFile = os.path.join(storeDir, "merged.pth") torch.save(merged, storeFile) diff --git a/server/voice_changer/RVC/RVCModelSlotGenerator.py b/server/voice_changer/RVC/RVCModelSlotGenerator.py index 391712608..f7cb2bf62 100644 --- a/server/voice_changer/RVC/RVCModelSlotGenerator.py +++ b/server/voice_changer/RVC/RVCModelSlotGenerator.py @@ -11,6 +11,8 @@ from voice_changer.utils.LoadModelParams import LoadModelParams from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator from settings import ServerSettings +import logging +logger = logging.getLogger(__name__) class RVCModelSlotGenerator(ModelSlotGenerator): @classmethod @@ -29,7 +31,7 @@ def load_model(cls, props: LoadModelParams): slotInfo.defaultProtect = 0.5 slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx") slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0] - print("RVC:: slotInfo.modelFile", slotInfo.modelFile) + logger.info(f"RVC:: slotInfo.modelFile {slotInfo.modelFile}") # slotInfo.iconFile = "/assets/icons/noimage.png" @@ -59,27 +61,7 @@ def _setInfoByPytorch(cls, modelPath: str, slot: RVCModelSlot): slot = RVCModelSlot(**asdict(slot)) slot.f0 = True if cpt["f0"] == 1 else False - if version == "voras_beta": - slot.f0 = True if cpt["f0"] == 1 else False - slot.modelType = EnumInferenceTypes.pyTorchVoRASbeta.value - slot.embChannels = 768 - slot.embOutputLayer = cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9 - slot.useFinalProj = False - - slot.embedder = cpt["embedder_name"] - if slot.embedder.endswith("768"): - slot.embedder = slot.embedder[:-3] - - # if slot.embedder == "hubert": - # slot.embedder = "hubert" - # elif slot.embedder == "contentvec": - # slot.embedder = "contentvec" - # elif slot.embedder == "hubert_jp": - # slot.embedder = "hubert_jp" - else: - raise RuntimeError("[Voice Changer][setInfoByPytorch] unknown embedder") - - elif config_len == 18: + if config_len == 18: # Original RVC if version == "v1": slot.modelType = EnumInferenceTypes.pyTorchRVC.value if slot.f0 else EnumInferenceTypes.pyTorchRVCNono.value @@ -87,14 +69,14 @@ def _setInfoByPytorch(cls, modelPath: str, slot: RVCModelSlot): slot.embOutputLayer = 9 slot.useFinalProj = True slot.embedder = "hubert_base" - print("[Voice Changer] Official Model(pyTorch) : v1") + logger.info("Official Model(pyTorch) : v1") else: slot.modelType = EnumInferenceTypes.pyTorchRVCv2.value if slot.f0 else EnumInferenceTypes.pyTorchRVCv2Nono.value slot.embChannels = 768 slot.embOutputLayer = 12 slot.useFinalProj = False slot.embedder = "hubert_base" - print("[Voice Changer] Official Model(pyTorch) : v2") + logger.info("Official Model(pyTorch) : v2") else: # DDPN RVC @@ -109,11 +91,11 @@ def _setInfoByPytorch(cls, modelPath: str, slot: RVCModelSlot): # DDPNモデルの情報を表示 if slot.embChannels == 256 and slot.embOutputLayer == 9 and slot.useFinalProj: - print("[Voice Changer] DDPN Model(pyTorch) : Official v1 like") + logger.info("DDPN Model(pyTorch) : Official v1 like") elif slot.embChannels == 768 and slot.embOutputLayer == 12 and slot.useFinalProj is False: - print("[Voice Changer] DDPN Model(pyTorch): Official v2 like") + logger.info("DDPN Model(pyTorch): Official v2 like") else: - print(f"[Voice Changer] DDPN Model(pyTorch): ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}") + logger.info(f"DDPN Model(pyTorch): ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}") slot.embedder = cpt["embedder_name"] if slot.embedder.endswith("768"): @@ -150,11 +132,11 @@ def _setInfoByONNX(cls, modelPath: str, slot: RVCModelSlot): # ONNXモデルの情報を表示 if slot.embChannels == 256 and slot.embOutputLayer == 9 and slot.useFinalProj: - print("[Voice Changer] ONNX Model: Official v1 like") + logger.info("ONNX Model: Official v1 like") elif slot.embChannels == 768 and slot.embOutputLayer == 12 and slot.useFinalProj is False: - print("[Voice Changer] ONNX Model: Official v2 like") + logger.info("ONNX Model: Official v2 like") else: - print(f"[Voice Changer] ONNX Model: ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}") + logger.info(f"ONNX Model: ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}") if "embedder" not in metadata: slot.embedder = "hubert_base" @@ -185,10 +167,10 @@ def _setInfoByONNX(cls, modelPath: str, slot: RVCModelSlot): slot.samplingRate = 48000 slot.deprecated = True - print("[Voice Changer] setInfoByONNX", e) - print("[Voice Changer] ############## !!!! CAUTION !!!! ####################") - print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.") - print("[Voice Changer] ############## !!!! CAUTION !!!! ####################") + logger.error("setInfoByONNX", e) + logger.error("############## !!!! CAUTION !!!! ####################") + logger.error("This onnxfie is deprecated. Please regenerate onnxfile.") + logger.error("############## !!!! CAUTION !!!! ####################") del tmp_onnx_session return slot diff --git a/server/voice_changer/RVC/RVCr2.py b/server/voice_changer/RVC/RVCr2.py index e08227d2e..c9c37e312 100644 --- a/server/voice_changer/RVC/RVCr2.py +++ b/server/voice_changer/RVC/RVCr2.py @@ -3,7 +3,7 @@ """ import torch from data.ModelSlot import RVCModelSlot -from mods.log_control import VoiceChangaerLogger +import logging from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.utils.VoiceChangerModel import ( @@ -20,16 +20,14 @@ from torchaudio import transforms as tat from voice_changer.VoiceChangerSettings import VoiceChangerSettings from Exceptions import ( - PipelineCreateException, PipelineNotInitializedException, ) -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class RVCr2(VoiceChangerModel): def __init__(self, params: ServerSettings, slotInfo: RVCModelSlot, settings: VoiceChangerSettings): - logger.info("[Voice Changer] [RVCr2] Creating instance") self.voiceChangerType = "RVC" self.device_manager = DeviceManager.get_instance() @@ -55,15 +53,15 @@ def __init__(self, params: ServerSettings, slotInfo: RVCModelSlot, settings: Voi self.resampler_in: tat.Resample | None = None self.resampler_out: tat.Resample | None = None - self.input_sample_rate = 44100 - self.outputSampleRate = 44100 + self.input_sample_rate = self.settings.inputSampleRate + self.output_sample_rate = self.settings.outputSampleRate self.is_half = False self.initialize() def initialize(self, force_reload: bool = False): - logger.info("[Voice Changer] [RVCr2] Initializing... ") + logger.info("Initializing...") self.is_half = self.device_manager.use_fp16() @@ -72,10 +70,9 @@ def initialize(self, force_reload: bool = False): self.pipeline = createPipeline( self.params, self.slotInfo, self.settings.f0Detector, force_reload ) - except PipelineCreateException as e: # NOQA - logger.error( - "[Voice Changer] pipeline create failed. check your model is valid." - ) + except Exception as e: # NOQA + logger.error("Failed to create pipeline.") + logger.exception(e) return self.dtype = torch.float16 if self.is_half else torch.float32 @@ -98,11 +95,13 @@ def initialize(self, force_reload: bool = False): self.resampler_out = tat.Resample( orig_freq=self.slotInfo.samplingRate, - new_freq=self.outputSampleRate, + new_freq=self.output_sample_rate, dtype=torch.float32 ).to(self.device_manager.device) - def setSamplingRate(self, input_sample_rate, outputSampleRate): + logger.info("Initialized.") + + def setSamplingRate(self, input_sample_rate, output_sample_rate): if self.input_sample_rate != input_sample_rate: self.input_sample_rate = input_sample_rate self.resampler_in = tat.Resample( @@ -110,17 +109,15 @@ def setSamplingRate(self, input_sample_rate, outputSampleRate): new_freq=self.sr, dtype=torch.float32 ).to(self.device_manager.device) - if self.outputSampleRate != outputSampleRate: - self.outputSampleRate = outputSampleRate + if self.output_sample_rate != output_sample_rate: + self.output_sample_rate = output_sample_rate self.resampler_out = tat.Resample( orig_freq=self.slotInfo.samplingRate, - new_freq=self.outputSampleRate, + new_freq=self.output_sample_rate, dtype=torch.float32 ).to(self.device_manager.device) def update_settings(self, key: str, val, old_val): - logger.info(f"[Voice Changer] [RVCr2]: update_settings {key}:{val}") - if key in {"gpu", "forceFp32"}: self.initialize(True) elif key == "f0Detector" and self.pipeline is not None: @@ -170,9 +167,9 @@ def realloc(self, block_frame: int, extra_frame: int, crossfade_frame: int, sola # that can output additional feature. self.pitch_buffer = torch.zeros(self.convert_feature_size_16k + 1, dtype=torch.int64, device=self.device_manager.device) self.pitchf_buffer = torch.zeros(self.convert_feature_size_16k + 1, dtype=self.dtype, device=self.device_manager.device) - print('[Voice Changer] Allocated audio buffer:', self.audio_buffer.shape[0]) - print('[Voice Changer] Allocated convert buffer:', self.convert_buffer.shape[0]) - print('[Voice Changer] Allocated pitchf buffer:', self.pitchf_buffer.shape[0]) + logger.info(f'Allocated audio buffer size: {audio_buffer_size}') + logger.info(f'Allocated convert buffer size: {convert_size_16k}') + logger.info(f'Allocated pitchf buffer size: {self.convert_feature_size_16k + 1}') def convert(self, audio_in: AudioInOutFloat, sample_rate: int) -> torch.Tensor: if self.pipeline is None: @@ -265,27 +262,11 @@ def inference(self, audio_in: AudioInOutFloat): def __del__(self): del self.pipeline - # print("---------- REMOVING ---------------") - - # remove_path = os.path.join("RVC") - # sys.path = [x for x in sys.path if x.endswith(remove_path) is False] - - # for key in list(sys.modules): - # val = sys.modules.get(key) - # try: - # file_path = val.__file__ - # if file_path.find("RVC" + os.path.sep) >= 0: - # # print("remove", key, file_path) - # sys.modules.pop(key) - # except Exception: # type:ignore - # # print(e) - # pass - def export2onnx(self): modelSlot = self.slotInfo if modelSlot.isONNX: - logger.warn("[Voice Changer] export2onnx, No pyTorch filepath.") + logger.error("Model is already in ONNX format.") return {"status": "ng", "path": ""} if self.pipeline is not None: diff --git a/server/voice_changer/RVC/embedder/Embedder.py b/server/voice_changer/RVC/embedder/Embedder.py index 464ae14f2..a7c6dbf2d 100644 --- a/server/voice_changer/RVC/embedder/Embedder.py +++ b/server/voice_changer/RVC/embedder/Embedder.py @@ -5,7 +5,8 @@ from const import EmbedderType from voice_changer.RVC.embedder.EmbedderProtocol import EmbedderProtocol - +import logging +logger = logging.getLogger(__name__) class Embedder(EmbedderProtocol): def __init__(self): @@ -38,12 +39,4 @@ def set_props( self.file = file def matchCondition(self, embedderType: EmbedderType) -> bool: - # Check Type - if self.embedderType != embedderType: - print( - "[Voice Changer] embeder type is not match", - self.embedderType, - embedderType, - ) - return False - return True + return self.embedderType == embedderType diff --git a/server/voice_changer/RVC/embedder/EmbedderManager.py b/server/voice_changer/RVC/embedder/EmbedderManager.py index 169abcd85..b5fa3a832 100644 --- a/server/voice_changer/RVC/embedder/EmbedderManager.py +++ b/server/voice_changer/RVC/embedder/EmbedderManager.py @@ -2,7 +2,8 @@ from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.embedder.OnnxContentvec import OnnxContentvec from settings import ServerSettings - +import logging +logger = logging.getLogger(__name__) class EmbedderManager: embedder: Embedder | None = None @@ -17,13 +18,15 @@ def get_embedder(cls, embedder_type: EmbedderType, force_reload: bool = False) - if cls.embedder is not None \ and cls.embedder.matchCondition(embedder_type) \ and not force_reload: - print('[Voice Changer] Reusing embedder.') + logger.info('Reusing embedder.') return cls.embedder cls.embedder = cls.load_embedder(embedder_type) return cls.embedder @classmethod def load_embedder(cls, embedder_type: EmbedderType) -> Embedder: + logger.info(f'Loading embedder {embedder_type}') + if embedder_type not in ["hubert_base", "contentvec"]: raise RuntimeError(f'Unsupported embedder type: {embedder_type}') file = cls.params.content_vec_500_onnx diff --git a/server/voice_changer/RVC/inferencer/InferencerManager.py b/server/voice_changer/RVC/inferencer/InferencerManager.py index 9782b850f..a54c9df3c 100644 --- a/server/voice_changer/RVC/inferencer/InferencerManager.py +++ b/server/voice_changer/RVC/inferencer/InferencerManager.py @@ -37,12 +37,6 @@ def loadInferencer( return RVCInferencerNono().load_model(file) elif inferencerType == EnumInferenceTypes.pyTorchRVCv2: return RVCInferencerv2().load_model(file) - elif inferencerType is EnumInferenceTypes.pyTorchVoRASbeta: - if sys.platform.startswith("darwin") is False: - from voice_changer.RVC.inferencer.VorasInferencebeta import VoRASInferencer - return VoRASInferencer().load_model(file) - else: - raise RuntimeError("[Voice Changer] VoRAS is not supported on macOS") elif inferencerType is EnumInferenceTypes.pyTorchRVCv2Nono: return RVCInferencerv2Nono().load_model(file) elif inferencerType is EnumInferenceTypes.pyTorchWebUI: @@ -54,4 +48,4 @@ def loadInferencer( elif inferencerType is EnumInferenceTypes.onnxRVCNono: return OnnxRVCInferencerNono().load_model(file, inferencerTypeVersion) else: - raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType) + raise RuntimeError("Inferencer not found", inferencerType) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencer.py b/server/voice_changer/RVC/inferencer/RVCInferencer.py index c540ef924..83b109144 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencer.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencer.py @@ -1,25 +1,33 @@ import torch +import json from const import EnumInferenceTypes - +from safetensors import safe_open from voice_changer.common.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.inferencer.Inferencer import Inferencer from .rvc_models.infer_pack.models import SynthesizerTrnMs256NSFsid +from voice_changer.common.SafetensorsUtils import load_model class RVCInferencer(Inferencer): def load_model(self, file: str): - self.set_props(EnumInferenceTypes.pyTorchRVC, file) - device_manager = DeviceManager.get_instance() + dev = device_manager.device is_half = device_manager.use_fp16() + self.set_props(EnumInferenceTypes.pyTorchRVC, file) - cpt = torch.load(file, map_location="cpu") - model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) + # Keep torch.load for backward compatibility, but discourage the use of this loading method + if file.endswith('.safetensors'): + with safe_open(file, 'pt', device=str(dev) if dev.type == 'cuda' else 'cpu') as cpt: + config = json.loads(cpt.metadata()['config']) + model = SynthesizerTrnMs256NSFsid(*config, is_half=is_half).to(dev) + load_model(model, cpt, strict=False) + else: + cpt = torch.load(file, map_location=dev if dev.type == 'cuda' else 'cpu') + model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half).to(dev) + model.load_state_dict(cpt["weight"], strict=False) - model.eval() - model.load_state_dict(cpt["weight"], strict=False) + model.eval().remove_weight_norm() - model = model.to(device_manager.device) if is_half: model = model.half() @@ -50,4 +58,4 @@ def infer( formant_length=formant_length ) res = res[0][0, 0] - return torch.clip(res, -1.0, 1.0) + return torch.clip(res, -1.0, 1.0, out=res) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py index 118d6cf37..fe57f0f86 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py @@ -1,25 +1,33 @@ import torch +import json from const import EnumInferenceTypes - +from safetensors import safe_open from voice_changer.common.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.inferencer.Inferencer import Inferencer from .rvc_models.infer_pack.models import SynthesizerTrnMs256NSFsid_nono +from voice_changer.common.SafetensorsUtils import load_model class RVCInferencerNono(Inferencer): def load_model(self, file: str): - self.set_props(EnumInferenceTypes.pyTorchRVCNono, file) - device_manager = DeviceManager.get_instance() + dev = device_manager.device is_half = device_manager.use_fp16() + self.set_props(EnumInferenceTypes.pyTorchRVCNono, file) - cpt = torch.load(file, map_location="cpu") - model = SynthesizerTrnMs256NSFsid_nono(*cpt["config"], is_half=is_half) + # Keep torch.load for backward compatibility, but discourage the use of this loading method + if file.endswith('.safetensors'): + with safe_open(file, 'pt', device=str(dev) if dev.type == 'cuda' else 'cpu') as cpt: + config = json.loads(cpt.metadata()['config']) + model = SynthesizerTrnMs256NSFsid_nono(*config, is_half=is_half).to(dev) + load_model(model, cpt, strict=False) + else: + cpt = torch.load(file, map_location=dev if dev.type == 'cuda' else 'cpu') + model = SynthesizerTrnMs256NSFsid_nono(*cpt["config"], is_half=is_half).to(dev) + model.load_state_dict(cpt["weight"], strict=False) - model.eval() - model.load_state_dict(cpt["weight"], strict=False) + model.eval().remove_weight_norm() - model = model.to(device_manager.device) if is_half: model = model.half() @@ -46,4 +54,4 @@ def infer( formant_length=formant_length ) res = res[0][0, 0] - return torch.clip(res, -1.0, 1.0) + return torch.clip(res, -1.0, 1.0, out=res) diff --git a/server/voice_changer/RVC/inferencer/VorasInferencebeta.py b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py deleted file mode 100644 index 86925bfe8..000000000 --- a/server/voice_changer/RVC/inferencer/VorasInferencebeta.py +++ /dev/null @@ -1,42 +0,0 @@ -import torch -from torch import device - -from const import EnumInferenceTypes -from voice_changer.RVC.inferencer.Inferencer import Inferencer -from voice_changer.common.deviceManager.DeviceManager import DeviceManager -from .voras_beta.models import Synthesizer - - -class VoRASInferencer(Inferencer): - def load_model(self, file: str): - super().set_props(EnumInferenceTypes.pyTorchVoRASbeta, file) - - dev = DeviceManager.get_instance().device - self.isHalf = False # DeviceManager.get_instance().is_fp16_available(gpu) - - cpt = torch.load(file, map_location="cpu") - model = Synthesizer(**cpt["params"]) - - model.eval() - model.load_state_dict(cpt["weight"], strict=False) - model.remove_weight_norm() - model.change_speaker(0) - - model = model.to(dev) - - self.model = model - print("load model comprete") - return self - - def infer( - self, - feats: torch.Tensor, - pitch_length: torch.Tensor, - pitch: torch.Tensor, - pitchf: torch.Tensor, - sid: torch.Tensor, - skip_head: int, - return_length: int, - formant_length: int, - ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid) diff --git a/server/voice_changer/RVC/inferencer/voras_beta/commons.py b/server/voice_changer/RVC/inferencer/voras_beta/commons.py deleted file mode 100644 index 66019df2b..000000000 --- a/server/voice_changer/RVC/inferencer/voras_beta/commons.py +++ /dev/null @@ -1,154 +0,0 @@ -import math - -import torch -from torch.nn import functional as F - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def kl_divergence(m_p, logs_p, m_q, logs_q): - """KL(P||Q)""" - kl = (logs_q - logs_p) - 0.5 - kl += 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) - return kl - - -def rand_gumbel(shape): - """Sample from the Gumbel distribution, protect from overflows.""" - uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 - return -torch.log(-torch.log(uniform_samples)) - - -def rand_gumbel_like(x): - g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) - return g - - -def slice_segments(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - r = x[i, :, idx_str:idx_end] - ret[i, :, : r.size(1)] = r - return ret - - -def slice_segments2(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - r = x[i, idx_str:idx_end] - ret[i, : r.size(0)] = r - return ret - - -def rand_slice_segments(x, x_lengths, segment_size=4, ids_str=None): - b, d, t = x.size() - if ids_str is None: - ids_str = torch.zeros([b]).to(device=x.device, dtype=x_lengths.dtype) - ids_str_max = torch.maximum(torch.zeros_like(x_lengths).to(device=x_lengths.device, dtype=x_lengths.dtype), x_lengths - segment_size + 1 - ids_str) - ids_str += (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) - ret = slice_segments(x, ids_str, segment_size) - return ret, ids_str - - -def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): - position = torch.arange(length, dtype=torch.float) - num_timescales = channels // 2 - log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1) - inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) - scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) - signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) - signal = F.pad(signal, [0, 0, 0, channels % 2]) - signal = signal.view(1, channels, length) - return signal - - -def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return x + signal.to(dtype=x.dtype, device=x.device) - - -def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) - - -def subsequent_mask(length): - mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) - return mask - - -# @torch.jit.script -@torch.jit._script_if_tracing -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -def shift_1d(x): - x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] - return x - - -def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - -def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2, 3) * mask - return path - - -def clip_grad_value_(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - norm_type = float(norm_type) - if clip_value is not None: - clip_value = float(clip_value) - - total_norm = 0 - for p in parameters: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm.item() ** norm_type - if clip_value is not None: - p.grad.data.clamp_(min=-clip_value, max=clip_value) - total_norm = total_norm ** (1.0 / norm_type) - return total_norm diff --git a/server/voice_changer/RVC/inferencer/voras_beta/config.py b/server/voice_changer/RVC/inferencer/voras_beta/config.py deleted file mode 100644 index ddfb42716..000000000 --- a/server/voice_changer/RVC/inferencer/voras_beta/config.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import * - -from pydantic import BaseModel - - -class TrainConfigTrain(BaseModel): - log_interval: int - seed: int - epochs: int - learning_rate: float - betas: List[float] - eps: float - batch_size: int - fp16_run: bool - lr_decay: float - segment_size: int - init_lr_ratio: int - warmup_epochs: int - c_mel: int - c_kl: float - - -class TrainConfigData(BaseModel): - max_wav_value: float - sampling_rate: int - filter_length: int - hop_length: int - win_length: int - n_mel_channels: int - mel_fmin: float - mel_fmax: Any - - -class TrainConfigModel(BaseModel): - emb_channels: int - inter_channels: int - n_layers: int - upsample_rates: List[int] - use_spectral_norm: bool - gin_channels: int - spk_embed_dim: int - - -class TrainConfig(BaseModel): - version: Literal["voras"] = "voras" - train: TrainConfigTrain - data: TrainConfigData - model: TrainConfigModel - - -class DatasetMetaItem(BaseModel): - gt_wav: str - co256: str - f0: Optional[str] - f0nsf: Optional[str] - speaker_id: int - - -class DatasetMetadata(BaseModel): - files: Dict[str, DatasetMetaItem] - # mute: DatasetMetaItem diff --git a/server/voice_changer/RVC/inferencer/voras_beta/models.py b/server/voice_changer/RVC/inferencer/voras_beta/models.py deleted file mode 100644 index 3168e590c..000000000 --- a/server/voice_changer/RVC/inferencer/voras_beta/models.py +++ /dev/null @@ -1,238 +0,0 @@ -import math -import os -import sys - -import numpy as np -import torch -from torch import nn -from torch.nn import Conv2d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - -from . import commons, modules -from .commons import get_padding -from .modules import (ConvNext2d, HarmonicEmbedder, IMDCTSymExpHead, - LoRALinear1d, SnakeFilter, WaveBlock) - -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - -sr2sr = { - "24k": 24000, - "32k": 32000, - "40k": 40000, - "48k": 48000, -} - -class GeneratorVoras(torch.nn.Module): - def __init__( - self, - emb_channels, - inter_channels, - gin_channels, - n_layers, - sr, - hop_length, - ): - super(GeneratorVoras, self).__init__() - self.n_layers = n_layers - self.emb_pitch = HarmonicEmbedder(768, inter_channels, gin_channels, 16, 15) # # pitch 256 - self.plinear = LoRALinear1d(inter_channels, inter_channels, gin_channels, r=8) - self.glinear = weight_norm(nn.Conv1d(gin_channels, inter_channels, 1)) - self.resblocks = nn.ModuleList() - self.init_linear = LoRALinear1d(emb_channels, inter_channels, gin_channels, r=4) - for _ in range(self.n_layers): - self.resblocks.append(WaveBlock(inter_channels, gin_channels, [9] * 2, [1] * 2, [1, 9], 2, r=4)) - self.head = IMDCTSymExpHead(inter_channels, gin_channels, hop_length, padding="center", sample_rate=sr) - self.post = SnakeFilter(4, 8, 9, 2, eps=1e-5) - - def forward(self, x, pitchf, x_mask, g): - x = self.init_linear(x, g) + self.plinear(self.emb_pitch(pitchf, g), g) + self.glinear(g) - for i in range(self.n_layers): - x = self.resblocks[i](x, x_mask, g) - x = x * x_mask - x = self.head(x, g) - x = self.post(x) - return torch.tanh(x) - - def remove_weight_norm(self): - self.plinear.remove_weight_norm() - remove_weight_norm(self.glinear) - for l in self.resblocks: - l.remove_weight_norm() - self.init_linear.remove_weight_norm() - self.head.remove_weight_norm() - self.post.remove_weight_norm() - - def fix_speaker(self, g): - self.plinear.fix_speaker(g) - self.init_linear.fix_speaker(g) - for l in self.resblocks: - l.fix_speaker(g) - self.head.fix_speaker(g) - - def unfix_speaker(self, g): - self.plinear.unfix_speaker(g) - self.init_linear.unfix_speaker(g) - for l in self.resblocks: - l.unfix_speaker(g) - self.head.unfix_speaker(g) - - -class Synthesizer(nn.Module): - def __init__( - self, - segment_size, - n_fft, - hop_length, - inter_channels, - n_layers, - spk_embed_dim, - gin_channels, - emb_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.segment_size = segment_size - self.n_fft = n_fft - self.hop_length = hop_length - self.inter_channels = inter_channels - self.n_layers = n_layers - self.spk_embed_dim = spk_embed_dim - self.gin_channels = gin_channels - self.emb_channels = emb_channels - self.sr = sr - - self.dec = GeneratorVoras( - emb_channels, - inter_channels, - gin_channels, - n_layers, - sr, - hop_length - ) - - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print( - "gin_channels:", - gin_channels, - "self.spk_embed_dim:", - self.spk_embed_dim, - "emb_channels:", - emb_channels, - ) - self.speaker = None - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - - def change_speaker(self, sid: int): - if self.speaker is not None: - g = self.emb_g(torch.from_numpy(np.array(self.speaker))).unsqueeze(-1) - self.dec.unfix_speaker(g) - g = self.emb_g(torch.from_numpy(np.array(sid))).unsqueeze(-1) - self.dec.fix_speaker(g) - self.speaker = sid - - def forward( - self, phone, phone_lengths, pitch, pitchf, ds - ): - g = self.emb_g(ds).unsqueeze(-1) - x = torch.transpose(phone, 1, -1) - x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(phone.dtype) - x_slice, ids_slice = commons.rand_slice_segments( - x, phone_lengths, self.segment_size - ) - pitchf_slice = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - mask_slice = commons.slice_segments(x_mask, ids_slice, self.segment_size) - o = self.dec(x_slice, pitchf_slice, mask_slice, g) - return o, ids_slice, x_mask, g - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - x = torch.transpose(phone, 1, -1) - x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(phone.dtype) - o = self.dec((x * x_mask)[:, :, :max_len], nsff0, x_mask, g) - return o, x_mask, (None, None, None, None) - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, gin_channels, upsample_rates, final_dim=256, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - self.init_kernel_size = upsample_rates[-1] * 3 - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - N = len(upsample_rates) - self.init_conv = norm_f(Conv2d(1, final_dim // (2 ** (N - 1)), (self.init_kernel_size, 1), (upsample_rates[-1], 1))) - self.convs = nn.ModuleList() - for i, u in enumerate(upsample_rates[::-1][1:], start=1): - self.convs.append( - ConvNext2d( - final_dim // (2 ** (N - i)), - final_dim // (2 ** (N - i - 1)), - gin_channels, - (u*3, 1), - (u, 1), - 4, - r=2 + i//2 - ) - ) - self.conv_post = weight_norm(Conv2d(final_dim, 1, (3, 1), (1, 1))) - - def forward(self, x, g): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (n_pad, 0), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - x = torch.flip(x, dims=[2]) - x = F.pad(x, [0, 0, 0, self.init_kernel_size - 1], mode="constant") - x = self.init_conv(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = torch.flip(x, dims=[2]) - fmap.append(x) - - for i, l in enumerate(self.convs): - x = l(x, g) - fmap.append(x) - - x = F.pad(x, [0, 0, 2, 0], mode="constant") - x = self.conv_post(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, upsample_rates, gin_channels, periods=[2, 3, 5, 7, 11, 17], **kwargs): - super(MultiPeriodDiscriminator, self).__init__() - - discs = [ - DiscriminatorP(i, gin_channels, upsample_rates, use_spectral_norm=False) for i in periods - ] - self.ups = np.prod(upsample_rates) - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat, g): - fmap_rs = [] - fmap_gs = [] - y_d_rs = [] - y_d_gs = [] - for d in self.discriminators: - y_d_r, fmap_r = d(y, g) - y_d_g, fmap_g = d(y_hat, g) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs diff --git a/server/voice_changer/RVC/inferencer/voras_beta/modules.py b/server/voice_changer/RVC/inferencer/voras_beta/modules.py deleted file mode 100644 index f6659c68b..000000000 --- a/server/voice_changer/RVC/inferencer/voras_beta/modules.py +++ /dev/null @@ -1,496 +0,0 @@ -import math - -import numpy as np -import scipy -import torch -from torch import nn -from torch.nn import Conv1d, Conv2d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm -from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz - -from . import commons, modules -from .commons import get_padding, init_weights -from .transforms import piecewise_rational_quadratic_transform - -LRELU_SLOPE = 0.1 - -class HarmonicEmbedder(nn.Module): - def __init__(self, num_embeddings, embedding_dim, gin_channels, num_head, num_harmonic=0, f0_min=50., f0_max=1100., device="cuda"): - super(HarmonicEmbedder, self).__init__() - self.embedding_dim = embedding_dim - self.num_head = num_head - self.num_harmonic = num_harmonic - - f0_mel_min = np.log(1 + f0_min / 700) - f0_mel_max = np.log(1 + f0_max * (1 + num_harmonic) / 700) - self.sequence = torch.from_numpy(np.linspace(f0_mel_min, f0_mel_max, num_embeddings-2)) - self.emb_layer = torch.nn.Embedding(num_embeddings, embedding_dim) - self.linear_q = Conv1d(gin_channels, num_head * (1 + num_harmonic), 1) - self.weight = None - - def forward(self, x, g): - b, l = x.size() - non_zero = (x != 0.).to(dtype=torch.long).unsqueeze(1) - mel = torch.log(1 + x / 700).unsqueeze(1) - harmonies = torch.arange(1 + self.num_harmonic, device=x.device, dtype=x.dtype).view(1, 1 + self.num_harmonic, 1) + 1. - ix = torch.searchsorted(self.sequence.to(x.device), mel * harmonies).to(x.device) + 1 - ix = ix * non_zero - emb = self.emb_layer(ix).transpose(1, 3).reshape(b, self.num_head, self.embedding_dim // self.num_head, 1 + self.num_harmonic, l) - if self.weight is None: - weight = torch.nn.functional.softmax(self.linear_q(g).reshape(b, self.num_head, 1, 1 + self.num_harmonic, 1), 3) - else: - weight = self.weight - res = torch.sum(emb * weight, dim=3).reshape(b, self.embedding_dim, l) - return res - - def fix_speaker(self, g): - self.weight = torch.nn.functional.softmax(self.linear_q(g).reshape(1, self.num_head, 1, 1 + self.num_harmonic, 1), 3) - - def unfix_speaker(self, g): - self.weight = None - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - -class DilatedCausalConv1d(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1, dilation=1, bias=True): - super(DilatedCausalConv1d, self).__init__() - self.kernel_size = kernel_size - self.dilation = dilation - self.stride = stride - self.conv = weight_norm(nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups, dilation=dilation, bias=bias)) - - def forward(self, x): - x = torch.flip(x, [2]) - x = F.pad(x, [0, (self.kernel_size - 1) * self.dilation], mode="constant", value=0.) - size = x.shape[2] // self.stride - x = self.conv(x)[:, :, :size] - x = torch.flip(x, [2]) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.conv) - - -class CausalConvTranspose1d(nn.Module): - """ - padding = 0, dilation = 1のとき - - Lout = (Lin - 1) * stride + kernel_rate * stride + output_padding - Lout = Lin * stride + (kernel_rate - 1) * stride + output_padding - output_paddingいらないね - """ - def __init__(self, in_channels, out_channels, kernel_rate=3, stride=1, groups=1): - super(CausalConvTranspose1d, self).__init__() - kernel_size = kernel_rate * stride - self.trim_size = (kernel_rate - 1) * stride - self.conv = weight_norm(nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups)) - - def forward(self, x): - x = self.conv(x) - return x[:, :, :-self.trim_size] - - def remove_weight_norm(self): - remove_weight_norm(self.conv) - - -class LoRALinear1d(nn.Module): - def __init__(self, in_channels, out_channels, info_channels, r): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.info_channels = info_channels - self.r = r - self.main_fc = weight_norm(nn.Conv1d(in_channels, out_channels, 1)) - self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) - self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) - nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) - nn.init.constant_(self.adapter_out.weight.data, 1e-6) - self.adapter_in = weight_norm(self.adapter_in) - self.adapter_out = weight_norm(self.adapter_out) - self.speaker_fixed = False - - def forward(self, x, g): - x_ = self.main_fc(x) - if not self.speaker_fixed: - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - l = torch.einsum("brl,brc->bcl", torch.einsum("bcl,bcr->brl", x, a_in), a_out) - x_ = x_ + l - return x_ - - def remove_weight_norm(self): - remove_weight_norm(self.main_fc) - remove_weight_norm(self.adapter_in) - remove_weight_norm(self.adapter_out) - - def fix_speaker(self, g): - self.speaker_fixed = True - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2) - self.main_fc.weight.data.add_(weight) - - def unfix_speaker(self, g): - self.speaker_fixed = False - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2) - self.main_fc.weight.data.sub_(weight) - - -class LoRALinear2d(nn.Module): - def __init__(self, in_channels, out_channels, info_channels, r): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.info_channels = info_channels - self.r = r - self.main_fc = weight_norm(nn.Conv2d(in_channels, out_channels, (1, 1), (1, 1))) - self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) - self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) - nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) - nn.init.constant_(self.adapter_out.weight.data, 1e-6) - self.adapter_in = weight_norm(self.adapter_in) - self.adapter_out = weight_norm(self.adapter_out) - self.speaker_fixed = False - - def forward(self, x, g): - x_ = self.main_fc(x) - if not self.speaker_fixed: - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - l = torch.einsum("brhw,brc->bchw", torch.einsum("bchw,bcr->brhw", x, a_in), a_out) - x_ = x_ + l - return x_ - - def remove_weight_norm(self): - remove_weight_norm(self.main_fc) - remove_weight_norm(self.adapter_in) - remove_weight_norm(self.adapter_out) - - def fix_speaker(self, g): - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2).unsqueeze(3) - self.main_fc.weight.data.add_(weight) - - def unfix_speaker(self, g): - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2).unsqueeze(3) - self.main_fc.weight.data.sub_(weight) - - -class MBConv2d(torch.nn.Module): - """ - Causal MBConv2D - """ - def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): - super(MBConv2d, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.kernel_size = kernel_size - self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) - self.dwconv = norm_f(Conv2d(inner_channels, inner_channels, kernel_size, stride, groups=inner_channels)) - self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) - self.pwnorm = LayerNorm(in_channels) - self.dwnorm = LayerNorm(inner_channels) - - def forward(self, x, g): - x = self.pwnorm(x) - x = self.pwconv1(x, g) - x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") - x = self.dwnorm(x) - x = self.dwconv(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.pwconv2(x, g) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - return x - -class ConvNext2d(torch.nn.Module): - """ - Causal ConvNext Block - stride = 1 only - """ - def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): - super(ConvNext2d, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.kernel_size = kernel_size - self.dwconv = norm_f(Conv2d(in_channels, in_channels, kernel_size, stride, groups=in_channels)) - self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) - self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) - self.act = nn.GELU() - self.norm = LayerNorm(in_channels) - - def forward(self, x, g): - x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") - x = self.dwconv(x) - x = self.norm(x) - x = self.pwconv1(x, g) - x = self.act(x) - x = self.pwconv2(x, g) - x = self.act(x) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.dwconv) - - -class WaveBlock(torch.nn.Module): - def __init__(self, inner_channels, gin_channels, kernel_sizes, strides, dilations, extend_rate, r): - super(WaveBlock, self).__init__() - norm_f = weight_norm - extend_channels = int(inner_channels * extend_rate) - self.dconvs = nn.ModuleList() - self.p1convs = nn.ModuleList() - self.p2convs = nn.ModuleList() - self.norms = nn.ModuleList() - self.act = nn.GELU() - - # self.ses = nn.ModuleList() - # self.norms = [] - for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations)): - self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, k, stride=s, dilation=d, groups=inner_channels)) - self.p1convs.append(LoRALinear1d(inner_channels, extend_channels, gin_channels, r)) - self.p2convs.append(LoRALinear1d(extend_channels, inner_channels, gin_channels, r)) - self.norms.append(LayerNorm(inner_channels)) - - def forward(self, x, x_mask, g): - x *= x_mask - for i in range(len(self.dconvs)): - residual = x.clone() - x = self.dconvs[i](x) - x = self.norms[i](x) - x *= x_mask - x = self.p1convs[i](x, g) - x = self.act(x) - x = self.p2convs[i](x, g) - x = residual + x - return x - - def remove_weight_norm(self): - for c in self.dconvs: - c.remove_weight_norm() - for c in self.p1convs: - c.remove_weight_norm() - for c in self.p2convs: - c.remove_weight_norm() - - def fix_speaker(self, g): - for c in self.p1convs: - c.fix_speaker(g) - for c in self.p2convs: - c.fix_speaker(g) - - def unfix_speaker(self, g): - for c in self.p1convs: - c.unfix_speaker(g) - for c in self.p2convs: - c.unfix_speaker(g) - - -class SnakeFilter(torch.nn.Module): - """ - Adaptive filter using snakebeta - """ - def __init__(self, channels, groups, kernel_size, num_layers, eps=1e-6): - super(SnakeFilter, self).__init__() - self.eps = eps - self.num_layers = num_layers - inner_channels = channels * groups - self.init_conv = DilatedCausalConv1d(1, inner_channels, kernel_size) - self.dconvs = torch.nn.ModuleList() - self.pconvs = torch.nn.ModuleList() - self.post_conv = DilatedCausalConv1d(inner_channels+1, 1, kernel_size, bias=False) - - for i in range(self.num_layers): - self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, kernel_size, stride=1, groups=inner_channels, dilation=kernel_size ** (i + 1))) - self.pconvs.append(weight_norm(Conv1d(inner_channels, inner_channels, 1, groups=groups))) - self.snake_alpha = torch.nn.Parameter(torch.zeros(inner_channels), requires_grad=True) - self.snake_beta = torch.nn.Parameter(torch.zeros(inner_channels), requires_grad=True) - - def forward(self, x): - y = x.clone() - x = self.init_conv(x) - for i in range(self.num_layers): - # snake activation - x = self.dconvs[i](x) - x = self.pconvs[i](x) - x = x + (1.0 / torch.clip(self.snake_beta.unsqueeze(0).unsqueeze(-1), min=self.eps)) * torch.pow(torch.sin(x * self.snake_alpha.unsqueeze(0).unsqueeze(-1)), 2) - x = torch.cat([x, y], 1) - x = self.post_conv(x) - return x - - def remove_weight_norm(self): - self.init_conv.remove_weight_norm() - for c in self.dconvs: - c.remove_weight_norm() - for c in self.pconvs: - remove_weight_norm(c) - self.post_conv.remove_weight_norm() - -""" -https://github.com/charactr-platform/vocos/blob/main/vocos/heads.py -""" -class FourierHead(nn.Module): - """Base class for inverse fourier modules.""" - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Args: - x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, - L is the sequence length, and H denotes the model dimension. - - Returns: - Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. - """ - raise NotImplementedError("Subclasses must implement the forward method.") - - -class IMDCT(nn.Module): - """ - Inverse Modified Discrete Cosine Transform (IMDCT) module. - - Args: - frame_len (int): Length of the MDCT frame. - padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". - """ - - def __init__(self, frame_len: int, padding: str = "same"): - super().__init__() - if padding not in ["center", "same"]: - raise ValueError("Padding must be 'center' or 'same'.") - self.padding = padding - self.frame_len = frame_len * 2 - N = frame_len - n0 = (N + 1) / 2 - window = torch.from_numpy(scipy.signal.cosine(N * 2)).float() - self.register_buffer("window", window) - - pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N) - post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2)) - self.register_buffer("pre_twiddle", torch.view_as_real(pre_twiddle)) - self.register_buffer("post_twiddle", torch.view_as_real(post_twiddle)) - - def forward(self, X: torch.Tensor) -> torch.Tensor: - """ - Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients. - - Args: - X (Tensor): Input MDCT coefficients of shape (B, N, L), where B is the batch size, - L is the number of frames, and N is the number of frequency bins. - - Returns: - Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio. - """ - X = X.transpose(1, 2) - B, L, N = X.shape - Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device) - Y[..., :N] = X - Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,))) - y = torch.fft.ifft(Y * torch.view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1) - y = torch.real(y * torch.view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2) - result = y * self.window.expand(y.shape) - output_size = (1, (L + 1) * N) - audio = torch.nn.functional.fold( - result.transpose(1, 2), - output_size=output_size, - kernel_size=(1, self.frame_len), - stride=(1, self.frame_len // 2), - )[:, 0, 0, :] - - if self.padding == "center": - pad = self.frame_len // 2 - elif self.padding == "same": - pad = self.frame_len // 4 - else: - raise ValueError("Padding must be 'center' or 'same'.") - - audio = audio[:, pad:-pad] - return audio.unsqueeze(1) - - -class IMDCTSymExpHead(FourierHead): - """ - IMDCT Head module for predicting MDCT coefficients with symmetric exponential function - - Args: - dim (int): Hidden dimension of the model. - mdct_frame_len (int): Length of the MDCT frame. - padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". - sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized - based on perceptual scaling. Defaults to None. - clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False. - """ - - def __init__( - self, dim: int, gin_channels: int, mdct_frame_len: int, padding: str = "same", sample_rate: int = 24000, - ): - super().__init__() - out_dim = mdct_frame_len - self.dconv = DilatedCausalConv1d(dim, dim, 5, 1, dim, 1) - self.pconv1 = LoRALinear1d(dim, dim * 2, gin_channels, 2) - self.pconv2 = LoRALinear1d(dim * 2, out_dim, gin_channels, 2) - self.act = torch.nn.GELU() - self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding) - - if sample_rate is not None: - # optionally init the last layer following mel-scale - m_max = _hz_to_mel(sample_rate // 2) - m_pts = torch.linspace(0, m_max, out_dim) - f_pts = _mel_to_hz(m_pts) - scale = 1 - (f_pts / f_pts.max()) - - with torch.no_grad(): - self.pconv2.main_fc.weight.mul_(scale.view(-1, 1, 1)) - - def forward(self, x: torch.Tensor, g: torch.Tensor) -> torch.Tensor: - """ - Forward pass of the IMDCTSymExpHead module. - - Args: - x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, - L is the sequence length, and H denotes the model dimension. - - Returns: - Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. - """ - x = self.dconv(x) - x = self.pconv1(x, g) - x = self.act(x) - x = self.pconv2(x, g) - x = symexp(x) - x = torch.clip(x, min=-1e2, max=1e2) # safeguard to prevent excessively large magnitudes - audio = self.imdct(x) - return audio - - def remove_weight_norm(self): - self.dconv.remove_weight_norm() - self.pconv1.remove_weight_norm() - self.pconv2.remove_weight_norm() - - def fix_speaker(self, g): - self.pconv1.fix_speaker(g) - self.pconv2.fix_speaker(g) - - def unfix_speaker(self, g): - self.pconv1.unfix_speaker(g) - self.pconv2.unfix_speaker(g) - -def symexp(x: torch.Tensor) -> torch.Tensor: - return torch.sign(x) * (torch.exp(x.abs()) - 1) \ No newline at end of file diff --git a/server/voice_changer/RVC/inferencer/voras_beta/transforms.py b/server/voice_changer/RVC/inferencer/voras_beta/transforms.py deleted file mode 100644 index 6f30b7177..000000000 --- a/server/voice_changer/RVC/inferencer/voras_beta/transforms.py +++ /dev/null @@ -1,207 +0,0 @@ -import numpy as np -import torch -from torch.nn import functional as F - -DEFAULT_MIN_BIN_WIDTH = 1e-3 -DEFAULT_MIN_BIN_HEIGHT = 1e-3 -DEFAULT_MIN_DERIVATIVE = 1e-3 - - -def piecewise_rational_quadratic_transform( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails=None, - tail_bound=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - if tails is None: - spline_fn = rational_quadratic_spline - spline_kwargs = {} - else: - spline_fn = unconstrained_rational_quadratic_spline - spline_kwargs = {"tails": tails, "tail_bound": tail_bound} - - outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs - ) - return outputs, logabsdet - - -def searchsorted(bin_locations, inputs, eps=1e-6): - bin_locations[..., -1] += eps - return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 - - -def unconstrained_rational_quadratic_spline( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails="linear", - tail_bound=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) - outside_interval_mask = ~inside_interval_mask - - outputs = torch.zeros_like(inputs) - logabsdet = torch.zeros_like(inputs) - - if tails == "linear": - unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) - constant = np.log(np.exp(1 - min_derivative) - 1) - unnormalized_derivatives[..., 0] = constant - unnormalized_derivatives[..., -1] = constant - - outputs[outside_interval_mask] = inputs[outside_interval_mask] - logabsdet[outside_interval_mask] = 0 - else: - raise RuntimeError("{} tails are not implemented.".format(tails)) - - ( - outputs[inside_interval_mask], - logabsdet[inside_interval_mask], - ) = rational_quadratic_spline( - inputs=inputs[inside_interval_mask], - unnormalized_widths=unnormalized_widths[inside_interval_mask, :], - unnormalized_heights=unnormalized_heights[inside_interval_mask, :], - unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], - inverse=inverse, - left=-tail_bound, - right=tail_bound, - bottom=-tail_bound, - top=tail_bound, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - ) - - return outputs, logabsdet - - -def rational_quadratic_spline( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - left=0.0, - right=1.0, - bottom=0.0, - top=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - if torch.min(inputs) < left or torch.max(inputs) > right: - raise ValueError("Input to a transform is not within its domain") - - num_bins = unnormalized_widths.shape[-1] - - if min_bin_width * num_bins > 1.0: - raise ValueError("Minimal bin width too large for the number of bins") - if min_bin_height * num_bins > 1.0: - raise ValueError("Minimal bin height too large for the number of bins") - - widths = F.softmax(unnormalized_widths, dim=-1) - widths = min_bin_width + (1 - min_bin_width * num_bins) * widths - cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) - cumwidths = (right - left) * cumwidths + left - cumwidths[..., 0] = left - cumwidths[..., -1] = right - widths = cumwidths[..., 1:] - cumwidths[..., :-1] - - derivatives = min_derivative + F.softplus(unnormalized_derivatives) - - heights = F.softmax(unnormalized_heights, dim=-1) - heights = min_bin_height + (1 - min_bin_height * num_bins) * heights - cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) - cumheights = (top - bottom) * cumheights + bottom - cumheights[..., 0] = bottom - cumheights[..., -1] = top - heights = cumheights[..., 1:] - cumheights[..., :-1] - - if inverse: - bin_idx = searchsorted(cumheights, inputs)[..., None] - else: - bin_idx = searchsorted(cumwidths, inputs)[..., None] - - input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] - input_bin_widths = widths.gather(-1, bin_idx)[..., 0] - - input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] - delta = heights / widths - input_delta = delta.gather(-1, bin_idx)[..., 0] - - input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] - input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] - - input_heights = heights.gather(-1, bin_idx)[..., 0] - - if inverse: - a = (inputs - input_cumheights) * ( - input_derivatives + input_derivatives_plus_one - 2 * input_delta - ) + input_heights * (input_delta - input_derivatives) - b = input_heights * input_derivatives - (inputs - input_cumheights) * ( - input_derivatives + input_derivatives_plus_one - 2 * input_delta - ) - c = -input_delta * (inputs - input_cumheights) - - discriminant = b.pow(2) - 4 * a * c - assert (discriminant >= 0).all() - - root = (2 * c) / (-b - torch.sqrt(discriminant)) - outputs = root * input_bin_widths + input_cumwidths - - theta_one_minus_theta = root * (1 - root) - denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta - ) - derivative_numerator = input_delta.pow(2) * ( - input_derivatives_plus_one * root.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - root).pow(2) - ) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, -logabsdet - else: - theta = (inputs - input_cumwidths) / input_bin_widths - theta_one_minus_theta = theta * (1 - theta) - - numerator = input_heights * ( - input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta - ) - denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta - ) - outputs = input_cumheights + numerator / denominator - - derivative_numerator = input_delta.pow(2) * ( - input_derivatives_plus_one * theta.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - theta).pow(2) - ) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, logabsdet diff --git a/server/voice_changer/RVC/inferencer/voras_beta/utils.py b/server/voice_changer/RVC/inferencer/voras_beta/utils.py deleted file mode 100644 index ee1f3b17a..000000000 --- a/server/voice_changer/RVC/inferencer/voras_beta/utils.py +++ /dev/null @@ -1,243 +0,0 @@ -import glob -import logging -import os -import shutil -import socket -import sys - -import ffmpeg -import numpy as np -import torch -from scipy.io.wavfile import read -from torch.nn import functional as F - -from modules.shared import ROOT_DIR - -from .config import TrainConfig - - -class AWP: - """ - Fast AWP - https://www.kaggle.com/code/junkoda/fast-awp - """ - def __init__(self, model, optimizer, *, adv_param='weight', - adv_lr=0.01, adv_eps=0.01): - self.model = model - self.optimizer = optimizer - self.adv_param = adv_param - self.adv_lr = adv_lr - self.adv_eps = adv_eps - self.backup = {} - - def perturb(self): - """ - Perturb model parameters for AWP gradient - Call before loss and loss.backward() - """ - self._save() # save model parameters - self._attack_step() # perturb weights - - def _attack_step(self): - e = 1e-6 - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None and self.adv_param in name: - grad = self.optimizer.state[param]['exp_avg'] - norm_grad = torch.norm(grad) - norm_data = torch.norm(param.detach()) - - if norm_grad != 0 and not torch.isnan(norm_grad): - # Set lower and upper limit in change - limit_eps = self.adv_eps * param.detach().abs() - param_min = param.data - limit_eps - param_max = param.data + limit_eps - - # Perturb along gradient - # w += (adv_lr * |w| / |grad|) * grad - param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e))) - - # Apply the limit to the change - param.data.clamp_(param_min, param_max) - - def _save(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None and self.adv_param in name: - if name not in self.backup: - self.backup[name] = param.clone().detach() - else: - self.backup[name].copy_(param.data) - - def restore(self): - """ - Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients - Call after loss.backward(), before optimizer.step() - """ - for name, param in self.model.named_parameters(): - if name in self.backup: - param.data.copy_(self.backup[name]) - - -def load_audio(file: str, sr): - try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # Prevent small white copy path head and tail with spaces and " and return - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() - - -def find_empty_port(): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.bind(("", 0)) - s.listen(1) - port = s.getsockname()[1] - s.close() - return port - - -def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - saved_state_dict = checkpoint_dict["model"] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - print( - f"shape-{k}-mismatch|need-{state_dict[k].shape}|get-{saved_state_dict[k].shape}" - ) - if saved_state_dict[k].dim() == 2: # NOTE: check is this ok? - # for embedded input 256 <==> 768 - # this achieves we can continue training from original's pretrained checkpoints when using embedder that 768-th dim output etc. - if saved_state_dict[k].dtype == torch.half: - new_state_dict[k] = ( - F.interpolate( - saved_state_dict[k].float().unsqueeze(0).unsqueeze(0), - size=state_dict[k].shape, - mode="bilinear", - ) - .half() - .squeeze(0) - .squeeze(0) - ) - else: - new_state_dict[k] = ( - F.interpolate( - saved_state_dict[k].unsqueeze(0).unsqueeze(0), - size=state_dict[k].shape, - mode="bilinear", - ) - .squeeze(0) - .squeeze(0) - ) - print( - "interpolated new_state_dict", - k, - "from", - saved_state_dict[k].shape, - "to", - new_state_dict[k].shape, - ) - else: - raise KeyError - except Exception as e: - # print(traceback.format_exc()) - print(f"{k} is not in the checkpoint") - print("error: %s" % e) - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - print("Loaded model weights") - - epoch = checkpoint_dict["epoch"] - learning_rate = checkpoint_dict["learning_rate"] - if optimizer is not None and load_opt == 1: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, epoch)) - return model, optimizer, learning_rate, epoch - - -def save_state(model, optimizer, learning_rate, epoch, checkpoint_path): - print( - "Saving model and optimizer state at epoch {} to {}".format( - epoch, checkpoint_path - ) - ) - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save( - { - "model": state_dict, - "epoch": epoch, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) - - -def summarize( - writer, - global_step, - scalars={}, - histograms={}, - images={}, - audios={}, - audio_sampling_rate=22050, -): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats="HWC") - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) - - -def latest_checkpoint_path(dir_path, regex="G_*.pth"): - filelist = glob.glob(os.path.join(dir_path, regex)) - if len(filelist) == 0: - return None - filelist.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) - filepath = filelist[-1] - return filepath - - -def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate - - -def load_config(training_dir: str, sample_rate: int, emb_channels: int): - if emb_channels == 256: - config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json") - else: - config_path = os.path.join( - ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json" - ) - config_save_path = os.path.join(training_dir, "config.json") - - shutil.copyfile(config_path, config_save_path) - - return TrainConfig.parse_file(config_save_path) diff --git a/server/voice_changer/RVC/modelMerger/MergeModel.py b/server/voice_changer/RVC/modelMerger/MergeModel.py index a54c388e2..5634deec4 100644 --- a/server/voice_changer/RVC/modelMerger/MergeModel.py +++ b/server/voice_changer/RVC/modelMerger/MergeModel.py @@ -7,6 +7,8 @@ from voice_changer.utils.ModelMerger import ModelMergerRequest from settings import ServerSettings import json +import logging +logger = logging.getLogger(__name__) def merge_model(params: ServerSettings, request: ModelMergerRequest): def extract(ckpt: Dict[str, Any]): @@ -21,7 +23,7 @@ def extract(ckpt: Dict[str, Any]): return opt def load_weight(path: str): - print(f"Loading {path}...") + logger.info(f"Loading {path}...") if path.endswith('.safetensors'): with safe_open(path, 'pt', device='cpu') as cpt: state_dict = cpt.metadata() @@ -38,8 +40,7 @@ def load_weight(path: str): files = request.files if len(files) == 0: - print("no merge file..............") - raise RuntimeError("no merge file..............") + raise RuntimeError("No merge file.") weights = [] alphas = [] @@ -64,12 +65,12 @@ def load_weight(path: str): merged: Dict[str, Any] = OrderedDict() merged["weight"] = {} - print("merge start.") + logger.info("merge start.") for key in weights[0].keys(): merged["weight"][key] = 0 for i, weight in enumerate(weights): merged["weight"][key] += weight[key] * alphas[i] - print("merge done. write metadata.") + logger.info("merge done. write metadata.") merged["config"] = config merged["params"] = state_dict["params"] if "params" in state_dict else None @@ -83,5 +84,5 @@ def load_weight(path: str): pass merged["embedder_name"] = state_dict["embedder_name"] if "embedder_name" in state_dict else None merged["embedder_output_layer"] = state_dict["embedder_output_layer"] if "embedder_output_layer" in state_dict else None - print("write metadata done.") + logger.info("write metadata done.") return merged diff --git a/server/voice_changer/RVC/onnxExporter/export2onnx.py b/server/voice_changer/RVC/onnxExporter/export2onnx.py index dc46e3c79..52989dc1b 100644 --- a/server/voice_changer/RVC/onnxExporter/export2onnx.py +++ b/server/voice_changer/RVC/onnxExporter/export2onnx.py @@ -11,6 +11,8 @@ from ..inferencer.rvc_models.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM # type: ignore from settings import ServerSettings from io import BytesIO +import logging +logger = logging.getLogger(__name__) def export2onnx(modelSlot: RVCModelSlot): model_dir = ServerSettings().model_dir @@ -30,7 +32,7 @@ def export2onnx(modelSlot: RVCModelSlot): "useFinalProj": modelSlot.useFinalProj, } - print("[Voice Changer] Exporting onnx...") + logger.info("Exporting onnx...") _export2onnx(modelFile, output_path_simple, metadata) return output_file_simple @@ -59,7 +61,7 @@ def _export2onnx(input_model: str, output_model_simple: str, metadata: dict): 'params': cpt['params'] } - print(f'[Voice Changer] Exporting to ONNX on {dev}') + logger.info(f'Exporting to ONNX on {dev}') # EnumInferenceTypesのままだとシリアライズできないのでテキスト化 if metadata["modelType"] == EnumInferenceTypes.pyTorchRVC.value: @@ -76,11 +78,7 @@ def _export2onnx(input_model: str, output_model_simple: str, metadata: dict): elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCv2Nono.value: net_g_onnx = SynthesizerTrnMsNSFsidM(*data["config"], 768, is_half=is_half) else: - print( - "unknwon::::: ", - metadata["modelType"], - EnumInferenceTypes.pyTorchRVCv2.value, - ) + logger.error(f"Unknown model type: {metadata['modelType']}") return net_g_onnx.eval().to(dev) diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 764faabbe..9da3db53f 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -14,7 +14,7 @@ import onnxruntime from torchaudio import transforms as tat from voice_changer.common.deviceManager.DeviceManager import DeviceManager -from mods.log_control import VoiceChangaerLogger +import logging from voice_changer.common.TorchUtils import circular_write from voice_changer.RVC.embedder.Embedder import Embedder @@ -24,7 +24,7 @@ from voice_changer.utils.Timer import Timer2 from const import F0_MEL_MIN, F0_MEL_MAX -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class Pipeline: @@ -114,7 +114,7 @@ def getPipelineInfo(self): def setPitchExtractor(self, pitchExtractor: PitchExtractor): self.pitchExtractor = pitchExtractor - def extractPitch(self, audio: torch.Tensor, pitch: torch.Tensor | None, pitchf: torch.Tensor | None, f0_up_key: int, formant_shift: float) -> tuple[torch.Tensor, torch.Tensor]: + def extract_pitch(self, audio: torch.Tensor, pitch: torch.Tensor | None, pitchf: torch.Tensor | None, f0_up_key: int, formant_shift: float) -> tuple[torch.Tensor, torch.Tensor]: f0 = self.pitchExtractor.extract( audio, self.sr, @@ -137,20 +137,6 @@ def extractPitch(self, audio: torch.Tensor, pitch: torch.Tensor | None, pitchf: return pitch.unsqueeze(0), pitchf.unsqueeze(0) - def extract_features(self, feats: torch.Tensor, embOutputLayer: int, useFinalProj: bool): - try: - return self.embedder.extract_features(feats, embOutputLayer, useFinalProj) - except RuntimeError as e: - print("Failed to extract features:", e) - raise e - - def infer(self, feats: torch.Tensor, p_len: torch.Tensor, pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, skip_head: int | None, return_length: int | None, formant_length: int | None) -> torch.Tensor: - try: - return self.inferencer.infer(feats, p_len, pitch, pitchf, sid, skip_head, return_length, formant_length) - except RuntimeError as e: - print("Failed to infer:", e) - raise e - def _search_index(self, audio: torch.Tensor, top_k: int = 1): if top_k == 1: _, ix = self.index.search(audio if self.use_gpu_index else audio.detach().cpu(), 1) @@ -198,11 +184,11 @@ def exec( t.record("pre-process") # ピッチ検出 - pitch, pitchf = self.extractPitch(audio[silence_front:], pitch, pitchf, f0_up_key, formant_shift) if self.use_f0 else (None, None) + pitch, pitchf = self.extract_pitch(audio[silence_front:], pitch, pitchf, f0_up_key, formant_shift) if self.use_f0 else (None, None) t.record("extract-pitch") # embedding - feats = self.extract_features(audio.view(1, -1), embOutputLayer, useFinalProj) + feats = self.embedder.extract_features(audio.view(1, -1), embOutputLayer, useFinalProj) feats = torch.cat((feats, feats[:, -1:, :]), 1) t.record("extract-feats") @@ -245,7 +231,7 @@ def exec( sid = torch.tensor([sid], device=self.device, dtype=torch.int64) t.record("mid-precess") # 推論実行 - out_audio = self.infer(feats, p_len, pitch, pitchf, sid, skip_head, return_length, formant_length).float() + out_audio = self.inferencer.infer(feats, p_len, pitch, pitchf, sid, skip_head, return_length, formant_length).float() t.record("infer") # Formant shift sample rate adjustment @@ -260,5 +246,4 @@ def exec( out_audio = self.resamplers[scaled_window]( out_audio[: return_length * scaled_window] ) - # print("EXEC AVERAGE:", t.avrSecs) return out_audio diff --git a/server/voice_changer/RVC/pipeline/PipelineGenerator.py b/server/voice_changer/RVC/pipeline/PipelineGenerator.py index c767aeb24..627fca04a 100644 --- a/server/voice_changer/RVC/pipeline/PipelineGenerator.py +++ b/server/voice_changer/RVC/pipeline/PipelineGenerator.py @@ -1,10 +1,8 @@ import os import sys -import traceback import faiss import faiss.contrib.torch_utils import torch -from Exceptions import PipelineCreateException from data.ModelSlot import RVCModelSlot from voice_changer.common.deviceManager.DeviceManager import DeviceManager @@ -14,24 +12,16 @@ from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager from settings import ServerSettings +import logging +logger = logging.getLogger(__name__) def createPipeline(params: ServerSettings, modelSlot: RVCModelSlot, f0Detector: str, force_reload: bool): # Inferencer 生成 - try: - modelPath = os.path.join(params.model_dir, str(modelSlot.slotIndex), os.path.basename(modelSlot.modelFile)) - inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelPath, modelSlot.version) - except Exception as e: - print("[Voice Changer] exception! loading inferencer", e) - traceback.print_exc() - raise PipelineCreateException("[Voice Changer] exception! loading inferencer") + modelPath = os.path.join(params.model_dir, str(modelSlot.slotIndex), os.path.basename(modelSlot.modelFile)) + inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelPath, modelSlot.version) # Embedder 生成 - try: - embedder = EmbedderManager.get_embedder(modelSlot.embedder, force_reload) - except Exception as e: - print("[Voice Changer] exception! loading embedder", e) - traceback.print_exc() - raise PipelineCreateException("[Voice Changer] exception! loading embedder") + embedder = EmbedderManager.get_embedder(modelSlot.embedder, force_reload) # pitchExtractor pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, force_reload) @@ -57,26 +47,26 @@ def createPipeline(params: ServerSettings, modelSlot: RVCModelSlot, f0Detector: def _loadIndex(indexPath: str) -> tuple[faiss.Index | None, torch.Tensor | None]: dev = DeviceManager.get_instance().device # Indexのロード - print("[Voice Changer] Loading index...") + logger.info("Loading index...") # ファイル指定があってもファイルがない場合はNone if os.path.exists(indexPath) is not True or os.path.isfile(indexPath) is not True: - print("[Voice Changer] Index file is not found") + logger.warn("Index file not found. Index will not be used.") return (None, None) + logger.info(f"Try loading \"{indexPath}\"...") try: - print("Try loading...", indexPath) index: faiss.IndexIVFFlat = faiss.read_index(indexPath) if not index.is_trained: - print("[Voice Changer] Invalid index. You MUST use added_xxxx.index, not trained_xxxx.index. Index will not be used.") + logger.error("Invalid index. You MUST use added_xxxx.index, not trained_xxxx.index. Index will not be used.") return (None, None) # BUG: faiss-gpu does not support reconstruct on GPU indices # https://github.com/facebookresearch/faiss/issues/2181 index_reconstruct = index.reconstruct_n(0, index.ntotal).to(dev) if sys.platform == 'linux' and '+cu' in torch.__version__ and dev.type == 'cuda': index: faiss.GpuIndexIVFFlat = faiss.index_cpu_to_gpus_list(index, gpus=[dev.index]) - except: # NOQA - print("[Voice Changer] Load index failed. Use no index.") - traceback.print_exc() + except Exception as e: # NOQA + logger.error("Load index failed. Index will not be used.") + logger.exception(e) return (None, None) return index, index_reconstruct diff --git a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py index 29c2697da..4e8a9ebb0 100644 --- a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py +++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py @@ -8,7 +8,8 @@ from voice_changer.RVC.pitchExtractor.FcpePitchExtractor import FcpePitchExtractor from voice_changer.RVC.pitchExtractor.FcpeOnnxPitchExtractor import FcpeOnnxPitchExtractor from settings import ServerSettings - +import logging +logger = logging.getLogger(__name__) class PitchExtractorManager(Protocol): pitch_extractor: PitchExtractor | None = None @@ -28,10 +29,10 @@ def loadPitchExtractor(cls, pitch_extractor: PitchExtractorType, force_reload: b if cls.pitch_extractor is not None \ and pitch_extractor == cls.pitch_extractor.type \ and not force_reload: - print('[Voice Changer] Reusing pitch extractor.') + logger.info('Reusing pitch extractor.') return cls.pitch_extractor - print(f'[Voice Changer] Loading pitch extractor {pitch_extractor}') + logger.info(f'Loading pitch extractor {pitch_extractor}') try: if pitch_extractor == 'crepe_tiny': return CrepePitchExtractor(pitch_extractor, cls.params.crepe_tiny) @@ -50,11 +51,9 @@ def loadPitchExtractor(cls, pitch_extractor: PitchExtractorType, force_reload: b elif pitch_extractor == "fcpe_onnx": return FcpeOnnxPitchExtractor(cls.params.fcpe_onnx) else: - print(f"[Voice Changer] PitchExctractor not found {pitch_extractor}. Fallback to rmvpe_onnx") + logger.warn(f"PitchExctractor not found {pitch_extractor}. Fallback to rmvpe_onnx") return RMVPEOnnxPitchExtractor(cls.params.rmvpe_onnx) except RuntimeError as e: - import traceback - print(traceback.format_exc()) - print(e) - print(f'[Voice Changer] Failed to load {pitch_extractor}. Fallback to rmvpe_onnx.') + logger.error(f'Failed to load {pitch_extractor}. Fallback to rmvpe_onnx.') + logger.exception(e) return RMVPEOnnxPitchExtractor(cls.params.rmvpe_onnx) diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 67a0796de..d054ac136 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -5,7 +5,7 @@ import threading import numpy as np from downloader.SampleDownloader import downloadSample, getSampleInfos -from mods.log_control import VoiceChangaerLogger +import logging from voice_changer.Local.ServerDevice import ServerDevice, ServerDeviceCallbacks from voice_changer.ModelSlotManager import ModelSlotManager from voice_changer.RVC.RVCModelMerger import RVCModelMerger @@ -18,16 +18,17 @@ from settings import ServerSettings from voice_changer.common.deviceManager.DeviceManager import DeviceManager from Exceptions import ( - NoModeLoadedException, PipelineNotInitializedException, VoiceChangerIsNotSelectedException, ) from traceback import format_exc - # import threading from typing import Callable, Any -logger = VoiceChangaerLogger.get_instance().getLogger() +from voice_changer.RVC.RVCr2 import RVCr2 +from voice_changer.RVC.RVCModelSlotGenerator import RVCModelSlotGenerator # 起動時にインポートするとパラメータが取れない。 + +logger = logging.getLogger(__name__) class VoiceChangerManager(ServerDeviceCallbacks): @@ -46,7 +47,7 @@ def emitTo(self, performance: list[float], err): # VoiceChangerManager ############################ def __init__(self, params: ServerSettings): - logger.info("[Voice Changer] VoiceChangerManager initializing...") + logger.info("Initializing...") self.params = params self.voiceChanger: VoiceChangerV2 = None self.voiceChangerModel = None @@ -71,7 +72,7 @@ def __init__(self, params: ServerSettings): thread = threading.Thread(target=self.serverDevice.start, args=()) thread.start() - logger.info("[Voice Changer] VoiceChangerManager initializing... done.") + logger.info("Initialized.") # Initialize the voice changer self.initialize(self.settings.modelSlotIndex) @@ -89,7 +90,7 @@ def get_instance(cls, params: ServerSettings): async def load_model(self, params: LoadModelParams): if params.isSampleMode: # サンプルダウンロード - logger.info(f"[Voice Changer] sample download...., {params}") + logger.info(f"Sample download.... {params}") await downloadSample(self.params.sample_mode, params.sampleId, self.params.model_dir, params.slot, params.params) self.modelSlotManager.getAllSlotInfo(reload=True) info = {"status": "OK"} @@ -114,14 +115,12 @@ async def load_model(self, params: LoadModelParams): ) dstPath = os.path.join(dstDir, file.name) os.makedirs(dstDir, exist_ok=True) - logger.info(f"move to {srcPath} -> {dstPath}") + logger.info(f"Moving {srcPath} -> {dstPath}") shutil.move(srcPath, dstPath) file.name = os.path.basename(dstPath) # メタデータ作成(各VCで定義) if params.voiceChangerType == "RVC": - from voice_changer.RVC.RVCModelSlotGenerator import RVCModelSlotGenerator # 起動時にインポートするとパラメータが取れない。 - slotInfo = RVCModelSlotGenerator.load_model(params) self.modelSlotManager.save_model_slot(params.slot, slotInfo) @@ -149,7 +148,7 @@ def get_info(self): def initialize(self, val: int): slotInfo = self.modelSlotManager.get_slot_info(val) if slotInfo is None: - logger.info(f"[Voice Changer] model slot is not found {val}") + logger.warn(f"Model slot is not found {val}") return if self.voiceChangerModel is not None and slotInfo.voiceChangerType == self.voiceChangerModel.voiceChangerType: @@ -159,17 +158,16 @@ def initialize(self, val: int): return if slotInfo.voiceChangerType == "RVC": - logger.info("................RVC") - from voice_changer.RVC.RVCr2 import RVCr2 + logger.info("Loading RVC...") self.voiceChangerModel = RVCr2(self.params, slotInfo, self.settings) self.voiceChanger = VoiceChangerV2(self.params, self.settings) self.voiceChanger.set_model(self.voiceChangerModel) else: - logger.info(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}") + logger.error(f"Unknown voice changer model: {slotInfo.voiceChangerType}") def update_settings(self, key: str, val: Any): - print("[Voice Changer] update configuration:", key, val) + logger.info(f"update configuration {key}: {val}") error, old_value = self.settings.set_property(key, val) if error: return self.get_info() @@ -181,7 +179,7 @@ def update_settings(self, key: str, val: Any): self.store_setting() if key == "modelSlotIndex": - logger.info(f"[Voice Changer] Model slot is changed {old_value} -> {val}") + logger.info(f"Model slot is changed {old_value} -> {val}") self.initialize(val) elif key == 'gpu': self.device_manager.set_device(val) @@ -213,23 +211,20 @@ def changeVoice(self, receivedData: AudioInOut) -> tuple[AudioInOut, tuple, tupl return receivedData, [0, 0, 0], None if self.voiceChanger is None: - logger.info("Voice Change is not loaded. Did you load a correct model?") + logger.error("Voice Change is not loaded. Did you load a correct model?") return np.zeros(1, dtype=np.float32), [0, 0, 0], ('NoVoiceChangerLoaded', "Voice Change is not loaded. Did you load a correct model?") try: - audio, perf = self.voiceChanger.on_request(receivedData) + with self.device_manager.lock: + audio, perf = self.voiceChanger.on_request(receivedData) return audio, perf, None - except NoModeLoadedException: - logger.warn(f"[Voice Changer] [Exception], {e}") - return np.zeros(1, dtype=np.float32), [0, 0, 0], ('NoModeLoadedException', format_exc()) - except VoiceChangerIsNotSelectedException: - logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.") + except VoiceChangerIsNotSelectedException as e: + logger.exception(e) return np.zeros(1, dtype=np.float32), [0, 0, 0], ('VoiceChangerIsNotSelectedException', format_exc()) - except PipelineNotInitializedException: - logger.warn("[Voice Changer] Pipeline is not initialized.") + except PipelineNotInitializedException as e: + logger.exception(e) return np.zeros(1, dtype=np.float32), [0, 0, 0], ('PipelineNotInitializedException', format_exc()) except Exception as e: - logger.warn(f"[Voice Changer] VC PROCESSING EXCEPTION!!! {e}") logger.exception(e) return np.zeros(1, dtype=np.float32), [0, 0, 0], ('Exception', format_exc()) diff --git a/server/voice_changer/VoiceChangerSettings.py b/server/voice_changer/VoiceChangerSettings.py index bd58dfcc2..54eb4bc99 100644 --- a/server/voice_changer/VoiceChangerSettings.py +++ b/server/voice_changer/VoiceChangerSettings.py @@ -1,6 +1,9 @@ # from const import PitchExtractorType from typing import NamedTuple +import logging +logger = logging.getLogger(__name__) + IGNORED_KEYS = { 'version' } STATEFUL_KEYS = [ 'serverAudioStated', 'passThrough', 'recordIO' ] @@ -42,13 +45,13 @@ def set_property(self, key, value) -> SetPropertyResult: if key in IGNORED_KEYS: return SetPropertyResult(error=False, old_value=None) if key not in cls.__dict__: - print(f'[VoiceChangerSettings] Failed to set setting: {key} does not exist') + logger.error(f'Failed to set setting: {key} does not exist') return SetPropertyResult(error=True, old_value=None) p = cls.__dict__[key] if not isinstance(p, property): return SetPropertyResult(error=True, old_value=None) if p.fset is None: - print(f'[VoiceChangerSettings] Failed to set setting: {key} is immutable.') + logger.error(f'Failed to set setting: {key} is immutable.') return SetPropertyResult(error=True, old_value=None) old_value = p.fget(self) p.fset(self, value) diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py index 1f6da67da..4a1fa0f81 100644 --- a/server/voice_changer/VoiceChangerV2.py +++ b/server/voice_changer/VoiceChangerV2.py @@ -5,7 +5,7 @@ import torch import os import numpy as np -from mods.log_control import VoiceChangaerLogger +import logging from voice_changer.IORecorder import IORecorder from voice_changer.VoiceChangerSettings import VoiceChangerSettings @@ -20,7 +20,7 @@ STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav") STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav") -logger = VoiceChangaerLogger.get_instance().getLogger() +logger = logging.getLogger(__name__) class VoiceChangerV2(VoiceChangerIF): @@ -39,9 +39,6 @@ def __init__(self, params: ServerSettings, settings: VoiceChangerSettings): self.sola_buffer: torch.Tensor | None = None self.ioRecorder: IORecorder | None = None - logger.info(f"VoiceChangerV2 Initialized") - np.set_printoptions(threshold=10000) - def set_model(self, model: VoiceChangerModel): self.voiceChangerModel = model @@ -68,7 +65,7 @@ def get_info(self): def update_settings(self, key: str, val: Any, old_val: Any): if self.voiceChangerModel is None: - logger.warn("[Voice Changer] Voice Changer is not selected.") + logger.warn("Voice Changer model is not selected.") return if key == "serverReadChunkSize": @@ -85,7 +82,7 @@ def update_settings(self, key: str, val: Any, old_val: Any): self.settings.outputSampleRate, # 16000, ) - print(f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}") + logger.info(f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}") else: self.ioRecorder.close() elif key == "inputSampleRate": @@ -122,7 +119,7 @@ def _generate_strength(self): # ひとつ前の結果とサイズが変わるため、記録は消去する。 self.sola_buffer = torch.zeros(self.crossfade_frame, device=self.device_manager.device, dtype=torch.float32) - logger.info(f'[Voice Changer] Allocated sola buffer size: {self.sola_buffer.shape}') + logger.info(f'Allocated SOLA buffer size: {self.crossfade_frame}') def get_processing_sampling_rate(self): if self.voiceChangerModel is None: @@ -179,12 +176,3 @@ def on_request(self, audio_in: AudioInOutFloat) -> tuple[AudioInOutFloat, list[U @torch.no_grad() def export2onnx(self): return self.voiceChangerModel.export2onnx() - - ############## - - def merge_models(self, request: str): - if self.voiceChangerModel is None: - logger.info("[Voice Changer] Voice Changer is not selected.") - return - self.voiceChangerModel.merge_models(request) - return self.get_info() diff --git a/server/voice_changer/common/MelExtractorFcpe.py b/server/voice_changer/common/MelExtractorFcpe.py index 86be3c757..5c4197fdc 100644 --- a/server/voice_changer/common/MelExtractorFcpe.py +++ b/server/voice_changer/common/MelExtractorFcpe.py @@ -2,6 +2,9 @@ from .STFT import STFT from librosa.filters import mel +import logging +logger = logging.getLogger(__name__) + # This module is used by FCPE # Modules are copied from torchfcpe and modified def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): @@ -153,9 +156,9 @@ def __call__(self, y = y.squeeze(-1) if torch.min(y) < -1.: - print('[error with torchfcpe.mel_extractor.MelModule]min value is ', torch.min(y)) + logger.error(f'min value is {torch.min(y)}') if torch.max(y) > 1.: - print('[error with torchfcpe.mel_extractor.MelModule]max value is ', torch.max(y)) + logger.error(f'max value is {torch.max(y)}') pad_left = (self.win_size - self.hop_length) // 2 pad_right = max((self.win_size - self.hop_length + 1) // 2, self.win_size - y.size(-1) - pad_left) diff --git a/server/voice_changer/common/OnnxLoader.py b/server/voice_changer/common/OnnxLoader.py index 879bb6d1d..fd88f5638 100644 --- a/server/voice_changer/common/OnnxLoader.py +++ b/server/voice_changer/common/OnnxLoader.py @@ -8,6 +8,8 @@ from onnxruntime.transformers.fusion_utils import FusionUtils from onnxruntime.transformers.onnx_model import OnnxModel +import logging +logger = logging.getLogger(__name__) def load_onnx_model(fpath: str, is_half: bool) -> ModelProto: if is_half: @@ -25,20 +27,24 @@ def load_cached_fp16_model(fpath: str) -> ModelProto: fname, _ = os.path.splitext(os.path.basename(fpath)) fp16_fpath = os.path.join(os.path.dirname(fpath), f'{fname}.fp16.onnx') if original_hash is None: + logger.info('Converting model to FP16...') model = convert_fp16(onnx.load(fpath)) onnx.save(model, fp16_fpath) with open(fpath, 'rb') as f: computed_hash = compute_hash(f, xxh128()) with open(hashfile, 'w', encoding='utf-8') as f: f.write(computed_hash) + logger.info('Done!') else: with open(fpath, 'rb') as f: computed_hash = compute_hash(f, xxh128()) if computed_hash != original_hash: + logger.info('Original model has changed. Regenerating FP16 model...') model = convert_fp16(onnx.load(fpath)) onnx.save(model, fp16_fpath) with open(hashfile, 'w', encoding='utf-8') as f: f.write(computed_hash) + logger.info('Done!') else: model = onnx.load(fp16_fpath) return model diff --git a/server/voice_changer/common/deviceManager/DeviceManager.py b/server/voice_changer/common/deviceManager/DeviceManager.py index 4dc4b7fa9..d8d69352c 100644 --- a/server/voice_changer/common/deviceManager/DeviceManager.py +++ b/server/voice_changer/common/deviceManager/DeviceManager.py @@ -3,12 +3,16 @@ import re from typing import TypedDict, Literal from enum import IntFlag +from threading import Lock try: import torch_directml except ImportError: import voice_changer.common.deviceManager.DummyDML as torch_directml +import logging +logger = logging.getLogger(__name__) + class CoreMLFlag(IntFlag): USE_CPU_ONLY = 0x001 ENABLE_ON_SUBGRAPH = 0x002 @@ -42,10 +46,11 @@ def __init__(self): self.dml_enabled: bool = torch_directml.is_available() self.fp16_available = False self.force_fp32 = False - print('[Voice Changer] Initialized DeviceManager. Available backends:') - print(f'[Voice Changer] * DirectML: {self.dml_enabled}, device count: {torch_directml.device_count()}') - print(f'[Voice Changer] * CUDA: {self.cuda_enabled}, device count: {torch.cuda.device_count()}') - print(f'[Voice Changer] * MPS: {self.mps_enabled}') + self.lock = Lock() + logger.info('Initialized DeviceManager. Backend statuses:') + logger.info(f'* DirectML: {self.dml_enabled}, device count: {torch_directml.device_count()}') + logger.info(f'* CUDA: {self.cuda_enabled}, device count: {torch.cuda.device_count()}') + logger.info(f'* MPS: {self.mps_enabled}') def initialize(self, device_id: int, force_fp32: bool): self.set_device(device_id) @@ -61,7 +66,7 @@ def set_device(self, id: int): self.device = device self.device_metadata = metadata self.fp16_available = self.is_fp16_available() - print(f'[Voice Changer] Switched to {metadata["name"]} ({device}). FP16 support: {self.fp16_available}') + logger.info(f'Switched to {metadata["name"]} ({device}). FP16 support: {self.fp16_available}') def use_fp16(self): return self.fp16_available and not self.force_fp32 diff --git a/server/voice_changer/common/rmvpe/rmvpe.py b/server/voice_changer/common/rmvpe/rmvpe.py index b9b0c1d4a..8a73dfd36 100644 --- a/server/voice_changer/common/rmvpe/rmvpe.py +++ b/server/voice_changer/common/rmvpe/rmvpe.py @@ -8,9 +8,6 @@ from voice_changer.common.SafetensorsUtils import load_model from librosa.filters import mel -from mods.log_control import VoiceChangaerLogger - -logger = VoiceChangaerLogger.get_instance().getLogger() class BiGRU(nn.Module): diff --git a/server/voice_changer/utils/VoiceChangerModel.py b/server/voice_changer/utils/VoiceChangerModel.py index c7a9ad8e7..be97c1a2f 100644 --- a/server/voice_changer/utils/VoiceChangerModel.py +++ b/server/voice_changer/utils/VoiceChangerModel.py @@ -52,6 +52,3 @@ def realloc(self, block_frame: int, extra_frame: int, crossfade_frame: int, sola def export2onnx() -> Any: ... - - def merge_models(request: str) -> Any: - ...