Spaces:

thunnai
/

SparkTTS

Running

App Files Files Community

thunnai commited on Feb 28

Commit

f4176b0

1 Parent(s): d123787

test with global model

Browse files

Files changed (2) hide show

sparktts/modules/speaker/perceiver_encoder.py +9 -18
webui.py +11 -6

sparktts/modules/speaker/perceiver_encoder.py CHANGED Viewed

@@ -15,6 +15,7 @@
 # Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
 from functools import wraps
 import torch
@@ -45,21 +46,6 @@ def once(fn):
 print_once = once(print)
-# Define config class at module level
-class EfficientAttentionConfig:
-    def __init__(self, enable_flash, enable_math, enable_mem_efficient):
-        self.enable_flash = enable_flash
-        self.enable_math = enable_math
-        self.enable_mem_efficient = enable_mem_efficient
-    def _asdict(self):
-        return {
-            'enable_flash': self.enable_flash,
-            'enable_math': self.enable_math,
-            'enable_mem_efficient': self.enable_mem_efficient
-        }
 # main class
@@ -77,7 +63,12 @@ class Attend(nn.Module):
             use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
         ), "in order to use flash attention, you must be using pytorch 2.0 or above"
-        self.cpu_config = EfficientAttentionConfig(True, True, True)
         self.cuda_config = None
         if not torch.cuda.is_available() or not use_flash:
@@ -89,12 +80,12 @@ class Attend(nn.Module):
             print_once(
                 "A100 GPU detected, using flash attention if input tensor is on cuda"
             )
-            self.cuda_config = EfficientAttentionConfig(True, False, False)
         else:
             print_once(
                 "Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda"
             )
-            self.cuda_config = EfficientAttentionConfig(False, True, True)
     def get_mask(self, n, device):
         if exists(self.mask) and self.mask.shape[-1] >= n:

 # Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
+from collections import namedtuple
 from functools import wraps
 import torch
 print_once = once(print)
 # main class
             use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
         ), "in order to use flash attention, you must be using pytorch 2.0 or above"
+        # determine efficient attention configs for cuda and cpu
+        self.config = namedtuple(
+            "EfficientAttentionConfig",
+            ["enable_flash", "enable_math", "enable_mem_efficient"],
+        )
+        self.cpu_config = self.config(True, True, True)
         self.cuda_config = None
         if not torch.cuda.is_available() or not use_flash:
             print_once(
                 "A100 GPU detected, using flash attention if input tensor is on cuda"
             )
+            self.cuda_config = self.config(True, False, False)
         else:
             print_once(
                 "Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda"
             )
+            self.cuda_config = self.config(False, True, True)
     def get_mask(self, n, device):
         if exists(self.mask) and self.mask.shape[-1] >= n:

webui.py CHANGED Viewed

@@ -25,6 +25,8 @@ from sparktts.utils.token_parser import LEVELS_MAP_UI
 from huggingface_hub import snapshot_download
 import spaces
 def initialize_model(model_dir=None, device="cpu"):
     """Load the model once at the beginning."""
@@ -38,8 +40,7 @@ def initialize_model(model_dir=None, device="cpu"):
     return model
 @spaces.GPU
-def generate(model,
-            text,
             prompt_speech,
             prompt_text,
             gender,
@@ -47,6 +48,10 @@ def generate(model,
             speed,
 ):
     """Generate audio from text."""
     # if gpu available, move model to gpu
     if torch.cuda.is_available():
         model = model.to("cuda")
@@ -66,7 +71,6 @@ def generate(model,
 def run_tts(
     text,
-    model,
     prompt_text=None,
     prompt_speech=None,
     gender=None,
@@ -90,7 +94,7 @@ def run_tts(
     logging.info("Starting inference...")
     # Perform inference and save the output audio
-    wav = generate(model, text,
             prompt_speech,
             prompt_text,
             gender,
@@ -109,6 +113,9 @@ def build_ui(model_dir, device=0):
     # Initialize model
     model = initialize_model(model_dir, device=device)
     # Define callback function for voice cloning
     def voice_clone(text, prompt_text, prompt_wav_upload, prompt_wav_record):
@@ -123,7 +130,6 @@ def build_ui(model_dir, device=0):
         audio_output_path = run_tts(
             text,
-            model,
             prompt_text=prompt_text_clean,
             prompt_speech=prompt_speech
         )
@@ -141,7 +147,6 @@ def build_ui(model_dir, device=0):
         speed_val = LEVELS_MAP_UI[int(speed)]
         audio_output_path = run_tts(
             text,
-            model,
             gender=gender,
             pitch=pitch_val,
             speed=speed_val

 from huggingface_hub import snapshot_download
 import spaces
+MODEL = None
 def initialize_model(model_dir=None, device="cpu"):
     """Load the model once at the beginning."""
     return model
 @spaces.GPU
+def generate(text,
             prompt_speech,
             prompt_text,
             gender,
             speed,
 ):
     """Generate audio from text."""
+    global MODEL
+    model = MODEL
     # if gpu available, move model to gpu
     if torch.cuda.is_available():
         model = model.to("cuda")
 def run_tts(
     text,
     prompt_text=None,
     prompt_speech=None,
     gender=None,
     logging.info("Starting inference...")
     # Perform inference and save the output audio
+    wav = generate(text,
             prompt_speech,
             prompt_text,
             gender,
     # Initialize model
     model = initialize_model(model_dir, device=device)
+    global MODEL
+    MODEL = model
     # Define callback function for voice cloning
     def voice_clone(text, prompt_text, prompt_wav_upload, prompt_wav_record):
         audio_output_path = run_tts(
             text,
             prompt_text=prompt_text_clean,
             prompt_speech=prompt_speech
         )
         speed_val = LEVELS_MAP_UI[int(speed)]
         audio_output_path = run_tts(
             text,
             gender=gender,
             pitch=pitch_val,
             speed=speed_val