acambece25
/

csm-tts-endpoint

Model card Files Files and versions

xet

Community

acambece25 commited on Oct 1, 2025

Commit

99dfb5a

verified ·

1 Parent(s): b7b57b9

Update handler.py

Browse files

Files changed (1) hide show

handler.py +52 -75

handler.py CHANGED Viewed

@@ -1,114 +1,91 @@
-# handler.py — Sesame CSM endpoint: 24kHz output + leading-silence trim
-import os, io, wave, base64, math
-import numpy as np
-import torch
-from transformers import AutoProcessor, CsmForConditionalGeneration
 MODEL_ID = "sesame/csm-1b"
 TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
-TARGET_SR = 24000        # <- hard target sample rate for telephony bridge
-TRIM_DBFS = -42          # leading silence threshold
-TRIM_MAX_MS = 350        # max leading trim
-SPEED_MULTIPLIER = float(os.environ.get("CSM_SPEED_MULTIPLIER", "1.0"))  # optional 1.10–1.25
-device = "cuda" if torch.cuda.is_available() else "cpu"
-processor = AutoProcessor.from_pretrained(MODEL_ID, token=TOKEN)
-model = CsmForConditionalGeneration.from_pretrained(
-    MODEL_ID, token=TOKEN,
-    torch_dtype=torch.float16 if device == "cuda" else None
-).to(device)
 def _trim_leading_silence(x: np.ndarray, sr: int, thresh_dbfs: float, max_ms: int):
     x = np.asarray(x, dtype=np.float32)
-    thresh = (10.0 ** (thresh_dbfs / 20.0))
-    max_samples = int(sr * max(0, max_ms) / 1000)
     cut = 0
-    for i in range(min(len(x), max_samples)):
-        if abs(x[i]) > thresh:
-            cut = i
-            break
-        if i == min(len(x), max_samples) - 1:
-            cut = i
     return x[cut:], int(round(cut * 1000 / sr))
 def _tempo_boost(x: np.ndarray, sr: int, speed: float):
-    if not (speed and speed > 1.01):
-        return x
-    # crude tempo increase via resample to higher SR then back to original SR (raises pitch a bit)
     up_sr = int(round(sr * speed))
-    x_up = _resample_linear(x, sr, up_sr)
-    return _resample_linear(x_up, up_sr, sr)
-def _resample_linear(x: np.ndarray, src_sr: int, dst_sr: int):
-    if src_sr == dst_sr or len(x) == 0:
-        return x.astype(np.float32, copy=False)
-    # linear interpolation in float32
-    ratio = float(dst_sr) / float(src_sr)
-    out_len = max(1, int(round(len(x) * ratio)))
-    t = np.linspace(0.0, len(x) - 1, num=out_len, dtype=np.float32)
-    i0 = np.floor(t).astype(np.int32)
-    i1 = np.minimum(i0 + 1, len(x) - 1)
-    frac = t - i0
-    y = (1.0 - frac) * x[i0] + frac * x[i1]
-    return y.astype(np.float32, copy=False)
 def _float_to_wav_bytes(x: np.ndarray, sr: int) -> bytes:
-    # clamp -> int16
     x = np.clip(np.asarray(x, dtype=np.float32), -1.0, 1.0)
     i16 = (x * 32767.0).astype(np.int16)
     buf = io.BytesIO()
     with wave.open(buf, "wb") as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(int(sr))
         wf.writeframes(i16.tobytes())
     return buf.getvalue()
 class EndpointHandler:
-    def __init__(self, path: str = ""):
-        pass
     def __call__(self, data: dict):
         try:
             text = (data.get("inputs") or data.get("text") or "").strip()
-            # ensure Sesame speaker prefix exists
-            if text and not text.startswith("["):
                 text = f"[0]{text}"
-            # 1) generate audio (model-native rate)
-            inputs = processor(text, add_special_tokens=True).to(model.device)
             audio = model.generate(**inputs, output_audio=True)
-            if isinstance(audio, torch.Tensor):
                 audio = audio.detach().cpu().float().numpy()
-            # 2) trim leading silence (model often leaves a big gap)
-            audio, _ = _trim_leading_silence(audio, sr=TARGET_SR, thresh_dbfs=TRIM_DBFS, max_ms=TRIM_MAX_MS)
-            # 3) tempo boost if requested (optional)
-            audio = _tempo_boost(audio, TARGET_SR, SPEED_MULTIPLIER)
-            # 4) upsample/downsample to TARGET_SR
-            audio_24k = _resample_linear(audio, src_sr=TARGET_SR, dst_sr=TARGET_SR)
-            # 5) to WAV (24k mono 16-bit)
-            wav_bytes = _float_to_wav_bytes(audio_24k, TARGET_SR)
-            b64 = base64.b64encode(wav_bytes).decode("ascii")
             return {
                 "status_code": 200,
-                "statusCode": 200,
                 "headers": {"Content-Type": "audio/wav"},
-                "body": b64,
-                "isBase64Encoded": True,
-                "is_base64_encoded": True,
             }
         except Exception as e:
-            return {
-                "status_code": 500,
-                "statusCode": 500,
-                "headers": {"Content-Type": "text/plain"},
-                "body": f"CSM error: {e}",
-                "isBase64Encoded": False,
-                "is_base64_encoded": False,
-            }

+# handler.py — Sesame CSM @ 24kHz + trim + optional tempo
+import os, io, wave, base64, numpy as np
+from transformers import AutoProcessor, AutoModel
 MODEL_ID = "sesame/csm-1b"
 TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+TARGET_SR = 24000                # force 24 kHz out
+TRIM_DBFS = -42                  # leading silence cutoff (≈ quiet room)
+TRIM_MAX_MS = 350                # cap leading trim
+SPEED_MULTIPLIER = float(os.environ.get("CSM_SPEED_MULTIPLIER", "1.0"))  # e.g. 1.12..1.22
+# ---- load via remote code (avoids missing Csm* import) ----
+processor = AutoProcessor.from_pretrained(MODEL_ID, token=TOKEN, trust_remote_code=True)
+model = AutoModel.from_pretrained(MODEL_ID, token=TOKEN, trust_remote_code=True)
+def _resample_linear(x: np.ndarray, src_sr: int, dst_sr: int):
+    if src_sr == dst_sr or x.size == 0: return x.astype(np.float32, copy=False)
+    ratio = float(dst_sr) / float(src_sr)
+    out_len = max(1, int(round(x.size * ratio)))
+    t = np.linspace(0.0, x.size - 1, num=out_len, dtype=np.float32)
+    i0 = np.floor(t).astype(np.int32)
+    i1 = np.minimum(i0 + 1, x.size - 1)
+    frac = t - i0
+    y = (1.0 - frac) * x[i0] + frac * x[i1]
+    return y.astype(np.float32, copy=False)
 def _trim_leading_silence(x: np.ndarray, sr: int, thresh_dbfs: float, max_ms: int):
     x = np.asarray(x, dtype=np.float32)
+    thresh = 10.0 ** (thresh_dbfs / 20.0)
+    max_n = int(sr * max(0, max_ms) / 1000)
     cut = 0
+    for i in range(min(x.size, max_n)):
+        if abs(x[i]) > thresh: cut = i; break
+        if i == min(x.size, max_n) - 1: cut = i
     return x[cut:], int(round(cut * 1000 / sr))
 def _tempo_boost(x: np.ndarray, sr: int, speed: float):
+    if not (speed and speed > 1.01): return x
     up_sr = int(round(sr * speed))
+    return _resample_linear(_resample_linear(x, sr, up_sr), up_sr, sr)
 def _float_to_wav_bytes(x: np.ndarray, sr: int) -> bytes:
     x = np.clip(np.asarray(x, dtype=np.float32), -1.0, 1.0)
     i16 = (x * 32767.0).astype(np.int16)
     buf = io.BytesIO()
     with wave.open(buf, "wb") as wf:
+        wf.setnchannels(1); wf.setsampwidth(2); wf.setframerate(int(sr))
         wf.writeframes(i16.tobytes())
     return buf.getvalue()
 class EndpointHandler:
+    def __init__(self, path: str = ""): pass
     def __call__(self, data: dict):
         try:
             text = (data.get("inputs") or data.get("text") or "").strip()
+            if not text:
+                return {"status_code":400,"headers":{"Content-Type":"text/plain"},"body":"Missing text"}
+            # CSM speaker prefix if absent
+            if not text.startswith("["):
                 text = f"[0]{text}"
+            # generate (model defines its own rate internally)
+            inputs = processor(text, add_special_tokens=True)
+            # sesame remote code supports output_audio=True
             audio = model.generate(**inputs, output_audio=True)
+            if hasattr(audio, "cpu"):  # torch tensor
                 audio = audio.detach().cpu().float().numpy()
+            audio = np.asarray(audio, dtype=np.float32)
+            # trim + (optional) tempo boost
+            audio, _ = _trim_leading_silence(audio, TARGET_SR, TRIM_DBFS, TRIM_MAX_MS)
+            if SPEED_MULTIPLIER and SPEED_MULTIPLIER > 1.01:
+                audio = _tempo_boost(audio, TARGET_SR, SPEED_MULTIPLIER)
+            # normalize gentle
+            peak = float(np.max(np.abs(audio))) or 1.0
+            if peak > 0: audio = (audio / peak) * 0.85
+            wav_b64 = base64.b64encode(_float_to_wav_bytes(audio, TARGET_SR)).decode("ascii")
             return {
                 "status_code": 200,
                 "headers": {"Content-Type": "audio/wav"},
+                "body": wav_b64,
+                "isBase64Encoded": True
             }
         except Exception as e:
+            return {"status_code":500,"headers":{"Content-Type":"text/plain"},"body":f"CSM error: {e}"}