Spaces:

ogflash
/

fastapi-asr-app

Runtime error

App Files Files Community

ogflash commited on Jun 11

Commit

d3a18f1

1 Parent(s): 69e578b

Fix: added webui for asr api

Browse files

Files changed (2) hide show

main.py +34 -21
static/index.html +210 -0

main.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import os
 import torch
 import torchaudio
-from transformers import AutoModel # For the new model
-from pydub import AudioSegment # Requires ffmpeg installed on system
-import aiofiles # For asynchronous file operations
-import uuid # For generating unique filenames
 from fastapi import FastAPI, HTTPException, File, UploadFile
-from starlette.concurrency import run_in_threadpool # For running blocking code in background thread
 # -----------------------------------------------------------
 # 1. FastAPI App Instance
@@ -16,7 +18,7 @@ app = FastAPI()
 # -----------------------------------------------------------
 # 2. Global Variables (for model and directories)
-#    These will be initialized during startup
 # -----------------------------------------------------------
 ASR_MODEL = None
 DEVICE = None
@@ -27,7 +29,7 @@ TARGET_SAMPLE_RATE = 16000 # Required sample rate for the new model
 # -----------------------------------------------------------
 # 3. Startup Event: Load Model and Create Directories
-#    This runs once when the FastAPI application starts
 # -----------------------------------------------------------
 @app.on_event("startup")
 async def startup_event():
@@ -41,12 +43,29 @@ async def startup_event():
     DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     ASR_MODEL = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True)
     ASR_MODEL.to(DEVICE)
-    ASR_MODEL.eval() # Set model to evaluation mode
 # -----------------------------------------------------------
-# 4. Helper Function: Audio Conversion
-#    This function performs the actual audio conversion (blocking operation)
 # -----------------------------------------------------------
 def _convert_audio_sync(input_path: str, output_path: str, target_sample_rate: int = TARGET_SAMPLE_RATE, channels: int = 1):
     audio = AudioSegment.from_file(input_path)
@@ -55,7 +74,7 @@ def _convert_audio_sync(input_path: str, output_path: str, target_sample_rate: i
 # -----------------------------------------------------------
-# 5. Main API Endpoint: Handle File Upload and Transcription
 # -----------------------------------------------------------
 @app.post('/transcribefile/')
 async def transcribe_file(file: UploadFile = File(...)):
@@ -63,7 +82,6 @@ async def transcribe_file(file: UploadFile = File(...)):
     unique_id = str(uuid.uuid4())
     uploaded_file_path = os.path.join(UPLOAD_DIR, f"{unique_id}_{file.filename}")
     converted_audio_path = os.path.join(CONVERTED_AUDIO_DIR, f"{unique_id}.wav")
-    #transcription_output_path_ctc = os.path.join(TRANSCRIPTION_OUTPUT_DIR, f"{unique_id}_ctc.txt")
     transcription_output_path_rnnt = os.path.join(TRANSCRIPTION_OUTPUT_DIR, f"{unique_id}_rnnt.txt")
     try:
@@ -77,7 +95,7 @@ async def transcribe_file(file: UploadFile = File(...)):
             raise HTTPException(status_code=400, detail="Uploaded file is empty or could not be saved.")
         # 5.4. Convert audio (run blocking operation in a thread pool)
-        #      This is where pydub uses ffmpeg
         await run_in_threadpool(
             _convert_audio_sync, uploaded_file_path, converted_audio_path
         )
@@ -92,21 +110,16 @@ async def transcribe_file(file: UploadFile = File(...)):
         wav = wav.to(DEVICE) # Move tensor to the correct device
-        # 5.6. Perform transcription using both CTC and RNNT decoding
         with torch.no_grad(): # Disable gradient calculation for inference
-            #transcription_ctc = ASR_MODEL(wav, "ml", "ctc")
             transcription_rnnt = ASR_MODEL(wav, "ml", "rnnt")
-        # 5.7. Save transcriptions (optional)
-        #async with aiofiles.open(transcription_output_path_ctc, "w", encoding="utf-8") as f:
-        #    await f.write(transcription_ctc)
         async with aiofiles.open(transcription_output_path_rnnt, "w", encoding="utf-8") as f:
             await f.write(transcription_rnnt)
-        # 5.8. Return the transcriptions
         return {
-        #    "ctc_transcription": transcription_ctc,
             "rnnt_transcription": transcription_rnnt
         }

 import os
 import torch
 import torchaudio
+from transformers import AutoModel
+from pydub import AudioSegment
+import aiofiles
+import uuid
 from fastapi import FastAPI, HTTPException, File, UploadFile
+from starlette.concurrency import run_in_threadpool
+from starlette.staticfiles import StaticFiles # <-- NEW IMPORT
+from starlette.responses import HTMLResponse, RedirectResponse # <-- NEW IMPORT
 # -----------------------------------------------------------
 # 1. FastAPI App Instance
 # -----------------------------------------------------------
 # 2. Global Variables (for model and directories)
+# These will be initialized during startup
 # -----------------------------------------------------------
 ASR_MODEL = None
 DEVICE = None
 # -----------------------------------------------------------
 # 3. Startup Event: Load Model and Create Directories
+# This runs once when the FastAPI application starts
 # -----------------------------------------------------------
 @app.on_event("startup")
 async def startup_event():
     DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     ASR_MODEL = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True)
     ASR_MODEL.to(DEVICE)
+    ASR_MODEL.eval()
+# -----------------------------------------------------------
+# 4. Mount Static Files and Define Root Endpoint (NEW)
+# -----------------------------------------------------------
+# Mount the 'static' directory to serve HTML, CSS, JS files
+# This makes files like 'static/index.html' accessible at /static/index.html
+app.mount("/static", StaticFiles(directory="static"), name="static")
+# Define a root endpoint that serves your main HTML page
+@app.get("/", response_class=HTMLResponse)
+async def read_root():
+    try:
+        # FastAPI will serve this index.html when users visit the root URL of your Space
+        with open("static/index.html", "r", encoding="utf-8") as f:
+            return HTMLResponse(content=f.read())
+    except FileNotFoundError:
+        # This fallback should ideally not be hit if your Dockerfile copies files correctly
+        return HTMLResponse("<h1>Error: index.html not found!</h1><p>Please ensure 'static/index.html' exists in your project.</p>", status_code=404)
 # -----------------------------------------------------------
+# 5. Helper Function: Audio Conversion (Existing Code)
+# This function performs the actual audio conversion (blocking operation)
 # -----------------------------------------------------------
 def _convert_audio_sync(input_path: str, output_path: str, target_sample_rate: int = TARGET_SAMPLE_RATE, channels: int = 1):
     audio = AudioSegment.from_file(input_path)
 # -----------------------------------------------------------
+# 6. Main API Endpoint: Handle File Upload and Transcription (Existing Code)
 # -----------------------------------------------------------
 @app.post('/transcribefile/')
 async def transcribe_file(file: UploadFile = File(...)):
     unique_id = str(uuid.uuid4())
     uploaded_file_path = os.path.join(UPLOAD_DIR, f"{unique_id}_{file.filename}")
     converted_audio_path = os.path.join(CONVERTED_AUDIO_DIR, f"{unique_id}.wav")
     transcription_output_path_rnnt = os.path.join(TRANSCRIPTION_OUTPUT_DIR, f"{unique_id}_rnnt.txt")
     try:
             raise HTTPException(status_code=400, detail="Uploaded file is empty or could not be saved.")
         # 5.4. Convert audio (run blocking operation in a thread pool)
+        # This is where pydub uses ffmpeg
         await run_in_threadpool(
             _convert_audio_sync, uploaded_file_path, converted_audio_path
         )
         wav = wav.to(DEVICE) # Move tensor to the correct device
+        # 5.6. Perform transcription using RNNT decoding
         with torch.no_grad(): # Disable gradient calculation for inference
             transcription_rnnt = ASR_MODEL(wav, "ml", "rnnt")
+        # 5.7. Save transcription (optional)
         async with aiofiles.open(transcription_output_path_rnnt, "w", encoding="utf-8") as f:
             await f.write(transcription_rnnt)
+        # 5.8. Return the transcription
         return {
             "rnnt_transcription": transcription_rnnt
         }

static/index.html ADDED Viewed

	@@ -0,0 +1,210 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ASR Transcription App</title>
+    <style>
+        body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #011227;
+            color: #333;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            min-height: 100vh;
+            box-sizing: border-box;
+        }
+        .container {
+            max-width: 650px;
+            width: 100%;
+            margin: auto;
+            background: linear-gradient(135deg, #ffffff, #f0f8ff);
+            padding: 40px;
+            border-radius: 12px;
+            box-shadow: 0 5px 20px rgba(0,0,0,0.1);
+            border: 1px solid #d0e0f0;
+        }
+        h1 {
+            text-align: center;
+            color: #0056b3;
+            margin-bottom: 30px;
+            font-size: 2em;
+        }
+        .form-group {
+            margin-bottom: 25px;
+        }
+        label {
+            display: block;
+            margin-bottom: 8px;
+            font-weight: bold;
+            color: #555;
+        }
+        input[type="file"] {
+            display: block;
+            width: 100%;
+            padding: 12px;
+            border: 1px solid #a7d0e0;
+            border-radius: 6px;
+            box-sizing: border-box;
+            background-color: #fcfdff;
+            cursor: pointer;
+        }
+        input[type="file"]::-webkit-file-upload-button {
+            background-color: #007bff;
+            color: white;
+            padding: 8px 15px;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+            margin-right: 15px;
+            transition: background-color 0.2s ease;
+        }
+        input[type="file"]::-webkit-file-upload-button:hover {
+            background-color: #0056b3;
+        }
+        button {
+            background-color: #28a745;
+            color: white;
+            padding: 15px 25px;
+            border: none;
+            border-radius: 6px;
+            cursor: pointer;
+            font-size: 1.1em;
+            width: 100%;
+            transition: background-color 0.2s ease, transform 0.1s ease;
+        }
+        button:hover {
+            background-color: #218838;
+            transform: translateY(-2px);
+        }
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+        }
+        #loading {
+            text-align: center;
+            margin-top: 30px;
+            font-weight: bold;
+            color: #007bff;
+            font-size: 1.1em;
+            display: none; /* Hidden by default */
+        }
+        #response-card {
+            margin-top: 30px;
+            padding: 20px;
+            background-color: #f8fafd;
+            border: 1px solid #d0e0f0;
+            border-radius: 8px;
+            min-height: 80px;
+            box-shadow: inset 0 1px 3px rgba(0,0,0,0.05);
+        }
+        #response-card strong {
+            color: #0056b3;
+            display: block;
+            margin-bottom: 10px;
+            font-size: 1.1em;
+        }
+        #transcriptionOutput {
+            white-space: pre-wrap; /* Preserve whitespace and line breaks */
+            word-wrap: break-word; /* Break long words */
+            font-size: 1.05em;
+            color: #333;
+        }
+        .error {
+            color: #dc3545;
+            font-weight: bold;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Audio Transcription</h1>
+        <form id="uploadForm">
+            <div class="form-group">
+                <label for="audioFile">Select an audio or video file:</label>
+                <input type="file" id="audioFile" name="file" accept="audio/*,video/*">
+            </div>
+            <button type="submit" id="submitButton">Transcribe Audio</button>
+        </form>
+        <div id="loading">Processing... Please wait, this might take a moment.</div>
+        <div id="response-card">
+            <strong>Transcription Output:</strong>
+            <span id="transcriptionOutput"></span>
+        </div>
+    </div>
+    <script>
+        const uploadForm = document.getElementById('uploadForm');
+        const audioFile = document.getElementById('audioFile');
+        const loadingDiv = document.getElementById('loading');
+        const transcriptionOutput = document.getElementById('transcriptionOutput');
+        const submitButton = document.getElementById('submitButton');
+        uploadForm.addEventListener('submit', async (event) => {
+            event.preventDefault(); // Prevent default form submission
+            transcriptionOutput.textContent = ''; // Clear previous output
+            transcriptionOutput.classList.remove('error'); // Remove error styling
+            loadingDiv.style.display = 'block'; // Show loading text
+            submitButton.disabled = true; // Disable button during processing
+            const file = audioFile.files[0];
+            if (!file) {
+                transcriptionOutput.textContent = 'Please select an audio or video file.';
+                transcriptionOutput.classList.add('error');
+                loadingDiv.style.display = 'none';
+                submitButton.disabled = false;
+                return;
+            }
+            const formData = new FormData();
+            formData.append('file', file); // 'file' must match the parameter name in your FastAPI endpoint
+            try {
+                // Use a relative path to the API endpoint
+                const response = await fetch('/transcribefile/', {
+                    method: 'POST',
+                    body: formData,
+                    // fetch will automatically set the 'Content-Type' header correctly for FormData
+                });
+                if (response.ok) { // Check if HTTP status is 2xx (e.g., 200 OK)
+                    const data = await response.json();
+                    transcriptionOutput.textContent = data.rnnt_transcription || 'No transcription found.';
+                } else {
+                    // Handle API errors (e.g., 400 Bad Request, 500 Internal Server Error)
+                    let errorMessage = `Error: ${response.status} - ${response.statusText}`;
+                    try {
+                        const errorData = await response.json(); // FastAPI often returns JSON for errors
+                        if (errorData.detail) {
+                            errorMessage = `Error: ${response.status} - ${errorData.detail}`;
+                        } else {
+                             errorMessage = `Error: ${response.status} - ${JSON.stringify(errorData)}`;
+                        }
+                    } catch (e) {
+                        // If response is not JSON, use raw text
+                        const rawText = await response.text();
+                        errorMessage = `Error: ${response.status} - ${rawText.substring(0, 200)}...`; // Limit length
+                    }
+                    transcriptionOutput.textContent = errorMessage;
+                    transcriptionOutput.classList.add('error');
+                    console.error('API Error:', errorMessage);
+                }
+            } catch (error) {
+                // Handle network errors (e.g., server unreachable)
+                transcriptionOutput.textContent = `Network error: ${error.message}. Please check your connection or try again.`;
+                transcriptionOutput.classList.add('error');
+                console.error('Fetch error:', error);
+            } finally {
+                loadingDiv.style.display = 'none'; // Hide loading text
+                submitButton.disabled = false; // Re-enable button
+            }
+        });
+    </script>
+</body>
+</html>