Spaces:
Runtime error
Runtime error
| import tempfile | |
| from pathlib import Path | |
| import gradio as gr | |
| import spaces | |
| from llama_cookbook.inference.model_utils import load_model as load_model_llamarecipes | |
| from llama_cookbook.inference.model_utils import load_peft_model | |
| from transformers import AutoTokenizer | |
| from src.data.single_video import SingleVideo | |
| from src.data.utils_asr import PromptASR | |
| from src.models.llama_inference import inference | |
| from src.test.vidchapters import get_chapters | |
| from tools.download.models import download_base_model, download_model | |
| # Set up proxies | |
| # from urllib.request import getproxies | |
| # proxies = getproxies() | |
| # os.environ["HTTP_PROXY"] = os.environ["http_proxy"] = proxies["http"] | |
| # os.environ["HTTPS_PROXY"] = os.environ["https_proxy"] = proxies["https"] | |
| # os.environ["NO_PROXY"] = os.environ["no_proxy"] = "localhost, 127.0.0.1/8, ::1" | |
| # Global variables to store loaded models | |
| base_model = None | |
| tokenizer = None | |
| current_peft_model = None | |
| inference_model = None | |
| LLAMA_CKPT_PATH = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
| def load_base_model(): | |
| """Load the base Llama model and tokenizer once at startup.""" | |
| global base_model, tokenizer | |
| if base_model is None: | |
| print(f"Loading base model: {LLAMA_CKPT_PATH}") | |
| # base_model = load_model_llamarecipes( | |
| # model_name=LLAMA_CKPT_PATH, | |
| # device_map="auto", | |
| # quantization=None, | |
| # use_fast_kernels=True, | |
| # ) | |
| # tokenizer = AutoTokenizer.from_pretrained(LLAMA_CKPT_PATH) | |
| # Try to get the local path using the download function | |
| model_path = download_base_model("lucas-ventura/chapter-llama", local_dir=".") | |
| model_path = f"/home/user/app/{LLAMA_CKPT_PATH}" | |
| print(f"Model path: {model_path}") | |
| base_model = load_model_llamarecipes( | |
| model_name=model_path, | |
| device_map="auto", | |
| quantization=None, | |
| use_fast_kernels=True, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| base_model.eval() | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print("Base model loaded successfully") | |
| class FastLlamaInference: | |
| def __init__( | |
| self, | |
| model, | |
| add_special_tokens: bool = True, | |
| temperature: float = 1.0, | |
| max_new_tokens: int = 1024, | |
| top_p: float = 1.0, | |
| top_k: int = 50, | |
| use_cache: bool = True, | |
| max_padding_length: int = None, | |
| do_sample: bool = False, | |
| min_length: int = None, | |
| repetition_penalty: float = 1.0, | |
| length_penalty: int = 1, | |
| max_prompt_tokens: int = 35_000, | |
| ): | |
| self.model = model | |
| self.tokenizer = tokenizer | |
| self.add_special_tokens = add_special_tokens | |
| self.temperature = temperature | |
| self.max_new_tokens = max_new_tokens | |
| self.top_p = top_p | |
| self.top_k = top_k | |
| self.use_cache = use_cache | |
| self.max_padding_length = max_padding_length | |
| self.do_sample = do_sample | |
| self.min_length = min_length | |
| self.repetition_penalty = repetition_penalty | |
| self.length_penalty = length_penalty | |
| self.max_prompt_tokens = max_prompt_tokens | |
| def __call__(self, prompt: str, **kwargs): | |
| # Create a dict of default parameters from instance attributes | |
| params = { | |
| "model": self.model, | |
| "tokenizer": self.tokenizer, | |
| "prompt": prompt, | |
| "add_special_tokens": self.add_special_tokens, | |
| "temperature": self.temperature, | |
| "max_new_tokens": self.max_new_tokens, | |
| "top_p": self.top_p, | |
| "top_k": self.top_k, | |
| "use_cache": self.use_cache, | |
| "max_padding_length": self.max_padding_length, | |
| "do_sample": self.do_sample, | |
| "min_length": self.min_length, | |
| "repetition_penalty": self.repetition_penalty, | |
| "length_penalty": self.length_penalty, | |
| "max_prompt_tokens": self.max_prompt_tokens, | |
| } | |
| # Update with any overrides passed in kwargs | |
| params.update(kwargs) | |
| return inference(**params) | |
| def load_peft(model_name: str = "asr-10k"): | |
| """Load or switch PEFT model while reusing the base model.""" | |
| global base_model, current_peft_model, inference_model | |
| # First make sure the base model is loaded | |
| if base_model is None: | |
| load_base_model() | |
| # Only load a new PEFT model if it's different from the current one | |
| if current_peft_model != model_name: | |
| print(f"Loading PEFT model: {model_name}") | |
| model_path = download_model(model_name) | |
| if not Path(model_path).exists(): | |
| print(f"PEFT model does not exist at {model_path}") | |
| return False | |
| # Apply the PEFT model to the base model | |
| peft_model = load_peft_model(base_model, model_path) | |
| peft_model.eval() | |
| # Create the inference wrapper | |
| inference_model = FastLlamaInference(model=peft_model) | |
| current_peft_model = model_name | |
| print(f"PEFT model {model_name} loaded successfully") | |
| return True | |
| # Model already loaded | |
| return True | |
| def process_video(video_file, model_name: str = "asr-10k", do_sample: bool = False): | |
| """Process a video file and generate chapters.""" | |
| progress = gr.Progress() | |
| progress(0, desc="Starting...") | |
| # Check if we have a valid input | |
| if video_file is None: | |
| return "Please upload a video file." | |
| # Load the PEFT model | |
| progress(0.1, desc=f"Loading LoRA parameters from {model_name}...") | |
| if not load_peft(model_name): | |
| return "Failed to load model. Please try again." | |
| # Create a temporary directory to save the uploaded video | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| temp_video_path = Path(temp_dir) / "temp_video.mp4" | |
| # Using uploaded file | |
| progress(0.2, desc="Processing uploaded video...") | |
| with open(temp_video_path, "wb") as f: | |
| f.write(video_file) | |
| # Process the video | |
| progress(0.3, desc="Extracting ASR transcript...") | |
| single_video = SingleVideo(temp_video_path) | |
| progress(0.4, desc="Creating prompt...") | |
| prompt = PromptASR(chapters=single_video) | |
| vid_id = single_video.video_ids[0] | |
| progress(0.5, desc="Creating prompt...") | |
| prompt = prompt.get_prompt_test(vid_id) | |
| transcript = single_video.get_asr(vid_id) | |
| prompt = prompt + transcript | |
| progress(0.6, desc="Generating chapters with Chapter-Llama...") | |
| _, chapters = get_chapters( | |
| inference_model, | |
| prompt, | |
| max_new_tokens=1024, | |
| do_sample=do_sample, | |
| vid_id=vid_id, | |
| ) | |
| # Format the output | |
| progress(0.9, desc="Formatting results...") | |
| output = "" | |
| for timestamp, text in chapters.items(): | |
| output += f"{timestamp}: {text}\n" | |
| progress(1.0, desc="Complete!") | |
| return output | |
| # CSS for the submit button color | |
| head = """ | |
| <head> | |
| <title>Chapter-Llama - VidChapters</title> | |
| <link rel="icon" type="image/x-icon" href="./favicon.ico"> | |
| </head> | |
| """ | |
| title_markdown = """ | |
| <div style="display: flex; justify-content: space-between; align-items: center; background: linear-gradient(90deg, rgba(72,219,251,0.1), rgba(29,209,161,0.1)); border-radius: 20px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); padding: 20px; margin-bottom: 20px;"> | |
| <div style="display: flex; align-items: center;"> | |
| <a href="https://github.com/lucas-ventura/chapter-llama" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;"> | |
| <img src="https://imagine.enpc.fr/~lucas.ventura/chapter-llama/images/chapter-llama.png" alt="Chapter-Llama" style="max-width: 100px; height: auto; border-radius: 15px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);"> | |
| </a> | |
| <div> | |
| <h1 style="margin: 0; background: linear-gradient(90deg, #8F68C3, #477EF4); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-size: 2.5em; font-weight: 700;">Chapter-Llama</h1> | |
| <h2 style="margin: 10px 0; background: linear-gradient(90deg, #8F68C3, #477EF4); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-size: 1.8em; font-weight: 600;">Efficient Chaptering in Hour-Long Videos with LLMs</h2> | |
| <div style="display: flex; gap: 15px; margin-top: 10px;"> | |
| <a href="https://github.com/lucas-ventura/chapter-llama" style="text-decoration: none; color: #8F68C3; font-weight: 500; transition: color 0.3s;">GitHub</a> | | |
| <a href="https://imagine.enpc.fr/~lucas.ventura/chapter-llama/" style="text-decoration: none; color: #8F68C3; font-weight: 500; transition: color 0.3s;">Project Page</a> | | |
| <a href="https://arxiv.org/abs/2504.00072" style="text-decoration: none; color: #8F68C3; font-weight: 500; transition: color 0.3s;">Paper</a> | |
| </div> | |
| </div> | |
| </div> | |
| <div style="text-align: right; margin-left: 20px;"> | |
| <h2 style="margin: 10px 0; color: #24467C; font-weight: 700; font-size: 2.5em;">CVPR 2025</h2> | |
| </div> | |
| </div> | |
| """ | |
| note_html = """ | |
| <div style="background-color: #f9f9f9; border-left: 5px solid #48dbfb; padding: 20px; margin-top: 20px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);"> | |
| <p style="font-size: 1.1em; color: #ff9933; margin-bottom: 10px; font-weight: bold;">Note: If you encounter any errors with this demo, you can run the code locally using the following commands:</p> | |
| <pre style="background-color: #f1f1f1; padding: 15px; border-radius: 5px; overflow-x: auto;"> | |
| # Clone the repository | |
| git clone https://github.com/lucas-ventura/chapter-llama.git | |
| cd chapter-llama | |
| # Install demo dependencies | |
| python -m pip install -e ".[demo]" | |
| # Launch the demo | |
| python demo.py</pre> | |
| <p style="font-size: 1.1em; color: #555; margin-bottom: 10px;">If you find any issues, please report them on our <a href="https://github.com/lucas-ventura/chapter-llama/issues" style="color: #8F68C3; text-decoration: none;">GitHub repository</a>.</p> | |
| </div> | |
| """ | |
| # Citation from demo_sample.py | |
| bibtext = """ | |
| ### Citation | |
| ``` | |
| @InProceedings{ventura25chapter, | |
| title = {{Chapter-Llama}: Efficient Chaptering in Hour-Long Videos with {LLM}s}, | |
| author = {Lucas Ventura and Antoine Yang and Cordelia Schmid and G{\"u}l Varol}, | |
| booktitle = {CVPR}, | |
| year = {2025} | |
| } | |
| ``` | |
| """ | |
| # Create the Gradio interface | |
| with gr.Blocks(title="Chapter-Llama", head=head) as demo: | |
| gr.HTML(title_markdown) | |
| gr.Markdown( | |
| """ | |
| This demo is currently using only the audio data (ASR), without frame information. | |
| We will add audio+captions functionality in the near future, which will improve | |
| chapter generation by incorporating visual content. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_input = gr.File( | |
| label="Upload Video or Audio File", | |
| file_types=["video", "audio"], | |
| type="binary", | |
| ) | |
| model_dropdown = gr.Dropdown( | |
| choices=["asr-10k", "asr-1k"], | |
| value="asr-10k", | |
| label="Select Model", | |
| ) | |
| do_sample = gr.Checkbox( | |
| label="Use random sampling", value=False, interactive=True | |
| ) | |
| submit_btn = gr.Button("Generate Chapters") | |
| with gr.Column(): | |
| status_area = gr.Markdown("**Status:** Ready to process video") | |
| output_text = gr.Textbox( | |
| label="Generated Chapters", lines=10, interactive=False | |
| ) | |
| def update_status_and_process(video_file, model_name, do_sample): | |
| if video_file is None: | |
| return ( | |
| "**Status:** No video uploaded", | |
| "Please upload a video file.", | |
| ) | |
| else: | |
| return "**Status:** Processing video...", process_video( | |
| video_file, model_name, do_sample | |
| ) | |
| # Load the base model at startup | |
| load_base_model() | |
| submit_btn.click( | |
| fn=update_status_and_process, | |
| inputs=[video_input, model_dropdown, do_sample], | |
| outputs=[status_area, output_text], | |
| ) | |
| gr.Markdown(bibtext) | |
| gr.HTML(note_html) | |
| if __name__ == "__main__": | |
| # Launch the Gradio app | |
| demo.launch() | |