Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import utils | |
| def image_to_text_prompt(image_path: str, metadata: dict = None) -> str: | |
| """Generate a text prompt to represent a image file with its metadata.""" | |
| metadata = metadata or {} | |
| metadata_lines = '\n'.join(f'- {key}: {value}' for key, value in metadata.items()) | |
| if metadata_lines: | |
| metadata_lines = '\n' + metadata_lines | |
| return f'''<image> | |
| Filename: {os.path.basename(image_path)} | |
| Metadata: {metadata_lines} | |
| </image> | |
| ''' | |
| def video_to_text_prompt(video_path: str, metadata: dict = None) -> str: | |
| """Generate a text prompt to represent a video file with its metadata.""" | |
| metadata = metadata or {} | |
| metadata_lines = '\n'.join(f'- {key}: {value}' for key, value in metadata.items()) | |
| if metadata_lines: | |
| metadata_lines = '\n' + metadata_lines | |
| return f'''<video> | |
| Filename: {os.path.basename(video_path)} | |
| Metadata: {metadata_lines} | |
| </video> | |
| ''' | |
| def video_segment_to_text_prompt( | |
| start: float, | |
| end: float, | |
| transcript_segments: list[dict], | |
| frame_paths: list[str] | |
| ) -> str: | |
| """Generate a text prompt to represent a video segment with its timespan, transcript segments, and frame images.""" | |
| # include timespans | |
| timespan_text = f'{utils.seconds_to_hms(int(start))} - {utils.seconds_to_hms(int(end))}' | |
| # include transcript segments | |
| transcript_texts = [] | |
| for segment in transcript_segments: | |
| transcript_texts.append( | |
| f'- {utils.seconds_to_hms(int(segment["start"]), drop_hours=True)}' | |
| f'-{utils.seconds_to_hms(int(segment["end"]), drop_hours=True)}: {segment["text"]}') | |
| transcript_lines = '\n'.join(transcript_texts) | |
| if transcript_lines: | |
| transcript_lines = '\n' + transcript_lines | |
| # include frame images | |
| image_tags = [] | |
| for frame_path in frame_paths: | |
| image_tags.append(f'<image>{frame_path}</image>') | |
| frame_images_lines = '\n'.join(image_tags) | |
| return f'''<video_segment> | |
| Timespan: {timespan_text} | |
| Transcript: {transcript_lines} | |
| {frame_images_lines} | |
| </video_segment> | |
| ''' | |