""" Leaderboard HTML Table Generator Creates styled HTML tables for the leaderboard view """ import pandas as pd from typing import Optional from .metric_displays import ( get_rank_badge, get_success_rate_bar, get_gpu_utilization_bar, get_provider_badge, get_agent_type_badge, get_hardware_badge, format_cost, format_duration, get_tooltip_icon ) def generate_leaderboard_html( df: pd.DataFrame, sort_by: str = "success_rate", ascending: bool = False ) -> str: """ Generate styled HTML table for leaderboard Args: df: Leaderboard DataFrame sort_by: Column to sort by ascending: Sort order (False = descending) Returns: HTML string with complete styled table Expected DataFrame columns: - model (str): Model name - agent_type (str): tool, code, or both - provider (str): litellm or transformers - success_rate (float): 0-100 - total_tests (int): Number of tests - avg_duration_ms (float): Average duration - total_cost_usd (float): Total cost - co2_emissions_g (float): CO2 emissions - gpu_utilization_avg (float, optional): GPU utilization % - submitted_by (str): Username """ # Sort dataframe df_sorted = df.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True) # Start HTML with embedded CSS html = """
""" # Generate table rows for idx, row in df_sorted.iterrows(): rank = idx + 1 # Convert row to dictionary for data attributes (like reference implementation) row_dict = row.to_dict() # Generate data attributes dynamically from all row data data_attrs_dict = {} for key, value in row_dict.items(): # Convert underscores to hyphens for HTML data attributes attr_name = f"data-{key.replace('_', '-')}" # Handle None/NaN values if pd.isna(value): data_attrs_dict[attr_name] = "None" else: data_attrs_dict[attr_name] = str(value) # Create the data attributes string data_attrs = " ".join([f'{key}="{value}"' for key, value in data_attrs_dict.items()]) # Get values with safe defaults for display model = row.get('model', 'Unknown') agent_type = row.get('agent_type', 'unknown') provider = row.get('provider', 'unknown') success_rate = row.get('success_rate', 0.0) total_tests = row.get('total_tests', 0) successful_tests = row.get('successful_tests', 0) failed_tests = row.get('failed_tests', 0) avg_steps = row.get('avg_steps', 0.0) avg_duration_ms = row.get('avg_duration_ms', 0.0) total_tokens = row.get('total_tokens', 0) total_cost_usd = row.get('total_cost_usd', 0.0) co2_emissions_g = row.get('co2_emissions_g', 0.0) gpu_utilization_avg = row.get('gpu_utilization_avg', None) gpu_memory_avg_mib = row.get('gpu_memory_avg_mib', None) gpu_memory_max_mib = row.get('gpu_memory_max_mib', None) gpu_temperature_avg = row.get('gpu_temperature_avg', None) gpu_temperature_max = row.get('gpu_temperature_max', None) gpu_power_avg_w = row.get('gpu_power_avg_w', None) timestamp = row.get('timestamp', '') submitted_by = row.get('submitted_by', 'Unknown') # Check if GPU job has_gpu = pd.notna(gpu_utilization_avg) and gpu_utilization_avg > 0 # Format GPU utilization if has_gpu: gpu_display = get_gpu_utilization_bar(gpu_utilization_avg) else: gpu_display = 'N/A' # Format CO2 if pd.notna(co2_emissions_g) and co2_emissions_g > 0: co2_display = f'{co2_emissions_g:.2f}g' else: co2_display = 'N/A' # Format GPU Memory if pd.notna(gpu_memory_avg_mib) and pd.notna(gpu_memory_max_mib): gpu_mem_display = f'{gpu_memory_avg_mib:.0f}/{gpu_memory_max_mib:.0f}' else: gpu_mem_display = 'N/A' # Format GPU Temperature if pd.notna(gpu_temperature_avg) and pd.notna(gpu_temperature_max): gpu_temp_display = f'{gpu_temperature_avg:.0f}/{gpu_temperature_max:.0f}°C' else: gpu_temp_display = 'N/A' # Format GPU Power if pd.notna(gpu_power_avg_w): gpu_power_display = f'{gpu_power_avg_w:.1f}W' else: gpu_power_display = 'N/A' # Format timestamp from datetime import datetime if pd.notna(timestamp): try: # Handle both string and Timestamp objects if isinstance(timestamp, pd.Timestamp): timestamp_display = timestamp.strftime('%Y-%m-%d %H:%M') else: dt = datetime.fromisoformat(str(timestamp).replace('Z', '+00:00')) timestamp_display = dt.strftime('%Y-%m-%d %H:%M') except Exception as e: timestamp_display = str(timestamp)[:16] if timestamp else 'N/A' else: timestamp_display = 'N/A' # Format Run ID (show first 8 characters) run_id = row.get('run_id', 'N/A') run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id html += f""" """ html += """
Rank Run ID Model Type Provider Hardware Success Rate Tests (P/F) Steps Duration Tokens Cost CO2 GPU Util GPU Mem GPU Temp GPU Power Timestamp Submitted By
{get_rank_badge(rank)} {run_id_short} {model} {get_agent_type_badge(agent_type)} {get_provider_badge(provider)} {get_hardware_badge(has_gpu)} {get_success_rate_bar(success_rate)} {total_tests} / {successful_tests} / {failed_tests} {avg_steps:.1f} {format_duration(avg_duration_ms)} {total_tokens:,} {format_cost(total_cost_usd)} {co2_display} {gpu_display} {gpu_mem_display} {gpu_temp_display} {gpu_power_display} {timestamp_display} {submitted_by}
""" return html def generate_empty_state_html() -> str: """ Generate HTML for empty leaderboard state Returns: HTML string for empty state """ return """
📊

No Evaluation Results Yet

Run your first evaluation to see results appear here.

""" def generate_filter_summary_html( total_runs: int, filtered_runs: int, active_filters: dict ) -> str: """ Generate summary of active filters Args: total_runs: Total number of runs filtered_runs: Number of runs after filtering active_filters: Dict of active filter values Returns: HTML string with filter summary """ if filtered_runs == total_runs: return f"""
Showing all {total_runs} evaluation runs
""" filter_chips = [] for key, value in active_filters.items(): if value and value != "All": filter_chips.append(f""" {key}: {value} """) filters_html = "".join(filter_chips) if filter_chips else "" return f"""
Showing {filtered_runs} of {total_runs} runs
{filters_html}
"""