"""
Leaderboard HTML Table Generator
Creates styled HTML tables for the leaderboard view
"""
import pandas as pd
from typing import Optional
from .metric_displays import (
get_rank_badge,
get_success_rate_bar,
get_gpu_utilization_bar,
get_provider_badge,
get_agent_type_badge,
get_hardware_badge,
format_cost,
format_duration,
get_tooltip_icon
)
def generate_leaderboard_html(
df: pd.DataFrame,
sort_by: str = "success_rate",
ascending: bool = False
) -> str:
"""
Generate styled HTML table for leaderboard
Args:
df: Leaderboard DataFrame
sort_by: Column to sort by
ascending: Sort order (False = descending)
Returns:
HTML string with complete styled table
Expected DataFrame columns:
- model (str): Model name
- agent_type (str): tool, code, or both
- provider (str): litellm or transformers
- success_rate (float): 0-100
- total_tests (int): Number of tests
- avg_duration_ms (float): Average duration
- total_cost_usd (float): Total cost
- co2_emissions_g (float): CO2 emissions
- gpu_utilization_avg (float, optional): GPU utilization %
- submitted_by (str): Username
"""
# Sort dataframe
df_sorted = df.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)
# Start HTML with embedded CSS
html = """
Rank
Run ID
Model
Type
Provider
Hardware
Success Rate
Tests (P/F)
Steps
Duration
Tokens
Cost
CO2
GPU Util
GPU Mem
GPU Temp
GPU Power
Timestamp
Submitted By
"""
# Generate table rows
for idx, row in df_sorted.iterrows():
rank = idx + 1
# Convert row to dictionary for data attributes (like reference implementation)
row_dict = row.to_dict()
# Generate data attributes dynamically from all row data
data_attrs_dict = {}
for key, value in row_dict.items():
# Convert underscores to hyphens for HTML data attributes
attr_name = f"data-{key.replace('_', '-')}"
# Handle None/NaN values
if pd.isna(value):
data_attrs_dict[attr_name] = "None"
else:
data_attrs_dict[attr_name] = str(value)
# Create the data attributes string
data_attrs = " ".join([f'{key}="{value}"' for key, value in data_attrs_dict.items()])
# Get values with safe defaults for display
model = row.get('model', 'Unknown')
agent_type = row.get('agent_type', 'unknown')
provider = row.get('provider', 'unknown')
success_rate = row.get('success_rate', 0.0)
total_tests = row.get('total_tests', 0)
successful_tests = row.get('successful_tests', 0)
failed_tests = row.get('failed_tests', 0)
avg_steps = row.get('avg_steps', 0.0)
avg_duration_ms = row.get('avg_duration_ms', 0.0)
total_tokens = row.get('total_tokens', 0)
total_cost_usd = row.get('total_cost_usd', 0.0)
co2_emissions_g = row.get('co2_emissions_g', 0.0)
gpu_utilization_avg = row.get('gpu_utilization_avg', None)
gpu_memory_avg_mib = row.get('gpu_memory_avg_mib', None)
gpu_memory_max_mib = row.get('gpu_memory_max_mib', None)
gpu_temperature_avg = row.get('gpu_temperature_avg', None)
gpu_temperature_max = row.get('gpu_temperature_max', None)
gpu_power_avg_w = row.get('gpu_power_avg_w', None)
timestamp = row.get('timestamp', '')
submitted_by = row.get('submitted_by', 'Unknown')
# Check if GPU job
has_gpu = pd.notna(gpu_utilization_avg) and gpu_utilization_avg > 0
# Format GPU utilization
if has_gpu:
gpu_display = get_gpu_utilization_bar(gpu_utilization_avg)
else:
gpu_display = 'N/A'
# Format CO2
if pd.notna(co2_emissions_g) and co2_emissions_g > 0:
co2_display = f'{co2_emissions_g:.2f}g'
else:
co2_display = 'N/A'
# Format GPU Memory
if pd.notna(gpu_memory_avg_mib) and pd.notna(gpu_memory_max_mib):
gpu_mem_display = f'{gpu_memory_avg_mib:.0f}/{gpu_memory_max_mib:.0f}'
else:
gpu_mem_display = 'N/A'
# Format GPU Temperature
if pd.notna(gpu_temperature_avg) and pd.notna(gpu_temperature_max):
gpu_temp_display = f'{gpu_temperature_avg:.0f}/{gpu_temperature_max:.0f}°C'
else:
gpu_temp_display = 'N/A'
# Format GPU Power
if pd.notna(gpu_power_avg_w):
gpu_power_display = f'{gpu_power_avg_w:.1f}W'
else:
gpu_power_display = 'N/A'
# Format timestamp
from datetime import datetime
if pd.notna(timestamp):
try:
# Handle both string and Timestamp objects
if isinstance(timestamp, pd.Timestamp):
timestamp_display = timestamp.strftime('%Y-%m-%d %H:%M')
else:
dt = datetime.fromisoformat(str(timestamp).replace('Z', '+00:00'))
timestamp_display = dt.strftime('%Y-%m-%d %H:%M')
except Exception as e:
timestamp_display = str(timestamp)[:16] if timestamp else 'N/A'
else:
timestamp_display = 'N/A'
# Format Run ID (show first 8 characters)
run_id = row.get('run_id', 'N/A')
run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id
html += f"""
{get_rank_badge(rank)}
{run_id_short}
{model}
{get_agent_type_badge(agent_type)}
{get_provider_badge(provider)}
{get_hardware_badge(has_gpu)}
{get_success_rate_bar(success_rate)}
{total_tests}/{successful_tests}/{failed_tests}
{avg_steps:.1f}
{format_duration(avg_duration_ms)}
{total_tokens:,}
{format_cost(total_cost_usd)}
{co2_display}
{gpu_display}
{gpu_mem_display}
{gpu_temp_display}
{gpu_power_display}
{timestamp_display}
{submitted_by}
"""
html += """
"""
return html
def generate_empty_state_html() -> str:
"""
Generate HTML for empty leaderboard state
Returns:
HTML string for empty state
"""
return """
📊
No Evaluation Results Yet
Run your first evaluation to see results appear here.
"""
def generate_filter_summary_html(
total_runs: int,
filtered_runs: int,
active_filters: dict
) -> str:
"""
Generate summary of active filters
Args:
total_runs: Total number of runs
filtered_runs: Number of runs after filtering
active_filters: Dict of active filter values
Returns:
HTML string with filter summary
"""
if filtered_runs == total_runs:
return f"""
Showing all {total_runs} evaluation runs
"""
filter_chips = []
for key, value in active_filters.items():
if value and value != "All":
filter_chips.append(f"""
{key}: {value}
""")
filters_html = "".join(filter_chips) if filter_chips else ""
return f"""