Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
69c6c68
1
Parent(s):
41c1420
added future perf prediction
Browse files- components/prediction_components.py +592 -0
- requirements.txt +2 -1
- results_v2.csv +23 -23
- tabs/leaderboard_v2.py +139 -4
components/prediction_components.py
ADDED
|
@@ -0,0 +1,592 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Components for AC prediction and visualization"""
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
from scipy.optimize import curve_fit
|
| 9 |
+
from scipy import stats
|
| 10 |
+
HAS_SCIPY = True
|
| 11 |
+
except ImportError:
|
| 12 |
+
HAS_SCIPY = False
|
| 13 |
+
# Fallback to numpy polynomial fitting
|
| 14 |
+
def curve_fit(func, xdata, ydata, p0=None, maxfev=5000, bounds=None):
|
| 15 |
+
# Simple fallback - just use polynomial fitting
|
| 16 |
+
if func.__name__ == 'exponential_growth':
|
| 17 |
+
# Linearize exponential: log(y) = log(a) + b*x
|
| 18 |
+
log_y = np.log(ydata + 0.01) # Add small constant to avoid log(0)
|
| 19 |
+
coeffs = np.polyfit(xdata, log_y, 1)
|
| 20 |
+
a = np.exp(coeffs[1])
|
| 21 |
+
b = coeffs[0]
|
| 22 |
+
c = 0.01
|
| 23 |
+
return [a, b, c], None
|
| 24 |
+
elif func.__name__ == 'logistic_growth':
|
| 25 |
+
# Better fallback for logistic using data characteristics
|
| 26 |
+
# Estimate L (max value) as slightly above current max
|
| 27 |
+
L = min(1.0, max(ydata) * 1.2) # Cap at 1.0
|
| 28 |
+
|
| 29 |
+
# Estimate x0 (midpoint) - when growth would be fastest
|
| 30 |
+
# For now, project forward from current trend
|
| 31 |
+
if len(xdata) > 1:
|
| 32 |
+
# Simple linear projection to estimate when we'd hit midpoint
|
| 33 |
+
slope = (ydata[-1] - ydata[0]) / (xdata[-1] - xdata[0])
|
| 34 |
+
if slope > 0:
|
| 35 |
+
# Estimate days to reach L/2
|
| 36 |
+
midpoint_value = L / 2
|
| 37 |
+
if ydata[-1] < midpoint_value:
|
| 38 |
+
days_to_midpoint = (midpoint_value - ydata[-1]) / slope
|
| 39 |
+
x0 = xdata[-1] + days_to_midpoint
|
| 40 |
+
else:
|
| 41 |
+
x0 = np.median(xdata)
|
| 42 |
+
else:
|
| 43 |
+
x0 = np.median(xdata)
|
| 44 |
+
else:
|
| 45 |
+
x0 = np.median(xdata)
|
| 46 |
+
|
| 47 |
+
# Estimate k (growth rate) based on current growth
|
| 48 |
+
k = 0.003 # Conservative default
|
| 49 |
+
|
| 50 |
+
return [L, k, x0], None
|
| 51 |
+
elif func.__name__ == 'power_law':
|
| 52 |
+
# Linearize power law: log(y) = log(a) + b*log(x)
|
| 53 |
+
log_x = np.log(xdata + 1)
|
| 54 |
+
log_y = np.log(ydata + 0.01)
|
| 55 |
+
coeffs = np.polyfit(log_x, log_y, 1)
|
| 56 |
+
return [np.exp(coeffs[1]), coeffs[0]], None
|
| 57 |
+
return p0, None
|
| 58 |
+
|
| 59 |
+
def exponential_growth(x, a, b, c):
|
| 60 |
+
"""Exponential growth function: y = a * exp(b * x) + c"""
|
| 61 |
+
return a * np.exp(b * x) + c
|
| 62 |
+
|
| 63 |
+
def logistic_growth(x, L, k, x0):
|
| 64 |
+
"""Logistic growth function: y = L / (1 + exp(-k*(x-x0)))"""
|
| 65 |
+
return L / (1 + np.exp(-k * (x - x0)))
|
| 66 |
+
|
| 67 |
+
def power_law(x, a, b):
|
| 68 |
+
"""Power law function: y = a * x^b"""
|
| 69 |
+
return a * np.power(x, b)
|
| 70 |
+
|
| 71 |
+
def create_ac_prediction_chart(df, domain_filter="All", model_type_filter="All"):
|
| 72 |
+
"""Create a prediction chart showing when AC will reach 99%
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
df: DataFrame with model data
|
| 76 |
+
domain_filter: Domain to filter by (All, Banking, Healthcare, etc.)
|
| 77 |
+
model_type_filter: Model type to filter by (All, Open Source, Proprietary)
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
# Clean up domain filter (remove emoji prefix if present)
|
| 81 |
+
if domain_filter.startswith('🌐'):
|
| 82 |
+
domain_filter = "All"
|
| 83 |
+
elif domain_filter.startswith('🏦'):
|
| 84 |
+
domain_filter = "Banking"
|
| 85 |
+
elif domain_filter.startswith('🏥'):
|
| 86 |
+
domain_filter = "Healthcare"
|
| 87 |
+
elif domain_filter.startswith('🛡️'):
|
| 88 |
+
domain_filter = "Insurance"
|
| 89 |
+
elif domain_filter.startswith('💰'):
|
| 90 |
+
domain_filter = "Investment"
|
| 91 |
+
elif domain_filter.startswith('📱'):
|
| 92 |
+
domain_filter = "Telecom"
|
| 93 |
+
|
| 94 |
+
# Determine which AC column to use based on domain filter
|
| 95 |
+
if domain_filter != "All":
|
| 96 |
+
ac_column = f'{domain_filter} AC'
|
| 97 |
+
# Check if domain-specific column exists
|
| 98 |
+
if ac_column not in df.columns:
|
| 99 |
+
ac_column = 'Avg AC'
|
| 100 |
+
else:
|
| 101 |
+
ac_column = 'Avg AC'
|
| 102 |
+
|
| 103 |
+
# Filter data to only include models with valid release dates and AC scores
|
| 104 |
+
df_clean = df.dropna(subset=['Release Date', ac_column])
|
| 105 |
+
df_clean = df_clean[df_clean[ac_column] > 0]
|
| 106 |
+
|
| 107 |
+
# Apply model type filter
|
| 108 |
+
if model_type_filter == "Open Source":
|
| 109 |
+
df_clean = df_clean[df_clean['Model Type'] == 'Open source']
|
| 110 |
+
elif model_type_filter == "Proprietary":
|
| 111 |
+
df_clean = df_clean[df_clean['Model Type'] == 'Proprietary']
|
| 112 |
+
|
| 113 |
+
# Rename the AC column to 'Avg AC' for consistent processing (only if different)
|
| 114 |
+
if ac_column != 'Avg AC':
|
| 115 |
+
# Drop the original 'Avg AC' column if it exists to avoid duplicates
|
| 116 |
+
if 'Avg AC' in df_clean.columns:
|
| 117 |
+
df_clean = df_clean.drop(columns=['Avg AC'])
|
| 118 |
+
df_clean = df_clean.rename(columns={ac_column: 'Avg AC'})
|
| 119 |
+
|
| 120 |
+
# Make a copy to avoid any issues with the original data
|
| 121 |
+
df_clean = df_clean.copy()
|
| 122 |
+
|
| 123 |
+
# Handle both YYYY-MM and YYYY-MM-DD formats
|
| 124 |
+
if df_clean['Release Date'].str.contains('-').all():
|
| 125 |
+
# Check if it's YYYY-MM format (no day component)
|
| 126 |
+
if df_clean['Release Date'].str.count('-').iloc[0] == 1:
|
| 127 |
+
# Add '-01' to make it a valid date
|
| 128 |
+
df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'] + '-01')
|
| 129 |
+
else:
|
| 130 |
+
df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])
|
| 131 |
+
else:
|
| 132 |
+
df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])
|
| 133 |
+
|
| 134 |
+
# Sort by release date
|
| 135 |
+
df_clean = df_clean.sort_values('Release Date')
|
| 136 |
+
|
| 137 |
+
# Create a running maximum (best performance achieved up to each date)
|
| 138 |
+
df_clean['Cumulative_Max_AC'] = df_clean['Avg AC'].expanding().max()
|
| 139 |
+
|
| 140 |
+
# Group by date and take the cumulative maximum for each date
|
| 141 |
+
df_best = df_clean.groupby('Release Date')['Cumulative_Max_AC'].max().reset_index()
|
| 142 |
+
df_best.columns = ['Release Date', 'Avg AC']
|
| 143 |
+
|
| 144 |
+
# Apply cumulative maximum again to ensure monotonic increase
|
| 145 |
+
df_best['Avg AC'] = df_best['Avg AC'].cummax()
|
| 146 |
+
|
| 147 |
+
# Convert dates to days since first release for curve fitting
|
| 148 |
+
first_date = df_best['Release Date'].min()
|
| 149 |
+
df_best['Days'] = (df_best['Release Date'] - first_date).dt.days
|
| 150 |
+
|
| 151 |
+
# Prepare data for fitting
|
| 152 |
+
x_data = df_best['Days'].values
|
| 153 |
+
y_data = df_best['Avg AC'].values
|
| 154 |
+
|
| 155 |
+
# With limited data (only 2 performance levels), use simple conservative linear projection
|
| 156 |
+
# Don't try to fit complex curves that will overfit
|
| 157 |
+
|
| 158 |
+
best_model = 'linear'
|
| 159 |
+
|
| 160 |
+
# Calculate simple linear trend
|
| 161 |
+
if len(x_data) > 1:
|
| 162 |
+
# Basic linear regression
|
| 163 |
+
z = np.polyfit(x_data, y_data, 1)
|
| 164 |
+
slope = z[0]
|
| 165 |
+
|
| 166 |
+
# Apply conservative adjustment (assume diminishing returns)
|
| 167 |
+
conservative_slope = slope * 0.5 # Assume 50% slower future improvements
|
| 168 |
+
|
| 169 |
+
# Create conservative linear projection
|
| 170 |
+
best_fit = [conservative_slope, y_data[-1] - conservative_slope * x_data[-1]]
|
| 171 |
+
|
| 172 |
+
# Calculate R² for the linear fit
|
| 173 |
+
p = np.poly1d(z)
|
| 174 |
+
y_pred = p(x_data)
|
| 175 |
+
best_r2 = 1 - (np.sum((y_data - y_pred)**2) / np.sum((y_data - y_data.mean())**2))
|
| 176 |
+
else:
|
| 177 |
+
# Single data point - use minimal growth
|
| 178 |
+
best_fit = [0.0001, y_data[0]]
|
| 179 |
+
best_r2 = 0.0
|
| 180 |
+
|
| 181 |
+
# Generate prediction timeline
|
| 182 |
+
future_days = np.arange(0, 5475, 30) # 15 years in 30-day intervals
|
| 183 |
+
|
| 184 |
+
# Simple conservative linear projection
|
| 185 |
+
p = np.poly1d(best_fit)
|
| 186 |
+
future_ac = p(future_days)
|
| 187 |
+
|
| 188 |
+
# Cap predictions at 1.0
|
| 189 |
+
future_ac = np.minimum(future_ac, 1.0)
|
| 190 |
+
|
| 191 |
+
# Find when we reach 99%
|
| 192 |
+
target_ac = 0.99
|
| 193 |
+
crossing_idx = np.where(future_ac >= target_ac)[0]
|
| 194 |
+
|
| 195 |
+
if len(crossing_idx) > 0:
|
| 196 |
+
days_to_99 = future_days[crossing_idx[0]]
|
| 197 |
+
date_99 = first_date + timedelta(days=int(days_to_99))
|
| 198 |
+
months_from_now = (date_99 - datetime.now()).days / 30.4
|
| 199 |
+
else:
|
| 200 |
+
date_99 = None
|
| 201 |
+
months_from_now = None
|
| 202 |
+
|
| 203 |
+
# Convert future days to dates
|
| 204 |
+
future_dates = [first_date + timedelta(days=int(d)) for d in future_days]
|
| 205 |
+
|
| 206 |
+
# Create the plot
|
| 207 |
+
fig = go.Figure()
|
| 208 |
+
|
| 209 |
+
# Add confidence bands FIRST (so they appear behind other traces)
|
| 210 |
+
if best_model and best_fit is not None:
|
| 211 |
+
# Generate smooth confidence bands
|
| 212 |
+
future_std = 0.05 # Base uncertainty
|
| 213 |
+
confidence_multiplier = np.linspace(1.0, 2.0, len(future_dates))
|
| 214 |
+
|
| 215 |
+
upper_bound = np.minimum(future_ac + future_std * confidence_multiplier, 1.0)
|
| 216 |
+
lower_bound = np.maximum(future_ac - future_std * confidence_multiplier, 0)
|
| 217 |
+
|
| 218 |
+
# Add confidence band as filled area
|
| 219 |
+
fig.add_trace(go.Scatter(
|
| 220 |
+
x=future_dates + future_dates[::-1],
|
| 221 |
+
y=list(upper_bound) + list(lower_bound[::-1]),
|
| 222 |
+
fill='toself',
|
| 223 |
+
fillcolor='rgba(16, 152, 247, 0.05)',
|
| 224 |
+
line=dict(width=0),
|
| 225 |
+
showlegend=False,
|
| 226 |
+
hoverinfo='skip',
|
| 227 |
+
name='Uncertainty'
|
| 228 |
+
))
|
| 229 |
+
|
| 230 |
+
# Add vendor info and additional metrics
|
| 231 |
+
df_with_vendor = df_clean.copy()
|
| 232 |
+
if 'Vendor' in df.columns:
|
| 233 |
+
vendor_map = df.set_index('Model')['Vendor'].to_dict()
|
| 234 |
+
df_with_vendor['Vendor'] = df_with_vendor['Model'].map(vendor_map).fillna('Unknown')
|
| 235 |
+
else:
|
| 236 |
+
df_with_vendor['Vendor'] = 'Unknown'
|
| 237 |
+
|
| 238 |
+
if 'Model Type' in df.columns:
|
| 239 |
+
type_map = df.set_index('Model')['Model Type'].to_dict()
|
| 240 |
+
df_with_vendor['Model Type'] = df_with_vendor['Model'].map(type_map).fillna('Unknown')
|
| 241 |
+
else:
|
| 242 |
+
df_with_vendor['Model Type'] = 'Unknown'
|
| 243 |
+
|
| 244 |
+
# Calculate additional metrics for each model
|
| 245 |
+
df_with_vendor['Gap_to_99'] = 0.99 - df_with_vendor['Avg AC']
|
| 246 |
+
df_with_vendor['Gap_to_Best'] = df_with_vendor['Cumulative_Max_AC'] - df_with_vendor['Avg AC']
|
| 247 |
+
|
| 248 |
+
# Get cost info if available
|
| 249 |
+
if 'Avg Total Cost' in df.columns:
|
| 250 |
+
cost_map = df.set_index('Model')['Avg Total Cost'].to_dict()
|
| 251 |
+
df_with_vendor['Cost'] = df_with_vendor['Model'].map(cost_map).fillna(0)
|
| 252 |
+
else:
|
| 253 |
+
df_with_vendor['Cost'] = 0
|
| 254 |
+
|
| 255 |
+
# Check if each model improved the frontier
|
| 256 |
+
df_with_vendor['Is_Frontier'] = df_with_vendor['Avg AC'] >= df_with_vendor['Cumulative_Max_AC'] - 0.001 # Small tolerance for float comparison
|
| 257 |
+
|
| 258 |
+
# Create frontier status text
|
| 259 |
+
frontier_status = []
|
| 260 |
+
for idx, row in df_with_vendor.iterrows():
|
| 261 |
+
if row['Is_Frontier']:
|
| 262 |
+
frontier_status.append('✅ Advanced SOTA')
|
| 263 |
+
else:
|
| 264 |
+
frontier_status.append('❌ Below existing best')
|
| 265 |
+
|
| 266 |
+
vendor_info = df_with_vendor['Vendor'].values
|
| 267 |
+
model_type = df_with_vendor['Model Type'].values
|
| 268 |
+
gap_to_99 = df_with_vendor['Gap_to_99'].values
|
| 269 |
+
gap_to_best = df_with_vendor['Gap_to_Best'].values
|
| 270 |
+
cost_info = df_with_vendor['Cost'].values
|
| 271 |
+
|
| 272 |
+
# Add historical data points with comprehensive hover
|
| 273 |
+
fig.add_trace(go.Scatter(
|
| 274 |
+
x=df_clean['Release Date'],
|
| 275 |
+
y=df_clean['Avg AC'],
|
| 276 |
+
mode='markers',
|
| 277 |
+
name='Individual Models',
|
| 278 |
+
marker=dict(
|
| 279 |
+
size=14,
|
| 280 |
+
color='rgba(227, 84, 84, 0.8)',
|
| 281 |
+
line=dict(width=2, color='rgba(255, 255, 255, 0.6)'),
|
| 282 |
+
symbol='circle'
|
| 283 |
+
),
|
| 284 |
+
customdata=list(zip(vendor_info, model_type, gap_to_99, gap_to_best, cost_info, frontier_status)),
|
| 285 |
+
hovertemplate=(
|
| 286 |
+
'<b style="font-size: 18px; color: #E35454;">%{text}</b><br>'
|
| 287 |
+
'<br>'
|
| 288 |
+
'<b style="color: #1098F7;">Model Information:</b><br>'
|
| 289 |
+
'• <b>Vendor:</b> %{customdata[0]}<br>'
|
| 290 |
+
'• <b>Type:</b> %{customdata[1]}<br>'
|
| 291 |
+
'• <b>Released:</b> %{x|%B %Y}<br>'
|
| 292 |
+
'• <b>Frontier Status:</b> %{customdata[5]}<br>'
|
| 293 |
+
'<br>'
|
| 294 |
+
'<b style="color: #FFD700;">Performance Metrics:</b><br>'
|
| 295 |
+
'• <b>Action Completion:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
|
| 296 |
+
'• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[2]:.1%}</span><br>'
|
| 297 |
+
'• <b>Behind Best:</b> <span style="color: #FFA500;">-%{customdata[3]:.1%}</span><br>'
|
| 298 |
+
'<br>'
|
| 299 |
+
'<b style="color: #28a745;">Cost Efficiency:</b><br>'
|
| 300 |
+
'• <b>Avg Session Cost:</b> $%{customdata[4]:.4f}<br>'
|
| 301 |
+
'<br>'
|
| 302 |
+
'<i style="color: #B1B5B9; font-size: 11px;">Performance at release time</i>'
|
| 303 |
+
'<extra></extra>'
|
| 304 |
+
),
|
| 305 |
+
text=df_clean['Model'].values,
|
| 306 |
+
hoverlabel=dict(
|
| 307 |
+
bgcolor='rgba(26, 26, 46, 0.95)',
|
| 308 |
+
bordercolor='rgba(227, 84, 84, 0.5)',
|
| 309 |
+
font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
|
| 310 |
+
align='left',
|
| 311 |
+
namelength=-1
|
| 312 |
+
)
|
| 313 |
+
))
|
| 314 |
+
|
| 315 |
+
# Calculate improvement metrics for hover
|
| 316 |
+
df_best['Improvement'] = df_best['Avg AC'].diff().fillna(0)
|
| 317 |
+
df_best['Improvement_Pct'] = (df_best['Avg AC'].pct_change() * 100).fillna(0)
|
| 318 |
+
df_best['Gap_to_99'] = 0.99 - df_best['Avg AC']
|
| 319 |
+
|
| 320 |
+
# Find which model is responsible for the best performance at each date
|
| 321 |
+
best_model_at_date = []
|
| 322 |
+
for date_val in df_best['Release Date']:
|
| 323 |
+
# Find all models up to and including this date
|
| 324 |
+
models_up_to_date = df_clean[df_clean['Release Date'] <= date_val]
|
| 325 |
+
if not models_up_to_date.empty:
|
| 326 |
+
# Find the model with the highest AC score up to this date
|
| 327 |
+
best_idx = models_up_to_date['Avg AC'].idxmax()
|
| 328 |
+
best_model_at_date.append(models_up_to_date.loc[best_idx, 'Model'])
|
| 329 |
+
else:
|
| 330 |
+
best_model_at_date.append('Unknown')
|
| 331 |
+
|
| 332 |
+
# Add best performance line with enhanced metrics
|
| 333 |
+
fig.add_trace(go.Scatter(
|
| 334 |
+
x=df_best['Release Date'],
|
| 335 |
+
y=df_best['Avg AC'],
|
| 336 |
+
mode='lines+markers',
|
| 337 |
+
name='Best Performance Trend',
|
| 338 |
+
line=dict(color='#E35454', width=4, shape='linear'),
|
| 339 |
+
marker=dict(
|
| 340 |
+
size=16,
|
| 341 |
+
color='#E35454',
|
| 342 |
+
symbol='diamond',
|
| 343 |
+
line=dict(width=2, color='white')
|
| 344 |
+
),
|
| 345 |
+
customdata=list(zip(
|
| 346 |
+
df_best['Improvement'].values,
|
| 347 |
+
df_best['Improvement_Pct'].values,
|
| 348 |
+
df_best['Gap_to_99'].values,
|
| 349 |
+
best_model_at_date
|
| 350 |
+
)),
|
| 351 |
+
hovertemplate=(
|
| 352 |
+
'<b style="font-size: 16px; color: #E35454;">📈 Best Performance Frontier</b><br>'
|
| 353 |
+
'<br>'
|
| 354 |
+
'<b>Date:</b> %{x|%B %Y}<br>'
|
| 355 |
+
'<b>Leading Model:</b> %{customdata[3]}<br>'
|
| 356 |
+
'<b>Cumulative Best AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
|
| 357 |
+
'<br>'
|
| 358 |
+
'<b>Progress Metrics:</b><br>'
|
| 359 |
+
'• Improvement: <span style="color: #28a745;">+%{customdata[0]:.1%}</span><br>'
|
| 360 |
+
'• Growth Rate: <span style="color: #28a745;">+%{customdata[1]:.1f}%</span><br>'
|
| 361 |
+
'• Gap to 99%: <span style="color: #1098F7;">%{customdata[2]:.1%}</span><br>'
|
| 362 |
+
'<br>'
|
| 363 |
+
'<i style="color: #B1B5B9;">This represents the best performance achieved by any model up to this date</i>'
|
| 364 |
+
'<extra></extra>'
|
| 365 |
+
),
|
| 366 |
+
hoverlabel=dict(
|
| 367 |
+
bgcolor='rgba(26, 26, 46, 0.95)',
|
| 368 |
+
bordercolor='rgba(227, 84, 84, 0.5)',
|
| 369 |
+
font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
|
| 370 |
+
align='left',
|
| 371 |
+
namelength=-1
|
| 372 |
+
)
|
| 373 |
+
))
|
| 374 |
+
|
| 375 |
+
# Calculate months from now for each prediction point
|
| 376 |
+
months_from_now_list = [(date - datetime.now()).days / 30.4 for date in future_dates]
|
| 377 |
+
years_from_now_list = [m / 12 for m in months_from_now_list]
|
| 378 |
+
|
| 379 |
+
# Add prediction line with comprehensive metrics
|
| 380 |
+
fig.add_trace(go.Scatter(
|
| 381 |
+
x=future_dates,
|
| 382 |
+
y=future_ac,
|
| 383 |
+
mode='lines',
|
| 384 |
+
name=f'Prediction ({best_model.capitalize()})',
|
| 385 |
+
line=dict(color='#1098F7', width=4, dash='dash'),
|
| 386 |
+
opacity=0.8,
|
| 387 |
+
customdata=list(zip(
|
| 388 |
+
[max(0, 0.99 - y) for y in future_ac],
|
| 389 |
+
months_from_now_list,
|
| 390 |
+
years_from_now_list,
|
| 391 |
+
[best_r2] * len(future_ac)
|
| 392 |
+
)),
|
| 393 |
+
hovertemplate=(
|
| 394 |
+
'<b style="font-size: 18px; color: #1098F7;">🔮 AI Performance Prediction</b><br>'
|
| 395 |
+
'<br>'
|
| 396 |
+
'<b style="color: #FFD700;">Forecast Details:</b><br>'
|
| 397 |
+
'• <b>Date:</b> %{x|%B %Y}<br>'
|
| 398 |
+
'• <b>Predicted AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
|
| 399 |
+
'• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[0]:.1%}</span><br>'
|
| 400 |
+
'<br>'
|
| 401 |
+
'<b style="color: #28a745;">Timeline:</b><br>'
|
| 402 |
+
'• <b>Months from now:</b> %{customdata[1]:.0f} months<br>'
|
| 403 |
+
'• <b>Years from now:</b> %{customdata[2]:.1f} years<br>'
|
| 404 |
+
'<br>'
|
| 405 |
+
'<b style="color: #1098F7;">Model Confidence:</b><br>'
|
| 406 |
+
f'• <b>Algorithm:</b> {best_model.capitalize()}<br>'
|
| 407 |
+
'• <b>R² Score:</b> %{customdata[3]:.3f}<br>'
|
| 408 |
+
'<br>'
|
| 409 |
+
'<i style="color: #B1B5B9; font-size: 11px;">Based on historical performance trends</i>'
|
| 410 |
+
'<extra></extra>'
|
| 411 |
+
),
|
| 412 |
+
hoverlabel=dict(
|
| 413 |
+
bgcolor='rgba(26, 26, 46, 0.95)',
|
| 414 |
+
bordercolor='rgba(16, 152, 247, 0.5)',
|
| 415 |
+
font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
|
| 416 |
+
align='left',
|
| 417 |
+
namelength=-1
|
| 418 |
+
)
|
| 419 |
+
))
|
| 420 |
+
|
| 421 |
+
# Add 99% threshold line with enhanced styling
|
| 422 |
+
fig.add_hline(
|
| 423 |
+
y=0.99,
|
| 424 |
+
line_dash="dash",
|
| 425 |
+
line_color="rgba(40, 167, 69, 0.4)",
|
| 426 |
+
line_width=2,
|
| 427 |
+
annotation=dict(
|
| 428 |
+
text="<b>Enterprise-Grade Threshold (99%)</b>",
|
| 429 |
+
font=dict(size=13, color='#28a745', family='Geist, sans-serif'),
|
| 430 |
+
bgcolor='rgba(40, 167, 69, 0.15)',
|
| 431 |
+
bordercolor='#28a745',
|
| 432 |
+
borderwidth=1,
|
| 433 |
+
borderpad=4
|
| 434 |
+
),
|
| 435 |
+
annotation_position="right"
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
# Add marker for 99% crossing point with enhanced visibility
|
| 439 |
+
if date_99:
|
| 440 |
+
# Calculate days until achievement
|
| 441 |
+
days_until = (date_99 - datetime.now()).days
|
| 442 |
+
|
| 443 |
+
fig.add_trace(go.Scatter(
|
| 444 |
+
x=[date_99],
|
| 445 |
+
y=[0.99],
|
| 446 |
+
mode='markers+text',
|
| 447 |
+
name='🎯 99% Achievement',
|
| 448 |
+
marker=dict(
|
| 449 |
+
size=28,
|
| 450 |
+
color='#28a745',
|
| 451 |
+
symbol='star',
|
| 452 |
+
line=dict(width=3, color='white')
|
| 453 |
+
),
|
| 454 |
+
text=[f'<b>{date_99.strftime("%b %Y")}</b>'],
|
| 455 |
+
textposition='top center',
|
| 456 |
+
textfont=dict(size=16, color='#28a745', family='Geist, sans-serif'),
|
| 457 |
+
hovertemplate=(
|
| 458 |
+
'<b style="font-size: 18px; color: #28a745;">🎯 ENTERPRISE-READY MILESTONE</b><br>'
|
| 459 |
+
'<br>'
|
| 460 |
+
f'<b>Achievement Date:</b> <span style="font-size: 16px;">{date_99.strftime("%B %Y")}</span><br>'
|
| 461 |
+
f'<b>Time from today:</b> <span style="font-size: 16px; color: #FFD700;">{months_from_now:.0f} months</span><br>'
|
| 462 |
+
f'<b>Days remaining:</b> {days_until} days<br>'
|
| 463 |
+
f'<b>Years:</b> {months_from_now/12:.1f} years<br>'
|
| 464 |
+
'<br>'
|
| 465 |
+
'<b style="color: #1098F7;">Strategic Implications:</b><br>'
|
| 466 |
+
f'• Early adopters gain {months_from_now:.0f}-month advantage<br>'
|
| 467 |
+
'• Infrastructure investment critical now<br>'
|
| 468 |
+
'• 99% reliability enables production deployment<br>'
|
| 469 |
+
'<extra></extra>'
|
| 470 |
+
),
|
| 471 |
+
hoverlabel=dict(
|
| 472 |
+
bgcolor='rgba(26, 26, 46, 0.95)',
|
| 473 |
+
bordercolor='rgba(40, 167, 69, 0.5)',
|
| 474 |
+
font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
|
| 475 |
+
align='left',
|
| 476 |
+
namelength=-1
|
| 477 |
+
)
|
| 478 |
+
))
|
| 479 |
+
|
| 480 |
+
# Update layout with improved title showing active filters
|
| 481 |
+
filter_text = ""
|
| 482 |
+
if domain_filter != "All":
|
| 483 |
+
filter_text += f" - {domain_filter} Domain"
|
| 484 |
+
if model_type_filter != "All":
|
| 485 |
+
if filter_text:
|
| 486 |
+
filter_text += f", {model_type_filter} Models"
|
| 487 |
+
else:
|
| 488 |
+
filter_text += f" - {model_type_filter} Models"
|
| 489 |
+
|
| 490 |
+
title_text = f"<span style='font-size: 24px;'>🚀 When Will AI Agents Reach Enterprise-Grade Reliability?</span>"
|
| 491 |
+
if filter_text:
|
| 492 |
+
title_text += f"<br><span style='font-size: 14px; color: #1098F7;'>{filter_text}</span>"
|
| 493 |
+
|
| 494 |
+
if date_99 and months_from_now:
|
| 495 |
+
if months_from_now > 0:
|
| 496 |
+
title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Prediction: <b style='color: #FFD700;'>{date_99.strftime('%B %Y')}</b> (~{months_from_now:.0f} months)</span>"
|
| 497 |
+
else:
|
| 498 |
+
title_text += f"<br><span style='font-size: 16px; color: #28a745;'>Already achieved!</span>"
|
| 499 |
+
else:
|
| 500 |
+
title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Tracking performance improvements...</span>"
|
| 501 |
+
|
| 502 |
+
fig.update_layout(
|
| 503 |
+
title=dict(
|
| 504 |
+
text=title_text,
|
| 505 |
+
font=dict(size=20, family="Geist, sans-serif", color="#F5F6F7"),
|
| 506 |
+
x=0.5,
|
| 507 |
+
xanchor='center'
|
| 508 |
+
),
|
| 509 |
+
xaxis=dict(
|
| 510 |
+
title=dict(
|
| 511 |
+
text="<b>Release Date</b>",
|
| 512 |
+
font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
|
| 513 |
+
standoff=20
|
| 514 |
+
),
|
| 515 |
+
tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
|
| 516 |
+
gridcolor="rgba(245, 246, 247, 0.08)",
|
| 517 |
+
zerolinecolor="rgba(245, 246, 247, 0.15)",
|
| 518 |
+
showgrid=True,
|
| 519 |
+
gridwidth=1,
|
| 520 |
+
tickangle=0,
|
| 521 |
+
tickformat='%b %Y',
|
| 522 |
+
showspikes=True,
|
| 523 |
+
spikecolor="rgba(245, 246, 247, 0.3)",
|
| 524 |
+
spikethickness=1,
|
| 525 |
+
spikemode='across',
|
| 526 |
+
spikedash='dot',
|
| 527 |
+
range=[df_clean['Release Date'].min() - timedelta(days=60),
|
| 528 |
+
min(datetime.now() + timedelta(days=800), future_dates[-1] if future_dates else datetime.now())]
|
| 529 |
+
),
|
| 530 |
+
yaxis=dict(
|
| 531 |
+
title=dict(
|
| 532 |
+
text="<b>Action Completion (AC)</b>",
|
| 533 |
+
font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
|
| 534 |
+
standoff=20
|
| 535 |
+
),
|
| 536 |
+
tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
|
| 537 |
+
gridcolor="rgba(245, 246, 247, 0.08)",
|
| 538 |
+
zerolinecolor="rgba(245, 246, 247, 0.15)",
|
| 539 |
+
showgrid=True,
|
| 540 |
+
gridwidth=1,
|
| 541 |
+
tickformat='.0%',
|
| 542 |
+
dtick=0.1,
|
| 543 |
+
showspikes=True,
|
| 544 |
+
spikecolor="rgba(245, 246, 247, 0.3)",
|
| 545 |
+
spikethickness=1,
|
| 546 |
+
spikemode='across',
|
| 547 |
+
spikedash='dot',
|
| 548 |
+
range=[-0.05, 1.08]
|
| 549 |
+
),
|
| 550 |
+
plot_bgcolor="rgba(1, 9, 26, 0.98)",
|
| 551 |
+
paper_bgcolor="rgba(1, 9, 26, 0.98)",
|
| 552 |
+
height=650,
|
| 553 |
+
margin=dict(l=90, r=100, t=120, b=90),
|
| 554 |
+
hovermode='closest',
|
| 555 |
+
hoverdistance=30,
|
| 556 |
+
spikedistance=50,
|
| 557 |
+
legend=dict(
|
| 558 |
+
bgcolor="rgba(1, 9, 26, 0.9)",
|
| 559 |
+
bordercolor="rgba(245, 246, 247, 0.3)",
|
| 560 |
+
borderwidth=2,
|
| 561 |
+
font=dict(size=12, family="Geist, sans-serif", color="#F5F6F7"),
|
| 562 |
+
x=0.02,
|
| 563 |
+
y=0.98,
|
| 564 |
+
xanchor='left',
|
| 565 |
+
yanchor='top',
|
| 566 |
+
orientation='v',
|
| 567 |
+
itemsizing='constant',
|
| 568 |
+
itemwidth=40,
|
| 569 |
+
tracegroupgap=5,
|
| 570 |
+
title=dict(
|
| 571 |
+
text='<b>Legend</b>',
|
| 572 |
+
font=dict(size=13, color='#F5F6F7')
|
| 573 |
+
)
|
| 574 |
+
),
|
| 575 |
+
showlegend=True,
|
| 576 |
+
annotations=[
|
| 577 |
+
dict(
|
| 578 |
+
text=f"<b>Model:</b> Conservative Linear | <b>Note:</b> Limited data - projection assumes diminishing returns",
|
| 579 |
+
xref="paper", yref="paper",
|
| 580 |
+
x=0.01, y=-0.12,
|
| 581 |
+
showarrow=False,
|
| 582 |
+
font=dict(size=11, color="#B1B5B9", family="Geist, sans-serif"),
|
| 583 |
+
bgcolor="rgba(1, 9, 26, 0.9)",
|
| 584 |
+
bordercolor="rgba(245, 246, 247, 0.3)",
|
| 585 |
+
borderwidth=1,
|
| 586 |
+
borderpad=4
|
| 587 |
+
)
|
| 588 |
+
]
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
return fig, date_99, months_from_now
|
requirements.txt
CHANGED
|
@@ -2,4 +2,5 @@ gradio==5.35.0
|
|
| 2 |
pandas
|
| 3 |
matplotlib
|
| 4 |
plotly==5.24.1
|
| 5 |
-
pydantic==2.10.6
|
|
|
|
|
|
| 2 |
pandas
|
| 3 |
matplotlib
|
| 4 |
plotly==5.24.1
|
| 5 |
+
pydantic==2.10.6
|
| 6 |
+
scipy
|
results_v2.csv
CHANGED
|
@@ -1,23 +1,23 @@
|
|
| 1 |
-
Model,Vendor,Avg AC,Avg TSQ,Avg Total Cost,Avg Session Duration,Avg Turns,Banking AC,Healthcare AC,Insurance AC,Investment AC,Telecom AC,Banking TSQ,Healthcare TSQ,Insurance TSQ,Investment TSQ,Telecom TSQ,Avg Input Cost ($),Avg Output Cost ($),Banking Cost,Healthcare Cost,Insurance Cost,Investment Cost,Telecom Cost,Banking Duration,Healthcare Duration,Insurance Duration,Investment Duration,Telecom Duration,Banking Turns,Healthcare Turns,Insurance Turns,Investment Turns,Telecom Turns,Model Type,$/M input token,$/M output token,Output Type
|
| 2 |
-
gpt-4.1-2025-04-14,OpenAI,0.62,0.8,0.0684,24.32,3.1,0.6,0.62,0.66,0.64,0.58,0.81,0.83,0.68,0.88,0.82,0.0577,0.0107,0.052,0.0711,0.0629,0.0777,0.0783,18.52,24.4,25.24,27.88,25.58,2.61,3.15,2.92,3.3,3.48,Proprietary,2.0,8.0,Normal
|
| 3 |
-
mistral-medium-2508,Mistral,0.61,0.77,0.0199,37.45,2.98,0.57,0.6,0.7,0.57,0.59,0.74,0.75,0.73,0.87,0.76,0.0164,0.0035,0.0185,0.0196,0.0195,0.0223,0.0195,31.91,34.94,36.96,43.91,39.53,3.06,2.85,2.92,3.12,2.97,Proprietary,0.4,2.0,Normal
|
| 4 |
-
gpt-4.1-mini-2025-04-14,OpenAI,0.56,0.79,0.0141,26.0,3.43,0.56,0.6,0.46,0.5,0.64,0.8,0.85,0.63,0.84,0.83,0.0123,0.0018,0.0115,0.0143,0.0131,0.0164,0.0156,21.28,26.82,23.32,30.5,28.07,2.99,3.32,3.28,3.76,3.79,Proprietary,0.4,1.6,Normal
|
| 5 |
-
claude-sonnet-4-20250514,Anthropic,0.55,0.92,0.1537,66.6,2.89,0.58,0.62,0.53,0.49,0.53,0.9,0.95,0.93,0.92,0.9,0.1212,0.0325,0.1359,0.1542,0.1442,0.1669,0.1675,55.36,57.93,56.87,86.44,76.38,2.54,2.84,2.79,3.06,3.22,Proprietary,3.0,15.0,Normal
|
| 6 |
-
kimi-k2-instruct,Moonshot AI,0.53,0.9,0.0386,163.62,2.84,0.58,0.49,0.58,0.47,0.53,0.89,0.91,0.88,0.93,0.91,0.0346,0.004,0.0344,0.0401,0.0367,0.0419,0.0397,165.45,155.42,164.9,161.14,171.17,2.58,2.81,2.79,3.1,2.93,Open source,1.0,3.0,Normal
|
| 7 |
-
qwen3-235b-a22b-instruct-2507,Alibaba,0.53,0.85,0.0074,238.02,2.4,0.44,0.49,0.74,0.41,0.58,0.88,0.84,0.85,0.91,0.79,0.0067,0.0007,0.0059,0.0079,0.0077,0.0079,0.008,206.53,267.85,233.49,252.94,229.29,1.99,2.41,2.53,2.47,2.62,Open source,0.2,0.6,Reasoning
|
| 8 |
-
qwen2.5-72b-instruct,Alibaba,0.51,0.8,0.0361,34.68,2.65,0.48,0.61,0.52,0.42,0.52,0.78,0.84,0.77,0.82,0.79,0.0338,0.0023,0.0292,0.0348,0.0338,0.0417,0.0415,27.34,29.09,30.46,41.32,45.2,2.3,2.46,2.47,3.0,3.0,Open source,0.9,0.9,Normal
|
| 9 |
-
gemini-2.5-flash-lite,Google,0.47,0.84,0.0039,9.8,3.11,0.45,0.6,0.54,0.35,0.41,0.82,0.9,0.78,0.86,0.84,0.0034,0.0005,0.003,0.0036,0.0039,0.0049,0.0043,7.96,8.82,9.52,11.31,11.39,2.61,3.0,2.87,3.69,3.36,Proprietary,0.1,0.4,Reasoning
|
| 10 |
-
glm-4.5-air,Zai,0.44,0.94,0.0194,69.17,4.96,0.49,0.4,0.53,0.46,0.33,0.94,0.91,0.94,0.96,0.94,0.014,0.0054,0.0152,0.021,0.0191,0.0216,0.0199,57.56,69.95,86.47,76.51,55.35,3.99,5.2,5.02,5.41,5.2,Open source,0.2,1.1,Reasoning
|
| 11 |
-
gemini-2.5-pro,Google,0.43,0.86,0.1447,125.85,3.57,0.45,0.4,0.54,0.31,0.44,0.88,0.87,0.87,0.85,0.83,0.0442,0.1005,0.1253,0.1475,0.1386,0.1464,0.1656,108.83,126.91,121.99,129.62,141.92,3.1,3.57,3.49,3.7,3.97,Proprietary,1.25,10.0,Reasoning
|
| 12 |
-
grok-4-0709,xAI,0.42,0.88,0.2387,225.94,3.19,0.29,0.4,0.48,0.5,0.42,0.92,0.84,0.91,0.9,0.82,0.1008,0.1379,0.2295,0.2679,0.2073,0.2257,0.2632,226.62,326.28,157.1,189.18,230.5,2.98,3.42,3.07,3.01,3.46,Proprietary,3.0,15.0,Reasoning
|
| 13 |
-
deepseek-v3,Deepseek,0.4,0.8,0.0141,59.97,3.71,0.38,0.32,0.48,0.36,0.47,0.8,0.74,0.76,0.87,0.81,0.0119,0.0022,0.0123,0.0158,0.0139,0.0151,0.0138,44.46,68.38,48.54,70.21,68.27,3.27,4.2,3.48,3.79,3.83,Open source,0.27,1.1,Normal
|
| 14 |
-
gemini-2.5-flash,Google,0.38,0.94,0.0271,39.84,3.9,0.48,0.38,0.44,0.22,0.36,0.94,0.94,0.94,0.94,0.95,0.0123,0.0148,0.0248,0.0283,0.0273,0.0308,0.0241,33.03,36.81,38.24,42.78,48.34,3.53,3.96,3.78,4.28,3.98,Proprietary,0.3,2.5,Reasoning
|
| 15 |
-
gpt-4.1-nano-2025-04-14,OpenAI,0.38,0.63,0.0038,12.36,3.56,0.4,0.4,0.41,0.29,0.38,0.64,0.54,0.54,0.77,0.65,0.0034,0.0004,0.0029,0.0038,0.004,0.0042,0.0041,14.16,10.9,12.23,12.68,11.83,2.88,3.24,3.78,4.01,3.91,Proprietary,0.1,0.4,Normal
|
| 16 |
-
qwen3-235b-a22b-thinking-2507,Alibaba,0.34,0.85,0.0584,302.24,3.12,0.42,0.3,0.42,0.23,0.34,0.84,0.82,0.86,0.91,0.84,0.0275,0.0309,0.0535,0.0679,0.0573,0.0562,0.0575,309.41,310.33,316.64,266.96,307.84,2.86,3.43,3.2,3.03,3.1,Open source,0.65,3.0,Reasoning
|
| 17 |
-
magistral-medium-2506,Mistral,0.32,0.59,0.1182,32.96,4.4,0.3,0.35,0.38,0.26,0.3,0.59,0.67,0.56,0.63,0.51,0.108,0.0102,0.1067,0.0994,0.1077,0.1476,0.1294,24.98,35.81,33.33,39.18,31.49,4.21,3.46,3.92,5.36,5.07,Proprietary,2.0,5.0,Reasoning
|
| 18 |
-
nova-pro-v1,Amazon,0.29,0.65,0.0359,27.96,3.04,0.33,0.29,0.39,0.17,0.29,0.6,0.57,0.64,0.83,0.6,0.0316,0.0043,0.0304,0.0353,0.0359,0.04,0.038,23.45,27.94,27.9,32.09,28.43,2.72,2.88,2.99,3.36,3.26,Proprietary,0.8,3.2,Normal
|
| 19 |
-
mistral-small-2506,Mistral,0.26,0.71,0.0053,35.69,4.37,0.37,0.28,0.22,0.2,0.21,0.73,0.71,0.65,0.76,0.69,0.0049,0.0004,0.0041,0.0057,0.0054,0.0058,0.0056,30.64,36.02,30.83,41.96,39.02,3.3,4.47,4.52,4.87,4.67,Open source,0.1,0.3,Normal
|
| 20 |
-
llama-3.3-70b-instruct,Meta,0.2,0.62,0.0599,19.92,3.83,0.11,0.29,0.29,0.14,0.16,0.62,0.64,0.62,0.64,0.57,0.0588,0.0011,0.055,0.0544,0.0545,0.0664,0.069,17.62,19.4,18.55,23.91,20.14,3.61,3.34,3.42,4.29,4.5,Open source,0.88,0.88,Normal
|
| 21 |
-
caller,Arcee,0.16,0.65,0.0297,25.66,4.2,0.23,0.14,0.22,0.09,0.12,0.69,0.6,0.68,0.61,0.67,0.0282,0.0015,0.0262,0.0303,0.0305,0.0331,0.0286,22.83,25.54,26.42,29.66,23.85,3.76,4.19,4.18,4.75,4.14,Open source,0.55,0.85,Normal
|
| 22 |
-
nova-lite-v1,Amazon,0.16,0.55,0.0031,20.26,3.73,0.12,0.18,0.19,0.15,0.18,0.48,0.49,0.58,0.72,0.49,0.0027,0.0004,0.0026,0.0033,0.0031,0.0034,0.0029,17.53,20.61,19.67,24.28,19.2,3.31,4.13,3.57,4.04,3.62,Proprietary,0.06,0.24,Normal
|
| 23 |
-
magistral-small-2506,Mistral,0.16,0.53,0.0301,17.42,5.68,0.23,0.18,0.13,0.16,0.12,0.57,0.46,0.42,0.62,0.6,0.0275,0.0026,0.0245,0.0335,0.0302,0.034,0.0281,14.53,21.36,14.65,19.67,16.87,4.74,6.28,6.14,6.06,5.19,Open source,0.5,1.5,Reasoning
|
|
|
|
| 1 |
+
Model,Vendor,Avg AC,Avg TSQ,Avg Total Cost,Avg Session Duration,Avg Turns,Banking AC,Healthcare AC,Insurance AC,Investment AC,Telecom AC,Banking TSQ,Healthcare TSQ,Insurance TSQ,Investment TSQ,Telecom TSQ,Avg Input Cost ($),Avg Output Cost ($),Banking Cost,Healthcare Cost,Insurance Cost,Investment Cost,Telecom Cost,Banking Duration,Healthcare Duration,Insurance Duration,Investment Duration,Telecom Duration,Banking Turns,Healthcare Turns,Insurance Turns,Investment Turns,Telecom Turns,Model Type,$/M input token,$/M output token,Output Type,Release Date
|
| 2 |
+
gpt-4.1-2025-04-14,OpenAI,0.62,0.8,0.0684,24.32,3.1,0.6,0.62,0.66,0.64,0.58,0.81,0.83,0.68,0.88,0.82,0.0577,0.0107,0.052,0.0711,0.0629,0.0777,0.0783,18.52,24.4,25.24,27.88,25.58,2.61,3.15,2.92,3.3,3.48,Proprietary,2.0,8.0,Normal,2025-04
|
| 3 |
+
mistral-medium-2508,Mistral,0.61,0.77,0.0199,37.45,2.98,0.57,0.6,0.7,0.57,0.59,0.74,0.75,0.73,0.87,0.76,0.0164,0.0035,0.0185,0.0196,0.0195,0.0223,0.0195,31.91,34.94,36.96,43.91,39.53,3.06,2.85,2.92,3.12,2.97,Proprietary,0.4,2.0,Normal,2025-08
|
| 4 |
+
gpt-4.1-mini-2025-04-14,OpenAI,0.56,0.79,0.0141,26.0,3.43,0.56,0.6,0.46,0.5,0.64,0.8,0.85,0.63,0.84,0.83,0.0123,0.0018,0.0115,0.0143,0.0131,0.0164,0.0156,21.28,26.82,23.32,30.5,28.07,2.99,3.32,3.28,3.76,3.79,Proprietary,0.4,1.6,Normal,2025-04
|
| 5 |
+
claude-sonnet-4-20250514,Anthropic,0.55,0.92,0.1537,66.6,2.89,0.58,0.62,0.53,0.49,0.53,0.9,0.95,0.93,0.92,0.9,0.1212,0.0325,0.1359,0.1542,0.1442,0.1669,0.1675,55.36,57.93,56.87,86.44,76.38,2.54,2.84,2.79,3.06,3.22,Proprietary,3.0,15.0,Normal,2025-05
|
| 6 |
+
kimi-k2-instruct,Moonshot AI,0.53,0.9,0.0386,163.62,2.84,0.58,0.49,0.58,0.47,0.53,0.89,0.91,0.88,0.93,0.91,0.0346,0.004,0.0344,0.0401,0.0367,0.0419,0.0397,165.45,155.42,164.9,161.14,171.17,2.58,2.81,2.79,3.1,2.93,Open source,1.0,3.0,Normal,2025-07
|
| 7 |
+
qwen3-235b-a22b-instruct-2507,Alibaba,0.53,0.85,0.0074,238.02,2.4,0.44,0.49,0.74,0.41,0.58,0.88,0.84,0.85,0.91,0.79,0.0067,0.0007,0.0059,0.0079,0.0077,0.0079,0.008,206.53,267.85,233.49,252.94,229.29,1.99,2.41,2.53,2.47,2.62,Open source,0.2,0.6,Reasoning,2025-07
|
| 8 |
+
qwen2.5-72b-instruct,Alibaba,0.51,0.8,0.0361,34.68,2.65,0.48,0.61,0.52,0.42,0.52,0.78,0.84,0.77,0.82,0.79,0.0338,0.0023,0.0292,0.0348,0.0338,0.0417,0.0415,27.34,29.09,30.46,41.32,45.2,2.3,2.46,2.47,3.0,3.0,Open source,0.9,0.9,Normal,2024-09
|
| 9 |
+
gemini-2.5-flash-lite,Google,0.47,0.84,0.0039,9.8,3.11,0.45,0.6,0.54,0.35,0.41,0.82,0.9,0.78,0.86,0.84,0.0034,0.0005,0.003,0.0036,0.0039,0.0049,0.0043,7.96,8.82,9.52,11.31,11.39,2.61,3.0,2.87,3.69,3.36,Proprietary,0.1,0.4,Reasoning,2025-07
|
| 10 |
+
glm-4.5-air,Zai,0.44,0.94,0.0194,69.17,4.96,0.49,0.4,0.53,0.46,0.33,0.94,0.91,0.94,0.96,0.94,0.014,0.0054,0.0152,0.021,0.0191,0.0216,0.0199,57.56,69.95,86.47,76.51,55.35,3.99,5.2,5.02,5.41,5.2,Open source,0.2,1.1,Reasoning,2025-07
|
| 11 |
+
gemini-2.5-pro,Google,0.43,0.86,0.1447,125.85,3.57,0.45,0.4,0.54,0.31,0.44,0.88,0.87,0.87,0.85,0.83,0.0442,0.1005,0.1253,0.1475,0.1386,0.1464,0.1656,108.83,126.91,121.99,129.62,141.92,3.1,3.57,3.49,3.7,3.97,Proprietary,1.25,10.0,Reasoning,2025-03
|
| 12 |
+
grok-4-0709,xAI,0.42,0.88,0.2387,225.94,3.19,0.29,0.4,0.48,0.5,0.42,0.92,0.84,0.91,0.9,0.82,0.1008,0.1379,0.2295,0.2679,0.2073,0.2257,0.2632,226.62,326.28,157.1,189.18,230.5,2.98,3.42,3.07,3.01,3.46,Proprietary,3.0,15.0,Reasoning,2025-07
|
| 13 |
+
deepseek-v3,Deepseek,0.4,0.8,0.0141,59.97,3.71,0.38,0.32,0.48,0.36,0.47,0.8,0.74,0.76,0.87,0.81,0.0119,0.0022,0.0123,0.0158,0.0139,0.0151,0.0138,44.46,68.38,48.54,70.21,68.27,3.27,4.2,3.48,3.79,3.83,Open source,0.27,1.1,Normal,2024-12
|
| 14 |
+
gemini-2.5-flash,Google,0.38,0.94,0.0271,39.84,3.9,0.48,0.38,0.44,0.22,0.36,0.94,0.94,0.94,0.94,0.95,0.0123,0.0148,0.0248,0.0283,0.0273,0.0308,0.0241,33.03,36.81,38.24,42.78,48.34,3.53,3.96,3.78,4.28,3.98,Proprietary,0.3,2.5,Reasoning,2025-06
|
| 15 |
+
gpt-4.1-nano-2025-04-14,OpenAI,0.38,0.63,0.0038,12.36,3.56,0.4,0.4,0.41,0.29,0.38,0.64,0.54,0.54,0.77,0.65,0.0034,0.0004,0.0029,0.0038,0.004,0.0042,0.0041,14.16,10.9,12.23,12.68,11.83,2.88,3.24,3.78,4.01,3.91,Proprietary,0.1,0.4,Normal,2025-04
|
| 16 |
+
qwen3-235b-a22b-thinking-2507,Alibaba,0.34,0.85,0.0584,302.24,3.12,0.42,0.3,0.42,0.23,0.34,0.84,0.82,0.86,0.91,0.84,0.0275,0.0309,0.0535,0.0679,0.0573,0.0562,0.0575,309.41,310.33,316.64,266.96,307.84,2.86,3.43,3.2,3.03,3.1,Open source,0.65,3.0,Reasoning,2025-07
|
| 17 |
+
magistral-medium-2506,Mistral,0.32,0.59,0.1182,32.96,4.4,0.3,0.35,0.38,0.26,0.3,0.59,0.67,0.56,0.63,0.51,0.108,0.0102,0.1067,0.0994,0.1077,0.1476,0.1294,24.98,35.81,33.33,39.18,31.49,4.21,3.46,3.92,5.36,5.07,Proprietary,2.0,5.0,Reasoning,2025-06
|
| 18 |
+
nova-pro-v1,Amazon,0.29,0.65,0.0359,27.96,3.04,0.33,0.29,0.39,0.17,0.29,0.6,0.57,0.64,0.83,0.6,0.0316,0.0043,0.0304,0.0353,0.0359,0.04,0.038,23.45,27.94,27.9,32.09,28.43,2.72,2.88,2.99,3.36,3.26,Proprietary,0.8,3.2,Normal,2024-12
|
| 19 |
+
mistral-small-2506,Mistral,0.26,0.71,0.0053,35.69,4.37,0.37,0.28,0.22,0.2,0.21,0.73,0.71,0.65,0.76,0.69,0.0049,0.0004,0.0041,0.0057,0.0054,0.0058,0.0056,30.64,36.02,30.83,41.96,39.02,3.3,4.47,4.52,4.87,4.67,Open source,0.1,0.3,Normal,2025-06
|
| 20 |
+
llama-3.3-70b-instruct,Meta,0.2,0.62,0.0599,19.92,3.83,0.11,0.29,0.29,0.14,0.16,0.62,0.64,0.62,0.64,0.57,0.0588,0.0011,0.055,0.0544,0.0545,0.0664,0.069,17.62,19.4,18.55,23.91,20.14,3.61,3.34,3.42,4.29,4.5,Open source,0.88,0.88,Normal,2024-12
|
| 21 |
+
caller,Arcee,0.16,0.65,0.0297,25.66,4.2,0.23,0.14,0.22,0.09,0.12,0.69,0.6,0.68,0.61,0.67,0.0282,0.0015,0.0262,0.0303,0.0305,0.0331,0.0286,22.83,25.54,26.42,29.66,23.85,3.76,4.19,4.18,4.75,4.14,Open source,0.55,0.85,Normal,2025-01
|
| 22 |
+
nova-lite-v1,Amazon,0.16,0.55,0.0031,20.26,3.73,0.12,0.18,0.19,0.15,0.18,0.48,0.49,0.58,0.72,0.49,0.0027,0.0004,0.0026,0.0033,0.0031,0.0034,0.0029,17.53,20.61,19.67,24.28,19.2,3.31,4.13,3.57,4.04,3.62,Proprietary,0.06,0.24,Normal,2024-12
|
| 23 |
+
magistral-small-2506,Mistral,0.16,0.53,0.0301,17.42,5.68,0.23,0.18,0.13,0.16,0.12,0.57,0.46,0.42,0.62,0.6,0.0275,0.0026,0.0245,0.0335,0.0302,0.034,0.0281,14.53,21.36,14.65,19.67,16.87,4.74,6.28,6.14,6.06,5.19,Open source,0.5,1.5,Reasoning,2025-06
|
tabs/leaderboard_v2.py
CHANGED
|
@@ -4,10 +4,11 @@ import plotly.graph_objects as go
|
|
| 4 |
|
| 5 |
# Import components and styles from modular files
|
| 6 |
from components.leaderboard_components import (
|
| 7 |
-
get_chart_colors, get_rank_badge, get_type_badge,
|
| 8 |
get_output_type_badge, get_score_bar, get_metric_tooltip,
|
| 9 |
get_responsive_styles, get_faq_section, SORT_COLUMN_MAP
|
| 10 |
)
|
|
|
|
| 11 |
from styles.leaderboard_styles import get_leaderboard_css
|
| 12 |
|
| 13 |
|
|
@@ -1639,7 +1640,112 @@ def create_leaderboard_v2_tab():
|
|
| 1639 |
</div>
|
| 1640 |
</div>
|
| 1641 |
""")
|
| 1642 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1643 |
# Radar Chart Section
|
| 1644 |
gr.HTML("""
|
| 1645 |
<div class="dark-container" style="margin-bottom: 24px;">
|
|
@@ -1847,22 +1953,51 @@ def create_leaderboard_v2_tab():
|
|
| 1847 |
|
| 1848 |
return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
|
| 1849 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1850 |
# Update table when filters change
|
| 1851 |
filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order]
|
| 1852 |
-
|
| 1853 |
for input_component in filter_inputs:
|
| 1854 |
input_component.change(
|
| 1855 |
fn=update_table,
|
| 1856 |
inputs=filter_inputs,
|
| 1857 |
outputs=[leaderboard_title, leaderboard_table]
|
| 1858 |
)
|
| 1859 |
-
|
| 1860 |
# Also update radar chart when filters change
|
| 1861 |
input_component.change(
|
| 1862 |
fn=update_radar_chart,
|
| 1863 |
inputs=filter_inputs + [model_selector],
|
| 1864 |
outputs=[model_selector, radar_chart]
|
| 1865 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1866 |
|
| 1867 |
# Update radar chart when model selection changes
|
| 1868 |
model_selector.change(
|
|
|
|
| 4 |
|
| 5 |
# Import components and styles from modular files
|
| 6 |
from components.leaderboard_components import (
|
| 7 |
+
get_chart_colors, get_rank_badge, get_type_badge,
|
| 8 |
get_output_type_badge, get_score_bar, get_metric_tooltip,
|
| 9 |
get_responsive_styles, get_faq_section, SORT_COLUMN_MAP
|
| 10 |
)
|
| 11 |
+
from components.prediction_components import create_ac_prediction_chart
|
| 12 |
from styles.leaderboard_styles import get_leaderboard_css
|
| 13 |
|
| 14 |
|
|
|
|
| 1640 |
</div>
|
| 1641 |
</div>
|
| 1642 |
""")
|
| 1643 |
+
|
| 1644 |
+
# AI Agent Reliability Prediction Section
|
| 1645 |
+
gr.HTML("""
|
| 1646 |
+
<div class="dark-container" style="margin-bottom: 24px;">
|
| 1647 |
+
<div class="section-header">
|
| 1648 |
+
<span class="section-icon" style="color: var(--accent-secondary);">📈</span>
|
| 1649 |
+
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
|
| 1650 |
+
Enterprise Readiness Prediction
|
| 1651 |
+
</h3>
|
| 1652 |
+
</div>
|
| 1653 |
+
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
|
| 1654 |
+
When will AI agents reach 99% reliability for enterprise deployment?
|
| 1655 |
+
</p>
|
| 1656 |
+
""")
|
| 1657 |
+
|
| 1658 |
+
# Add prediction chart - make it reactive
|
| 1659 |
+
gr.HTML('<div class="chart-container">')
|
| 1660 |
+
|
| 1661 |
+
# Create initial prediction with default filters
|
| 1662 |
+
initial_prediction_chart, initial_date_99, initial_months_to_99 = create_ac_prediction_chart(
|
| 1663 |
+
load_leaderboard_data(), domain_filter="All", model_type_filter="All"
|
| 1664 |
+
)
|
| 1665 |
+
|
| 1666 |
+
prediction_plot = gr.Plot(
|
| 1667 |
+
label="",
|
| 1668 |
+
value=initial_prediction_chart,
|
| 1669 |
+
elem_classes=["prediction-chart", "plot-container"]
|
| 1670 |
+
)
|
| 1671 |
+
gr.HTML('</div>')
|
| 1672 |
+
|
| 1673 |
+
# Add dynamic insights section
|
| 1674 |
+
def generate_insight_html(date_99, months_to_99, domain_filter="All", model_type_filter="All"):
|
| 1675 |
+
"""Generate insight HTML based on prediction results and filters"""
|
| 1676 |
+
# Clean up filter names
|
| 1677 |
+
if domain_filter.startswith('🌐'):
|
| 1678 |
+
domain_clean = "All Domains"
|
| 1679 |
+
elif domain_filter.startswith('🏦'):
|
| 1680 |
+
domain_clean = "Banking"
|
| 1681 |
+
elif domain_filter.startswith('🏥'):
|
| 1682 |
+
domain_clean = "Healthcare"
|
| 1683 |
+
elif domain_filter.startswith('🛡️'):
|
| 1684 |
+
domain_clean = "Insurance"
|
| 1685 |
+
elif domain_filter.startswith('💰'):
|
| 1686 |
+
domain_clean = "Investment"
|
| 1687 |
+
elif domain_filter.startswith('📱'):
|
| 1688 |
+
domain_clean = "Telecom"
|
| 1689 |
+
else:
|
| 1690 |
+
domain_clean = domain_filter
|
| 1691 |
+
|
| 1692 |
+
filter_context = ""
|
| 1693 |
+
if domain_clean != "All Domains" or model_type_filter != "All":
|
| 1694 |
+
filter_context = " for "
|
| 1695 |
+
if domain_clean != "All Domains":
|
| 1696 |
+
filter_context += f"<strong>{domain_clean}</strong>"
|
| 1697 |
+
if model_type_filter != "All":
|
| 1698 |
+
if domain_clean != "All Domains":
|
| 1699 |
+
filter_context += f" ({model_type_filter} models)"
|
| 1700 |
+
else:
|
| 1701 |
+
filter_context += f"<strong>{model_type_filter} models</strong>"
|
| 1702 |
+
|
| 1703 |
+
if date_99 and months_to_99:
|
| 1704 |
+
if months_to_99 > 0:
|
| 1705 |
+
return f"""
|
| 1706 |
+
<div style="margin-top: 20px; padding: 16px; background: linear-gradient(145deg, rgba(16, 152, 247, 0.1) 0%, rgba(227, 84, 84, 0.05) 100%); border: 1px solid var(--border-subtle); border-radius: 12px;">
|
| 1707 |
+
<div style="display: flex; align-items: center; gap: 12px; margin-bottom: 12px;">
|
| 1708 |
+
<span style="font-size: 1.5rem;">🎯</span>
|
| 1709 |
+
<span style="font-weight: 700; color: var(--text-primary); font-size: 1.1rem;">Key Prediction Insights{filter_context}</span>
|
| 1710 |
+
</div>
|
| 1711 |
+
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.8;">
|
| 1712 |
+
<li><strong style="color: var(--accent-primary);">Enterprise threshold (99% AC)</strong> predicted by <strong style="color: var(--accent-secondary);">{date_99.strftime('%B %Y')}</strong></li>
|
| 1713 |
+
<li>Approximately <strong style="color: var(--accent-primary);">{months_to_99:.0f} months</strong> from current date</li>
|
| 1714 |
+
<li>Performance trends based on <strong>historical data{filter_context}</strong></li>
|
| 1715 |
+
<li>Early adopters who invest now gain <strong>{months_to_99:.0f} months</strong> of competitive advantage</li>
|
| 1716 |
+
</ul>
|
| 1717 |
+
</div>
|
| 1718 |
+
"""
|
| 1719 |
+
else:
|
| 1720 |
+
return f"""
|
| 1721 |
+
<div style="margin-top: 20px; padding: 16px; background: linear-gradient(145deg, rgba(40, 167, 69, 0.1) 0%, rgba(227, 84, 84, 0.05) 100%); border: 1px solid var(--border-subtle); border-radius: 12px;">
|
| 1722 |
+
<div style="display: flex; align-items: center; gap: 12px; margin-bottom: 12px;">
|
| 1723 |
+
<span style="font-size: 1.5rem;">✅</span>
|
| 1724 |
+
<span style="font-weight: 700; color: var(--text-primary); font-size: 1.1rem;">Enterprise Ready{filter_context}!</span>
|
| 1725 |
+
</div>
|
| 1726 |
+
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.8;">
|
| 1727 |
+
<li>Models{filter_context} have <strong style="color: #28a745;">already achieved</strong> near-enterprise reliability</li>
|
| 1728 |
+
<li>Focus should shift to <strong>implementation and scaling</strong></li>
|
| 1729 |
+
<li>Investment in guardrails and observability is <strong>critical now</strong></li>
|
| 1730 |
+
</ul>
|
| 1731 |
+
</div>
|
| 1732 |
+
"""
|
| 1733 |
+
else:
|
| 1734 |
+
return f"""
|
| 1735 |
+
<div style="margin-top: 20px; padding: 16px; background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); border-radius: 12px;">
|
| 1736 |
+
<p style="color: var(--text-secondary); margin: 0;">
|
| 1737 |
+
{'Insufficient data' + filter_context + ' to make reliable predictions. More models need to be evaluated in this category.' if (domain_clean != "All Domains" or model_type_filter != "All") else 'Based on current data trends, we are tracking the exponential improvement in AI agent capabilities. As more models are released, our predictions will become more accurate.'}
|
| 1738 |
+
</p>
|
| 1739 |
+
</div>
|
| 1740 |
+
"""
|
| 1741 |
+
|
| 1742 |
+
# Create the insights HTML component
|
| 1743 |
+
prediction_insights = gr.HTML(
|
| 1744 |
+
generate_insight_html(initial_date_99, initial_months_to_99, "All", "All")
|
| 1745 |
+
)
|
| 1746 |
+
|
| 1747 |
+
gr.HTML("</div>")
|
| 1748 |
+
|
| 1749 |
# Radar Chart Section
|
| 1750 |
gr.HTML("""
|
| 1751 |
<div class="dark-container" style="margin-bottom: 24px;">
|
|
|
|
| 1953 |
|
| 1954 |
return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
|
| 1955 |
|
| 1956 |
+
# Function to update prediction chart and insights
|
| 1957 |
+
def update_prediction_chart_and_insights(domain_filter, model_type_filter):
|
| 1958 |
+
"""Update prediction chart and insights based on filters"""
|
| 1959 |
+
df = load_leaderboard_data()
|
| 1960 |
+
|
| 1961 |
+
# Create new prediction chart with filters
|
| 1962 |
+
chart, date_99, months_to_99 = create_ac_prediction_chart(
|
| 1963 |
+
df, domain_filter=domain_filter, model_type_filter=model_type_filter
|
| 1964 |
+
)
|
| 1965 |
+
|
| 1966 |
+
# Generate new insights HTML
|
| 1967 |
+
insights_html = generate_insight_html(date_99, months_to_99, domain_filter, model_type_filter)
|
| 1968 |
+
|
| 1969 |
+
return chart, insights_html
|
| 1970 |
+
|
| 1971 |
# Update table when filters change
|
| 1972 |
filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order]
|
| 1973 |
+
|
| 1974 |
for input_component in filter_inputs:
|
| 1975 |
input_component.change(
|
| 1976 |
fn=update_table,
|
| 1977 |
inputs=filter_inputs,
|
| 1978 |
outputs=[leaderboard_title, leaderboard_table]
|
| 1979 |
)
|
| 1980 |
+
|
| 1981 |
# Also update radar chart when filters change
|
| 1982 |
input_component.change(
|
| 1983 |
fn=update_radar_chart,
|
| 1984 |
inputs=filter_inputs + [model_selector],
|
| 1985 |
outputs=[model_selector, radar_chart]
|
| 1986 |
)
|
| 1987 |
+
|
| 1988 |
+
# Update prediction chart when domain or model type filters change
|
| 1989 |
+
# Only react to domain_filter and model_type_filter, not other filters
|
| 1990 |
+
domain_filter.change(
|
| 1991 |
+
fn=update_prediction_chart_and_insights,
|
| 1992 |
+
inputs=[domain_filter, model_type_filter],
|
| 1993 |
+
outputs=[prediction_plot, prediction_insights]
|
| 1994 |
+
)
|
| 1995 |
+
|
| 1996 |
+
model_type_filter.change(
|
| 1997 |
+
fn=update_prediction_chart_and_insights,
|
| 1998 |
+
inputs=[domain_filter, model_type_filter],
|
| 1999 |
+
outputs=[prediction_plot, prediction_insights]
|
| 2000 |
+
)
|
| 2001 |
|
| 2002 |
# Update radar chart when model selection changes
|
| 2003 |
model_selector.change(
|