Pratik Bhavsar commited on
Commit
69c6c68
·
1 Parent(s): 41c1420

added future perf prediction

Browse files
components/prediction_components.py ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Components for AC prediction and visualization"""
2
+ import pandas as pd
3
+ import numpy as np
4
+ from datetime import datetime, timedelta
5
+ import plotly.graph_objects as go
6
+
7
+ try:
8
+ from scipy.optimize import curve_fit
9
+ from scipy import stats
10
+ HAS_SCIPY = True
11
+ except ImportError:
12
+ HAS_SCIPY = False
13
+ # Fallback to numpy polynomial fitting
14
+ def curve_fit(func, xdata, ydata, p0=None, maxfev=5000, bounds=None):
15
+ # Simple fallback - just use polynomial fitting
16
+ if func.__name__ == 'exponential_growth':
17
+ # Linearize exponential: log(y) = log(a) + b*x
18
+ log_y = np.log(ydata + 0.01) # Add small constant to avoid log(0)
19
+ coeffs = np.polyfit(xdata, log_y, 1)
20
+ a = np.exp(coeffs[1])
21
+ b = coeffs[0]
22
+ c = 0.01
23
+ return [a, b, c], None
24
+ elif func.__name__ == 'logistic_growth':
25
+ # Better fallback for logistic using data characteristics
26
+ # Estimate L (max value) as slightly above current max
27
+ L = min(1.0, max(ydata) * 1.2) # Cap at 1.0
28
+
29
+ # Estimate x0 (midpoint) - when growth would be fastest
30
+ # For now, project forward from current trend
31
+ if len(xdata) > 1:
32
+ # Simple linear projection to estimate when we'd hit midpoint
33
+ slope = (ydata[-1] - ydata[0]) / (xdata[-1] - xdata[0])
34
+ if slope > 0:
35
+ # Estimate days to reach L/2
36
+ midpoint_value = L / 2
37
+ if ydata[-1] < midpoint_value:
38
+ days_to_midpoint = (midpoint_value - ydata[-1]) / slope
39
+ x0 = xdata[-1] + days_to_midpoint
40
+ else:
41
+ x0 = np.median(xdata)
42
+ else:
43
+ x0 = np.median(xdata)
44
+ else:
45
+ x0 = np.median(xdata)
46
+
47
+ # Estimate k (growth rate) based on current growth
48
+ k = 0.003 # Conservative default
49
+
50
+ return [L, k, x0], None
51
+ elif func.__name__ == 'power_law':
52
+ # Linearize power law: log(y) = log(a) + b*log(x)
53
+ log_x = np.log(xdata + 1)
54
+ log_y = np.log(ydata + 0.01)
55
+ coeffs = np.polyfit(log_x, log_y, 1)
56
+ return [np.exp(coeffs[1]), coeffs[0]], None
57
+ return p0, None
58
+
59
+ def exponential_growth(x, a, b, c):
60
+ """Exponential growth function: y = a * exp(b * x) + c"""
61
+ return a * np.exp(b * x) + c
62
+
63
+ def logistic_growth(x, L, k, x0):
64
+ """Logistic growth function: y = L / (1 + exp(-k*(x-x0)))"""
65
+ return L / (1 + np.exp(-k * (x - x0)))
66
+
67
+ def power_law(x, a, b):
68
+ """Power law function: y = a * x^b"""
69
+ return a * np.power(x, b)
70
+
71
+ def create_ac_prediction_chart(df, domain_filter="All", model_type_filter="All"):
72
+ """Create a prediction chart showing when AC will reach 99%
73
+
74
+ Args:
75
+ df: DataFrame with model data
76
+ domain_filter: Domain to filter by (All, Banking, Healthcare, etc.)
77
+ model_type_filter: Model type to filter by (All, Open Source, Proprietary)
78
+ """
79
+
80
+ # Clean up domain filter (remove emoji prefix if present)
81
+ if domain_filter.startswith('🌐'):
82
+ domain_filter = "All"
83
+ elif domain_filter.startswith('🏦'):
84
+ domain_filter = "Banking"
85
+ elif domain_filter.startswith('🏥'):
86
+ domain_filter = "Healthcare"
87
+ elif domain_filter.startswith('🛡️'):
88
+ domain_filter = "Insurance"
89
+ elif domain_filter.startswith('💰'):
90
+ domain_filter = "Investment"
91
+ elif domain_filter.startswith('📱'):
92
+ domain_filter = "Telecom"
93
+
94
+ # Determine which AC column to use based on domain filter
95
+ if domain_filter != "All":
96
+ ac_column = f'{domain_filter} AC'
97
+ # Check if domain-specific column exists
98
+ if ac_column not in df.columns:
99
+ ac_column = 'Avg AC'
100
+ else:
101
+ ac_column = 'Avg AC'
102
+
103
+ # Filter data to only include models with valid release dates and AC scores
104
+ df_clean = df.dropna(subset=['Release Date', ac_column])
105
+ df_clean = df_clean[df_clean[ac_column] > 0]
106
+
107
+ # Apply model type filter
108
+ if model_type_filter == "Open Source":
109
+ df_clean = df_clean[df_clean['Model Type'] == 'Open source']
110
+ elif model_type_filter == "Proprietary":
111
+ df_clean = df_clean[df_clean['Model Type'] == 'Proprietary']
112
+
113
+ # Rename the AC column to 'Avg AC' for consistent processing (only if different)
114
+ if ac_column != 'Avg AC':
115
+ # Drop the original 'Avg AC' column if it exists to avoid duplicates
116
+ if 'Avg AC' in df_clean.columns:
117
+ df_clean = df_clean.drop(columns=['Avg AC'])
118
+ df_clean = df_clean.rename(columns={ac_column: 'Avg AC'})
119
+
120
+ # Make a copy to avoid any issues with the original data
121
+ df_clean = df_clean.copy()
122
+
123
+ # Handle both YYYY-MM and YYYY-MM-DD formats
124
+ if df_clean['Release Date'].str.contains('-').all():
125
+ # Check if it's YYYY-MM format (no day component)
126
+ if df_clean['Release Date'].str.count('-').iloc[0] == 1:
127
+ # Add '-01' to make it a valid date
128
+ df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'] + '-01')
129
+ else:
130
+ df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])
131
+ else:
132
+ df_clean['Release Date'] = pd.to_datetime(df_clean['Release Date'])
133
+
134
+ # Sort by release date
135
+ df_clean = df_clean.sort_values('Release Date')
136
+
137
+ # Create a running maximum (best performance achieved up to each date)
138
+ df_clean['Cumulative_Max_AC'] = df_clean['Avg AC'].expanding().max()
139
+
140
+ # Group by date and take the cumulative maximum for each date
141
+ df_best = df_clean.groupby('Release Date')['Cumulative_Max_AC'].max().reset_index()
142
+ df_best.columns = ['Release Date', 'Avg AC']
143
+
144
+ # Apply cumulative maximum again to ensure monotonic increase
145
+ df_best['Avg AC'] = df_best['Avg AC'].cummax()
146
+
147
+ # Convert dates to days since first release for curve fitting
148
+ first_date = df_best['Release Date'].min()
149
+ df_best['Days'] = (df_best['Release Date'] - first_date).dt.days
150
+
151
+ # Prepare data for fitting
152
+ x_data = df_best['Days'].values
153
+ y_data = df_best['Avg AC'].values
154
+
155
+ # With limited data (only 2 performance levels), use simple conservative linear projection
156
+ # Don't try to fit complex curves that will overfit
157
+
158
+ best_model = 'linear'
159
+
160
+ # Calculate simple linear trend
161
+ if len(x_data) > 1:
162
+ # Basic linear regression
163
+ z = np.polyfit(x_data, y_data, 1)
164
+ slope = z[0]
165
+
166
+ # Apply conservative adjustment (assume diminishing returns)
167
+ conservative_slope = slope * 0.5 # Assume 50% slower future improvements
168
+
169
+ # Create conservative linear projection
170
+ best_fit = [conservative_slope, y_data[-1] - conservative_slope * x_data[-1]]
171
+
172
+ # Calculate R² for the linear fit
173
+ p = np.poly1d(z)
174
+ y_pred = p(x_data)
175
+ best_r2 = 1 - (np.sum((y_data - y_pred)**2) / np.sum((y_data - y_data.mean())**2))
176
+ else:
177
+ # Single data point - use minimal growth
178
+ best_fit = [0.0001, y_data[0]]
179
+ best_r2 = 0.0
180
+
181
+ # Generate prediction timeline
182
+ future_days = np.arange(0, 5475, 30) # 15 years in 30-day intervals
183
+
184
+ # Simple conservative linear projection
185
+ p = np.poly1d(best_fit)
186
+ future_ac = p(future_days)
187
+
188
+ # Cap predictions at 1.0
189
+ future_ac = np.minimum(future_ac, 1.0)
190
+
191
+ # Find when we reach 99%
192
+ target_ac = 0.99
193
+ crossing_idx = np.where(future_ac >= target_ac)[0]
194
+
195
+ if len(crossing_idx) > 0:
196
+ days_to_99 = future_days[crossing_idx[0]]
197
+ date_99 = first_date + timedelta(days=int(days_to_99))
198
+ months_from_now = (date_99 - datetime.now()).days / 30.4
199
+ else:
200
+ date_99 = None
201
+ months_from_now = None
202
+
203
+ # Convert future days to dates
204
+ future_dates = [first_date + timedelta(days=int(d)) for d in future_days]
205
+
206
+ # Create the plot
207
+ fig = go.Figure()
208
+
209
+ # Add confidence bands FIRST (so they appear behind other traces)
210
+ if best_model and best_fit is not None:
211
+ # Generate smooth confidence bands
212
+ future_std = 0.05 # Base uncertainty
213
+ confidence_multiplier = np.linspace(1.0, 2.0, len(future_dates))
214
+
215
+ upper_bound = np.minimum(future_ac + future_std * confidence_multiplier, 1.0)
216
+ lower_bound = np.maximum(future_ac - future_std * confidence_multiplier, 0)
217
+
218
+ # Add confidence band as filled area
219
+ fig.add_trace(go.Scatter(
220
+ x=future_dates + future_dates[::-1],
221
+ y=list(upper_bound) + list(lower_bound[::-1]),
222
+ fill='toself',
223
+ fillcolor='rgba(16, 152, 247, 0.05)',
224
+ line=dict(width=0),
225
+ showlegend=False,
226
+ hoverinfo='skip',
227
+ name='Uncertainty'
228
+ ))
229
+
230
+ # Add vendor info and additional metrics
231
+ df_with_vendor = df_clean.copy()
232
+ if 'Vendor' in df.columns:
233
+ vendor_map = df.set_index('Model')['Vendor'].to_dict()
234
+ df_with_vendor['Vendor'] = df_with_vendor['Model'].map(vendor_map).fillna('Unknown')
235
+ else:
236
+ df_with_vendor['Vendor'] = 'Unknown'
237
+
238
+ if 'Model Type' in df.columns:
239
+ type_map = df.set_index('Model')['Model Type'].to_dict()
240
+ df_with_vendor['Model Type'] = df_with_vendor['Model'].map(type_map).fillna('Unknown')
241
+ else:
242
+ df_with_vendor['Model Type'] = 'Unknown'
243
+
244
+ # Calculate additional metrics for each model
245
+ df_with_vendor['Gap_to_99'] = 0.99 - df_with_vendor['Avg AC']
246
+ df_with_vendor['Gap_to_Best'] = df_with_vendor['Cumulative_Max_AC'] - df_with_vendor['Avg AC']
247
+
248
+ # Get cost info if available
249
+ if 'Avg Total Cost' in df.columns:
250
+ cost_map = df.set_index('Model')['Avg Total Cost'].to_dict()
251
+ df_with_vendor['Cost'] = df_with_vendor['Model'].map(cost_map).fillna(0)
252
+ else:
253
+ df_with_vendor['Cost'] = 0
254
+
255
+ # Check if each model improved the frontier
256
+ df_with_vendor['Is_Frontier'] = df_with_vendor['Avg AC'] >= df_with_vendor['Cumulative_Max_AC'] - 0.001 # Small tolerance for float comparison
257
+
258
+ # Create frontier status text
259
+ frontier_status = []
260
+ for idx, row in df_with_vendor.iterrows():
261
+ if row['Is_Frontier']:
262
+ frontier_status.append('✅ Advanced SOTA')
263
+ else:
264
+ frontier_status.append('❌ Below existing best')
265
+
266
+ vendor_info = df_with_vendor['Vendor'].values
267
+ model_type = df_with_vendor['Model Type'].values
268
+ gap_to_99 = df_with_vendor['Gap_to_99'].values
269
+ gap_to_best = df_with_vendor['Gap_to_Best'].values
270
+ cost_info = df_with_vendor['Cost'].values
271
+
272
+ # Add historical data points with comprehensive hover
273
+ fig.add_trace(go.Scatter(
274
+ x=df_clean['Release Date'],
275
+ y=df_clean['Avg AC'],
276
+ mode='markers',
277
+ name='Individual Models',
278
+ marker=dict(
279
+ size=14,
280
+ color='rgba(227, 84, 84, 0.8)',
281
+ line=dict(width=2, color='rgba(255, 255, 255, 0.6)'),
282
+ symbol='circle'
283
+ ),
284
+ customdata=list(zip(vendor_info, model_type, gap_to_99, gap_to_best, cost_info, frontier_status)),
285
+ hovertemplate=(
286
+ '<b style="font-size: 18px; color: #E35454;">%{text}</b><br>'
287
+ '<br>'
288
+ '<b style="color: #1098F7;">Model Information:</b><br>'
289
+ '• <b>Vendor:</b> %{customdata[0]}<br>'
290
+ '• <b>Type:</b> %{customdata[1]}<br>'
291
+ '• <b>Released:</b> %{x|%B %Y}<br>'
292
+ '• <b>Frontier Status:</b> %{customdata[5]}<br>'
293
+ '<br>'
294
+ '<b style="color: #FFD700;">Performance Metrics:</b><br>'
295
+ '• <b>Action Completion:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
296
+ '• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[2]:.1%}</span><br>'
297
+ '• <b>Behind Best:</b> <span style="color: #FFA500;">-%{customdata[3]:.1%}</span><br>'
298
+ '<br>'
299
+ '<b style="color: #28a745;">Cost Efficiency:</b><br>'
300
+ '• <b>Avg Session Cost:</b> $%{customdata[4]:.4f}<br>'
301
+ '<br>'
302
+ '<i style="color: #B1B5B9; font-size: 11px;">Performance at release time</i>'
303
+ '<extra></extra>'
304
+ ),
305
+ text=df_clean['Model'].values,
306
+ hoverlabel=dict(
307
+ bgcolor='rgba(26, 26, 46, 0.95)',
308
+ bordercolor='rgba(227, 84, 84, 0.5)',
309
+ font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
310
+ align='left',
311
+ namelength=-1
312
+ )
313
+ ))
314
+
315
+ # Calculate improvement metrics for hover
316
+ df_best['Improvement'] = df_best['Avg AC'].diff().fillna(0)
317
+ df_best['Improvement_Pct'] = (df_best['Avg AC'].pct_change() * 100).fillna(0)
318
+ df_best['Gap_to_99'] = 0.99 - df_best['Avg AC']
319
+
320
+ # Find which model is responsible for the best performance at each date
321
+ best_model_at_date = []
322
+ for date_val in df_best['Release Date']:
323
+ # Find all models up to and including this date
324
+ models_up_to_date = df_clean[df_clean['Release Date'] <= date_val]
325
+ if not models_up_to_date.empty:
326
+ # Find the model with the highest AC score up to this date
327
+ best_idx = models_up_to_date['Avg AC'].idxmax()
328
+ best_model_at_date.append(models_up_to_date.loc[best_idx, 'Model'])
329
+ else:
330
+ best_model_at_date.append('Unknown')
331
+
332
+ # Add best performance line with enhanced metrics
333
+ fig.add_trace(go.Scatter(
334
+ x=df_best['Release Date'],
335
+ y=df_best['Avg AC'],
336
+ mode='lines+markers',
337
+ name='Best Performance Trend',
338
+ line=dict(color='#E35454', width=4, shape='linear'),
339
+ marker=dict(
340
+ size=16,
341
+ color='#E35454',
342
+ symbol='diamond',
343
+ line=dict(width=2, color='white')
344
+ ),
345
+ customdata=list(zip(
346
+ df_best['Improvement'].values,
347
+ df_best['Improvement_Pct'].values,
348
+ df_best['Gap_to_99'].values,
349
+ best_model_at_date
350
+ )),
351
+ hovertemplate=(
352
+ '<b style="font-size: 16px; color: #E35454;">📈 Best Performance Frontier</b><br>'
353
+ '<br>'
354
+ '<b>Date:</b> %{x|%B %Y}<br>'
355
+ '<b>Leading Model:</b> %{customdata[3]}<br>'
356
+ '<b>Cumulative Best AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
357
+ '<br>'
358
+ '<b>Progress Metrics:</b><br>'
359
+ '• Improvement: <span style="color: #28a745;">+%{customdata[0]:.1%}</span><br>'
360
+ '• Growth Rate: <span style="color: #28a745;">+%{customdata[1]:.1f}%</span><br>'
361
+ '• Gap to 99%: <span style="color: #1098F7;">%{customdata[2]:.1%}</span><br>'
362
+ '<br>'
363
+ '<i style="color: #B1B5B9;">This represents the best performance achieved by any model up to this date</i>'
364
+ '<extra></extra>'
365
+ ),
366
+ hoverlabel=dict(
367
+ bgcolor='rgba(26, 26, 46, 0.95)',
368
+ bordercolor='rgba(227, 84, 84, 0.5)',
369
+ font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
370
+ align='left',
371
+ namelength=-1
372
+ )
373
+ ))
374
+
375
+ # Calculate months from now for each prediction point
376
+ months_from_now_list = [(date - datetime.now()).days / 30.4 for date in future_dates]
377
+ years_from_now_list = [m / 12 for m in months_from_now_list]
378
+
379
+ # Add prediction line with comprehensive metrics
380
+ fig.add_trace(go.Scatter(
381
+ x=future_dates,
382
+ y=future_ac,
383
+ mode='lines',
384
+ name=f'Prediction ({best_model.capitalize()})',
385
+ line=dict(color='#1098F7', width=4, dash='dash'),
386
+ opacity=0.8,
387
+ customdata=list(zip(
388
+ [max(0, 0.99 - y) for y in future_ac],
389
+ months_from_now_list,
390
+ years_from_now_list,
391
+ [best_r2] * len(future_ac)
392
+ )),
393
+ hovertemplate=(
394
+ '<b style="font-size: 18px; color: #1098F7;">🔮 AI Performance Prediction</b><br>'
395
+ '<br>'
396
+ '<b style="color: #FFD700;">Forecast Details:</b><br>'
397
+ '• <b>Date:</b> %{x|%B %Y}<br>'
398
+ '• <b>Predicted AC:</b> <span style="font-size: 20px; color: #FFD700;">%{y:.1%}</span><br>'
399
+ '• <b>Gap to 99%:</b> <span style="color: #FF6B6B;">-%{customdata[0]:.1%}</span><br>'
400
+ '<br>'
401
+ '<b style="color: #28a745;">Timeline:</b><br>'
402
+ '• <b>Months from now:</b> %{customdata[1]:.0f} months<br>'
403
+ '• <b>Years from now:</b> %{customdata[2]:.1f} years<br>'
404
+ '<br>'
405
+ '<b style="color: #1098F7;">Model Confidence:</b><br>'
406
+ f'• <b>Algorithm:</b> {best_model.capitalize()}<br>'
407
+ '• <b>R² Score:</b> %{customdata[3]:.3f}<br>'
408
+ '<br>'
409
+ '<i style="color: #B1B5B9; font-size: 11px;">Based on historical performance trends</i>'
410
+ '<extra></extra>'
411
+ ),
412
+ hoverlabel=dict(
413
+ bgcolor='rgba(26, 26, 46, 0.95)',
414
+ bordercolor='rgba(16, 152, 247, 0.5)',
415
+ font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
416
+ align='left',
417
+ namelength=-1
418
+ )
419
+ ))
420
+
421
+ # Add 99% threshold line with enhanced styling
422
+ fig.add_hline(
423
+ y=0.99,
424
+ line_dash="dash",
425
+ line_color="rgba(40, 167, 69, 0.4)",
426
+ line_width=2,
427
+ annotation=dict(
428
+ text="<b>Enterprise-Grade Threshold (99%)</b>",
429
+ font=dict(size=13, color='#28a745', family='Geist, sans-serif'),
430
+ bgcolor='rgba(40, 167, 69, 0.15)',
431
+ bordercolor='#28a745',
432
+ borderwidth=1,
433
+ borderpad=4
434
+ ),
435
+ annotation_position="right"
436
+ )
437
+
438
+ # Add marker for 99% crossing point with enhanced visibility
439
+ if date_99:
440
+ # Calculate days until achievement
441
+ days_until = (date_99 - datetime.now()).days
442
+
443
+ fig.add_trace(go.Scatter(
444
+ x=[date_99],
445
+ y=[0.99],
446
+ mode='markers+text',
447
+ name='🎯 99% Achievement',
448
+ marker=dict(
449
+ size=28,
450
+ color='#28a745',
451
+ symbol='star',
452
+ line=dict(width=3, color='white')
453
+ ),
454
+ text=[f'<b>{date_99.strftime("%b %Y")}</b>'],
455
+ textposition='top center',
456
+ textfont=dict(size=16, color='#28a745', family='Geist, sans-serif'),
457
+ hovertemplate=(
458
+ '<b style="font-size: 18px; color: #28a745;">🎯 ENTERPRISE-READY MILESTONE</b><br>'
459
+ '<br>'
460
+ f'<b>Achievement Date:</b> <span style="font-size: 16px;">{date_99.strftime("%B %Y")}</span><br>'
461
+ f'<b>Time from today:</b> <span style="font-size: 16px; color: #FFD700;">{months_from_now:.0f} months</span><br>'
462
+ f'<b>Days remaining:</b> {days_until} days<br>'
463
+ f'<b>Years:</b> {months_from_now/12:.1f} years<br>'
464
+ '<br>'
465
+ '<b style="color: #1098F7;">Strategic Implications:</b><br>'
466
+ f'• Early adopters gain {months_from_now:.0f}-month advantage<br>'
467
+ '• Infrastructure investment critical now<br>'
468
+ '• 99% reliability enables production deployment<br>'
469
+ '<extra></extra>'
470
+ ),
471
+ hoverlabel=dict(
472
+ bgcolor='rgba(26, 26, 46, 0.95)',
473
+ bordercolor='rgba(40, 167, 69, 0.5)',
474
+ font=dict(size=12, color='#F5F6F7', family='Geist, sans-serif'),
475
+ align='left',
476
+ namelength=-1
477
+ )
478
+ ))
479
+
480
+ # Update layout with improved title showing active filters
481
+ filter_text = ""
482
+ if domain_filter != "All":
483
+ filter_text += f" - {domain_filter} Domain"
484
+ if model_type_filter != "All":
485
+ if filter_text:
486
+ filter_text += f", {model_type_filter} Models"
487
+ else:
488
+ filter_text += f" - {model_type_filter} Models"
489
+
490
+ title_text = f"<span style='font-size: 24px;'>🚀 When Will AI Agents Reach Enterprise-Grade Reliability?</span>"
491
+ if filter_text:
492
+ title_text += f"<br><span style='font-size: 14px; color: #1098F7;'>{filter_text}</span>"
493
+
494
+ if date_99 and months_from_now:
495
+ if months_from_now > 0:
496
+ title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Prediction: <b style='color: #FFD700;'>{date_99.strftime('%B %Y')}</b> (~{months_from_now:.0f} months)</span>"
497
+ else:
498
+ title_text += f"<br><span style='font-size: 16px; color: #28a745;'>Already achieved!</span>"
499
+ else:
500
+ title_text += f"<br><span style='font-size: 16px; color: #B1B5B9;'>Tracking performance improvements...</span>"
501
+
502
+ fig.update_layout(
503
+ title=dict(
504
+ text=title_text,
505
+ font=dict(size=20, family="Geist, sans-serif", color="#F5F6F7"),
506
+ x=0.5,
507
+ xanchor='center'
508
+ ),
509
+ xaxis=dict(
510
+ title=dict(
511
+ text="<b>Release Date</b>",
512
+ font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
513
+ standoff=20
514
+ ),
515
+ tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
516
+ gridcolor="rgba(245, 246, 247, 0.08)",
517
+ zerolinecolor="rgba(245, 246, 247, 0.15)",
518
+ showgrid=True,
519
+ gridwidth=1,
520
+ tickangle=0,
521
+ tickformat='%b %Y',
522
+ showspikes=True,
523
+ spikecolor="rgba(245, 246, 247, 0.3)",
524
+ spikethickness=1,
525
+ spikemode='across',
526
+ spikedash='dot',
527
+ range=[df_clean['Release Date'].min() - timedelta(days=60),
528
+ min(datetime.now() + timedelta(days=800), future_dates[-1] if future_dates else datetime.now())]
529
+ ),
530
+ yaxis=dict(
531
+ title=dict(
532
+ text="<b>Action Completion (AC)</b>",
533
+ font=dict(size=16, family="Geist, sans-serif", color="#F5F6F7"),
534
+ standoff=20
535
+ ),
536
+ tickfont=dict(size=12, family="Geist Mono, monospace", color="#B1B5B9"),
537
+ gridcolor="rgba(245, 246, 247, 0.08)",
538
+ zerolinecolor="rgba(245, 246, 247, 0.15)",
539
+ showgrid=True,
540
+ gridwidth=1,
541
+ tickformat='.0%',
542
+ dtick=0.1,
543
+ showspikes=True,
544
+ spikecolor="rgba(245, 246, 247, 0.3)",
545
+ spikethickness=1,
546
+ spikemode='across',
547
+ spikedash='dot',
548
+ range=[-0.05, 1.08]
549
+ ),
550
+ plot_bgcolor="rgba(1, 9, 26, 0.98)",
551
+ paper_bgcolor="rgba(1, 9, 26, 0.98)",
552
+ height=650,
553
+ margin=dict(l=90, r=100, t=120, b=90),
554
+ hovermode='closest',
555
+ hoverdistance=30,
556
+ spikedistance=50,
557
+ legend=dict(
558
+ bgcolor="rgba(1, 9, 26, 0.9)",
559
+ bordercolor="rgba(245, 246, 247, 0.3)",
560
+ borderwidth=2,
561
+ font=dict(size=12, family="Geist, sans-serif", color="#F5F6F7"),
562
+ x=0.02,
563
+ y=0.98,
564
+ xanchor='left',
565
+ yanchor='top',
566
+ orientation='v',
567
+ itemsizing='constant',
568
+ itemwidth=40,
569
+ tracegroupgap=5,
570
+ title=dict(
571
+ text='<b>Legend</b>',
572
+ font=dict(size=13, color='#F5F6F7')
573
+ )
574
+ ),
575
+ showlegend=True,
576
+ annotations=[
577
+ dict(
578
+ text=f"<b>Model:</b> Conservative Linear | <b>Note:</b> Limited data - projection assumes diminishing returns",
579
+ xref="paper", yref="paper",
580
+ x=0.01, y=-0.12,
581
+ showarrow=False,
582
+ font=dict(size=11, color="#B1B5B9", family="Geist, sans-serif"),
583
+ bgcolor="rgba(1, 9, 26, 0.9)",
584
+ bordercolor="rgba(245, 246, 247, 0.3)",
585
+ borderwidth=1,
586
+ borderpad=4
587
+ )
588
+ ]
589
+ )
590
+
591
+
592
+ return fig, date_99, months_from_now
requirements.txt CHANGED
@@ -2,4 +2,5 @@ gradio==5.35.0
2
  pandas
3
  matplotlib
4
  plotly==5.24.1
5
- pydantic==2.10.6
 
 
2
  pandas
3
  matplotlib
4
  plotly==5.24.1
5
+ pydantic==2.10.6
6
+ scipy
results_v2.csv CHANGED
@@ -1,23 +1,23 @@
1
- Model,Vendor,Avg AC,Avg TSQ,Avg Total Cost,Avg Session Duration,Avg Turns,Banking AC,Healthcare AC,Insurance AC,Investment AC,Telecom AC,Banking TSQ,Healthcare TSQ,Insurance TSQ,Investment TSQ,Telecom TSQ,Avg Input Cost ($),Avg Output Cost ($),Banking Cost,Healthcare Cost,Insurance Cost,Investment Cost,Telecom Cost,Banking Duration,Healthcare Duration,Insurance Duration,Investment Duration,Telecom Duration,Banking Turns,Healthcare Turns,Insurance Turns,Investment Turns,Telecom Turns,Model Type,$/M input token,$/M output token,Output Type
2
- gpt-4.1-2025-04-14,OpenAI,0.62,0.8,0.0684,24.32,3.1,0.6,0.62,0.66,0.64,0.58,0.81,0.83,0.68,0.88,0.82,0.0577,0.0107,0.052,0.0711,0.0629,0.0777,0.0783,18.52,24.4,25.24,27.88,25.58,2.61,3.15,2.92,3.3,3.48,Proprietary,2.0,8.0,Normal
3
- mistral-medium-2508,Mistral,0.61,0.77,0.0199,37.45,2.98,0.57,0.6,0.7,0.57,0.59,0.74,0.75,0.73,0.87,0.76,0.0164,0.0035,0.0185,0.0196,0.0195,0.0223,0.0195,31.91,34.94,36.96,43.91,39.53,3.06,2.85,2.92,3.12,2.97,Proprietary,0.4,2.0,Normal
4
- gpt-4.1-mini-2025-04-14,OpenAI,0.56,0.79,0.0141,26.0,3.43,0.56,0.6,0.46,0.5,0.64,0.8,0.85,0.63,0.84,0.83,0.0123,0.0018,0.0115,0.0143,0.0131,0.0164,0.0156,21.28,26.82,23.32,30.5,28.07,2.99,3.32,3.28,3.76,3.79,Proprietary,0.4,1.6,Normal
5
- claude-sonnet-4-20250514,Anthropic,0.55,0.92,0.1537,66.6,2.89,0.58,0.62,0.53,0.49,0.53,0.9,0.95,0.93,0.92,0.9,0.1212,0.0325,0.1359,0.1542,0.1442,0.1669,0.1675,55.36,57.93,56.87,86.44,76.38,2.54,2.84,2.79,3.06,3.22,Proprietary,3.0,15.0,Normal
6
- kimi-k2-instruct,Moonshot AI,0.53,0.9,0.0386,163.62,2.84,0.58,0.49,0.58,0.47,0.53,0.89,0.91,0.88,0.93,0.91,0.0346,0.004,0.0344,0.0401,0.0367,0.0419,0.0397,165.45,155.42,164.9,161.14,171.17,2.58,2.81,2.79,3.1,2.93,Open source,1.0,3.0,Normal
7
- qwen3-235b-a22b-instruct-2507,Alibaba,0.53,0.85,0.0074,238.02,2.4,0.44,0.49,0.74,0.41,0.58,0.88,0.84,0.85,0.91,0.79,0.0067,0.0007,0.0059,0.0079,0.0077,0.0079,0.008,206.53,267.85,233.49,252.94,229.29,1.99,2.41,2.53,2.47,2.62,Open source,0.2,0.6,Reasoning
8
- qwen2.5-72b-instruct,Alibaba,0.51,0.8,0.0361,34.68,2.65,0.48,0.61,0.52,0.42,0.52,0.78,0.84,0.77,0.82,0.79,0.0338,0.0023,0.0292,0.0348,0.0338,0.0417,0.0415,27.34,29.09,30.46,41.32,45.2,2.3,2.46,2.47,3.0,3.0,Open source,0.9,0.9,Normal
9
- gemini-2.5-flash-lite,Google,0.47,0.84,0.0039,9.8,3.11,0.45,0.6,0.54,0.35,0.41,0.82,0.9,0.78,0.86,0.84,0.0034,0.0005,0.003,0.0036,0.0039,0.0049,0.0043,7.96,8.82,9.52,11.31,11.39,2.61,3.0,2.87,3.69,3.36,Proprietary,0.1,0.4,Reasoning
10
- glm-4.5-air,Zai,0.44,0.94,0.0194,69.17,4.96,0.49,0.4,0.53,0.46,0.33,0.94,0.91,0.94,0.96,0.94,0.014,0.0054,0.0152,0.021,0.0191,0.0216,0.0199,57.56,69.95,86.47,76.51,55.35,3.99,5.2,5.02,5.41,5.2,Open source,0.2,1.1,Reasoning
11
- gemini-2.5-pro,Google,0.43,0.86,0.1447,125.85,3.57,0.45,0.4,0.54,0.31,0.44,0.88,0.87,0.87,0.85,0.83,0.0442,0.1005,0.1253,0.1475,0.1386,0.1464,0.1656,108.83,126.91,121.99,129.62,141.92,3.1,3.57,3.49,3.7,3.97,Proprietary,1.25,10.0,Reasoning
12
- grok-4-0709,xAI,0.42,0.88,0.2387,225.94,3.19,0.29,0.4,0.48,0.5,0.42,0.92,0.84,0.91,0.9,0.82,0.1008,0.1379,0.2295,0.2679,0.2073,0.2257,0.2632,226.62,326.28,157.1,189.18,230.5,2.98,3.42,3.07,3.01,3.46,Proprietary,3.0,15.0,Reasoning
13
- deepseek-v3,Deepseek,0.4,0.8,0.0141,59.97,3.71,0.38,0.32,0.48,0.36,0.47,0.8,0.74,0.76,0.87,0.81,0.0119,0.0022,0.0123,0.0158,0.0139,0.0151,0.0138,44.46,68.38,48.54,70.21,68.27,3.27,4.2,3.48,3.79,3.83,Open source,0.27,1.1,Normal
14
- gemini-2.5-flash,Google,0.38,0.94,0.0271,39.84,3.9,0.48,0.38,0.44,0.22,0.36,0.94,0.94,0.94,0.94,0.95,0.0123,0.0148,0.0248,0.0283,0.0273,0.0308,0.0241,33.03,36.81,38.24,42.78,48.34,3.53,3.96,3.78,4.28,3.98,Proprietary,0.3,2.5,Reasoning
15
- gpt-4.1-nano-2025-04-14,OpenAI,0.38,0.63,0.0038,12.36,3.56,0.4,0.4,0.41,0.29,0.38,0.64,0.54,0.54,0.77,0.65,0.0034,0.0004,0.0029,0.0038,0.004,0.0042,0.0041,14.16,10.9,12.23,12.68,11.83,2.88,3.24,3.78,4.01,3.91,Proprietary,0.1,0.4,Normal
16
- qwen3-235b-a22b-thinking-2507,Alibaba,0.34,0.85,0.0584,302.24,3.12,0.42,0.3,0.42,0.23,0.34,0.84,0.82,0.86,0.91,0.84,0.0275,0.0309,0.0535,0.0679,0.0573,0.0562,0.0575,309.41,310.33,316.64,266.96,307.84,2.86,3.43,3.2,3.03,3.1,Open source,0.65,3.0,Reasoning
17
- magistral-medium-2506,Mistral,0.32,0.59,0.1182,32.96,4.4,0.3,0.35,0.38,0.26,0.3,0.59,0.67,0.56,0.63,0.51,0.108,0.0102,0.1067,0.0994,0.1077,0.1476,0.1294,24.98,35.81,33.33,39.18,31.49,4.21,3.46,3.92,5.36,5.07,Proprietary,2.0,5.0,Reasoning
18
- nova-pro-v1,Amazon,0.29,0.65,0.0359,27.96,3.04,0.33,0.29,0.39,0.17,0.29,0.6,0.57,0.64,0.83,0.6,0.0316,0.0043,0.0304,0.0353,0.0359,0.04,0.038,23.45,27.94,27.9,32.09,28.43,2.72,2.88,2.99,3.36,3.26,Proprietary,0.8,3.2,Normal
19
- mistral-small-2506,Mistral,0.26,0.71,0.0053,35.69,4.37,0.37,0.28,0.22,0.2,0.21,0.73,0.71,0.65,0.76,0.69,0.0049,0.0004,0.0041,0.0057,0.0054,0.0058,0.0056,30.64,36.02,30.83,41.96,39.02,3.3,4.47,4.52,4.87,4.67,Open source,0.1,0.3,Normal
20
- llama-3.3-70b-instruct,Meta,0.2,0.62,0.0599,19.92,3.83,0.11,0.29,0.29,0.14,0.16,0.62,0.64,0.62,0.64,0.57,0.0588,0.0011,0.055,0.0544,0.0545,0.0664,0.069,17.62,19.4,18.55,23.91,20.14,3.61,3.34,3.42,4.29,4.5,Open source,0.88,0.88,Normal
21
- caller,Arcee,0.16,0.65,0.0297,25.66,4.2,0.23,0.14,0.22,0.09,0.12,0.69,0.6,0.68,0.61,0.67,0.0282,0.0015,0.0262,0.0303,0.0305,0.0331,0.0286,22.83,25.54,26.42,29.66,23.85,3.76,4.19,4.18,4.75,4.14,Open source,0.55,0.85,Normal
22
- nova-lite-v1,Amazon,0.16,0.55,0.0031,20.26,3.73,0.12,0.18,0.19,0.15,0.18,0.48,0.49,0.58,0.72,0.49,0.0027,0.0004,0.0026,0.0033,0.0031,0.0034,0.0029,17.53,20.61,19.67,24.28,19.2,3.31,4.13,3.57,4.04,3.62,Proprietary,0.06,0.24,Normal
23
- magistral-small-2506,Mistral,0.16,0.53,0.0301,17.42,5.68,0.23,0.18,0.13,0.16,0.12,0.57,0.46,0.42,0.62,0.6,0.0275,0.0026,0.0245,0.0335,0.0302,0.034,0.0281,14.53,21.36,14.65,19.67,16.87,4.74,6.28,6.14,6.06,5.19,Open source,0.5,1.5,Reasoning
 
1
+ Model,Vendor,Avg AC,Avg TSQ,Avg Total Cost,Avg Session Duration,Avg Turns,Banking AC,Healthcare AC,Insurance AC,Investment AC,Telecom AC,Banking TSQ,Healthcare TSQ,Insurance TSQ,Investment TSQ,Telecom TSQ,Avg Input Cost ($),Avg Output Cost ($),Banking Cost,Healthcare Cost,Insurance Cost,Investment Cost,Telecom Cost,Banking Duration,Healthcare Duration,Insurance Duration,Investment Duration,Telecom Duration,Banking Turns,Healthcare Turns,Insurance Turns,Investment Turns,Telecom Turns,Model Type,$/M input token,$/M output token,Output Type,Release Date
2
+ gpt-4.1-2025-04-14,OpenAI,0.62,0.8,0.0684,24.32,3.1,0.6,0.62,0.66,0.64,0.58,0.81,0.83,0.68,0.88,0.82,0.0577,0.0107,0.052,0.0711,0.0629,0.0777,0.0783,18.52,24.4,25.24,27.88,25.58,2.61,3.15,2.92,3.3,3.48,Proprietary,2.0,8.0,Normal,2025-04
3
+ mistral-medium-2508,Mistral,0.61,0.77,0.0199,37.45,2.98,0.57,0.6,0.7,0.57,0.59,0.74,0.75,0.73,0.87,0.76,0.0164,0.0035,0.0185,0.0196,0.0195,0.0223,0.0195,31.91,34.94,36.96,43.91,39.53,3.06,2.85,2.92,3.12,2.97,Proprietary,0.4,2.0,Normal,2025-08
4
+ gpt-4.1-mini-2025-04-14,OpenAI,0.56,0.79,0.0141,26.0,3.43,0.56,0.6,0.46,0.5,0.64,0.8,0.85,0.63,0.84,0.83,0.0123,0.0018,0.0115,0.0143,0.0131,0.0164,0.0156,21.28,26.82,23.32,30.5,28.07,2.99,3.32,3.28,3.76,3.79,Proprietary,0.4,1.6,Normal,2025-04
5
+ claude-sonnet-4-20250514,Anthropic,0.55,0.92,0.1537,66.6,2.89,0.58,0.62,0.53,0.49,0.53,0.9,0.95,0.93,0.92,0.9,0.1212,0.0325,0.1359,0.1542,0.1442,0.1669,0.1675,55.36,57.93,56.87,86.44,76.38,2.54,2.84,2.79,3.06,3.22,Proprietary,3.0,15.0,Normal,2025-05
6
+ kimi-k2-instruct,Moonshot AI,0.53,0.9,0.0386,163.62,2.84,0.58,0.49,0.58,0.47,0.53,0.89,0.91,0.88,0.93,0.91,0.0346,0.004,0.0344,0.0401,0.0367,0.0419,0.0397,165.45,155.42,164.9,161.14,171.17,2.58,2.81,2.79,3.1,2.93,Open source,1.0,3.0,Normal,2025-07
7
+ qwen3-235b-a22b-instruct-2507,Alibaba,0.53,0.85,0.0074,238.02,2.4,0.44,0.49,0.74,0.41,0.58,0.88,0.84,0.85,0.91,0.79,0.0067,0.0007,0.0059,0.0079,0.0077,0.0079,0.008,206.53,267.85,233.49,252.94,229.29,1.99,2.41,2.53,2.47,2.62,Open source,0.2,0.6,Reasoning,2025-07
8
+ qwen2.5-72b-instruct,Alibaba,0.51,0.8,0.0361,34.68,2.65,0.48,0.61,0.52,0.42,0.52,0.78,0.84,0.77,0.82,0.79,0.0338,0.0023,0.0292,0.0348,0.0338,0.0417,0.0415,27.34,29.09,30.46,41.32,45.2,2.3,2.46,2.47,3.0,3.0,Open source,0.9,0.9,Normal,2024-09
9
+ gemini-2.5-flash-lite,Google,0.47,0.84,0.0039,9.8,3.11,0.45,0.6,0.54,0.35,0.41,0.82,0.9,0.78,0.86,0.84,0.0034,0.0005,0.003,0.0036,0.0039,0.0049,0.0043,7.96,8.82,9.52,11.31,11.39,2.61,3.0,2.87,3.69,3.36,Proprietary,0.1,0.4,Reasoning,2025-07
10
+ glm-4.5-air,Zai,0.44,0.94,0.0194,69.17,4.96,0.49,0.4,0.53,0.46,0.33,0.94,0.91,0.94,0.96,0.94,0.014,0.0054,0.0152,0.021,0.0191,0.0216,0.0199,57.56,69.95,86.47,76.51,55.35,3.99,5.2,5.02,5.41,5.2,Open source,0.2,1.1,Reasoning,2025-07
11
+ gemini-2.5-pro,Google,0.43,0.86,0.1447,125.85,3.57,0.45,0.4,0.54,0.31,0.44,0.88,0.87,0.87,0.85,0.83,0.0442,0.1005,0.1253,0.1475,0.1386,0.1464,0.1656,108.83,126.91,121.99,129.62,141.92,3.1,3.57,3.49,3.7,3.97,Proprietary,1.25,10.0,Reasoning,2025-03
12
+ grok-4-0709,xAI,0.42,0.88,0.2387,225.94,3.19,0.29,0.4,0.48,0.5,0.42,0.92,0.84,0.91,0.9,0.82,0.1008,0.1379,0.2295,0.2679,0.2073,0.2257,0.2632,226.62,326.28,157.1,189.18,230.5,2.98,3.42,3.07,3.01,3.46,Proprietary,3.0,15.0,Reasoning,2025-07
13
+ deepseek-v3,Deepseek,0.4,0.8,0.0141,59.97,3.71,0.38,0.32,0.48,0.36,0.47,0.8,0.74,0.76,0.87,0.81,0.0119,0.0022,0.0123,0.0158,0.0139,0.0151,0.0138,44.46,68.38,48.54,70.21,68.27,3.27,4.2,3.48,3.79,3.83,Open source,0.27,1.1,Normal,2024-12
14
+ gemini-2.5-flash,Google,0.38,0.94,0.0271,39.84,3.9,0.48,0.38,0.44,0.22,0.36,0.94,0.94,0.94,0.94,0.95,0.0123,0.0148,0.0248,0.0283,0.0273,0.0308,0.0241,33.03,36.81,38.24,42.78,48.34,3.53,3.96,3.78,4.28,3.98,Proprietary,0.3,2.5,Reasoning,2025-06
15
+ gpt-4.1-nano-2025-04-14,OpenAI,0.38,0.63,0.0038,12.36,3.56,0.4,0.4,0.41,0.29,0.38,0.64,0.54,0.54,0.77,0.65,0.0034,0.0004,0.0029,0.0038,0.004,0.0042,0.0041,14.16,10.9,12.23,12.68,11.83,2.88,3.24,3.78,4.01,3.91,Proprietary,0.1,0.4,Normal,2025-04
16
+ qwen3-235b-a22b-thinking-2507,Alibaba,0.34,0.85,0.0584,302.24,3.12,0.42,0.3,0.42,0.23,0.34,0.84,0.82,0.86,0.91,0.84,0.0275,0.0309,0.0535,0.0679,0.0573,0.0562,0.0575,309.41,310.33,316.64,266.96,307.84,2.86,3.43,3.2,3.03,3.1,Open source,0.65,3.0,Reasoning,2025-07
17
+ magistral-medium-2506,Mistral,0.32,0.59,0.1182,32.96,4.4,0.3,0.35,0.38,0.26,0.3,0.59,0.67,0.56,0.63,0.51,0.108,0.0102,0.1067,0.0994,0.1077,0.1476,0.1294,24.98,35.81,33.33,39.18,31.49,4.21,3.46,3.92,5.36,5.07,Proprietary,2.0,5.0,Reasoning,2025-06
18
+ nova-pro-v1,Amazon,0.29,0.65,0.0359,27.96,3.04,0.33,0.29,0.39,0.17,0.29,0.6,0.57,0.64,0.83,0.6,0.0316,0.0043,0.0304,0.0353,0.0359,0.04,0.038,23.45,27.94,27.9,32.09,28.43,2.72,2.88,2.99,3.36,3.26,Proprietary,0.8,3.2,Normal,2024-12
19
+ mistral-small-2506,Mistral,0.26,0.71,0.0053,35.69,4.37,0.37,0.28,0.22,0.2,0.21,0.73,0.71,0.65,0.76,0.69,0.0049,0.0004,0.0041,0.0057,0.0054,0.0058,0.0056,30.64,36.02,30.83,41.96,39.02,3.3,4.47,4.52,4.87,4.67,Open source,0.1,0.3,Normal,2025-06
20
+ llama-3.3-70b-instruct,Meta,0.2,0.62,0.0599,19.92,3.83,0.11,0.29,0.29,0.14,0.16,0.62,0.64,0.62,0.64,0.57,0.0588,0.0011,0.055,0.0544,0.0545,0.0664,0.069,17.62,19.4,18.55,23.91,20.14,3.61,3.34,3.42,4.29,4.5,Open source,0.88,0.88,Normal,2024-12
21
+ caller,Arcee,0.16,0.65,0.0297,25.66,4.2,0.23,0.14,0.22,0.09,0.12,0.69,0.6,0.68,0.61,0.67,0.0282,0.0015,0.0262,0.0303,0.0305,0.0331,0.0286,22.83,25.54,26.42,29.66,23.85,3.76,4.19,4.18,4.75,4.14,Open source,0.55,0.85,Normal,2025-01
22
+ nova-lite-v1,Amazon,0.16,0.55,0.0031,20.26,3.73,0.12,0.18,0.19,0.15,0.18,0.48,0.49,0.58,0.72,0.49,0.0027,0.0004,0.0026,0.0033,0.0031,0.0034,0.0029,17.53,20.61,19.67,24.28,19.2,3.31,4.13,3.57,4.04,3.62,Proprietary,0.06,0.24,Normal,2024-12
23
+ magistral-small-2506,Mistral,0.16,0.53,0.0301,17.42,5.68,0.23,0.18,0.13,0.16,0.12,0.57,0.46,0.42,0.62,0.6,0.0275,0.0026,0.0245,0.0335,0.0302,0.034,0.0281,14.53,21.36,14.65,19.67,16.87,4.74,6.28,6.14,6.06,5.19,Open source,0.5,1.5,Reasoning,2025-06
tabs/leaderboard_v2.py CHANGED
@@ -4,10 +4,11 @@ import plotly.graph_objects as go
4
 
5
  # Import components and styles from modular files
6
  from components.leaderboard_components import (
7
- get_chart_colors, get_rank_badge, get_type_badge,
8
  get_output_type_badge, get_score_bar, get_metric_tooltip,
9
  get_responsive_styles, get_faq_section, SORT_COLUMN_MAP
10
  )
 
11
  from styles.leaderboard_styles import get_leaderboard_css
12
 
13
 
@@ -1639,7 +1640,112 @@ def create_leaderboard_v2_tab():
1639
  </div>
1640
  </div>
1641
  """)
1642
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1643
  # Radar Chart Section
1644
  gr.HTML("""
1645
  <div class="dark-container" style="margin-bottom: 24px;">
@@ -1847,22 +1953,51 @@ def create_leaderboard_v2_tab():
1847
 
1848
  return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
1849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1850
  # Update table when filters change
1851
  filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order]
1852
-
1853
  for input_component in filter_inputs:
1854
  input_component.change(
1855
  fn=update_table,
1856
  inputs=filter_inputs,
1857
  outputs=[leaderboard_title, leaderboard_table]
1858
  )
1859
-
1860
  # Also update radar chart when filters change
1861
  input_component.change(
1862
  fn=update_radar_chart,
1863
  inputs=filter_inputs + [model_selector],
1864
  outputs=[model_selector, radar_chart]
1865
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1866
 
1867
  # Update radar chart when model selection changes
1868
  model_selector.change(
 
4
 
5
  # Import components and styles from modular files
6
  from components.leaderboard_components import (
7
+ get_chart_colors, get_rank_badge, get_type_badge,
8
  get_output_type_badge, get_score_bar, get_metric_tooltip,
9
  get_responsive_styles, get_faq_section, SORT_COLUMN_MAP
10
  )
11
+ from components.prediction_components import create_ac_prediction_chart
12
  from styles.leaderboard_styles import get_leaderboard_css
13
 
14
 
 
1640
  </div>
1641
  </div>
1642
  """)
1643
+
1644
+ # AI Agent Reliability Prediction Section
1645
+ gr.HTML("""
1646
+ <div class="dark-container" style="margin-bottom: 24px;">
1647
+ <div class="section-header">
1648
+ <span class="section-icon" style="color: var(--accent-secondary);">📈</span>
1649
+ <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
1650
+ Enterprise Readiness Prediction
1651
+ </h3>
1652
+ </div>
1653
+ <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
1654
+ When will AI agents reach 99% reliability for enterprise deployment?
1655
+ </p>
1656
+ """)
1657
+
1658
+ # Add prediction chart - make it reactive
1659
+ gr.HTML('<div class="chart-container">')
1660
+
1661
+ # Create initial prediction with default filters
1662
+ initial_prediction_chart, initial_date_99, initial_months_to_99 = create_ac_prediction_chart(
1663
+ load_leaderboard_data(), domain_filter="All", model_type_filter="All"
1664
+ )
1665
+
1666
+ prediction_plot = gr.Plot(
1667
+ label="",
1668
+ value=initial_prediction_chart,
1669
+ elem_classes=["prediction-chart", "plot-container"]
1670
+ )
1671
+ gr.HTML('</div>')
1672
+
1673
+ # Add dynamic insights section
1674
+ def generate_insight_html(date_99, months_to_99, domain_filter="All", model_type_filter="All"):
1675
+ """Generate insight HTML based on prediction results and filters"""
1676
+ # Clean up filter names
1677
+ if domain_filter.startswith('🌐'):
1678
+ domain_clean = "All Domains"
1679
+ elif domain_filter.startswith('🏦'):
1680
+ domain_clean = "Banking"
1681
+ elif domain_filter.startswith('🏥'):
1682
+ domain_clean = "Healthcare"
1683
+ elif domain_filter.startswith('🛡️'):
1684
+ domain_clean = "Insurance"
1685
+ elif domain_filter.startswith('💰'):
1686
+ domain_clean = "Investment"
1687
+ elif domain_filter.startswith('📱'):
1688
+ domain_clean = "Telecom"
1689
+ else:
1690
+ domain_clean = domain_filter
1691
+
1692
+ filter_context = ""
1693
+ if domain_clean != "All Domains" or model_type_filter != "All":
1694
+ filter_context = " for "
1695
+ if domain_clean != "All Domains":
1696
+ filter_context += f"<strong>{domain_clean}</strong>"
1697
+ if model_type_filter != "All":
1698
+ if domain_clean != "All Domains":
1699
+ filter_context += f" ({model_type_filter} models)"
1700
+ else:
1701
+ filter_context += f"<strong>{model_type_filter} models</strong>"
1702
+
1703
+ if date_99 and months_to_99:
1704
+ if months_to_99 > 0:
1705
+ return f"""
1706
+ <div style="margin-top: 20px; padding: 16px; background: linear-gradient(145deg, rgba(16, 152, 247, 0.1) 0%, rgba(227, 84, 84, 0.05) 100%); border: 1px solid var(--border-subtle); border-radius: 12px;">
1707
+ <div style="display: flex; align-items: center; gap: 12px; margin-bottom: 12px;">
1708
+ <span style="font-size: 1.5rem;">🎯</span>
1709
+ <span style="font-weight: 700; color: var(--text-primary); font-size: 1.1rem;">Key Prediction Insights{filter_context}</span>
1710
+ </div>
1711
+ <ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.8;">
1712
+ <li><strong style="color: var(--accent-primary);">Enterprise threshold (99% AC)</strong> predicted by <strong style="color: var(--accent-secondary);">{date_99.strftime('%B %Y')}</strong></li>
1713
+ <li>Approximately <strong style="color: var(--accent-primary);">{months_to_99:.0f} months</strong> from current date</li>
1714
+ <li>Performance trends based on <strong>historical data{filter_context}</strong></li>
1715
+ <li>Early adopters who invest now gain <strong>{months_to_99:.0f} months</strong> of competitive advantage</li>
1716
+ </ul>
1717
+ </div>
1718
+ """
1719
+ else:
1720
+ return f"""
1721
+ <div style="margin-top: 20px; padding: 16px; background: linear-gradient(145deg, rgba(40, 167, 69, 0.1) 0%, rgba(227, 84, 84, 0.05) 100%); border: 1px solid var(--border-subtle); border-radius: 12px;">
1722
+ <div style="display: flex; align-items: center; gap: 12px; margin-bottom: 12px;">
1723
+ <span style="font-size: 1.5rem;">✅</span>
1724
+ <span style="font-weight: 700; color: var(--text-primary); font-size: 1.1rem;">Enterprise Ready{filter_context}!</span>
1725
+ </div>
1726
+ <ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.8;">
1727
+ <li>Models{filter_context} have <strong style="color: #28a745;">already achieved</strong> near-enterprise reliability</li>
1728
+ <li>Focus should shift to <strong>implementation and scaling</strong></li>
1729
+ <li>Investment in guardrails and observability is <strong>critical now</strong></li>
1730
+ </ul>
1731
+ </div>
1732
+ """
1733
+ else:
1734
+ return f"""
1735
+ <div style="margin-top: 20px; padding: 16px; background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); border-radius: 12px;">
1736
+ <p style="color: var(--text-secondary); margin: 0;">
1737
+ {'Insufficient data' + filter_context + ' to make reliable predictions. More models need to be evaluated in this category.' if (domain_clean != "All Domains" or model_type_filter != "All") else 'Based on current data trends, we are tracking the exponential improvement in AI agent capabilities. As more models are released, our predictions will become more accurate.'}
1738
+ </p>
1739
+ </div>
1740
+ """
1741
+
1742
+ # Create the insights HTML component
1743
+ prediction_insights = gr.HTML(
1744
+ generate_insight_html(initial_date_99, initial_months_to_99, "All", "All")
1745
+ )
1746
+
1747
+ gr.HTML("</div>")
1748
+
1749
  # Radar Chart Section
1750
  gr.HTML("""
1751
  <div class="dark-container" style="margin-bottom: 24px;">
 
1953
 
1954
  return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
1955
 
1956
+ # Function to update prediction chart and insights
1957
+ def update_prediction_chart_and_insights(domain_filter, model_type_filter):
1958
+ """Update prediction chart and insights based on filters"""
1959
+ df = load_leaderboard_data()
1960
+
1961
+ # Create new prediction chart with filters
1962
+ chart, date_99, months_to_99 = create_ac_prediction_chart(
1963
+ df, domain_filter=domain_filter, model_type_filter=model_type_filter
1964
+ )
1965
+
1966
+ # Generate new insights HTML
1967
+ insights_html = generate_insight_html(date_99, months_to_99, domain_filter, model_type_filter)
1968
+
1969
+ return chart, insights_html
1970
+
1971
  # Update table when filters change
1972
  filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order]
1973
+
1974
  for input_component in filter_inputs:
1975
  input_component.change(
1976
  fn=update_table,
1977
  inputs=filter_inputs,
1978
  outputs=[leaderboard_title, leaderboard_table]
1979
  )
1980
+
1981
  # Also update radar chart when filters change
1982
  input_component.change(
1983
  fn=update_radar_chart,
1984
  inputs=filter_inputs + [model_selector],
1985
  outputs=[model_selector, radar_chart]
1986
  )
1987
+
1988
+ # Update prediction chart when domain or model type filters change
1989
+ # Only react to domain_filter and model_type_filter, not other filters
1990
+ domain_filter.change(
1991
+ fn=update_prediction_chart_and_insights,
1992
+ inputs=[domain_filter, model_type_filter],
1993
+ outputs=[prediction_plot, prediction_insights]
1994
+ )
1995
+
1996
+ model_type_filter.change(
1997
+ fn=update_prediction_chart_and_insights,
1998
+ inputs=[domain_filter, model_type_filter],
1999
+ outputs=[prediction_plot, prediction_insights]
2000
+ )
2001
 
2002
  # Update radar chart when model selection changes
2003
  model_selector.change(