|
|
""" |
|
|
create_visualization.py |
|
|
Creates visualization of multilingual embeddings and anomaly detection results |
|
|
using actual training data. |
|
|
""" |
|
|
import sys |
|
|
import numpy as np |
|
|
import matplotlib |
|
|
matplotlib.use('Agg') |
|
|
import matplotlib.pyplot as plt |
|
|
from pathlib import Path |
|
|
from sklearn.decomposition import PCA |
|
|
import joblib |
|
|
|
|
|
if sys.platform == 'win32': |
|
|
sys.stdout.reconfigure(encoding='utf-8') |
|
|
|
|
|
print("=" * 60) |
|
|
print(" CREATING VECTORIZER VISUALIZATION") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
embeddings_path = Path('models/anomaly-detection/artifacts/data_transformation') |
|
|
|
|
|
|
|
|
emb_files = list(embeddings_path.glob('embeddings_*.npy')) |
|
|
if emb_files: |
|
|
latest_emb = sorted(emb_files)[-1] |
|
|
embeddings = np.load(latest_emb) |
|
|
print(f'Loaded embeddings: {embeddings.shape}') |
|
|
else: |
|
|
print('No embeddings found') |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
data_files = list(embeddings_path.glob('transformed_*.parquet')) |
|
|
if data_files: |
|
|
latest_data = sorted(data_files)[-1] |
|
|
df = pd.read_parquet(latest_data) |
|
|
languages = df['language'].values |
|
|
lang_counts = df['language'].value_counts().to_dict() |
|
|
print(f'Languages: {lang_counts}') |
|
|
else: |
|
|
languages = ['english'] * len(embeddings) |
|
|
lang_counts = {'english': len(embeddings)} |
|
|
|
|
|
|
|
|
model_path = Path('models/anomaly-detection/artifacts/model_trainer/isolation_forest_embeddings_only.joblib') |
|
|
model = joblib.load(model_path) |
|
|
predictions = model.predict(embeddings) |
|
|
anomaly_mask = predictions == -1 |
|
|
|
|
|
print(f'Total samples: {len(embeddings)}') |
|
|
print(f'Anomalies detected: {anomaly_mask.sum()}') |
|
|
print(f'Normal samples: {(~anomaly_mask).sum()}') |
|
|
|
|
|
|
|
|
print('\nRunning PCA...') |
|
|
pca = PCA(n_components=2) |
|
|
X_2d = pca.fit_transform(embeddings) |
|
|
print(f'Explained variance: {pca.explained_variance_ratio_.sum():.2%}') |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 6)) |
|
|
|
|
|
|
|
|
ax1 = axes[0] |
|
|
colors = {'english': '#3498db', 'sinhala': '#2ecc71', 'tamil': '#e74c3c', 'unknown': '#95a5a6'} |
|
|
|
|
|
for lang in colors: |
|
|
mask = np.array(languages) == lang |
|
|
if mask.any(): |
|
|
ax1.scatter(X_2d[mask, 0], X_2d[mask, 1], |
|
|
c=colors[lang], label=f'{lang.capitalize()} ({mask.sum()})', |
|
|
alpha=0.7, s=60, edgecolors='white', linewidth=0.5) |
|
|
|
|
|
ax1.set_title('Text Embeddings by Language (PCA Projection)', fontsize=14, fontweight='bold') |
|
|
ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)') |
|
|
ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)') |
|
|
ax1.legend(loc='best', framealpha=0.9) |
|
|
ax1.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
ax2 = axes[1] |
|
|
normal_mask = ~anomaly_mask |
|
|
|
|
|
|
|
|
ax2.scatter(X_2d[normal_mask, 0], X_2d[normal_mask, 1], |
|
|
c='#3498db', label=f'Normal ({normal_mask.sum()})', alpha=0.6, s=60, |
|
|
edgecolors='white', linewidth=0.5) |
|
|
|
|
|
|
|
|
ax2.scatter(X_2d[anomaly_mask, 0], X_2d[anomaly_mask, 1], |
|
|
c='#e74c3c', marker='X', label=f'Anomaly ({anomaly_mask.sum()})', |
|
|
alpha=0.9, s=120, edgecolors='black', linewidth=0.5) |
|
|
|
|
|
ax2.set_title('Anomaly Detection Results (Isolation Forest)', fontsize=14, fontweight='bold') |
|
|
ax2.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)') |
|
|
ax2.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)') |
|
|
ax2.legend(loc='best', framealpha=0.9) |
|
|
ax2.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
output_path = 'vectorizer_anomaly_visualization.png' |
|
|
plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white') |
|
|
print(f'\nVisualization saved: {output_path}') |
|
|
|
|
|
|
|
|
artifacts_dir = Path(r'C:\Users\LENOVO\.gemini\antigravity\brain\b892f63f-afbc-4c4a-bbf1-37195faf04a5') |
|
|
if artifacts_dir.exists(): |
|
|
artifacts_output = artifacts_dir / 'vectorizer_visualization.png' |
|
|
plt.savefig(str(artifacts_output), dpi=150, bbox_inches='tight', facecolor='white') |
|
|
print(f'Also saved to: {artifacts_output}') |
|
|
|
|
|
plt.close() |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print(" VISUALIZATION COMPLETE") |
|
|
print("=" * 60) |
|
|
|