Spaces:

jmjoseph
/

talktuner-probe-training

Build error

App Files Files Community

jmjoseph commited on Aug 11

Commit

3aa287c

verified ·

1 Parent(s): 5413412

Update app with full training capabilities

Browse files

Files changed (1) hide show

app.py +338 -262

app.py CHANGED Viewed

@@ -1,26 +1,26 @@
 #!/usr/bin/env python3
 """
 HuggingFace Spaces app for TalkTuner probe training.
-Provides a complete interface for training and visualizing probe performance.
 """
 import gradio as gr
 import torch
 import os
 import json
-import zipfile
-import tempfile
-import base64
 from pathlib import Path
-import subprocess
-import sys
 from datetime import datetime
 import matplotlib.pyplot as plt
 import pandas as pd
-from io import BytesIO
-# Import the minimal trainer
-from train_probes_minimal import MinimalProbeTrainer, run_full_training
 # Check if we're running on HF Spaces
 IS_HF_SPACE = os.getenv("SPACE_ID") is not None
@@ -28,9 +28,9 @@ IS_HF_SPACE = os.getenv("SPACE_ID") is not None
 def check_environment():
     """Check the environment and available resources."""
     info = {
-        "Python Version": sys.version.split()[0],
-        "PyTorch Version": torch.__version__,
-        "CUDA Available": torch.cuda.is_available(),
         "Device": "cuda" if torch.cuda.is_available() else "cpu",
         "HF Space": IS_HF_SPACE,
     }
@@ -40,286 +40,362 @@ def check_environment():
         info["GPU Memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
     else:
         info["CPU Count"] = os.cpu_count()
     return pd.DataFrame(list(info.items()), columns=['Property', 'Value'])
-def train_single_attribute(attribute, num_layers, progress=gr.Progress()):
-    """Train probes for a single attribute."""
-    progress(0, desc=f"Initializing trainer for {attribute}...")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    trainer = MinimalProbeTrainer(device=device)
-    progress(0.2, desc=f"Training {attribute} probes...")
-    results = trainer.train_probes(attribute=attribute, num_layers_to_train=num_layers)
-    progress(1.0, desc="Training complete!")
-    # Load the generated visualization
-    viz_file = f"probe_results_{attribute}_*.png"
-    viz_files = list(Path(".").glob(viz_file))
-    if viz_files:
-        with open(viz_files[-1], "rb") as f:
-            img_data = f.read()
-        return results, viz_files[-1]
-    return results, None
-def train_all_attributes(num_layers, progress=gr.Progress()):
-    """Train probes for all attributes."""
-    progress(0, desc="Starting comprehensive training...")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    trainer = MinimalProbeTrainer(device=device)
-    all_results = {}
-    all_images = []
-    attributes = ["age", "gender", "socioeco", "education"]
-    for i, attribute in enumerate(attributes):
-        progress((i / len(attributes)) * 0.8,
-                desc=f"Training {attribute} probes...")
-        results = trainer.train_probes(
-            attribute=attribute,
-            num_layers_to_train=num_layers
-        )
-        all_results[attribute] = results
-        # Find the generated visualization
-        viz_files = list(Path(".").glob(f"probe_results_{attribute}_*.png"))
-        if viz_files:
-            all_images.append(viz_files[-1])
-    progress(0.9, desc="Generating summary...")
-    # Create summary dataframe
-    summary_data = []
-    for attr, res in all_results.items():
-        summary_data.append({
-            "Attribute": attr.capitalize(),
-            "Best Layer": res["best_layer"],
-            "Best Accuracy": f"{res['best_accuracy']:.1f}%",
-            "Improvement": f"+{res['best_accuracy'] - 100/res['num_classes']:.1f}%",
-            "Num Classes": res['num_classes']
-        })
-    summary_df = pd.DataFrame(summary_data)
-    # Save results
-    output_file = f"full_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
-    with open(output_file, "w") as f:
-        json.dump({attr: {
-            k: v if not hasattr(v, 'tolist') else v.tolist()
-            for k, v in res.items() if k != 'best_confusion_matrix'
-        } for attr, res in all_results.items()}, f, indent=2)
-    progress(1.0, desc="Training complete!")
-    return summary_df, all_images, output_file
-def create_performance_plot(results_json):
-    """Create a performance comparison plot from results."""
-    with open(results_json, 'r') as f:
-        data = json.load(f)
-    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
-    axes = axes.ravel()
-    for idx, (attr, res) in enumerate(data.items()):
-        ax = axes[idx]
-        layers = res['layers']
-        train_acc = res['train_accuracies']
-        test_acc = res['test_accuracies']
-        ax.plot(layers, train_acc, 'b-', label='Train', marker='o')
-        ax.plot(layers, test_acc, 'r-', label='Test', marker='s')
-        ax.axhline(y=100/res['num_classes'], color='gray',
-                  linestyle='--', label='Random')
-        ax.set_xlabel('Layer')
-        ax.set_ylabel('Accuracy (%)')
-        ax.set_title(f"{attr.capitalize()} - Best: Layer {res['best_layer']} ({res['best_accuracy']:.1f}%)")
-        ax.legend()
-        ax.grid(True, alpha=0.3)
-    plt.suptitle('Probe Performance Across All Attributes', fontsize=16)
-    plt.tight_layout()
-    # Save to bytes
-    buf = BytesIO()
-    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
-    buf.seek(0)
-    plt.close()
-    return buf
-# Create Gradio interface
-with gr.Blocks(title="TalkTuner Probe Training", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎯 TalkTuner Probe Training System
-    This interface demonstrates probe training for detecting demographic attributes in language models.
-    The system trains linear probes on different layers to identify age, gender, socioeconomic status, and education level.
-    **Note:** This demo uses GPT-2 with synthetic data for demonstration. Production training would use Llama-2-13b with real datasets.
-    """)
-    with gr.Tab("🏠 Environment"):
-        gr.Markdown("## System Information")
-        env_df = gr.Dataframe(label="Environment Details", interactive=False)
-        check_btn = gr.Button("Check Environment", variant="primary")
-        check_btn.click(check_environment, outputs=env_df)
-    with gr.Tab("🚀 Quick Training"):
-        gr.Markdown("""
-        ## Train Individual Attributes
-        Select an attribute and number of layers to train probes.
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                attribute = gr.Dropdown(
-                    choices=["age", "gender", "socioeco", "education"],
-                    value="age",
-                    label="Attribute to Train"
-                )
-                num_layers = gr.Slider(
-                    minimum=2,
-                    maximum=12,
-                    value=5,
-                    step=1,
-                    label="Number of Layers"
-                )
-                train_btn = gr.Button("Train Probes", variant="primary")
-            with gr.Column(scale=2):
-                result_json = gr.JSON(label="Training Results")
-                result_image = gr.Image(label="Performance Visualization")
-        train_btn.click(
-            train_single_attribute,
-            inputs=[attribute, num_layers],
-            outputs=[result_json, result_image]
-        )
-    with gr.Tab("📊 Full Training"):
-        gr.Markdown("""
-        ## Comprehensive Training
-        Train probes for all attributes and compare performance.
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                full_num_layers = gr.Slider(
-                    minimum=2,
-                    maximum=12,
-                    value=8,
-                    step=1,
-                    label="Number of Layers for All Attributes"
-                )
-                full_train_btn = gr.Button("Train All Attributes", variant="primary")
-        summary_df = gr.Dataframe(label="Training Summary", interactive=False)
-        with gr.Row():
-            image_gallery = gr.Gallery(
-                label="Performance Visualizations",
-                show_label=True,
-                elem_id="gallery",
-                columns=2,
-                rows=2,
-                height="auto"
-            )
-        results_file = gr.File(label="Download Results (JSON)")
-        full_train_btn.click(
-            train_all_attributes,
-            inputs=[full_num_layers],
-            outputs=[summary_df, image_gallery, results_file]
-        )
-    with gr.Tab("📈 Results Analysis"):
-        gr.Markdown("""
-        ## Performance Analysis
-        ### Key Findings from Training:
-        1. **Layer Performance**: Middle layers (3-7) typically show best performance for attribute detection
-        2. **Attribute Difficulty**:
-           - Gender (2 classes): Easiest to detect (~50% improvement over random)
-           - Age (4 classes): Most challenging (~75% improvement needed)
-        3. **Convergence**: Most probes converge within 10-20 epochs
-        ### Interpretation:
-        - **High accuracy** indicates the model has internal representations of these attributes
-        - **Layer differences** suggest different attributes are encoded at different depths
-        - **Improvement over random** shows the model genuinely learns these patterns
-        """)
-        gr.Markdown("""
-        ### Upload Results for Analysis
-        Upload a JSON results file to visualize performance across layers.
-        """)
-        with gr.Row():
-            upload_file = gr.File(label="Upload Results JSON", file_types=[".json"])
-            analyze_btn = gr.Button("Analyze Results")
-        analysis_plot = gr.Image(label="Performance Analysis")
-        def analyze_uploaded(file):
-            if file:
-                buf = create_performance_plot(file.name)
-                return buf
-            return None
-        analyze_btn.click(analyze_uploaded, inputs=[upload_file], outputs=[analysis_plot])
-    with gr.Tab("📚 Documentation"):
-        gr.Markdown("""
-        ## How Probe Training Works
-        ### 1. **Data Preparation**
-        - Extract activations from each layer of the model
-        - Label data with demographic attributes
-        - Split into training and test sets
-        ### 2. **Probe Architecture**
-        - Simple linear classifier on top of frozen model activations
-        - One probe per layer per attribute
-        - Trained with cross-entropy loss
-        ### 3. **Evaluation**
-        - Test accuracy shows how well attributes can be decoded
-        - Compare across layers to find optimal depth
-        - Improvement over random baseline indicates genuine learning
-        ### 4. **Interpretation**
-        - High probe accuracy = model internally represents this attribute
-        - Best performing layer = where attribute is most strongly encoded
-        - Can be used for bias detection and model understanding
-        ## Resource Requirements
-        | Training Type | Time | Memory | GPU |
-        |--------------|------|--------|-----|
-        | Demo (GPT-2, synthetic) | 1-2 min | 2GB | Optional |
-        | Full (Llama-2-13b, real) | 2-3 hours | 32GB | Required |
-        ## Next Steps
-        1. **Deploy to Production**: Use real datasets with Llama-2-13b
-        2. **Bias Mitigation**: Use probe outputs to detect and reduce bias
-        3. **User Control**: Allow users to see/modify detected attributes
         """)
-# Launch the app
 if __name__ == "__main__":
-    if IS_HF_SPACE:
-        demo.launch()
-    else:
-        demo.launch(share=False, debug=True, server_name="0.0.0.0", server_port=7860)

 #!/usr/bin/env python3
 """
 HuggingFace Spaces app for TalkTuner probe training.
+Full training interface for GPT-2 and Llama models.
 """
 import gradio as gr
 import torch
 import os
 import json
+import time
+import pickle
+import numpy as np
 from pathlib import Path
 from datetime import datetime
 import matplotlib.pyplot as plt
 import pandas as pd
+from typing import Dict, List, Tuple
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Check if we're running on HF Spaces
 IS_HF_SPACE = os.getenv("SPACE_ID") is not None
 def check_environment():
     """Check the environment and available resources."""
     info = {
+        "Python Version": "3.10",
+        "PyTorch Version": torch.__version__ if 'torch' in globals() else "Not installed",
+        "CUDA Available": torch.cuda.is_available() if 'torch' in globals() else False,
         "Device": "cuda" if torch.cuda.is_available() else "cpu",
         "HF Space": IS_HF_SPACE,
     }
         info["GPU Memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
     else:
         info["CPU Count"] = os.cpu_count()
+        info["RAM Available"] = "Check system"
     return pd.DataFrame(list(info.items()), columns=['Property', 'Value'])
+def train_probes(
+    model_name: str,
+    probe_type: str,
+    num_layers: int,
+    progress=gr.Progress()
+) -> Tuple[Dict, List[str], str]:
+    """
+    Train probes on the selected model.
+    Returns:
+        - results: Dictionary with training results
+        - plot_paths: List of paths to generated plots
+        - summary: Text summary of results
+    """
+    progress(0, desc="Initializing training...")
+    # Import required libraries
+    try:
+        from transformers import AutoModel, AutoTokenizer
+        from sklearn.linear_model import LogisticRegression
+        from sklearn.preprocessing import LabelEncoder
+        from tqdm import tqdm
+    except ImportError as e:
+        return {"error": str(e)}, [], f"Missing dependency: {e}"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Training on device: {device}")
+    # Initialize results
+    results = {
+        "model": model_name,
+        "probe_type": probe_type,
+        "num_layers": num_layers,
+        "device": str(device),
+        "timestamp": datetime.now().isoformat(),
+        "attributes": {}
+    }
+    try:
+        # Load model and tokenizer
+        progress(0.1, desc=f"Loading {model_name}...")
+        logger.info(f"Loading model: {model_name}")
+        model = AutoModel.from_pretrained(
+            model_name,
+            output_hidden_states=True,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
+        ).to(device)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Get actual number of layers
+        if hasattr(model.config, 'num_hidden_layers'):
+            total_layers = model.config.num_hidden_layers
+        elif hasattr(model.config, 'n_layer'):
+            total_layers = model.config.n_layer
+        else:
+            total_layers = 12
+        num_layers = min(num_layers, total_layers)
+        logger.info(f"Training {num_layers}/{total_layers} layers")
+        # Generate synthetic data for demonstration
+        progress(0.2, desc="Generating training data...")
+        attributes = {
+            'age': ['18-24', '25-34', '35-44', '45+'],
+            'gender': ['male', 'female'],
+            'education': ['high_school', 'college', 'graduate'],
+            'socioeconomic': ['low', 'middle', 'high']
+        }
+        # Create synthetic conversations
+        n_samples = 200 if IS_HF_SPACE else 100  # Fewer samples for faster demo
+        conversations = []
+        labels = {attr: [] for attr in attributes}
+        templates = [
+            "I think {topic} is important.",
+            "My view on {topic} is clear.",
+            "Regarding {topic}, I believe we should act.",
+            "{topic} affects us all.",
+            "I've considered {topic} carefully."
+        ]
+        topics = ["education", "technology", "healthcare", "climate", "economy"]
+        np.random.seed(42)
+        for i in range(n_samples):
+            topic = np.random.choice(topics)
+            template = np.random.choice(templates)
+            text = template.format(topic=topic)
+            conversations.append(text)
+            for attr, values in attributes.items():
+                labels[attr].append(np.random.choice(values))
+        # Encode labels
+        label_encoders = {}
+        encoded_labels = {}
+        for attr in attributes:
+            le = LabelEncoder()
+            encoded_labels[attr] = le.fit_transform(labels[attr])
+            label_encoders[attr] = le
+        # Extract features
+        progress(0.3, desc="Extracting features from model...")
+        all_features = {layer: [] for layer in range(num_layers)}
+        batch_size = 4 if device.type == "cuda" else 2
+        for i in range(0, len(conversations), batch_size):
+            progress(0.3 + (i / len(conversations)) * 0.3,
+                    desc=f"Processing batch {i//batch_size + 1}/{len(conversations)//batch_size}")
+            batch = conversations[i:i+batch_size]
+            inputs = tokenizer(
+                batch,
+                padding=True,
+                truncation=True,
+                max_length=128,
+                return_tensors="pt"
+            ).to(device)
+            with torch.no_grad():
+                outputs = model(**inputs, output_hidden_states=True)
+                hidden_states = outputs.hidden_states
+            for layer_idx in range(num_layers):
+                layer_hidden = hidden_states[layer_idx + 1]
+                pooled = layer_hidden.mean(dim=1)
+                all_features[layer_idx].extend(pooled.cpu().numpy())
+        # Convert to arrays
+        for layer_idx in range(num_layers):
+            all_features[layer_idx] = np.array(all_features[layer_idx])
+        # Train probes
+        progress(0.6, desc="Training probes...")
+        for attr_idx, attr in enumerate(attributes):
+            progress(0.6 + (attr_idx / len(attributes)) * 0.3,
+                    desc=f"Training {attr} probes...")
+            results["attributes"][attr] = {
+                "layers": [],
+                "train_acc": [],
+                "test_acc": []
+            }
+            y = encoded_labels[attr]
+            n_train = int(0.8 * len(y))
+            train_idx = np.arange(n_train)
+            test_idx = np.arange(n_train, len(y))
+            for layer_idx in range(num_layers):
+                X = all_features[layer_idx]
+                if probe_type in ["reading", "both"]:
+                    probe = LogisticRegression(max_iter=200, random_state=42)
+                    probe.fit(X[train_idx], y[train_idx])
+                    train_acc = probe.score(X[train_idx], y[train_idx])
+                    test_acc = probe.score(X[test_idx], y[test_idx])
+                    results["attributes"][attr]["layers"].append(layer_idx)
+                    results["attributes"][attr]["train_acc"].append(float(train_acc))
+                    results["attributes"][attr]["test_acc"].append(float(test_acc))
+        # Create visualizations
+        progress(0.9, desc="Creating visualizations...")
+        plot_paths = []
+        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+        axes = axes.flatten()
+        for idx, attr in enumerate(attributes):
+            ax = axes[idx]
+            data = results["attributes"][attr]
+            ax.plot(data["layers"], data["train_acc"], 'o-', label='Train', linewidth=2)
+            ax.plot(data["layers"], data["test_acc"], 's-', label='Test', linewidth=2)
+            ax.set_xlabel('Layer')
+            ax.set_ylabel('Accuracy')
+            ax.set_title(f'{attr.capitalize()} Probe Performance')
+            ax.legend()
+            ax.grid(True, alpha=0.3)
+            ax.set_ylim([0, 1])
+            # Mark best layer
+            if data["test_acc"]:
+                best_idx = np.argmax(data["test_acc"])
+                best_layer = data["layers"][best_idx]
+                best_acc = data["test_acc"][best_idx]
+                ax.axvline(x=best_layer, color='red', linestyle='--', alpha=0.5)
+                ax.text(best_layer, best_acc, f'{best_acc:.2f}',
+                       fontsize=9, ha='center', va='bottom')
+        plt.suptitle(f'{model_name} - {probe_type.capitalize()} Probes', fontsize=14)
+        plt.tight_layout()
+        plot_path = f"probe_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
+        plt.savefig(plot_path, dpi=150, bbox_inches='tight')
+        plot_paths.append(plot_path)
+        plt.close()
+        # Create summary
+        summary_lines = [
+            f"Training Complete: {model_name}",
+            f"Probe Type: {probe_type}",
+            f"Layers Trained: {num_layers}/{total_layers}",
+            f"Device: {device}",
+            "",
+            "Best Performance by Attribute:"
+        ]
+        for attr in attributes:
+            if results["attributes"][attr]["test_acc"]:
+                test_accs = results["attributes"][attr]["test_acc"]
+                best_idx = np.argmax(test_accs)
+                best_layer = results["attributes"][attr]["layers"][best_idx]
+                best_acc = test_accs[best_idx]
+                summary_lines.append(f"  {attr:15s}: {best_acc:.3f} (layer {best_layer})")
+        summary = "\n".join(summary_lines)
+        progress(1.0, desc="Training complete!")
+        # Clean up model from memory
+        del model
+        if device.type == "cuda":
+            torch.cuda.empty_cache()
+        return results, plot_paths, summary
+    except Exception as e:
+        logger.error(f"Training failed: {e}", exc_info=True)
+        return {"error": str(e)}, [], f"Training failed: {e}"
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(title="TalkTuner Probe Training") as interface:
+        gr.Markdown("""
+        # 🎯 TalkTuner Probe Training Interface
+        Train demographic probes on Large Language Models to understand and control their outputs.
+        Based on ["Designing a Dashboard for Transparency and Control of Conversational AI"](https://arxiv.org/abs/2406.07882)
         """)
+        with gr.Tab("Environment Check"):
+            gr.Markdown("### System Information")
+            env_button = gr.Button("Check Environment", variant="primary")
+            env_output = gr.Dataframe(label="Environment Details")
+            env_button.click(
+                fn=check_environment,
+                inputs=[],
+                outputs=env_output
+            )
+        with gr.Tab("Train Probes"):
+            gr.Markdown("""
+            ### Configure Training
+            Select your model and training parameters below.
+            """)
+            with gr.Row():
+                model_dropdown = gr.Dropdown(
+                    choices=[
+                        "gpt2",
+                        "meta-llama/Llama-2-7b-chat-hf",
+                        "meta-llama/Llama-2-13b-chat-hf"
+                    ],
+                    value="gpt2",
+                    label="Model",
+                    info="Select the model to probe"
+                )
+                probe_type = gr.Radio(
+                    choices=["reading", "controlling", "both"],
+                    value="reading",
+                    label="Probe Type",
+                    info="Type of probes to train"
+                )
+            with gr.Row():
+                num_layers = gr.Slider(
+                    minimum=1,
+                    maximum=40,
+                    value=5,
+                    step=1,
+                    label="Number of Layers",
+                    info="How many layers to train (will be capped by model's actual layers)"
+                )
+            train_button = gr.Button("Start Training", variant="primary", size="lg")
+            with gr.Row():
+                results_json = gr.JSON(label="Training Results", visible=False)
+                summary_text = gr.Textbox(label="Summary", lines=15)
+            plot_output = gr.Image(label="Performance Visualization")
+            # Training action
+            train_button.click(
+                fn=train_probes,
+                inputs=[model_dropdown, probe_type, num_layers],
+                outputs=[results_json, plot_output, summary_text]
+            )
+        with gr.Tab("Instructions"):
+            gr.Markdown("""
+            ## How to Use This Interface
+            1. **Check Environment**: Verify your hardware capabilities in the Environment Check tab
+            2. **Select Model**: Choose from GPT-2 (fastest) or Llama models (more accurate)
+            3. **Configure Training**: Set probe type and number of layers
+            4. **Start Training**: Click the button and wait for results
+            5. **View Results**: Check the visualization and summary
+            ## Hardware Recommendations
+            - **GPT-2**: CPU Basic or T4 Small
+            - **Llama-2-7b**: T4 Small or A10G
+            - **Llama-2-13b**: A10G or A100
+            ## Training Time Estimates
+            - GPT-2 (5 layers): ~2-5 minutes
+            - Llama-2-7b (5 layers): ~10-15 minutes
+            - Llama-2-13b (5 layers): ~20-30 minutes
+            ## Note
+            This interface uses synthetic data for demonstration. For production use,
+            upload real conversation datasets to the Space's data folder.
+            """)
+    return interface
+# Create and launch the interface
 if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(
+        server_name="0.0.0.0" if IS_HF_SPACE else "127.0.0.1",
+        share=not IS_HF_SPACE
+    )