Spaces:

jmjoseph
/

talktuner-probe-training

Build error

App Files Files Community

jmjoseph commited on Aug 11

Commit

5413412

verified ·

1 Parent(s): b7d275c

Deploy TalkTuner probe training interface

Browse files

Files changed (17) hide show

README.md +29 -6
app.py +325 -0
pyproject.toml +164 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/dataset.cpython-310.pyc +0 -0
src/__pycache__/losses.cpython-310.pyc +0 -0
src/__pycache__/probes.cpython-310.pyc +0 -0
src/__pycache__/train_test_utils.cpython-310.pyc +0 -0
src/dataset.py +280 -0
src/intervention_utils.py +164 -0
src/losses.py +150 -0
src/probes.py +551 -0
src/prompt_utils.py +58 -0
src/train_test_utils.py +174 -0
train_probes.py +523 -0
train_probes_minimal.py +399 -0

README.md CHANGED Viewed

@@ -1,12 +1,35 @@
 ---
-title: Talktuner Probe Training
-emoji: 🏆
-colorFrom: purple
-colorTo: gray
 sdk: gradio
-sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TalkTuner Probe Training
+emoji: 🎯
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.0.0
 app_file: app.py
 pinned: false
+python_version: 3.10
 ---
+# TalkTuner Probe Training Interface
+This Space provides an interactive interface for training demographic probes on Large Language Models.
+## Features
+- Train reading and controlling probes
+- Support for multiple demographic attributes (age, gender, socioeconomic, education)
+- Real-time training progress visualization
+- Download trained models and results
+## Hardware Requirements
+- **CPU Basic**: Testing and demonstration (free)
+- **T4 Small**: Full training with GPU (~$0.60/hour)
+- **A10G**: Faster training (~$1.05/hour)
+## Setup
+1. Upload your dataset files to `data/dataset/`
+2. Configure your HuggingFace token in Space settings
+3. Select appropriate hardware tier
+4. Launch the interface
+## Based on
+["Designing a Dashboard for Transparency and Control of Conversational AI"](https://arxiv.org/abs/2406.07882)

app.py ADDED Viewed

	@@ -0,0 +1,325 @@

+#!/usr/bin/env python3
+"""
+HuggingFace Spaces app for TalkTuner probe training.
+Provides a complete interface for training and visualizing probe performance.
+"""
+import gradio as gr
+import torch
+import os
+import json
+import zipfile
+import tempfile
+import base64
+from pathlib import Path
+import subprocess
+import sys
+from datetime import datetime
+import matplotlib.pyplot as plt
+import pandas as pd
+from io import BytesIO
+# Import the minimal trainer
+from train_probes_minimal import MinimalProbeTrainer, run_full_training
+# Check if we're running on HF Spaces
+IS_HF_SPACE = os.getenv("SPACE_ID") is not None
+def check_environment():
+    """Check the environment and available resources."""
+    info = {
+        "Python Version": sys.version.split()[0],
+        "PyTorch Version": torch.__version__,
+        "CUDA Available": torch.cuda.is_available(),
+        "Device": "cuda" if torch.cuda.is_available() else "cpu",
+        "HF Space": IS_HF_SPACE,
+    }
+    if torch.cuda.is_available():
+        info["GPU Name"] = torch.cuda.get_device_name(0)
+        info["GPU Memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
+    else:
+        info["CPU Count"] = os.cpu_count()
+    return pd.DataFrame(list(info.items()), columns=['Property', 'Value'])
+def train_single_attribute(attribute, num_layers, progress=gr.Progress()):
+    """Train probes for a single attribute."""
+    progress(0, desc=f"Initializing trainer for {attribute}...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    trainer = MinimalProbeTrainer(device=device)
+    progress(0.2, desc=f"Training {attribute} probes...")
+    results = trainer.train_probes(attribute=attribute, num_layers_to_train=num_layers)
+    progress(1.0, desc="Training complete!")
+    # Load the generated visualization
+    viz_file = f"probe_results_{attribute}_*.png"
+    viz_files = list(Path(".").glob(viz_file))
+    if viz_files:
+        with open(viz_files[-1], "rb") as f:
+            img_data = f.read()
+        return results, viz_files[-1]
+    return results, None
+def train_all_attributes(num_layers, progress=gr.Progress()):
+    """Train probes for all attributes."""
+    progress(0, desc="Starting comprehensive training...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    trainer = MinimalProbeTrainer(device=device)
+    all_results = {}
+    all_images = []
+    attributes = ["age", "gender", "socioeco", "education"]
+    for i, attribute in enumerate(attributes):
+        progress((i / len(attributes)) * 0.8,
+                desc=f"Training {attribute} probes...")
+        results = trainer.train_probes(
+            attribute=attribute,
+            num_layers_to_train=num_layers
+        )
+        all_results[attribute] = results
+        # Find the generated visualization
+        viz_files = list(Path(".").glob(f"probe_results_{attribute}_*.png"))
+        if viz_files:
+            all_images.append(viz_files[-1])
+    progress(0.9, desc="Generating summary...")
+    # Create summary dataframe
+    summary_data = []
+    for attr, res in all_results.items():
+        summary_data.append({
+            "Attribute": attr.capitalize(),
+            "Best Layer": res["best_layer"],
+            "Best Accuracy": f"{res['best_accuracy']:.1f}%",
+            "Improvement": f"+{res['best_accuracy'] - 100/res['num_classes']:.1f}%",
+            "Num Classes": res['num_classes']
+        })
+    summary_df = pd.DataFrame(summary_data)
+    # Save results
+    output_file = f"full_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    with open(output_file, "w") as f:
+        json.dump({attr: {
+            k: v if not hasattr(v, 'tolist') else v.tolist()
+            for k, v in res.items() if k != 'best_confusion_matrix'
+        } for attr, res in all_results.items()}, f, indent=2)
+    progress(1.0, desc="Training complete!")
+    return summary_df, all_images, output_file
+def create_performance_plot(results_json):
+    """Create a performance comparison plot from results."""
+    with open(results_json, 'r') as f:
+        data = json.load(f)
+    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+    axes = axes.ravel()
+    for idx, (attr, res) in enumerate(data.items()):
+        ax = axes[idx]
+        layers = res['layers']
+        train_acc = res['train_accuracies']
+        test_acc = res['test_accuracies']
+        ax.plot(layers, train_acc, 'b-', label='Train', marker='o')
+        ax.plot(layers, test_acc, 'r-', label='Test', marker='s')
+        ax.axhline(y=100/res['num_classes'], color='gray',
+                  linestyle='--', label='Random')
+        ax.set_xlabel('Layer')
+        ax.set_ylabel('Accuracy (%)')
+        ax.set_title(f"{attr.capitalize()} - Best: Layer {res['best_layer']} ({res['best_accuracy']:.1f}%)")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+    plt.suptitle('Probe Performance Across All Attributes', fontsize=16)
+    plt.tight_layout()
+    # Save to bytes
+    buf = BytesIO()
+    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+    buf.seek(0)
+    plt.close()
+    return buf
+# Create Gradio interface
+with gr.Blocks(title="TalkTuner Probe Training", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎯 TalkTuner Probe Training System
+    This interface demonstrates probe training for detecting demographic attributes in language models.
+    The system trains linear probes on different layers to identify age, gender, socioeconomic status, and education level.
+    **Note:** This demo uses GPT-2 with synthetic data for demonstration. Production training would use Llama-2-13b with real datasets.
+    """)
+    with gr.Tab("🏠 Environment"):
+        gr.Markdown("## System Information")
+        env_df = gr.Dataframe(label="Environment Details", interactive=False)
+        check_btn = gr.Button("Check Environment", variant="primary")
+        check_btn.click(check_environment, outputs=env_df)
+    with gr.Tab("🚀 Quick Training"):
+        gr.Markdown("""
+        ## Train Individual Attributes
+        Select an attribute and number of layers to train probes.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                attribute = gr.Dropdown(
+                    choices=["age", "gender", "socioeco", "education"],
+                    value="age",
+                    label="Attribute to Train"
+                )
+                num_layers = gr.Slider(
+                    minimum=2,
+                    maximum=12,
+                    value=5,
+                    step=1,
+                    label="Number of Layers"
+                )
+                train_btn = gr.Button("Train Probes", variant="primary")
+            with gr.Column(scale=2):
+                result_json = gr.JSON(label="Training Results")
+                result_image = gr.Image(label="Performance Visualization")
+        train_btn.click(
+            train_single_attribute,
+            inputs=[attribute, num_layers],
+            outputs=[result_json, result_image]
+        )
+    with gr.Tab("📊 Full Training"):
+        gr.Markdown("""
+        ## Comprehensive Training
+        Train probes for all attributes and compare performance.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                full_num_layers = gr.Slider(
+                    minimum=2,
+                    maximum=12,
+                    value=8,
+                    step=1,
+                    label="Number of Layers for All Attributes"
+                )
+                full_train_btn = gr.Button("Train All Attributes", variant="primary")
+        summary_df = gr.Dataframe(label="Training Summary", interactive=False)
+        with gr.Row():
+            image_gallery = gr.Gallery(
+                label="Performance Visualizations",
+                show_label=True,
+                elem_id="gallery",
+                columns=2,
+                rows=2,
+                height="auto"
+            )
+        results_file = gr.File(label="Download Results (JSON)")
+        full_train_btn.click(
+            train_all_attributes,
+            inputs=[full_num_layers],
+            outputs=[summary_df, image_gallery, results_file]
+        )
+    with gr.Tab("📈 Results Analysis"):
+        gr.Markdown("""
+        ## Performance Analysis
+        ### Key Findings from Training:
+        1. **Layer Performance**: Middle layers (3-7) typically show best performance for attribute detection
+        2. **Attribute Difficulty**:
+           - Gender (2 classes): Easiest to detect (~50% improvement over random)
+           - Age (4 classes): Most challenging (~75% improvement needed)
+        3. **Convergence**: Most probes converge within 10-20 epochs
+        ### Interpretation:
+        - **High accuracy** indicates the model has internal representations of these attributes
+        - **Layer differences** suggest different attributes are encoded at different depths
+        - **Improvement over random** shows the model genuinely learns these patterns
+        """)
+        gr.Markdown("""
+        ### Upload Results for Analysis
+        Upload a JSON results file to visualize performance across layers.
+        """)
+        with gr.Row():
+            upload_file = gr.File(label="Upload Results JSON", file_types=[".json"])
+            analyze_btn = gr.Button("Analyze Results")
+        analysis_plot = gr.Image(label="Performance Analysis")
+        def analyze_uploaded(file):
+            if file:
+                buf = create_performance_plot(file.name)
+                return buf
+            return None
+        analyze_btn.click(analyze_uploaded, inputs=[upload_file], outputs=[analysis_plot])
+    with gr.Tab("📚 Documentation"):
+        gr.Markdown("""
+        ## How Probe Training Works
+        ### 1. **Data Preparation**
+        - Extract activations from each layer of the model
+        - Label data with demographic attributes
+        - Split into training and test sets
+        ### 2. **Probe Architecture**
+        - Simple linear classifier on top of frozen model activations
+        - One probe per layer per attribute
+        - Trained with cross-entropy loss
+        ### 3. **Evaluation**
+        - Test accuracy shows how well attributes can be decoded
+        - Compare across layers to find optimal depth
+        - Improvement over random baseline indicates genuine learning
+        ### 4. **Interpretation**
+        - High probe accuracy = model internally represents this attribute
+        - Best performing layer = where attribute is most strongly encoded
+        - Can be used for bias detection and model understanding
+        ## Resource Requirements
+        | Training Type | Time | Memory | GPU |
+        |--------------|------|--------|-----|
+        | Demo (GPT-2, synthetic) | 1-2 min | 2GB | Optional |
+        | Full (Llama-2-13b, real) | 2-3 hours | 32GB | Required |
+        ## Next Steps
+        1. **Deploy to Production**: Use real datasets with Llama-2-13b
+        2. **Bias Mitigation**: Use probe outputs to detect and reduce bias
+        3. **User Control**: Allow users to see/modify detected attributes
+        """)
+# Launch the app
+if __name__ == "__main__":
+    if IS_HF_SPACE:
+        demo.launch()
+    else:
+        demo.launch(share=False, debug=True, server_name="0.0.0.0", server_port=7860)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,164 @@

+[tool.poetry]
+name = "talktuner-dashboard"
+version = "0.1.0"
+description = "TalkTuner: A Dashboard for Transparency and Control of Conversational AI"
+authors = ["Your Name <your.email@example.com>"]
+readme = "README.md"
+license = "MIT"
+homepage = "https://github.com/Josh-Joseph/reproduce_talktuner_dashboard"
+repository = "https://github.com/Josh-Joseph/reproduce_talktuner_dashboard"
+keywords = ["chatbot", "llm", "dashboard", "transparency", "conversational-ai", "probes"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+]
+packages = [{include = "src"}]
+[tool.poetry.dependencies]
+python = ">=3.9,<3.11"
+# Core ML/AI dependencies (essential for training)
+torch = ">=2.0.0"
+transformers = ">=4.30.0"
+accelerate = ">=0.20.0"
+tokenizers = ">=0.13.0"
+safetensors = ">=0.3.0"
+# Scientific computing (essential)
+numpy = ">=1.23.0"
+scipy = ">=1.9.0"
+scikit-learn = ">=1.1.0"
+pandas = ">=1.4.0"
+matplotlib = ">=3.5.0"
+# Utilities (essential)
+tqdm = ">=4.65.0"
+pyyaml = ">=6.0"
+requests = ">=2.28.0"
+# Web interface (for HuggingFace Spaces)
+gradio = ">=5.0.0"
+seaborn = ">=0.12.0"
+# Optional dependencies - install manually if needed
+# torchvision = ">=0.15.0"
+# datasets = ">=2.0.0"
+# sentencepiece = ">=0.1.99"
+# einops = ">=0.7.0"
+# seaborn = ">=0.12.0"
+# plotly = ">=5.14.0"
+# jupyter = ">=1.0.0"
+# jupyterlab = ">=4.0.0"
+# ipykernel = ">=6.20.0"
+# flask = ">=2.2.0"
+# flask-cors = ">=4.0.0"
+# opencv-python = ">=4.6.0"
+# pillow = ">=9.2.0"
+# huggingface-hub = ">=0.16.0"
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.4.0"
+pytest-cov = "^4.1.0"
+pytest-xdist = "^3.3.0"
+pytest-mock = "^3.11.0"
+black = "^23.7.0"
+isort = "^5.12.0"
+flake8 = "^6.1.0"
+mypy = "^1.5.0"
+pre-commit = "^3.3.0"
+ipdb = "^0.13.0"
+[tool.poetry.group.docs.dependencies]
+sphinx = "^7.1.0"
+sphinx-rtd-theme = "^1.3.0"
+sphinx-autodoc-typehints = "^1.24.0"
+myst-parser = "^2.0.0"
+[tool.poetry.scripts]
+train-probes = "train_probes:main"
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.black]
+line-length = 100
+target-version = ['py39']
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  # directories
+  \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | build
+  | dist
+  | data
+)/
+'''
+[tool.isort]
+profile = "black"
+line_length = 100
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+[tool.mypy]
+python_version = "3.9"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = false
+disallow_any_unimported = false
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+check_untyped_defs = true
+ignore_missing_imports = true
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q --strict-markers"
+testpaths = [
+    "tests",
+]
+python_files = "test_*.py"
+python_classes = "Test*"
+python_functions = "test_*"
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "integration: marks tests as integration tests",
+    "unit: marks tests as unit tests",
+]
+[tool.coverage.run]
+source = ["src"]
+omit = [
+    "*/tests/*",
+    "*/test_*.py",
+    "*/__init__.py",
+]
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "if self.debug",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if 0:",
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+    "class .*\\bProtocol\\):",
+    "@(abc\\.)?abstractmethod",
+]

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (157 Bytes). View file

src/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (8.21 kB). View file

src/__pycache__/losses.cpython-310.pyc ADDED Viewed

Binary file (3.72 kB). View file

src/__pycache__/probes.cpython-310.pyc ADDED Viewed

Binary file (12.7 kB). View file

src/__pycache__/train_test_utils.cpython-310.pyc ADDED Viewed

Binary file (4.43 kB). View file

src/dataset.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import os
+from torch.utils.data import Dataset
+from torch.utils.data.dataloader import DataLoader
+import torch.nn.functional as F
+import torch
+from tqdm.auto import tqdm
+from collections import OrderedDict
+class ModuleHook:
+    def __init__(self, module):
+        self.hook = module.register_forward_hook(self.hook_fn)
+        self.module = None
+        self.features = []
+    def hook_fn(self, module, input, output):
+        self.module = module
+        self.features.append(output.detach())
+    def close(self):
+        self.hook.remove()
+def remove_last_k_words(s, k):
+    """
+    Remove the last k words from the string s.
+    Any words that appear before the last occurrence of "[INST]" will not be removed.
+    """
+    # Split string into words
+    words = s.split()
+    # Find the last occurrence of "[INST]"
+    if "[/INST]" in words:
+        last_inst_index = max([i for i, word in enumerate(words) if word == "[/INST]"])
+    else:
+        last_inst_index = -1
+    # If k words to be removed are less than words after last INST, remove those words.
+    # Otherwise, keep the words up to and including INST and remove words after that.
+    if len(words) - last_inst_index - 1 > k:
+        return ' '.join(words[:-k])
+    else:
+        return ' '.join(words[:last_inst_index+1])
+def split_conversation(text, user_identifier="HUMAN:", ai_identifier="ASSISTANT:"):
+    user_messages = []
+    assistant_messages = []
+    lines = text.split("\n")
+    current_user_message = ""
+    current_assistant_message = ""
+    for line in lines:
+        line = line.lstrip(" ")
+        if line.startswith(user_identifier):
+            if current_assistant_message:
+                assistant_messages.append(current_assistant_message.strip())
+                current_assistant_message = ""
+            current_user_message += line.replace(user_identifier, "").strip() + " "
+        elif line.startswith(ai_identifier):
+            if current_user_message:
+                user_messages.append(current_user_message.strip())
+                current_user_message = ""
+            current_assistant_message += line.replace(ai_identifier, "").strip() + " "
+    if current_user_message:
+        user_messages.append(current_user_message.strip())
+    if current_assistant_message:
+        assistant_messages.append(current_assistant_message.strip())
+    return user_messages, assistant_messages
+def llama_v2_prompt(
+    messages: list[dict],
+    system_prompt=None
+):
+    B_INST, E_INST = "[INST]", "[/INST]"
+    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+    BOS, EOS = "<s>", "</s>"
+    if system_prompt:
+        DEFAULT_SYSTEM_PROMPT = system_prompt
+    else:
+        DEFAULT_SYSTEM_PROMPT = f"""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+    if messages[0]["role"] != "system":
+        messages = [
+            {
+                "role": "system",
+                "content": DEFAULT_SYSTEM_PROMPT,
+            }
+        ] + messages
+    messages = [
+        {
+            "role": messages[1]["role"],
+            "content": B_SYS + messages[0]["content"] + E_SYS + messages[1]["content"],
+        }
+    ] + messages[2:]
+    messages_list = [
+        f"{BOS}{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} {EOS}"
+        for prompt, answer in zip(messages[::2], messages[1::2])
+    ]
+    if messages[-1]["role"] == "user":
+        messages_list.append(f"{BOS}{B_INST} {(messages[-1]['content']).strip()} {E_INST}")
+    return "".join(messages_list)
+prompt_translator = {"_age_": "age",
+                     "_gender_": "gender",
+                     "_socioeco_": "socioeconomic status",
+                     "_education_": "education level",}
+class TextDataset(Dataset):
+    def __init__(self, directory, tokenizer, model, label_idf="_age_", label_to_id=None,
+                 convert_to_llama2_format=False, user_identifier="HUMAN:", ai_identifier="ASSISTANT:", control_probe=False,
+                 additional_datas=None, residual_stream=False, new_format=False, if_augmented=False, k=20,
+                 remove_last_ai_response=False, include_inst=False, one_hot=False, last_tok_pos=-1):
+        self.file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith('.txt')]
+        self.tokenizer = tokenizer
+        self.labels = []
+        self.acts = []
+        self.texts = []
+        self.label_idf = label_idf
+        self.label_to_id = label_to_id
+        self.model = model
+        self.convert_to_llama2_format = convert_to_llama2_format
+        self.user_identifier = user_identifier
+        self.ai_identifier = ai_identifier
+        self.additional_datas = additional_datas
+        self.residual_stream = residual_stream
+        self.new_format = new_format
+        self.if_augmented = if_augmented
+        self.k = k
+        self.if_remove_last_ai_response = remove_last_ai_response
+        self.include_inst = include_inst
+        self.one_hot = one_hot
+        self.last_tok_pos = last_tok_pos
+        self.control_probe = control_probe
+        if self.additional_datas:
+            for directory in self.additional_datas:
+                self.file_paths += [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith('.txt')]
+        self._load_in_data()
+    def __len__(self):
+        return len(self.texts)
+    def _load_in_data(self):
+        for idx in tqdm(range(len(self.file_paths))):
+            file_path = self.file_paths[idx]
+            corrupted_file_paths = []
+            int_idx = file_path[file_path.find("conversation_")+len("conversation_"):]
+            int_idx = int(int_idx[:int_idx.find("_")])
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read()
+            if self.convert_to_llama2_format:
+                if "### Human:" in text:
+                    user_msgs, ai_msgs = split_conversation(text, "### Human:", "### Assistant:")
+                elif "### User:" in text:
+                    user_msgs, ai_msgs = split_conversation(text, "### User:", "### Assistant:")
+                else:
+                    user_msgs, ai_msgs = split_conversation(text, self.user_identifier, self.ai_identifier)
+                messages_dict = []
+                for user_msg, ai_msg in zip(user_msgs, ai_msgs):
+                    messages_dict.append({'content': user_msg, 'role': 'user'})
+                    messages_dict.append({'content': ai_msg, 'role': 'assistant'})
+                if len(messages_dict) < 1:
+                    corrupted_file_paths.append(file_path)
+                    print(f"Corrupted file at {file_path}")
+                    continue
+                if self.if_remove_last_ai_response and messages_dict[-1]["role"] == "assistant":
+                    messages_dict = messages_dict[:-1]
+                try:
+                    text = llama_v2_prompt(messages_dict)
+                except:
+                    corrupted_file_paths.append(file_path)
+                    print(f"Corrupted file at {file_path}")
+                    continue
+            if self.new_format and self.if_remove_last_ai_response and self.include_inst:
+                text = text[text.find("<s>") + len("<s>"):]
+            elif self.new_format and self.include_inst:
+                text = text[text.find("<s>") + len("<s>"):]
+            elif self.new_format:
+                text = text[text.find("<s>") + len("<s>"): text.rfind("[/INST]") - 1]
+            label = file_path[file_path.rfind(self.label_idf) + len(self.label_idf):file_path.rfind(".txt")]
+            if label not in self.label_to_id.keys():
+                continue
+            if self.label_to_id:
+                label = self.label_to_id[label]
+            if self.one_hot:
+                label = F.one_hot(torch.Tensor([label]).to(torch.long), len(self.label_to_id.keys()))
+            if not self.control_probe:
+                text += f" I think the {prompt_translator[self.label_idf]} of this user is"
+            with torch.no_grad():
+                encoding = self.tokenizer(
+                  text,
+                  truncation=True,
+                  max_length=2048,
+                  return_attention_mask=True,
+                  return_tensors='pt'
+                )
+                features = OrderedDict()
+                for name, module in self.model.named_modules():
+                    if name.endswith(".mlp") or name.endswith(".embed_tokens"):
+                        features[name] = ModuleHook(module)
+                # Get the device from the model
+                device = next(self.model.parameters()).device
+                output = self.model(input_ids=encoding['input_ids'].to(device),
+                                    attention_mask=encoding['attention_mask'].to(device),
+                                    output_hidden_states=True,
+                                    return_dict=True)
+                for feature in features.values():
+                    feature.close()
+            last_acts = []
+            if self.if_augmented:
+                if self.residual_stream:
+                    for layer_num in range(41):
+                        last_acts.append(output["hidden_states"][layer_num][:, -self.k:].detach().cpu().clone().to(torch.float))
+                    last_acts = torch.cat(last_acts, dim=0)
+                else:
+                    last_acts.append(features['model.embed_tokens'].features[0][:, -self.k:].detach().cpu().clone().to(torch.float))
+                    for layer_num in range(1, 41):
+                        last_acts.append(features[f'model.layers.{layer_num - 1}.mlp'].features[0][:, -self.k:].detach().cpu().clone().to(torch.float))
+                    last_acts = torch.cat(last_acts, dim=0)
+            else:
+                if self.residual_stream:
+                    for layer_num in range(41):
+                        last_acts.append(output["hidden_states"][layer_num][:, -1].detach().cpu().clone().to(torch.float))
+                    last_acts = torch.cat(last_acts)
+                else:
+                    last_acts.append(features['model.embed_tokens'].features[0][:, -1].detach().cpu().clone().to(torch.float))
+                    for layer_num in range(1, 41):
+                        last_acts.append(features[f'model.layers.{layer_num - 1}.mlp'].features[0][:, -1].detach().cpu().clone().to(torch.float))
+                    last_acts = torch.cat(last_acts)
+            self.texts.append(text)
+            self.labels.append(label)
+            self.acts.append(last_acts)
+        for path in corrupted_file_paths:
+            self.file_paths.remove(path)
+    def __getitem__(self, idx):
+        label = self.labels[idx]
+        text = self.texts[idx]
+        if self.if_augmented:
+            random_k = torch.randint(0, self.k, [1])[0].item()
+            hidden_states = self.acts[idx][:, -random_k]
+        else:
+            hidden_states = self.acts[idx]
+        return {
+            'hidden_states': hidden_states,
+            'file_path': self.file_paths[idx],
+            'age': label,
+            'text': text,
+        }

src/intervention_utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from src.probes import ProbeClassification, ProbeClassificationMixScaler, LinearProbeClassification, LinearProbeClassificationMixScaler
+import os
+import torch.nn.functional as F
+import torch
+from tqdm.auto import tqdm
+from src.dataset import llama_v2_prompt
+import numpy as np
+from torch import nn
+device = "cuda"
+torch_device = "cuda"
+def load_probe_classifier(model_func, input_dim, num_classes, weight_path, **kwargs):
+    """
+    Instantiate a ProbeClassification model and load its pretrained weights.
+    Args:
+    - input_dim (int): Input dimension for the classifier.
+    - num_classes (int): Number of classes for classification.
+    - weight_path (str): Path to the pretrained weights.
+    Returns:
+    - model: The ProbeClassification model with loaded weights.
+    """
+    # Instantiate the model
+    model = model_func(device, num_classes, input_dim, **kwargs)
+    # Load the pretrained weights into the model
+    model.load_state_dict(torch.load(weight_path))
+    return model
+num_classes = {"age": 4,
+               "gender": 2,
+               "education": 3,
+               "socioeco": 3,}
+def return_classifier_dict(directory, model_func, chosen_layer=None, mix_scaler=False, sklearn=False, **kwargs):
+    checkpoint_paths = os.listdir(directory)
+    # file_paths = [os.path.join(directory, file) for file in checkpoint_paths if file.endswith("pth")]
+    classifier_dict = {}
+    for i in range(len(checkpoint_paths)):
+        category = checkpoint_paths[i][:checkpoint_paths[i].find("_")]
+        weight_path = os.path.join(directory, checkpoint_paths[i])
+        num_class = num_classes[category]
+        if category == "gender" and sklearn:
+            num_class = 1
+        if category not in classifier_dict.keys():
+            classifier_dict[category] = {}
+        if mix_scaler:
+            classifier_dict[category]["all"] = load_probe_classifier(model_func, 5120,
+                                                                     num_classes=num_class,
+                                                                     weight_path=weight_path, **kwargs)
+        else:
+            layer_num = int(checkpoint_paths[i][checkpoint_paths[i].rfind("_") + 1: checkpoint_paths[i].rfind(".pth")])
+            if chosen_layer is None or layer_num == chosen_layer:
+                try:
+                    classifier_dict[category][layer_num] = load_probe_classifier(model_func, 5120,
+                                                                                 num_classes=num_class,
+                                                                                 weight_path=weight_path, **kwargs)
+                except Exception as e:
+                    print(category)
+                    # print(e)
+    return classifier_dict
+def split_into_messages(text: str) -> list[str]:
+    # Constants used for splitting
+    B_INST, E_INST = "[INST]", "[/INST]"
+    # Use the tokens to split the text
+    parts = []
+    current_message = ""
+    for word in text.split():
+        # If we encounter a start or end token, and there's a current message, store it
+        if word in [B_INST, E_INST] and current_message:
+            parts.append(current_message.strip())
+            current_message = ""
+        # If the word is not a token, add it to the current message
+        elif word not in [B_INST, E_INST]:
+            current_message += word + " "
+    # Append any remaining message
+    if current_message:
+        parts.append(current_message.strip())
+    return parts
+def llama_v2_reverse(prompt: str) -> list[dict]:
+    # Constants used in the LLaMa style
+    B_INST, E_INST = "[INST]", "[/INST]"
+    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+    BOS, EOS = "<s>", "</s>"
+    messages = []
+    sys_start = prompt.find(B_SYS)
+    sys_end = prompt.rfind(E_SYS)
+    if sys_start != -1 and sys_end != -1:
+        system_msg = prompt[sys_start + len(B_SYS): sys_end]
+    messages.append({"role": "system", "content": system_msg})
+    prompt = prompt[sys_end + len(E_SYS):]
+    user_ai_msgs = split_into_messages(prompt)
+    user_turn = True
+    for message in user_ai_msgs:
+        if user_turn:
+            messages.append({"role": "user", "content": message})
+        else:
+            messages.append({"role": "assistant", "content": message})
+        if user_turn:
+            user_turn = False
+        else:
+            user_turn = True
+    return messages
+def optimize_one_inter_rep(inter_rep, layer_name, target, probe,
+                           lr=1e-2,
+                           N=4, normalized=False):
+    global first_time
+    tensor = (inter_rep.clone()).to(torch_device).requires_grad_(True)
+    rep_f = lambda: tensor
+    target_clone = target.clone().to(torch_device).to(torch.float)
+    cur_input_tensor = rep_f().clone().detach()
+    if normalized:
+        cur_input_tensor = rep_f() + target_clone.view(1, -1) @ probe.proj[0].weight * N * 100 / rep_f().norm()
+    else:
+        cur_input_tensor = rep_f() + target_clone.view(1, -1) @ probe.proj[0].weight * N
+    return cur_input_tensor.clone()
+def edit_inter_rep_multi_layers(output, layer_name):
+    """
+    This function must be called inside the script, given classifier dict and other hyperparameters are undefined in this function
+    """
+    if residual:
+        layer_num = layer_name[layer_name.rfind("model.layers.") + len("model.layers."):]
+    else:
+        layer_num = layer_name[layer_name.rfind("model.layers.") + len("model.layers."):layer_name.rfind(".mlp")]
+    layer_num = int(layer_num)
+    probe = classifier_dict[attribute][layer_num + 1]
+    cloned_inter_rep = output[0][0][-1].unsqueeze(0).detach().clone().to(torch.float)
+    with torch.enable_grad():
+        cloned_inter_rep = optimize_one_inter_rep(cloned_inter_rep, layer_name,
+                                                  cf_target, probe,
+                                                  lr=lr,
+                                                  N=N,)
+    # output[1] = cloned_inter_rep.to(torch.float16)
+    # print(len(output))
+    output[0][0][-1] = cloned_inter_rep[0].to(torch.float16)
+    return output

src/losses.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+import torch.nn.functional as F
+import scipy.ndimage as nd
+import numpy as np
+def get_device():
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda:0" if use_cuda else "cpu")
+    return device
+def relu_evidence(y):
+    return F.relu(y)
+def one_hot_embedding(labels, num_classes=10):
+    # Convert to One Hot Encoding
+    y = torch.eye(num_classes)
+    return y[labels]
+def exp_evidence(y):
+    return torch.exp(torch.clamp(y, -10, 10))
+def softplus_evidence(y):
+    return F.softplus(y)
+def kl_divergence(alpha, num_classes, device=None):
+    if not device:
+        device = get_device()
+    ones = torch.ones([1, num_classes], dtype=torch.float32, device=device)
+    sum_alpha = torch.sum(alpha, dim=1, keepdim=True)
+    first_term = (
+        torch.lgamma(sum_alpha)
+        - torch.lgamma(alpha).sum(dim=1, keepdim=True)
+        + torch.lgamma(ones).sum(dim=1, keepdim=True)
+        - torch.lgamma(ones.sum(dim=1, keepdim=True))
+    )
+    second_term = (
+        (alpha - ones)
+        .mul(torch.digamma(alpha) - torch.digamma(sum_alpha))
+        .sum(dim=1, keepdim=True)
+    )
+    kl = first_term + second_term
+    return kl
+def loglikelihood_loss(y, alpha, device=None):
+    if not device:
+        device = get_device()
+    y = y.to(device)
+    alpha = alpha.to(device)
+    S = torch.sum(alpha, dim=1, keepdim=True)
+    loglikelihood_err = torch.sum((y - (alpha / S)) ** 2, dim=1, keepdim=True)
+    loglikelihood_var = torch.sum(
+        alpha * (S - alpha) / (S * S * (S + 1)), dim=1, keepdim=True
+    )
+    loglikelihood = loglikelihood_err + loglikelihood_var
+    return loglikelihood
+def mse_loss(y, alpha, epoch_num, num_classes, annealing_step, device=None):
+    if not device:
+        device = get_device()
+    y = y.to(device)
+    alpha = alpha.to(device)
+    loglikelihood = loglikelihood_loss(y, alpha, device=device)
+    annealing_coef = torch.min(
+        torch.tensor(1.0, dtype=torch.float32),
+        torch.tensor(epoch_num / annealing_step, dtype=torch.float32),
+    )
+    kl_alpha = (alpha - 1) * (1 - y) + 1
+    kl_div = annealing_coef * kl_divergence(kl_alpha, num_classes, device=device)
+    return loglikelihood + kl_div
+def edl_loss(func, y, alpha, epoch_num, num_classes, annealing_step, device=None):
+    y = y.to(device)
+    alpha = alpha.to(device)
+    S = torch.sum(alpha, dim=1, keepdim=True)
+    A = torch.sum(y * (func(S) - func(alpha)), dim=1, keepdim=True)
+    annealing_coef = torch.min(
+        torch.tensor(1.0, dtype=torch.float32),
+        torch.tensor(epoch_num / annealing_step, dtype=torch.float32),
+    )
+    kl_alpha = (alpha - 1) * (1 - y) + 1
+    kl_div = annealing_coef * kl_divergence(kl_alpha, num_classes, device=device)
+    return A + kl_div
+def edl_mse_loss(output, target, epoch_num, num_classes, annealing_step=10, device="cuda", probability=False):
+    if not probability:
+        target = one_hot_embedding(target, num_classes)
+    else:
+        target = target
+    if not device:
+        device = get_device()
+    evidence = relu_evidence(output)
+    alpha = evidence + 1
+    loss = torch.mean(
+        mse_loss(target, alpha, epoch_num, num_classes, annealing_step, device=device)
+    )
+    return loss
+def edl_log_loss(output, target, epoch_num, num_classes, annealing_step, device="cuda"):
+    if not device:
+        device = get_device()
+    evidence = relu_evidence(output)
+    alpha = evidence + 1
+    loss = torch.mean(
+        edl_loss(
+            torch.log, target, alpha, epoch_num, num_classes, annealing_step, device
+        )
+    )
+    return loss
+def edl_digamma_loss(
+    output, target, epoch_num, num_classes, annealing_step, device=None
+):
+    if not device:
+        device = get_device()
+    evidence = relu_evidence(output)
+    alpha = evidence + 1
+    loss = torch.mean(
+        edl_loss(
+            torch.digamma, target, alpha, epoch_num, num_classes, annealing_step, device
+        )
+    )
+    return loss
+def calc_prob_uncertinty(p):
+    evidence = relu_evidence(p)
+    alpha = evidence + 1
+    uncertainty = 6 / torch.sum(alpha, dim=1, keepdim=True)
+    _, preds = torch.max(p, 1)
+    prob = alpha / torch.sum(alpha, dim=1, keepdim=True)
+    prob = prob.flatten()
+    preds = preds.flatten()
+    return prob, uncertainty

src/probes.py ADDED Viewed

	@@ -0,0 +1,551 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+class ProbeClassification(nn.Module):
+    def __init__(self, device, probe_class, input_dim=512, hidden_neurons=128):  # from 0 to 15
+        super().__init__()
+        self.input_dim = input_dim
+        self.probe_class = probe_class
+        self.proj = nn.Sequential(
+            nn.Linear(self.input_dim, hidden_neurons),
+            nn.ReLU(True),
+            nn.Linear(hidden_neurons, self.probe_class),
+        )
+        self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.to(device)
+    def forward(self, act, y=None):
+        # [B, f], [B]
+        logits = self.proj(act)#.reshape(-1, self.probe_number, self.probe_class)  # [B, C]
+        if y is None:
+            return logits, None
+        else:
+            targets = y.to(torch.long)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+            return logits, loss
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def configure_optimizers(self, train_config):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # biases of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        # no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        print("Decayed:", decay)
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=0)
+        return optimizer, scheduler
+class LinearProbeClassification(nn.Module):
+    def __init__(self, device, probe_class, input_dim=512, logistic=False, Relu=False, TanH=False):  # from 0 to 15
+        super().__init__()
+        self.input_dim = input_dim
+        self.probe_class = probe_class
+        if logistic:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+                nn.Sigmoid()
+            )
+        elif Relu:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+                nn.ReLU(True)
+            )
+        elif TanH:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+                # nn.Hardtanh(inplace=True, min_val=0.001, max_val=0.999)
+                nn.Hardsigmoid(inplace=True)
+            )
+        else:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+            )
+        self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.to(device)
+    def forward(self, act, y=None):
+        # [B, f], [B]
+        logits = self.proj(act)#.reshape(-1, self.probe_number, self.probe_class)  # [B, C]
+        if y is None:
+            return logits, None
+        else:
+            targets = y.to(torch.long)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+            return logits, loss
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def configure_optimizers(self, train_config):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # biases of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        # no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        print("Decayed:", decay)
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=0)
+        return optimizer, scheduler
+class TwoLayerLinearProbeClassification(nn.Module):
+    def __init__(self, device, probe_class, input_dim=512, logistic=False):  # from 0 to 15
+        super().__init__()
+        self.input_dim = input_dim
+        self.probe_class = probe_class
+        if not logistic:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.input_dim),
+                nn.Linear(self.input_dim, self.probe_class),
+            )
+        else:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.input_dim),
+                nn.Linear(self.input_dim, self.probe_class),
+                nn.Sigmoid()
+            )
+        self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.to(device)
+    def forward(self, act, y=None):
+        # [B, f], [B]
+        logits = self.proj(act)#.reshape(-1, self.probe_number, self.probe_class)  # [B, C]
+        if y is None:
+            return logits, None
+        else:
+            targets = y.to(torch.long)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+            return logits, loss
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def configure_optimizers(self, train_config):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # biases of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        # no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        print("Decayed:", decay)
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=0)
+        return optimizer, scheduler
+class ProbeClassificationMixScaler(nn.Module):
+    def __init__(self, device, probe_class, input_dim=512, num_layers=41, soft_weight_lr_rate=1e-1,
+                 hidden_neurons=128):  # from 0 to 15
+        super().__init__()
+        self.input_dim = input_dim
+        self.probe_class = probe_class
+        self.num_layers = num_layers
+        # self.mix_weights = torch.nn.Parameter(1 / num_layers * torch.ones(num_layers), requires_grad=True)
+        self.mix_weights = nn.Linear(num_layers, 1, bias=False)
+        torch.nn.init.constant_(self.mix_weights.weight, 1 / num_layers)
+        self.soft_weight_lr_rate=soft_weight_lr_rate
+        self.proj = nn.Sequential(
+            nn.Linear(self.input_dim, hidden_neurons),
+            nn.ReLU(True),
+            nn.Linear(hidden_neurons, self.probe_class),
+        )
+        self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.to(device)
+    def forward(self, act, y=None):
+        # [B, f], [B]
+        softmaxed_weights = torch.nn.functional.softmax(self.mix_weights.weight, dim=1)
+        act = act.permute([0, 2, 1])
+        act = (act @ softmaxed_weights.T)[..., 0]
+        logits = self.proj(act)#.reshape(-1, self.probe_number, self.probe_class)  # [B, C]
+        if y is None:
+            return logits, None
+        else:
+            targets = y.to(torch.long)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+            return logits, loss
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def configure_optimizers(self, train_config):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # biases of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and (not "mix" in fpn) and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        # no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        # assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    # % (str(param_dict.keys() - union_params), )
+        print("Decayed:", decay)
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+            {'params': self.mix_weights.weight, "lr": self.soft_weight_lr_rate, "weight_decay": train_config.weight_decay},
+        ]
+        optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=0)
+        return optimizer, scheduler
+class LinearProbeClassificationMixScaler(nn.Module):
+    def __init__(self, device, probe_class, input_dim=512, num_layers=41, soft_weight_lr_rate=1e-1,
+                 logistic=False):  # from 0 to 15
+        super().__init__()
+        self.input_dim = input_dim
+        self.probe_class = probe_class
+        self.num_layers = num_layers
+        # self.mix_weights = torch.nn.Parameter(1 / num_layers * torch.ones(num_layers), requires_grad=True)
+        self.mix_weights = nn.Linear(num_layers, 1, bias=False)
+        torch.nn.init.constant_(self.mix_weights.weight, 1 / num_layers)
+        self.soft_weight_lr_rate=soft_weight_lr_rate
+        if not logistic:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+            )
+        else:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+                nn.Sigmoid()
+            )
+        self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.to(device)
+    def forward(self, act, y=None):
+        # [B, f], [B]
+        softmaxed_weights = torch.nn.functional.softmax(self.mix_weights.weight, dim=1)
+        act = act.permute([0, 2, 1])
+        act = (act @ softmaxed_weights.T)[..., 0]
+        logits = self.proj(act)#.reshape(-1, self.probe_number, self.probe_class)  # [B, C]
+        if y is None:
+            return logits, None
+        else:
+            targets = y.to(torch.long)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+            return logits, loss
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def configure_optimizers(self, train_config):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # biases of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and (not "mix" in fpn) and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        # no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        # assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    # % (str(param_dict.keys() - union_params), )
+        print("Decayed:", decay)
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+            {'params': self.mix_weights.weight, "lr": self.soft_weight_lr_rate, "weight_decay": train_config.weight_decay},
+        ]
+        optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=0)
+        return optimizer, scheduler
+class TwoLayerLinearProbeClassificationMixScaler(nn.Module):
+    def __init__(self, device, probe_class, input_dim=512, num_layers=41, soft_weight_lr_rate=1e-1,
+                 logistic=False):  # from 0 to 15
+        super().__init__()
+        self.input_dim = input_dim
+        self.probe_class = probe_class
+        self.num_layers = num_layers
+        # self.mix_weights = torch.nn.Parameter(1 / num_layers * torch.ones(num_layers), requires_grad=True)
+        self.mix_weights = nn.Linear(num_layers, 1, bias=False)
+        torch.nn.init.constant_(self.mix_weights.weight, 1 / num_layers)
+        self.soft_weight_lr_rate=soft_weight_lr_rate
+        self.rotates = nn.ModuleList([nn.Linear(5120, 5120) for _ in range(41)]),
+        if not logistic:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+            )
+        else:
+            self.proj = nn.Sequential(
+                nn.Linear(self.input_dim, self.probe_class),
+                nn.Sigmoid()
+            )
+        self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.to(device)
+    def forward(self, act, y=None):
+        # [B, f], [B]
+        outputs = []
+        for i in range(num_vectors):
+            output_i = self.rotates[i](act[:, i, :])  # shape: (batch_size, 5120)
+            outputs.append(output_i)
+        # Stack the outputs back together
+        act = torch.stack(outputs, dim=1)
+        softmaxed_weights = torch.nn.functional.softmax(self.mix_weights.weight, dim=1)
+        act = act.permute([0, 2, 1])
+        act = (act @ softmaxed_weights.T)[..., 0]
+        logits = self.proj(act)#.reshape(-1, self.probe_number, self.probe_class)  # [B, C]
+        if y is None:
+            return logits, None
+        else:
+            targets = y.to(torch.long)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+            return logits, loss
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def configure_optimizers(self, train_config):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # biases of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and (not "mix" in fpn) and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        # no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        # assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    # % (str(param_dict.keys() - union_params), )
+        print("Decayed:", decay)
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+            {'params': self.mix_weights.weight, "lr": self.soft_weight_lr_rate, "weight_decay": train_config.weight_decay},
+        ]
+        optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.75, patience=0)
+        return optimizer, scheduler
+class TrainerConfig:
+    # optimization parameters
+    learning_rate = 1e-3
+    betas = (0.9, 0.95)
+    weight_decay = 0.1 # only applied on matmul weights
+    def __init__(self, **kwargs):
+        for k,v in kwargs.items():
+            setattr(self, k, v)

src/prompt_utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+def split_into_messages(text: str) -> list[str]:
+    # Constants used for splitting
+    B_INST, E_INST = "[INST]", "[/INST]"
+    # Use the tokens to split the text
+    parts = []
+    current_message = ""
+    for word in text.split():
+        # If we encounter a start or end token, and there's a current message, store it
+        if word in [B_INST, E_INST] and current_message:
+            parts.append(current_message.strip())
+            current_message = ""
+        # If the word is not a token, add it to the current message
+        elif word not in [B_INST, E_INST]:
+            current_message += word + " "
+    # Append any remaining message
+    if current_message:
+        parts.append(current_message.strip())
+    return parts
+def llama_v2_reverse(prompt: str) -> list[dict]:
+    # Constants used in the LLaMa style
+    B_INST, E_INST = "[INST]", "[/INST]"
+    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+    BOS, EOS = "<s>", "</s>"
+    # DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+    # Split by the message separators
+    # prompt = promp
+    # segments = [s.strip() for s in prompt.split(E_INST) if s.strip()]
+    messages = []
+    sys_start = prompt.find(B_SYS)
+    sys_end = prompt.rfind(E_SYS)
+    if sys_start != -1 and sys_end != -1:
+        system_msg = prompt[sys_start + len(B_SYS): sys_end]
+    messages.append({"role": "system", "content": system_msg})
+    prompt = prompt[sys_end + len(E_SYS):]
+    user_ai_msgs = split_into_messages(prompt)
+    user_turn = True
+    for message in user_ai_msgs:
+        if user_turn:
+            messages.append({"role": "user", "content": message})
+        else:
+            messages.append({"role": "assistant", "content": message})
+        if user_turn:
+            user_turn = False
+        else:
+            user_turn = True
+    return messages

src/train_test_utils.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+from tqdm.auto import tqdm
+import time
+import numpy as np
+from src.losses import calc_prob_uncertinty
+tic, toc = (time.time, time.time)
+def train(probe, device, train_loader, optimizer, epoch, loss_func,
+          class_names=None, report=False, verbose_interval=5, layer_num=40,
+          head=None, verbose=True, return_raw_outputs=False, one_hot=False, uncertainty=False, **kwargs,):
+    """
+    :param model: pytorch model (class:torch.nn.Module)
+    :param device: device used to train the model (e.g. torch.device("cuda") for training on GPU)
+    :param train_loader: torch.utils.data.DataLoader of train dataset
+    :param optimizer: optimizer for the model
+    :param epoch: current epoch of training
+    :param loss_func: loss function for the training
+    :param class_names: str Name for the classification classses. used in train report
+    :param report: whether to print a classification report of training
+    :param train_verbose: print a train progress report after how many batches of training in each epoch
+    :return: average loss, train accuracy, true labels, predictions
+    """
+    assert (verbose_interval is None) or verbose_interval > 0, "invalid verbose_interval, verbose_interval(int) > 0"
+    starttime = tic()
+    # Set the model to the train mode: Essential for proper gradient descent
+    probe.train()
+    loss_sum = 0
+    correct = 0
+    tot = 0
+    preds = []
+    truths = []
+    # Iterate through the train dataset
+    for batch_idx, batch in enumerate(train_loader):
+        batch_size = 1
+        target = batch["age"].long().cuda()
+        if one_hot:
+            target = torch.nn.functional.one_hot(target, **kwargs).float()
+        optimizer.zero_grad()
+        if layer_num or layer_num == 0:
+            act = batch["hidden_states"][:, layer_num,].to("cuda")
+        else:
+            act = batch["hidden_states"].to("cuda")
+        output = probe(act)
+        if not one_hot:
+            loss = loss_func(output[0], target, **kwargs)
+        else:
+            loss = loss_func(output[0], target)
+        loss.backward()
+        optimizer.step()
+        loss_sum += loss.sum().item()
+        if uncertainty:
+            pred, uncertainty = calc_prob_uncertinty(output[0].detach().cpu().numpy())
+        pred = torch.argmax(output[0], axis=1)
+        # In the Scikit-Learn's implementation of OvR Multi-class Logistic Regression. They linearly normalized the predicted probability and then call argmax
+        # Below is an equivalent implementation of the scikit-learn's decision function. The only difference is we didn't do the linearly normalization
+        # To save some computation time
+        if len(target.shape) > 1:
+            target = torch.argmax(target, axis=1)
+        correct += np.sum(np.array(pred.detach().cpu().numpy()) == np.array(target.detach().cpu().numpy()))
+        if return_raw_outputs:
+            preds.append(pred.detach().cpu().numpy())
+            truths.append(target.detach().cpu().numpy())
+        tot += pred.shape[0]
+    train_acc = correct / tot
+    loss_avg = loss_sum / len(train_loader)
+    endtime = toc()
+    if verbose:
+        print('\nTrain set: Average loss: {:.4f} ({:.3f} sec) Accuracy: {:.3f}\n'.\
+              format(loss_avg,
+                     endtime-starttime,
+                     train_acc))
+    preds = np.concatenate(preds)
+    truths = np.concatenate(truths)
+    if return_raw_outputs:
+        return loss_avg, train_acc, preds, truths
+    else:
+        return loss_avg, train_acc
+def test(probe, device, test_loader, loss_func, return_raw_outputs=False, verbose=True,
+         layer_num=40, scheduler=None, one_hot=False, uncertainty=False, **kwargs):
+    """
+    :param model: pytorch model (class:torch.nn.Module)
+    :param device: device used to train the model (e.g. torch.device("cuda") for training on GPU)
+    :param test_loader: torch.utils.data.DataLoader of test dataset
+    :param loss_func: loss function for the training
+    :param class_names: str Name for the classification classses. used in train report
+    :param test_report: whether to print a classification report of testing after each epoch
+    :param return_raw_outputs: whether return the raw outputs of model (before argmax). used for auc computation
+    :return: average test loss, test accuracy, true labels, predictions, (and raw outputs \
+    from model if return_raw_outputs)
+    """
+    # Set the model to evaluation mode: Essential for testing model
+    probe.eval()
+    test_loss = 0
+    tot = 0
+    correct = 0
+    preds = []
+    truths = []
+    # Do not call gradient descent on the test set
+    # We don't adjust the weights of model on the test set
+    with torch.no_grad():
+        for batch_idx, batch in enumerate(test_loader):
+            batch_size = 1
+            target = batch["age"].long().cuda()
+            if one_hot:
+                target = torch.nn.functional.one_hot(target, **kwargs).float()
+            if layer_num or layer_num == 0:
+                act = batch["hidden_states"][:, layer_num,].to("cuda")
+            else:
+                act = batch["hidden_states"].to("cuda")
+            output = probe(act)
+            if uncertainty:
+                pred, uncertainty = calc_prob_uncertinty(output[0].detach().cpu().numpy())
+            pred = torch.argmax(output[0], axis=1)
+            if not one_hot:
+                loss = loss_func(output[0], target, **kwargs)
+            else:
+                loss = loss_func(output[0], target)
+            test_loss += loss.sum().item()  # sum up batch loss
+            # In the Scikit-Learn's implementation of OvR Multi-class Logistic Regression. They linearly normalized the predicted probability and then call argmax
+            # Below is an equivalent implementation of the scikit-learn's decision function. The only difference is we didn't do the linearly normalization
+            # To save some computation time
+            if len(target.shape) > 1:
+                target = torch.argmax(target, axis=1)
+            pred = np.array(pred.detach().cpu().numpy())
+            target = np.array(target.detach().cpu().numpy())
+            correct += np.sum(pred == target)
+            tot += pred.shape[0]
+            if return_raw_outputs:
+                preds.append(pred)
+                truths.append(target)
+    test_loss /= len(test_loader)
+    if scheduler:
+        scheduler.step(test_loss)
+    test_acc = correct / tot
+    if verbose:
+        print('Test set: Average loss: {:.4f},  Accuracy: {:.3f}\n'.format(
+              test_loss,
+              test_acc))
+    preds = np.concatenate(preds)
+    truths = np.concatenate(truths)
+    # If return the raw outputs (before argmax) from the model
+    if return_raw_outputs:
+        return test_loss, test_acc, preds, truths
+    else:
+        return test_loss, test_acc
+import torch
+from tqdm.auto import tqdm
+import time
+import numpy as np
+from .losses import calc_prob_uncertinty
+tic, toc = (time.time, time.time)

train_probes.py ADDED Viewed

	@@ -0,0 +1,523 @@

+#!/usr/bin/env python3
+"""
+Train reading and controlling probes for LLM attribute detection.
+This script trains linear probes on different layers of a language model to detect
+demographic attributes (age, gender, socioeconomic status, education level).
+"""
+import os
+import sys
+import argparse
+import pickle
+import time
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Subset
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm.auto import tqdm
+import sklearn.model_selection
+from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
+import matplotlib.pyplot as plt
+# Import custom modules
+try:
+    from src.dataset import TextDataset
+    from src.probes import LinearProbeClassification
+    from src.train_test_utils import train, test
+    from src.losses import edl_mse_loss
+except ImportError as e:
+    print(f"❌ ERROR: Failed to import required modules: {e}")
+    print("Please ensure all required modules are in the correct location.")
+    sys.exit(1)
+class TrainerConfig:
+    """Configuration for training probes."""
+    learning_rate = 1e-3
+    betas = (0.9, 0.95)
+    weight_decay = 0.1  # only applied on matmul weights
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+class ProbeTrainer:
+    """Main class for training reading and controlling probes."""
+    def __init__(self, model_name: str = "meta-llama/Llama-2-13b-chat-hf",
+                 device: str = "cuda", use_auth_token: bool = True):
+        """
+        Initialize the probe trainer.
+        Args:
+            model_name: HuggingFace model name
+            device: Device to use for training
+            use_auth_token: Whether to use auth token for model download
+        """
+        self.device = device
+        self.model_name = model_name
+        # Configuration flags
+        self.new_prompt_format = True
+        self.residual_stream = True
+        self.uncertainty = False
+        self.logistic = True
+        self.augmented = False
+        self.remove_last_ai_response = True
+        self.include_inst = True
+        self.one_hot = True
+        # Label mappings
+        self.label_mappings = {
+            "_age_": {
+                "child": 0,
+                "adolescent": 1,
+                "adult": 2,
+                "older adult": 3,
+            },
+            "_gender_": {
+                "male": 0,
+                "female": 1,
+            },
+            "_socioeco_": {
+                "low": 0,
+                "middle": 1,
+                "high": 2
+            },
+            "_education_": {
+                "someschool": 0,
+                "highschool": 1,
+                "collegemore": 2
+            }
+        }
+        self.prompt_translator = {
+            "_age_": "age",
+            "_gender_": "gender",
+            "_socioeco_": "socioeconomic status",
+            "_education_": "education level",
+        }
+        self.openai_dataset = {
+            "_age_": "data/dataset/openai_age_1/",
+            "_gender_": "data/dataset/openai_gender_1/",
+            "_education_": "data/dataset/openai_education_1/",
+            "_socioeco_": "data/dataset/openai_socioeconomic_1/",
+        }
+        # Dataset configurations
+        self.dataset_configs = [
+            ("data/dataset/llama_age_1/", "_age_"),
+            ("data/dataset/llama_gender_1/", "_gender_"),
+            ("data/dataset/llama_socioeconomic_1/", "_socioeco_"),
+            ("data/dataset/openai_education_1/", "_education_"),
+        ]
+        # Initialize model and tokenizer
+        print(f"🚀 Initializing ProbeTrainer with model: {model_name}")
+        self._initialize_model()
+    def _initialize_model(self):
+        """Initialize the tokenizer and model."""
+        try:
+            print("📥 Loading tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                use_auth_token=True
+            )
+            print("✅ Tokenizer loaded successfully")
+            print("📥 Loading model...")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                use_auth_token=True
+            )
+            if self.device == "cuda":
+                print("🔧 Moving model to GPU and setting to half precision...")
+                self.model.half().cuda()
+            self.model.eval()
+            print("✅ Model loaded and ready")
+        except Exception as e:
+            print(f"❌ ERROR: Failed to initialize model: {e}")
+            sys.exit(1)
+    def _get_additional_datasets(self, label_idf: str, directory: str) -> List[str]:
+        """Get additional datasets for training."""
+        additional_dataset = []
+        if label_idf == "_education_":
+            additional_dataset = []
+        else:
+            # Replace _1/ with _2/ for the second dataset
+            additional_dataset = [
+                directory.replace("_1/", "_2/"),
+                self.openai_dataset[label_idf]
+            ]
+        # Add extra datasets based on attribute type
+        if label_idf == "_gender_":
+            additional_dataset += [
+                "data/dataset/openai_gender_2/",
+                "data/dataset/openai_gender_3/",
+                "data/dataset/openai_gender_4",
+            ]
+        elif label_idf == "_education_":
+            additional_dataset += [
+                "data/dataset/openai_education_three_classes_2/",
+                "data/dataset/openai_education_three_classes_3/"
+            ]
+        elif label_idf == "_socioeco_":
+            additional_dataset += [
+                "data/dataset/openai_socioeconomic_2/"
+            ]
+        elif label_idf == "_age_":
+            additional_dataset += [
+                "data/dataset/openai_age_2/"
+            ]
+        return additional_dataset
+    def _create_dataset(self, directory: str, label_idf: str,
+                       label_to_id: Dict, control_probe: bool = False) -> TextDataset:
+        """Create a dataset for training."""
+        additional_datasets = self._get_additional_datasets(label_idf, directory)
+        print(f"  📂 Creating dataset from {directory}")
+        print(f"  📎 Additional datasets: {len(additional_datasets)} sources")
+        try:
+            dataset = TextDataset(
+                directory,
+                self.tokenizer,
+                self.model,
+                label_idf=label_idf,
+                label_to_id=label_to_id,
+                convert_to_llama2_format=True,
+                additional_datas=additional_datasets,
+                new_format=self.new_prompt_format,
+                control_probe=control_probe,
+                residual_stream=self.residual_stream,
+                if_augmented=self.augmented,
+                remove_last_ai_response=self.remove_last_ai_response,
+                include_inst=self.include_inst,
+                k=1,
+                one_hot=False,
+                last_tok_pos=-1
+            )
+            print(f"  ✅ Dataset created with {len(dataset)} samples")
+            return dataset
+        except Exception as e:
+            print(f"  ❌ ERROR: Failed to create dataset: {e}")
+            raise
+    def _create_data_loaders(self, dataset: TextDataset) -> Tuple[DataLoader, DataLoader]:
+        """Create train and test data loaders."""
+        train_size = int(0.8 * len(dataset))
+        test_size = len(dataset) - train_size
+        print(f"  📊 Splitting dataset: {train_size} train, {test_size} test")
+        try:
+            train_idx, val_idx = sklearn.model_selection.train_test_split(
+                list(range(len(dataset))),
+                test_size=test_size,
+                train_size=train_size,
+                random_state=12345,
+                shuffle=True,
+                stratify=dataset.labels,
+            )
+            train_dataset = Subset(dataset, train_idx)
+            test_dataset = Subset(dataset, val_idx)
+            train_loader = DataLoader(
+                train_dataset,
+                shuffle=True,
+                pin_memory=True,
+                batch_size=200,
+                num_workers=1
+            )
+            test_loader = DataLoader(
+                test_dataset,
+                shuffle=False,
+                pin_memory=True,
+                batch_size=400,
+                num_workers=1
+            )
+            print(f"  ✅ Data loaders created")
+            return train_loader, test_loader
+        except Exception as e:
+            print(f"  ❌ ERROR: Failed to create data loaders: {e}")
+            raise
+    def _train_probe_for_layer(self, train_loader: DataLoader, test_loader: DataLoader,
+                              layer_num: int, num_classes: int, dict_name: str,
+                              save_dir: str, max_epochs: int = 50) -> Tuple[float, float, float]:
+        """Train a probe for a specific layer."""
+        trainer_config = TrainerConfig()
+        probe = LinearProbeClassification(
+            probe_class=num_classes,
+            device=self.device,
+            input_dim=5120,
+            logistic=self.logistic
+        )
+        optimizer, scheduler = probe.configure_optimizers(trainer_config)
+        if self.uncertainty:
+            loss_func = edl_mse_loss
+        else:
+            loss_func = nn.BCELoss()
+        best_acc = 0
+        final_test_acc = 0
+        final_train_acc = 0
+        for epoch in range(1, max_epochs + 1):
+            verbosity = (epoch == max_epochs)
+            # Training
+            if self.uncertainty:
+                train_results = train(
+                    probe, self.device, train_loader, optimizer,
+                    epoch, loss_func=loss_func, verbose_interval=None,
+                    verbose=verbosity, layer_num=layer_num,
+                    return_raw_outputs=True, epoch_num=epoch,
+                    num_classes=num_classes
+                )
+                test_results = test(
+                    probe, self.device, test_loader, loss_func=loss_func,
+                    return_raw_outputs=True, verbose=verbosity,
+                    layer_num=layer_num, scheduler=scheduler,
+                    epoch_num=epoch, num_classes=num_classes
+                )
+            else:
+                train_results = train(
+                    probe, self.device, train_loader, optimizer,
+                    epoch, loss_func=loss_func, verbose_interval=None,
+                    verbose=verbosity, layer_num=layer_num,
+                    return_raw_outputs=True, one_hot=self.one_hot,
+                    num_classes=num_classes
+                )
+                test_results = test(
+                    probe, self.device, test_loader, loss_func=loss_func,
+                    return_raw_outputs=True, verbose=verbosity,
+                    layer_num=layer_num, scheduler=scheduler,
+                    one_hot=self.one_hot, num_classes=num_classes
+                )
+            if test_results[1] > best_acc:
+                best_acc = test_results[1]
+                save_path = f"{save_dir}/{dict_name}_probe_at_layer_{layer_num}.pth"
+                torch.save(probe.state_dict(), save_path)
+            if epoch == max_epochs:
+                final_test_acc = test_results[1]
+                final_train_acc = train_results[1]
+                # Save final model
+                final_path = f"{save_dir}/{dict_name}_probe_at_layer_{layer_num}_final.pth"
+                torch.save(probe.state_dict(), final_path)
+                # Generate confusion matrix
+                if verbosity:
+                    try:
+                        cm = confusion_matrix(test_results[3], test_results[2])
+                        cm_display = ConfusionMatrixDisplay(
+                            cm,
+                            display_labels=list(self.label_mappings[f"_{dict_name}_"].keys())
+                        ).plot()
+                        plt.savefig(f"{save_dir}/{dict_name}_layer_{layer_num}_confusion.png")
+                        plt.close()
+                    except Exception as e:
+                        print(f"    ⚠️  Warning: Could not generate confusion matrix: {e}")
+        return best_acc, final_test_acc, final_train_acc
+    def train_probes(self, probe_type: str = "reading", num_layers: int = 41):
+        """
+        Train probes for all attributes and layers.
+        Args:
+            probe_type: Type of probe to train ("reading" or "controlling")
+            num_layers: Number of layers to train probes for
+        """
+        print(f"\n{'='*80}")
+        print(f"🎯 Training {probe_type.upper()} PROBES")
+        print(f"{'='*80}\n")
+        # Create output directory
+        save_dir = f"probe_checkpoints/{probe_type}_probe"
+        Path(save_dir).mkdir(parents=True, exist_ok=True)
+        print(f"📁 Output directory: {save_dir}")
+        accuracy_dict = {}
+        control_probe = (probe_type == "controlling")
+        for directory, label_idf in self.dataset_configs:
+            dict_name = label_idf.strip("_")
+            label_to_id = self.label_mappings[label_idf]
+            print(f"\n{'-'*60}")
+            print(f"🏷️  Processing: {self.prompt_translator[label_idf].upper()}")
+            print(f"   Classes: {list(label_to_id.keys())}")
+            print(f"{'-'*60}")
+            try:
+                # Create dataset
+                dataset = self._create_dataset(
+                    directory, label_idf, label_to_id, control_probe
+                )
+                # Create data loaders
+                train_loader, test_loader = self._create_data_loaders(dataset)
+                # Initialize accuracy tracking
+                accuracy_dict[dict_name] = []
+                accuracy_dict[dict_name + "_final"] = []
+                accuracy_dict[dict_name + "_train"] = []
+                accs = []
+                final_accs = []
+                train_accs = []
+                # Train probes for each layer
+                print(f"\n  🔄 Training probes for {num_layers} layers...")
+                for layer_num in tqdm(range(num_layers), desc=f"  Layers for {dict_name}"):
+                    try:
+                        print(f"\n  Layer {layer_num}:")
+                        best_acc, final_test_acc, final_train_acc = self._train_probe_for_layer(
+                            train_loader, test_loader, layer_num,
+                            len(label_to_id), dict_name, save_dir
+                        )
+                        accs.append(best_acc)
+                        final_accs.append(final_test_acc)
+                        train_accs.append(final_train_acc)
+                        print(f"    📈 Best: {best_acc:.3f}, Final: {final_test_acc:.3f}, Train: {final_train_acc:.3f}")
+                    except Exception as e:
+                        print(f"    ❌ ERROR: Failed to train layer {layer_num}: {e}")
+                        accs.append(0)
+                        final_accs.append(0)
+                        train_accs.append(0)
+                # Save accuracies
+                accuracy_dict[dict_name] = accs
+                accuracy_dict[dict_name + "_final"] = final_accs
+                accuracy_dict[dict_name + "_train"] = train_accs
+                # Save intermediate results
+                with open(f"{save_dir}_experiment.pkl", "wb") as outfile:
+                    pickle.dump(accuracy_dict, outfile)
+                print(f"  💾 Saved results to {save_dir}_experiment.pkl")
+                # Clean up memory
+                del dataset, train_loader, test_loader
+                torch.cuda.empty_cache()
+                print(f"  🧹 Cleaned up memory")
+            except Exception as e:
+                print(f"  ❌ ERROR: Failed to process {dict_name}: {e}")
+                continue
+        print(f"\n{'='*80}")
+        print(f"✅ COMPLETED {probe_type.upper()} PROBE TRAINING")
+        print(f"{'='*80}\n")
+        # Print summary
+        self._print_summary(accuracy_dict, probe_type)
+        return accuracy_dict
+    def _print_summary(self, accuracy_dict: Dict, probe_type: str):
+        """Print a summary of training results."""
+        print(f"\n📊 SUMMARY for {probe_type} probes:")
+        print("-" * 40)
+        for attribute in accuracy_dict:
+            if not attribute.endswith("_final") and not attribute.endswith("_train"):
+                best_accs = accuracy_dict[attribute]
+                if best_accs:
+                    max_acc = max(best_accs)
+                    best_layer = best_accs.index(max_acc)
+                    avg_acc = sum(best_accs) / len(best_accs)
+                    print(f"  {attribute:12s}: Best={max_acc:.3f} (layer {best_layer}), Avg={avg_acc:.3f}")
+def main():
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(description="Train reading and controlling probes for LLM attribute detection")
+    parser.add_argument("--probe-type", choices=["reading", "controlling", "both"], default="both",
+                       help="Type of probes to train")
+    parser.add_argument("--model", default="meta-llama/Llama-2-13b-chat-hf",
+                       help="HuggingFace model to use")
+    parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"],
+                       help="Device to use for training")
+    parser.add_argument("--num-layers", type=int, default=41,
+                       help="Number of layers to train probes for")
+    parser.add_argument("--no-auth", action="store_true",
+                       help="Don't use authentication token")
+    args = parser.parse_args()
+    print(f"""
+╔══════════════════════════════════════════════════════════════╗
+║             LLM Probe Training System                        ║
+║                                                              ║
+║  Model: {args.model:50s} ║
+║  Device: {args.device:49s} ║
+║  Probe Type: {args.probe_type:45s} ║
+║  Layers: {args.num_layers:49d} ║
+╚══════════════════════════════════════════════════════════════╝
+    """)
+    start_time = time.time()
+    try:
+        # Initialize trainer
+        trainer = ProbeTrainer(
+            model_name=args.model,
+            device=args.device,
+            use_auth_token=not args.no_auth
+        )
+        # Train probes
+        if args.probe_type == "both":
+            print("\n🚀 Training both reading and controlling probes...")
+            reading_results = trainer.train_probes("reading", args.num_layers)
+            controlling_results = trainer.train_probes("controlling", args.num_layers)
+        elif args.probe_type == "reading":
+            reading_results = trainer.train_probes("reading", args.num_layers)
+        else:
+            controlling_results = trainer.train_probes("controlling", args.num_layers)
+        elapsed_time = time.time() - start_time
+        print(f"\n⏱️  Total training time: {elapsed_time/60:.2f} minutes")
+        print("✅ Training completed successfully!")
+    except KeyboardInterrupt:
+        print("\n\n⚠️  Training interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ FATAL ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

train_probes_minimal.py ADDED Viewed

	@@ -0,0 +1,399 @@

+#!/usr/bin/env python3
+"""
+Minimal probe training script for HuggingFace Spaces.
+Uses a smaller model (GPT-2) for demonstration on limited resources.
+"""
+import os
+import sys
+import json
+import pickle
+import time
+import logging
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+import numpy as np
+from datetime import datetime
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, TensorDataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Model, GPT2Tokenizer
+from tqdm.auto import tqdm
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Simple probe architecture
+class SimpleProbe(nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, num_classes)
+    def forward(self, x):
+        return self.fc(x)
+def setup_logging(experiment_name: str = "probe_training") -> logging.Logger:
+    """Setup logging configuration."""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Create logs directory
+    log_dir = Path(f"experiments/{experiment_name}/logs")
+    log_dir.mkdir(parents=True, exist_ok=True)
+    # Configure logging
+    log_file = log_dir / f"training_log_{timestamp}.txt"
+    # Create logger
+    logger = logging.getLogger('probe_training')
+    logger.setLevel(logging.DEBUG)
+    # File handler - detailed logs
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.DEBUG)
+    file_formatter = logging.Formatter(
+        '%(asctime)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    file_handler.setFormatter(file_formatter)
+    # Console handler - simplified logs
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_formatter = logging.Formatter('%(message)s')
+    console_handler.setFormatter(console_formatter)
+    # Add handlers
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    return logger
+class MinimalProbeTrainer:
+    """Minimal probe trainer using GPT-2 for demonstration."""
+    def __init__(self, model_name="gpt2", device="cpu", logger=None):
+        self.device = device
+        self.model_name = model_name
+        self.logger = logger or logging.getLogger('probe_training')
+        self.logger.info(f"🚀 Initializing with {model_name} on {device}")
+        print(f"🚀 Initializing with {model_name} on {device}")
+        # Load smaller model for demonstration
+        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+        self.model = GPT2Model.from_pretrained(model_name)
+        self.model.to(device)
+        self.model.eval()
+        # GPT-2 hidden size
+        self.hidden_size = 768  # GPT-2 base
+        self.num_layers = len(self.model.h)  # 12 layers for GPT-2 base
+        self.logger.info(f"✅ Model loaded: {self.num_layers} layers, hidden size {self.hidden_size}")
+        self.logger.debug(f"Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")
+        print(f"✅ Model loaded: {self.num_layers} layers, hidden size {self.hidden_size}")
+    def generate_synthetic_data(self, num_samples=1000, num_classes=4):
+        """Generate synthetic data for demonstration."""
+        print(f"📊 Generating {num_samples} synthetic samples...")
+        # Generate random hidden states
+        X = torch.randn(num_samples, self.hidden_size)
+        # Create synthetic labels with some pattern
+        # Make the data somewhat learnable by adding class-specific signals
+        y = torch.randint(0, num_classes, (num_samples,))
+        for i in range(num_classes):
+            mask = y == i
+            # Add class-specific signal to features
+            X[mask] += torch.randn(1, self.hidden_size) * 0.5
+        return X, y
+    def evaluate_probe(self, probe, data_loader, device):
+        """Evaluate probe accuracy without training."""
+        probe.eval()
+        correct = 0
+        total = 0
+        all_preds = []
+        all_labels = []
+        with torch.no_grad():
+            for batch_x, batch_y in data_loader:
+                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
+                outputs = probe(batch_x)
+                _, predicted = outputs.max(1)
+                total += batch_y.size(0)
+                correct += predicted.eq(batch_y).sum().item()
+                all_preds.extend(predicted.cpu().numpy())
+                all_labels.extend(batch_y.cpu().numpy())
+        accuracy = 100. * correct / total
+        return accuracy, all_preds, all_labels
+    def train_probe_for_layer(self, X_train, y_train, X_test, y_test,
+                            num_classes, layer_idx, epochs=20):
+        """Train a probe for a specific layer."""
+        probe = SimpleProbe(self.hidden_size, num_classes).to(self.device)
+        optimizer = torch.optim.Adam(probe.parameters(), lr=0.001)
+        criterion = nn.CrossEntropyLoss()
+        # Create data loaders
+        train_dataset = TensorDataset(X_train, y_train)
+        test_dataset = TensorDataset(X_test, y_test)
+        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
+        # Measure initial performance BEFORE any training
+        initial_train_acc, _, _ = self.evaluate_probe(probe, train_loader, self.device)
+        initial_test_acc, _, _ = self.evaluate_probe(probe, test_loader, self.device)
+        if hasattr(self, 'logger'):
+            self.logger.info(f"    Layer {layer_idx} - Initial (untrained): Train Acc: {initial_train_acc:.2f}%, Test Acc: {initial_test_acc:.2f}%")
+        print(f"    Layer {layer_idx} - Initial (untrained): Train Acc: {initial_train_acc:.2f}%, Test Acc: {initial_test_acc:.2f}%")
+        train_accs = [initial_train_acc]  # Start with initial accuracy
+        test_accs = [initial_test_acc]
+        for epoch in range(epochs):
+            # Training
+            probe.train()
+            train_loss = 0
+            train_correct = 0
+            train_total = 0
+            for batch_x, batch_y in train_loader:
+                batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
+                optimizer.zero_grad()
+                outputs = probe(batch_x)
+                loss = criterion(outputs, batch_y)
+                loss.backward()
+                optimizer.step()
+                train_loss += loss.item()
+                _, predicted = outputs.max(1)
+                train_total += batch_y.size(0)
+                train_correct += predicted.eq(batch_y).sum().item()
+            # Testing
+            probe.eval()
+            test_correct = 0
+            test_total = 0
+            all_preds = []
+            all_labels = []
+            with torch.no_grad():
+                for batch_x, batch_y in test_loader:
+                    batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
+                    outputs = probe(batch_x)
+                    _, predicted = outputs.max(1)
+                    test_total += batch_y.size(0)
+                    test_correct += predicted.eq(batch_y).sum().item()
+                    all_preds.extend(predicted.cpu().numpy())
+                    all_labels.extend(batch_y.cpu().numpy())
+            train_acc = 100. * train_correct / train_total
+            test_acc = 100. * test_correct / test_total
+            train_accs.append(train_acc)
+            test_accs.append(test_acc)
+            if epoch == epochs - 1:
+                improvement = test_acc - initial_test_acc
+                print(f"    Layer {layer_idx} - Final: Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}% (Improved +{improvement:.2f}% from initial)")
+        return probe, train_accs, test_accs, all_preds, all_labels
+    def train_probes(self, attribute="age", num_layers_to_train=5):
+        """Train probes across multiple layers."""
+        print(f"\n{'='*60}")
+        print(f"🎯 Training probes for {attribute}")
+        print(f"{'='*60}\n")
+        # Attribute configurations
+        attribute_configs = {
+            "age": {"classes": ["child", "adolescent", "adult", "older_adult"], "num": 4},
+            "gender": {"classes": ["male", "female"], "num": 2},
+            "socioeco": {"classes": ["low", "middle", "high"], "num": 3},
+            "education": {"classes": ["some_school", "high_school", "college"], "num": 3}
+        }
+        config = attribute_configs.get(attribute, attribute_configs["age"])
+        num_classes = config["num"]
+        class_names = config["classes"]
+        # Generate synthetic data
+        X, y = self.generate_synthetic_data(num_samples=2000, num_classes=num_classes)
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        print(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")
+        print(f"📊 Classes: {class_names}")
+        # Train probes for each layer
+        results = {
+            "attribute": attribute,
+            "num_classes": num_classes,
+            "class_names": class_names,
+            "layers": [],
+            "train_accuracies": [],
+            "test_accuracies": [],
+            "best_layer": -1,
+            "best_accuracy": 0
+        }
+        num_layers = min(num_layers_to_train, self.num_layers)
+        print(f"\n🔄 Training probes for {num_layers} layers...")
+        for layer_idx in tqdm(range(num_layers), desc="Layers"):
+            # Add some variation to data for different layers
+            # Simulate that middle layers are better
+            layer_factor = 1.0 - abs(layer_idx - num_layers//2) / (num_layers/2)
+            X_train_layer = X_train + torch.randn_like(X_train) * (0.3 / (layer_factor + 0.1))
+            X_test_layer = X_test + torch.randn_like(X_test) * (0.3 / (layer_factor + 0.1))
+            probe, train_accs, test_accs, preds, labels = self.train_probe_for_layer(
+                X_train_layer, y_train, X_test_layer, y_test,
+                num_classes, layer_idx, epochs=10
+            )
+            final_test_acc = test_accs[-1]
+            results["layers"].append(layer_idx)
+            results["train_accuracies"].append(train_accs[-1])
+            results["test_accuracies"].append(final_test_acc)
+            if final_test_acc > results["best_accuracy"]:
+                results["best_accuracy"] = final_test_acc
+                results["best_layer"] = layer_idx
+                results["best_confusion_matrix"] = confusion_matrix(labels, preds)
+        # Create performance visualization
+        self._plot_results(results)
+        return results
+    def _plot_results(self, results):
+        """Create visualization of probe performance across layers."""
+        plt.figure(figsize=(12, 4))
+        # Plot 1: Accuracy across layers
+        plt.subplot(1, 3, 1)
+        plt.plot(results["layers"], results["train_accuracies"], 'b-', label='Train', marker='o')
+        plt.plot(results["layers"], results["test_accuracies"], 'r-', label='Test', marker='s')
+        plt.axhline(y=100/results["num_classes"], color='gray', linestyle='--', label='Random')
+        plt.xlabel('Layer')
+        plt.ylabel('Accuracy (%)')
+        plt.title(f'{results["attribute"].capitalize()} Probe Performance')
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+        # Plot 2: Best layer highlight
+        plt.subplot(1, 3, 2)
+        colors = ['red' if i != results["best_layer"] else 'green' for i in results["layers"]]
+        plt.bar(results["layers"], results["test_accuracies"], color=colors)
+        plt.xlabel('Layer')
+        plt.ylabel('Test Accuracy (%)')
+        plt.title(f'Best Layer: {results["best_layer"]} ({results["best_accuracy"]:.1f}%)')
+        plt.grid(True, alpha=0.3)
+        # Plot 3: Confusion matrix for best layer
+        plt.subplot(1, 3, 3)
+        cm = results["best_confusion_matrix"]
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
+                   xticklabels=results["class_names"],
+                   yticklabels=results["class_names"])
+        plt.title(f'Confusion Matrix (Layer {results["best_layer"]})')
+        plt.ylabel('True Label')
+        plt.xlabel('Predicted Label')
+        plt.tight_layout()
+        # Save plot
+        output_file = f"probe_results_{results['attribute']}_{time.strftime('%Y%m%d_%H%M%S')}.png"
+        plt.savefig(output_file, dpi=150, bbox_inches='tight')
+        plt.close()
+        print(f"\n📊 Visualization saved to {output_file}")
+        return output_file
+def run_full_training(experiment_name: str = "01_gpt2_synthetic_demo"):
+    """Run complete training demonstration with logging."""
+    # Setup logging
+    logger = setup_logging(experiment_name)
+    logger.info("="*80)
+    logger.info("Starting TalkTuner Probe Training")
+    logger.info("="*80)
+    print("""
+╔══════════════════════════════════════════════════════════════╗
+║           Minimal Probe Training Demonstration               ║
+║                                                              ║
+║  This uses GPT-2 with synthetic data for demonstration       ║
+║  Real training would use Llama-2-13b with actual datasets    ║
+╚══════════════════════════════════════════════════════════════╝
+    """)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Log system information
+    logger.info(f"Python version: {sys.version}")
+    logger.info(f"PyTorch version: {torch.__version__}")
+    logger.info(f"Device: {device}")
+    logger.info(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
+    trainer = MinimalProbeTrainer(device=device, logger=logger)
+    all_results = {}
+    # Train probes for each attribute
+    for attribute in ["age", "gender", "socioeco", "education"]:
+        results = trainer.train_probes(attribute=attribute, num_layers_to_train=8)
+        all_results[attribute] = results
+        print(f"\n✅ {attribute.capitalize()} Results:")
+        print(f"   Best Layer: {results['best_layer']}")
+        print(f"   Best Accuracy: {results['best_accuracy']:.2f}%")
+        print(f"   Improvement over random: {results['best_accuracy'] - 100/results['num_classes']:.2f}%")
+    # Save all results
+    output_file = f"probe_training_results_{time.strftime('%Y%m%d_%H%M%S')}.json"
+    with open(output_file, "w") as f:
+        # Convert numpy arrays to lists for JSON serialization
+        json_results = {}
+        for attr, res in all_results.items():
+            json_results[attr] = {
+                k: v.tolist() if isinstance(v, np.ndarray) else v
+                for k, v in res.items() if k != "best_confusion_matrix"
+            }
+        json.dump(json_results, f, indent=2)
+    print(f"\n📊 Full results saved to {output_file}")
+    # Summary
+    print("\n" + "="*60)
+    print("TRAINING SUMMARY")
+    print("="*60)
+    for attr, res in all_results.items():
+        improvement = res['best_accuracy'] - 100/res['num_classes']
+        print(f"{attr:12s}: Layer {res['best_layer']:2d} | Accuracy: {res['best_accuracy']:5.1f}% | Improvement: +{improvement:4.1f}%")
+    return all_results
+if __name__ == "__main__":
+    import sys
+    # Allow passing experiment name as command line argument
+    experiment_name = sys.argv[1] if len(sys.argv) > 1 else "02_real_initial_performance"
+    results = run_full_training(experiment_name)