UserLM

Sleeping

App Files Files Community

pszemraj commited on Oct 11

Commit

5bc353f

verified ·

1 Parent(s): 8f03206

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -52

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from __future__ import annotations
 import os
-from typing import List, Tuple, Dict, Any
-import spaces
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 # ----------------------
 # Config
@@ -19,6 +20,7 @@ DEFAULT_SYSTEM_PROMPT = (
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def load_model(model_id: str = MODEL_ID):
     """Load tokenizer and model, with a reasonable dtype and device fallback."""
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
@@ -36,16 +38,25 @@ def load_model(model_id: str = MODEL_ID):
     end_token_ids = tokenizer.encode(end_token, add_special_tokens=False)
     end_conv_token_ids = tokenizer.encode(end_conv_token, add_special_tokens=False)
-    # Some models may not include these tokens — handle gracefully
-    eos_token_id = end_token_ids[0] if len(end_token_ids) > 0 else tokenizer.eos_token_id
     bad_words_ids = (
         [[tid] for tid in end_conv_token_ids] if len(end_conv_token_ids) > 0 else None
     )
-    return tokenizer, model, eos_token_id, bad_words_ids
-tokenizer, model, EOS_TOKEN_ID, BAD_WORDS_IDS = load_model()
 model = model.to(device)
 model.eval()
@@ -53,7 +64,10 @@ model.eval()
 # Generation helper
 # ----------------------
-def build_messages(system_prompt: str, history: List[Tuple[str, str]]) -> List[Dict[str, str]]:
     """Transform Gradio history [(user, assistant), ...] into chat template messages."""
     messages: List[Dict[str, str]] = []
     if system_prompt.strip():
@@ -66,51 +80,123 @@ def build_messages(system_prompt: str, history: List[Tuple[str, str]]) -> List[D
     return messages
 @spaces.GPU
 def generate_reply(
     messages: List[Dict[str, str]],
     max_new_tokens: int = 256,
-    temperature: float = 0.8,
-    top_p: float = 0.9,
 ) -> str:
-    """Run a single generate() step and return the model's text reply."""
-    # Prepare input ids using the model's chat template
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        return_tensors="pt",
-        add_generation_prompt=True,
-    ).to(device)
-    with torch.no_grad():
-        outputs = model.generate(
-            input_ids=inputs,
-            do_sample=True,
-            top_p=top_p,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            eos_token_id=EOS_TOKEN_ID,
-            pad_token_id=tokenizer.eos_token_id,
-            bad_words_ids=BAD_WORDS_IDS,
-        )
-    # Slice off the prompt tokens to get only the new text
-    generated = outputs[0][inputs.shape[1]:]
-    text = tokenizer.decode(generated, skip_special_tokens=True).strip()
-    return text
 # ----------------------
 # Gradio UI callbacks
 # ----------------------
-def respond(user_message: str, chat_history: List[Tuple[str, str]], system_prompt: str,
-            max_new_tokens: int, temperature: float, top_p: float):
     # Build messages including prior turns
     messages = build_messages(system_prompt, chat_history + [(user_message, "")])
     try:
         reply = generate_reply(
             messages,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
@@ -130,45 +216,90 @@ def clear_state():
 # Build the Gradio App
 # ----------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🧪 Transformers × Gradio: Multi‑turn Chat Demo
-    Model: **{model}** on **{device}**
-    Change the system prompt, then chat. Sliders control sampling.
-    """.format(model=MODEL_ID, device=device))
     with gr.Row():
         system_box = gr.Textbox(
-            label="System Prompt",
             value=DEFAULT_SYSTEM_PROMPT,
             lines=3,
-            placeholder="Enter a system instruction to steer the assistant",
         )
-    chatbot = gr.Chatbot(height=420, label="Chat")
     with gr.Row():
         msg = gr.Textbox(
-            label="Your message",
-            placeholder="Type a message and press Enter",
         )
-    with gr.Accordion("Generation Settings", open=False):
-        max_new_tokens = gr.Slider(16, 1024, value=256, step=1, label="max_new_tokens")
-        temperature = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="temperature")
-        top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="top_p")
     with gr.Row():
-        submit_btn = gr.Button("Send", variant="primary")
         clear_btn = gr.Button("Clear")
     state = gr.State([])  # chat history state: List[Tuple[user, assistant]]
     def _submit(user_text, history, system_prompt, mnt, temp, tp):
         if not user_text or not user_text.strip():
             return gr.update(), history
-        new_history, visible = respond(user_text.strip(), history, system_prompt, mnt, temp, tp)
         return "", visible
     submit_btn.click(
@@ -195,4 +326,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     clear_btn.click(_clear, outputs=[state, system_box, chatbot, msg])
 if __name__ == "__main__":
-    demo.queue().launch()  # enable queuing for concurrency

 from __future__ import annotations
 import os
+from typing import Any, Dict, List, Tuple
 import gradio as gr
+import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 # ----------------------
 # Config
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def load_model(model_id: str = MODEL_ID):
     """Load tokenizer and model, with a reasonable dtype and device fallback."""
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     end_token_ids = tokenizer.encode(end_token, add_special_tokens=False)
     end_conv_token_ids = tokenizer.encode(end_conv_token, add_special_tokens=False)
+    # Guardrail 1: Problematic first tokens that cause repetition (from Appendix C.1)
+    problematic_tokens = ["I", "You", "Here", "i", "you", "here"]
+    first_token_filter_ids = []
+    for token in problematic_tokens:
+        token_ids = tokenizer.encode(token, add_special_tokens=False)
+        if len(token_ids) > 0:
+            first_token_filter_ids.append(token_ids[0])
+    eos_token_id = (
+        end_token_ids[0] if len(end_token_ids) > 0 else tokenizer.eos_token_id
+    )
     bad_words_ids = (
         [[tid] for tid in end_conv_token_ids] if len(end_conv_token_ids) > 0 else None
     )
+    return tokenizer, model, eos_token_id, bad_words_ids, first_token_filter_ids
+tokenizer, model, EOS_TOKEN_ID, BAD_WORDS_IDS, FIRST_TOKEN_FILTER_IDS = load_model()
 model = model.to(device)
 model.eval()
 # Generation helper
 # ----------------------
+def build_messages(
+    system_prompt: str, history: List[Tuple[str, str]]
+) -> List[Dict[str, str]]:
     """Transform Gradio history [(user, assistant), ...] into chat template messages."""
     messages: List[Dict[str, str]] = []
     if system_prompt.strip():
     return messages
+def apply_first_token_filter(
+    logits: torch.Tensor, filter_ids: List[int]
+) -> torch.Tensor:
+    """Apply logit filter for problematic first tokens (Guardrail 1)."""
+    logits_filtered = logits.clone()
+    for token_id in filter_ids:
+        logits_filtered[0, -1, token_id] = float("-inf")
+    return logits_filtered
+def is_valid_length(text: str, min_words: int = 3, max_words: int = 50) -> bool:
+    """Check if generated text meets length requirements (Guardrail 3).
+    Paper used max_words=25 for their simulation experiments, but we use 50
+    for interactive demo to allow slightly longer responses while still preventing
+    the model from revealing the entire intent at once.
+    """
+    word_count = len(text.split())
+    return min_words <= word_count <= max_words
+def is_verbatim_repetition(
+    new_text: str, history: List[Tuple[str, str]], system_prompt: str
+) -> bool:
+    """Check if text is exact repetition of prior user turn or system prompt (Guardrail 4)."""
+    new_text_normalized = new_text.strip().lower()
+    # Check against system prompt
+    if new_text_normalized == system_prompt.strip().lower():
+        return True
+    # Check against previous user messages
+    for user_msg, _ in history:
+        if user_msg and new_text_normalized == user_msg.strip().lower():
+            return True
+    return False
 @spaces.GPU
 def generate_reply(
     messages: List[Dict[str, str]],
+    history: List[Tuple[str, str]],
+    system_prompt: str,
     max_new_tokens: int = 256,
+    temperature: float = 1.0,
+    top_p: float = 0.8,
+    max_retries: int = 5,
 ) -> str:
+    """Run generation with guardrails from Appendix C.1.
+    Implements all 4 guardrails from the paper:
+    1. Filter problematic first tokens
+    2. Optionally avoid dialogue termination (disabled by default for demo)
+    3. Enforce length thresholds with retry
+    4. Filter verbatim repetitions with retry
+    """
+    for attempt in range(max_retries):
+        # Prepare input ids using the model's chat template
+        inputs = tokenizer.apply_chat_template(
+            messages,
+            return_tensors="pt",
+            add_generation_prompt=True,
+        ).to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                input_ids=inputs,
+                do_sample=True,
+                top_p=top_p,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                eos_token_id=EOS_TOKEN_ID,
+                pad_token_id=tokenizer.eos_token_id,
+                bad_words_ids=BAD_WORDS_IDS,  # Prevents <|endconversation|>
+            )
+        # Slice off the prompt tokens to get only the new text
+        generated = outputs[0][inputs.shape[1] :]
+        text = tokenizer.decode(generated, skip_special_tokens=True).strip()
+        # Apply guardrails - retry if checks fail
+        if not is_valid_length(text):
+            continue
+        if is_verbatim_repetition(text, history, system_prompt):
+            continue
+        # Success - return the valid text
+        return text
+    # If all retries failed, return a fallback message
+    return "(Unable to generate valid response after multiple attempts)"
 # ----------------------
 # Gradio UI callbacks
 # ----------------------
+def respond(
+    user_message: str,
+    chat_history: List[Tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+):
     # Build messages including prior turns
     messages = build_messages(system_prompt, chat_history + [(user_message, "")])
     try:
         reply = generate_reply(
             messages,
+            chat_history,
+            system_prompt,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
 # Build the Gradio App
 # ----------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        f"""
+    # UserLM-8b: User Language Model Demo
+    **Model:** `{MODEL_ID}` on **{device}**
+    This demo implements the generation guardrails from [Appendix C.1](https://arxiv.org/abs/2510.06552) of the paper:
+    - Filters problematic first tokens (I, You, Here) that cause repetition
+    - Enforces length thresholds (3-50 words per turn)
+    - Prevents verbatim repetition of prior turns
+    - Uses recommended sampling params: temp=1.0, top_p=0.8
+    **Note:** Unlike typical assistant LMs, UserLM simulates *human users* in conversations.
+    The system prompt defines the user's high-level intent.
+    """
+    )
     with gr.Row():
         system_box = gr.Textbox(
+            label="User Intent (System Prompt)",
             value=DEFAULT_SYSTEM_PROMPT,
             lines=3,
+            placeholder="Enter a high-level user intent (e.g., 'You are a user who wants to...')",
         )
+    chatbot = gr.Chatbot(height=420, label="Simulated User-Assistant Conversation")
     with gr.Row():
         msg = gr.Textbox(
+            label="Assistant Response",
+            placeholder="Type the assistant's response to the user",
+            lines=2,
         )
+    with gr.Accordion(
+        "Generation Settings (Based on Paper Recommendations)", open=False
+    ):
+        max_new_tokens = gr.Slider(
+            16,
+            512,
+            value=256,
+            step=16,
+            label="max_new_tokens",
+            info="Max tokens per user turn. Paper used stricter limits for simulation.",
+        )
+        temperature = gr.Slider(
+            0.0,
+            2.0,
+            value=1.0,
+            step=0.05,
+            label="temperature",
+            info="Paper recommends 1.0 for realistic user diversity",
+        )
+        top_p = gr.Slider(
+            0.0,
+            1.0,
+            value=0.8,
+            step=0.01,
+            label="top_p",
+            info="Paper recommends 0.8 (not 0.9)",
+        )
     with gr.Row():
+        submit_btn = gr.Button("Generate User Response", variant="primary")
         clear_btn = gr.Button("Clear")
     state = gr.State([])  # chat history state: List[Tuple[user, assistant]]
+    gr.Markdown(
+        """
+    ### Usage Tips:
+    - The **system prompt** defines the user's goal (keep it high-level, not overly specific)
+    - Type what the **assistant says** in response
+    - Click **Generate User Response** to simulate how a human user would reply
+    - UserLM naturally reveals intent across multiple turns, not all at once
+    """
+    )
     def _submit(user_text, history, system_prompt, mnt, temp, tp):
         if not user_text or not user_text.strip():
             return gr.update(), history
+        new_history, visible = respond(
+            user_text.strip(), history, system_prompt, mnt, temp, tp
+        )
         return "", visible
     submit_btn.click(
     clear_btn.click(_clear, outputs=[state, system_box, chatbot, msg])
 if __name__ == "__main__":
+    demo.queue().launch()