PromptEnhancer_32B-FlashPack

Sleeping

App Files Files Community

rahul7star commited on Oct 16

Commit

2c51b55

verified ·

1 Parent(s): ebe1e32

Update app_low.py

Browse files

Files changed (1) hide show

app_low.py +45 -88

app_low.py CHANGED Viewed

@@ -1,114 +1,71 @@
-import os
-import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# =========================================================
-# 1️⃣ Configuration
-# =========================================================
 MODEL_ID = "Qwen/Qwen2.5-1.5B"
-# Space-friendly settings
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-print(f"🔹 Loading model: {MODEL_ID}")
-# Device setup
-if torch.cuda.is_available():
-    device = "cuda"
-    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-    print("⚙️ Using GPU for inference.")
-else:
-    device = "cpu"
-    dtype = torch.float32
-    print("⚙️ Using CPU (with offload folder).")
-# =========================================================
-# 2️⃣ Load Model + Tokenizer (streaming from HF Hub)
-# =========================================================
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=dtype,
-    device_map="auto" if device == "cuda" else {"": "cpu"},
     low_cpu_mem_usage=True,
-    offload_folder="./offload" if device == "cpu" else None,
 )
-model.eval()
-# =========================================================
-# 3️⃣ Inference Function
-# =========================================================
-def chat_with_qwen(user_input, temperature, max_tokens, chat_history):
-    """Chat or enhance text using Qwen2.5-1.5B."""
-    if not user_input.strip():
-        return chat_history + [["", "⚠️ Please enter some text."]]
-    messages = [{"role": "user", "content": user_input}]
     inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
-        return_tensors="pt",
-    ).to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=int(max_tokens),
-            temperature=float(temperature),
-            top_p=0.9,
             do_sample=True,
-            repetition_penalty=1.05,
         )
-    result = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
-    chat_history = chat_history + [[user_input, result.strip()]]
-    return chat_history
-# =========================================================
-# 4️⃣ Gradio Interface
-# =========================================================
-with gr.Blocks(title="Qwen 2.5 1.5B Chat", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 🧠 Qwen 2.5 1.5B Chat / Prompt Enhancer
-        A lightweight reasoning-capable chat model that works fully on CPU or GPU.
-        Optimized for Hugging Face Spaces with offloading and streaming model load.
-        ---
-        """
-    )
-    with gr.Row():
-        chatbot = gr.Chatbot(height=420, label="Qwen 2.5 Chat")
-        with gr.Column(scale=1):
-            user_input = gr.Textbox(
-                placeholder="Type your question or prompt here...",
-                label="Your Message",
-                lines=3,
-            )
-            temperature = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Temperature")
-            max_tokens = gr.Slider(32, 512, value=128, step=16, label="Max Tokens")
-            send_btn = gr.Button("🚀 Generate", variant="primary")
-            clear_btn = gr.Button("🧹 Clear Chat")
-    send_btn.click(chat_with_qwen, [user_input, temperature, max_tokens, chatbot], chatbot)
-    user_input.submit(chat_with_qwen, [user_input, temperature, max_tokens, chatbot], chatbot)
-    clear_btn.click(lambda: [], None, chatbot)
-    gr.Markdown(
-        """
-        ---
-        💡 **Tips:**
-        - Works with both creative and factual queries.
-        - Try: *“Describe a futuristic city skyline at dawn.”*
-        - Small enough to run smoothly on CPU (under 5 GB memory).
-        """
-    )
-# =========================================================
-# 5️⃣ Launch App
-# =========================================================
-if __name__ == "__main__":
-    demo.launch(show_error=True, share=True)

 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, snapshot_download
+import os
+# ============================================================
+# 1️⃣ Download model efficiently
+# ============================================================
 MODEL_ID = "Qwen/Qwen2.5-1.5B"
+# Download to /tmp to avoid HF Space quota overflow
+model_dir = snapshot_download(repo_id=MODEL_ID, cache_dir="/tmp/qwen_model")
+# ============================================================
+# 2️⃣ Load model with CPU/offload optimizations
+# ============================================================
+device = "cuda" if torch.cuda.is_available() else "cpu"
 model = AutoModelForCausalLM.from_pretrained(
+    model_dir,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto" if torch.cuda.is_available() else None,
     low_cpu_mem_usage=True,
 )
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+# ============================================================
+# 3️⃣ Define Chat Function
+# ============================================================
+def chat_with_qwen(message, history):
+    history = history or []
+    messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
+    for human, bot in history:
+        messages.append({"role": "user", "content": human})
+        messages.append({"role": "assistant", "content": bot})
+    messages.append({"role": "user", "content": message})
     inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
+        return_tensors="pt"
+    ).to(device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=300,
+            temperature=0.8,
             do_sample=True,
+            pad_token_id=tokenizer.eos_token_id
         )
+    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
+    history.append((message, response))
+    return history, history
+# ============================================================
+# 4️⃣ Gradio UI
+# ============================================================
+with gr.Blocks(theme="soft", title="Qwen 2.5 Chatbot") as demo:
+    gr.Markdown("## 🧠 Qwen 2.5 — Lightweight Chatbot (Optimized for CPU & GPU Offload)")
+    chatbot = gr.Chatbot(height=480, label="Chat with Qwen 2.5", type="messages")
+    msg = gr.Textbox(placeholder="Ask me anything...", label="Your message")
+    clear = gr.Button("🧹 Clear Chat")
+    msg.submit(chat_with_qwen, [msg, chatbot], [chatbot, chatbot])
+    clear.click(lambda: None, None, chatbot, queue=False)
+demo.launch(server_name="0.0.0.0", server_port=7860)