rahul7star commited on
Commit
2c51b55
·
verified ·
1 Parent(s): ebe1e32

Update app_low.py

Browse files
Files changed (1) hide show
  1. app_low.py +45 -88
app_low.py CHANGED
@@ -1,114 +1,71 @@
1
- import os
2
- import torch
3
  import gradio as gr
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
5
 
6
- # =========================================================
7
- # 1️⃣ Configuration
8
- # =========================================================
9
  MODEL_ID = "Qwen/Qwen2.5-1.5B"
10
 
11
- # Space-friendly settings
12
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
13
- os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
14
-
15
- print(f"🔹 Loading model: {MODEL_ID}")
16
 
17
- # Device setup
18
- if torch.cuda.is_available():
19
- device = "cuda"
20
- dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
21
- print("⚙️ Using GPU for inference.")
22
- else:
23
- device = "cpu"
24
- dtype = torch.float32
25
- print("⚙️ Using CPU (with offload folder).")
26
 
27
- # =========================================================
28
- # 2️⃣ Load Model + Tokenizer (streaming from HF Hub)
29
- # =========================================================
30
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
31
  model = AutoModelForCausalLM.from_pretrained(
32
- MODEL_ID,
33
- torch_dtype=dtype,
34
- device_map="auto" if device == "cuda" else {"": "cpu"},
35
  low_cpu_mem_usage=True,
36
- offload_folder="./offload" if device == "cpu" else None,
37
  )
38
- model.eval()
39
 
40
- # =========================================================
41
- # 3️⃣ Inference Function
42
- # =========================================================
43
- def chat_with_qwen(user_input, temperature, max_tokens, chat_history):
44
- """Chat or enhance text using Qwen2.5-1.5B."""
45
- if not user_input.strip():
46
- return chat_history + [["", "⚠️ Please enter some text."]]
 
 
 
47
 
48
- messages = [{"role": "user", "content": user_input}]
49
  inputs = tokenizer.apply_chat_template(
50
  messages,
51
  add_generation_prompt=True,
52
  tokenize=True,
53
- return_tensors="pt",
54
- ).to(model.device)
55
 
56
  with torch.no_grad():
57
  outputs = model.generate(
58
  **inputs,
59
- max_new_tokens=int(max_tokens),
60
- temperature=float(temperature),
61
- top_p=0.9,
62
  do_sample=True,
63
- repetition_penalty=1.05,
64
  )
65
 
66
- result = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
67
- chat_history = chat_history + [[user_input, result.strip()]]
68
- return chat_history
69
-
70
- # =========================================================
71
- # 4️⃣ Gradio Interface
72
- # =========================================================
73
- with gr.Blocks(title="Qwen 2.5 1.5B Chat", theme=gr.themes.Soft()) as demo:
74
- gr.Markdown(
75
- """
76
- # 🧠 Qwen 2.5 1.5B Chat / Prompt Enhancer
77
- A lightweight reasoning-capable chat model that works fully on CPU or GPU.
78
- Optimized for Hugging Face Spaces with offloading and streaming model load.
79
- ---
80
- """
81
- )
82
 
83
- with gr.Row():
84
- chatbot = gr.Chatbot(height=420, label="Qwen 2.5 Chat")
85
- with gr.Column(scale=1):
86
- user_input = gr.Textbox(
87
- placeholder="Type your question or prompt here...",
88
- label="Your Message",
89
- lines=3,
90
- )
91
- temperature = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Temperature")
92
- max_tokens = gr.Slider(32, 512, value=128, step=16, label="Max Tokens")
93
- send_btn = gr.Button("🚀 Generate", variant="primary")
94
- clear_btn = gr.Button("🧹 Clear Chat")
95
 
96
- send_btn.click(chat_with_qwen, [user_input, temperature, max_tokens, chatbot], chatbot)
97
- user_input.submit(chat_with_qwen, [user_input, temperature, max_tokens, chatbot], chatbot)
98
- clear_btn.click(lambda: [], None, chatbot)
 
 
 
 
 
99
 
100
- gr.Markdown(
101
- """
102
- ---
103
- 💡 **Tips:**
104
- - Works with both creative and factual queries.
105
- - Try: *“Describe a futuristic city skyline at dawn.”*
106
- - Small enough to run smoothly on CPU (under 5 GB memory).
107
- """
108
- )
109
 
110
- # =========================================================
111
- # 5️⃣ Launch App
112
- # =========================================================
113
- if __name__ == "__main__":
114
- demo.launch(show_error=True, share=True)
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, snapshot_download
4
+ import os
5
 
6
+ # ============================================================
7
+ # 1️⃣ Download model efficiently
8
+ # ============================================================
9
  MODEL_ID = "Qwen/Qwen2.5-1.5B"
10
 
11
+ # Download to /tmp to avoid HF Space quota overflow
12
+ model_dir = snapshot_download(repo_id=MODEL_ID, cache_dir="/tmp/qwen_model")
 
 
 
13
 
14
+ # ============================================================
15
+ # 2️⃣ Load model with CPU/offload optimizations
16
+ # ============================================================
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
18
 
 
 
 
 
19
  model = AutoModelForCausalLM.from_pretrained(
20
+ model_dir,
21
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
22
+ device_map="auto" if torch.cuda.is_available() else None,
23
  low_cpu_mem_usage=True,
 
24
  )
25
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
26
 
27
+ # ============================================================
28
+ # 3️⃣ Define Chat Function
29
+ # ============================================================
30
+ def chat_with_qwen(message, history):
31
+ history = history or []
32
+ messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
33
+ for human, bot in history:
34
+ messages.append({"role": "user", "content": human})
35
+ messages.append({"role": "assistant", "content": bot})
36
+ messages.append({"role": "user", "content": message})
37
 
 
38
  inputs = tokenizer.apply_chat_template(
39
  messages,
40
  add_generation_prompt=True,
41
  tokenize=True,
42
+ return_tensors="pt"
43
+ ).to(device)
44
 
45
  with torch.no_grad():
46
  outputs = model.generate(
47
  **inputs,
48
+ max_new_tokens=300,
49
+ temperature=0.8,
 
50
  do_sample=True,
51
+ pad_token_id=tokenizer.eos_token_id
52
  )
53
 
54
+ response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
55
+ history.append((message, response))
56
+ return history, history
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ # ============================================================
60
+ # 4️⃣ Gradio UI
61
+ # ============================================================
62
+ with gr.Blocks(theme="soft", title="Qwen 2.5 Chatbot") as demo:
63
+ gr.Markdown("## 🧠 Qwen 2.5 — Lightweight Chatbot (Optimized for CPU & GPU Offload)")
64
+ chatbot = gr.Chatbot(height=480, label="Chat with Qwen 2.5", type="messages")
65
+ msg = gr.Textbox(placeholder="Ask me anything...", label="Your message")
66
+ clear = gr.Button("🧹 Clear Chat")
67
 
68
+ msg.submit(chat_with_qwen, [msg, chatbot], [chatbot, chatbot])
69
+ clear.click(lambda: None, None, chatbot, queue=False)
 
 
 
 
 
 
 
70
 
71
+ demo.launch(server_name="0.0.0.0", server_port=7860)