Smllm

Sleeping

App Files Files Community

ghosthets commited on 22 days ago

Commit

05fe403

verified ·

1 Parent(s): 02f2b8a

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -22

app.py CHANGED Viewed

@@ -1,23 +1,29 @@
 import flask
 from flask import request, jsonify
-# Use AutoModelForCausalLM for Decoder-only models like TinyLlama
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 app = flask.Flask(__name__)
-model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-print("🔄 Loading TinyLlama model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-# Load using AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) # Using bfloat16 for better memory/speed on GPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-print("✅ Model loaded instantly!")
 @app.route('/chat', methods=['POST'])
 def chat():
@@ -28,44 +34,57 @@ def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
-        # --- Key Change 1: Apply Chat Template ---
-        # Format the user message into the model's required chat template
         chat_history = [{"role": "user", "content": msg}]
-        # add_generation_prompt=True ensures the model knows it needs to respond
-        formatted_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
         # Tokenize the formatted prompt
         inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
-        # Generation
         output = model.generate(
             **inputs,
             max_length=256,
             do_sample=True,
-            top_p=0.9,
-            temperature=0.7,
-            eos_token_id=tokenizer.eos_token_id
         )
-        # Decode the output
         full_reply = tokenizer.decode(output[0], skip_special_tokens=False)
-        # --- Key Change 2: Extract only the generated response ---
-        # The output includes the input prompt, so we extract only the response part.
-        # Identify the assistant marker used by TinyLlama's chat template
-        if "[/INST]" in full_reply:
-             # This structure is often used: <s>[INST] User Prompt [/INST] Assistant Reply
-             reply = full_reply.split("[/INST]")[-1].strip()
         else:
-             # Fallback: decode only the newly generated tokens
              reply = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
         return jsonify({"reply": reply})
     except Exception as e:
         return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=7860)

 import flask
 from flask import request, jsonify
+# Use AutoModelForCausalLM for Decoder-only models like Qwen
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+# Initialize the Flask application
 app = flask.Flask(__name__)
+# Qwen1.5-0.5B-Chat Model ID
+model_id = "Qwen/Qwen1.5-0.5B-Chat"
+print(f"🔄 Loading {model_id} model...")
+# Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load the model using the correct CausalLM class
+# Using bfloat16 for better memory/speed if a compatible GPU is available
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+# Set the device (GPU/CPU)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+print(f"✅ {model_id} Model loaded successfully!")
 @app.route('/chat', methods=['POST'])
 def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
+        # --- Qwen1.5 Chat Template Formatting ---
+        # Qwen models require input in the ChatML format.
         chat_history = [{"role": "user", "content": msg}]
+        # apply_chat_template handles the specific formatting (e.g., <|im_start|>user\n...)
+        formatted_prompt = tokenizer.apply_chat_template(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=True
+        )
         # Tokenize the formatted prompt
         inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
+        # Generation configuration
         output = model.generate(
             **inputs,
             max_length=256,
             do_sample=True,
+            top_p=0.8,
+            temperature=0.6,
+            # Set pad_token_id to eos_token_id, which is often necessary for Causal LMs
+            pad_token_id=tokenizer.eos_token_id
         )
+        # Decode the full output
         full_reply = tokenizer.decode(output[0], skip_special_tokens=False)
+        # --- Extract only the Generated Response ---
+        # Qwen ChatML format uses '<|im_start|>assistant\n' before the response
+        assistant_tag = "<|im_start|>assistant\n"
+        if assistant_tag in full_reply:
+             # Split the full reply and take the content after the assistant tag
+             reply = full_reply.split(assistant_tag)[-1].strip()
+             # Remove the end-of-message tag if it was generated
+             if "<|im_end|>" in reply:
+                 reply = reply.split("<|im_end|>")[0].strip()
         else:
+             # Fallback: Decode only the newly generated tokens
              reply = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
         return jsonify({"reply": reply})
     except Exception as e:
+        # Catch any runtime errors
         return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
+    # Run the Flask app
     app.run(host='0.0.0.0', port=7860)