ghosthets commited on
Commit
d09bf4d
·
verified ·
1 Parent(s): c389d98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -16
app.py CHANGED
@@ -1,45 +1,62 @@
1
- import flask # Gradio nahi, Flask hi rakhte hain
2
  from flask import request, jsonify
3
- # from transformers import pipeline # अब इसकी ज़रूरत नहीं
4
- # import torch # अब इसकी ज़रूरत नहीं
 
5
 
6
- from ctransformers import AutoModelForCausalLM # ctransformers से मॉडल लोड करेंगे
 
7
 
8
  app = flask.Flask(__name__)
9
 
10
  # ===========================
11
- # LOAD MODEL
12
  # ===========================
13
- model_id = "Qwen/Qwen1.5-1.8B-Chat-GGUF"
14
  print("🔄 Loading model...")
15
 
 
 
 
 
 
16
  try:
17
- # ctransformers का उपयोग करके GGUF मॉडल को CPU पर लोड करें
18
- ai = AutoModelForCausalLM.from_pretrained(
19
- model_id,
20
- model_file="qwen1_5-1_8b-chat-q5_k_m.gguf", # GGUF फ़ाइल का नाम
21
- model_type="qwen",
22
- gpu_layers=0 # CPU पर चलाने के लिए
 
23
  )
24
  print("✅ Model loaded!")
25
  except Exception as e:
26
  print(f"❌ Error loading model: {e}")
27
- # Fallback/Exit strategy here if loading fails
28
 
29
  # ===========================
30
  # CHAT API
31
  # ===========================
32
  @app.route('/chat', methods=['POST'])
33
  def chat():
 
 
 
34
  try:
35
  data = request.get_json()
36
  msg = data.get("message", "")
37
  if not msg:
38
  return jsonify({"error": "No message sent"}), 400
39
 
40
- # ctransformers से response generate करें
41
- output = ai(msg, max_new_tokens=200, temperature=0.7)
42
- return jsonify({"reply": output})
 
 
 
 
 
 
43
  except Exception as e:
44
  return jsonify({"error": str(e)}), 500
45
 
 
1
+ import flask
2
  from flask import request, jsonify
3
+ from transformers import pipeline
4
+ import torch
5
+ import warnings # warning suppress करने के लिए
6
 
7
+ # warnings को suppress करें, वर्ना CPU पर warnings आ सकती हैं
8
+ warnings.filterwarnings("ignore")
9
 
10
  app = flask.Flask(__name__)
11
 
12
  # ===========================
13
+ # LOAD MODEL (StableLM-3B-Chat)
14
  # ===========================
15
+ model_id = "stabilityai/StableLM-3B-4E1T-Chat"
16
  print("🔄 Loading model...")
17
 
18
+ # CPU/GPU device set
19
+ # हम CPU पर लोड करते समय 'torch.bfloat16' का उपयोग करके मेमोरी को कम करने की कोशिश करेंगे।
20
+ device = 0 if torch.cuda.is_available() else -1
21
+ dtype = torch.float32 if device == -1 else torch.bfloat16 # CPU के लिए float32
22
+
23
  try:
24
+ ai = pipeline(
25
+ "text-generation",
26
+ model=model_id,
27
+ max_new_tokens=200,
28
+ device=device,
29
+ torch_dtype=dtype, # CPU/Memory optimization
30
+ trust_remote_code=True # StableLM के लिए आवश्यक
31
  )
32
  print("✅ Model loaded!")
33
  except Exception as e:
34
  print(f"❌ Error loading model: {e}")
35
+ ai = None # If load fails, prevent later API errors
36
 
37
  # ===========================
38
  # CHAT API
39
  # ===========================
40
  @app.route('/chat', methods=['POST'])
41
  def chat():
42
+ if ai is None:
43
+ return jsonify({"error": "Model initialization failed."}), 500
44
+
45
  try:
46
  data = request.get_json()
47
  msg = data.get("message", "")
48
  if not msg:
49
  return jsonify({"error": "No message sent"}), 400
50
 
51
+ # StableLM Instruction Format:
52
+ prompt = f"<|user|>\n{msg}<|end|>\n<|assistant|>"
53
+
54
+ output = ai(prompt)[0]["generated_text"]
55
+
56
+ # Output को clean करें ताकि सिर्फ assistant का जवाब मिले
57
+ reply = output.split("<|assistant|>")[-1].strip()
58
+
59
+ return jsonify({"reply": reply})
60
  except Exception as e:
61
  return jsonify({"error": str(e)}), 500
62