Spaces:

vungocthach1112
/

ocr-detection

Sleeping

App Files Files Community

vungocthach1112 commited on Jun 10

Commit

0169392

1 Parent(s): 23d2e0c

Create GUI for OCR app

Browse files

Files changed (4) hide show

.gitignore +45 -0
app.py +91 -0
models.py +94 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,45 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+.env
+.venv
+env.bak/
+venv.bak/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Streamlit
+.streamlit/secrets.toml
+# Logs and local files
+*.log
+.DS_Store
+Thumbs.db
+.env

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+from PIL import Image
+import json
+from io import BytesIO
+import base64
+import torch
+from tempfile import gettempdir
+from os import path, makedirs, remove
+import models
+def get_safe_cache_dir():
+    try:
+        # Thử ghi vào ~/.cache/huggingface (nếu có)
+        default_cache = path.expanduser("~/.cache/huggingface")
+        makedirs(default_cache, exist_ok=True)
+        test_file = path.join(default_cache, "test_write.txt")
+        with open(test_file, "w") as f:
+            f.write("ok")
+        remove(test_file)
+        return default_cache
+    except Exception:
+        # Nếu lỗi (ví dụ trên HuggingFace Spaces), dùng temp
+        return path.join(gettempdir(), "huggingface")
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+CACHE_DIR = get_safe_cache_dir()
+AVAILABLE_MODELS = {
+    "TrOCR (Base Printed)": {
+        "id": "microsoft/trocr-base-printed",
+        "type": "trocr"
+    },
+    "EraX (VL-2B-V1.5)": {
+        "id": "erax-ai/EraX-VL-2B-V1.5",
+        "type": "erax"
+    }
+}
+_model_cache = {}
+print("Using device:", DEVICE)
+print("Cache directory:", CACHE_DIR)
+def load_model(model_key):
+    model_id = AVAILABLE_MODELS[model_key]["id"]
+    model_type = AVAILABLE_MODELS[model_key]["type"]
+    if model_id in _model_cache:
+        return _model_cache[model_key]
+    if "trocr" in model_type:
+        model = models.TrOCRModel(model_id, cache_dir=CACHE_DIR, device=DEVICE)
+    if "erax" in model_type:
+        model = models.EraXModel(model_id, cache_dir=CACHE_DIR, device=DEVICE)
+    else:
+        raise ValueError("Unknown model")
+    _model_cache[model_key] = model
+    print('Load model:', model_id, ' successfully!')
+    return model
+# Hàm xử lý ảnh đầu vào
+def gradio_process(image: Image.Image, model_key: str):
+    if image is None:
+        return {"error": "No image provided"}
+    model = load_model(model_key)
+    result = model.predict(image)
+    return json.dumps({
+        "texts": result,
+        "image_size": {
+            "width": image.width,
+            "height": image.height
+        },
+        "mode": image.mode,
+    }, indent=4)
+# Giao diện Gradio
+demo = gr.Interface(
+    fn=gradio_process,
+    inputs=[
+        gr.Image(type="pil", label="Upload Image"),
+        gr.Dropdown(choices=list(AVAILABLE_MODELS.keys()), label="Chọn mô hình", value="TrOCR (Base Printed)"),
+        # gr.Textbox(label="Prompt (chỉ dùng cho EraX)", placeholder="Ảnh này có gì?")
+    ],
+    outputs=gr.JSON(label="Output (Text/JSON Extract)"),
+    title="Image to Text/JSON Extractor",
+    description="Upload an image and extract structured text using OCR."
+)
+if __name__ == "__main__":
+    demo.launch()

models.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from transformers import pipeline, AutoTokenizer, VisionEncoderDecoderModel, AutoProcessor
+import torch
+from PIL import Image
+from io import BytesIO
+import base64
+# Chuyển ảnh thành base64 (tùy chọn nếu bạn cần hiển thị hoặc xuất)
+def pil_to_base64(image: Image.Image, format="PNG") -> str:
+    buffered = BytesIO()
+    image.save(buffered, format=format)
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+def parse_to_json(result_text):
+    """
+    Nếu output là các dòng 'key: value', parse thành dict.
+    Nếu không, gói nguyên text vào trường 'text'.
+    """
+    data = {}
+    lines = [line.strip() for line in result_text.splitlines() if line.strip()]
+    for line in lines:
+        if ":" in line:
+            key, val = line.split(":", 1)
+            data[key.strip()] = val.strip()
+        else:
+            # Nếu không tách được, gom vào list chung
+            data.setdefault("text", []).append(line)
+    # Nếu chỉ có list 'text', chuyển về chuỗi
+    if set(data.keys()) == {"text"}:
+        data = {"text": "\n".join(data["text"])}
+    return data
+# class TrOCRModel:
+#     def __init__(self, model_id="microsoft/trocr-base-printed", cache_dir=None, device=None):
+#         self.model_id = model_id
+#         self.cache_dir = cache_dir
+#         self.device = device
+#         self.processor = TrOCRProcessor.from_pretrained(self.model_id, cache_dir=self.cache_dir)
+#         self.model = VisionEncoderDecoderModel.from_pretrained(self.model_id, cache_dir=self.cache_dir)
+#         self.model.to(self.device)
+#     def predict(self, image: Image.Image) -> str:
+#         if image is None:
+#             raise ValueError("No image provided")
+#         image = image.convert("RGB")
+#         pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device)
+#         with torch.no_grad():
+#             generated_ids = self.model.generate(pixel_values)
+#             generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+#         return generated_text
+class TrOCRModel:
+    def __init__(self, model_id="microsoft/trocr-base-printed", cache_dir=None, device=None):
+        self.pipe = pipeline("image-to-text", model=model_id, device=device)
+    def predict(self, image: Image.Image) -> str:
+        if image is None:
+            raise ValueError("No image provided")
+        image = image.convert("RGB")
+        result = self.pipe(image)
+        return result[0]['generated_text'] if result else ""
+class EraXModel:
+    def __init__(self, model_id="erax-ai/EraX-VL-2B-V1.5", cache_dir=None, device=None):
+        self.pipe = pipeline("image-to-text", model=model_id, device=device)
+    def predict(self, image: Image.Image) -> str:
+        if image is None:
+            raise ValueError("No image provided")
+        decoded_image_text = pil_to_base64(image)
+        base64_data = f"data:image;base64,{decoded_image_text}"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": base64_data,
+                    },
+                    {
+                        "type": "text",
+                        "text": "Trích xuất thông tin nội dung từ hình ảnh được cung cấp."
+                    },
+                ],
+            }
+        ]
+        result = self.pipe(image)[0]['generated_texts']
+        return parse_to_json(result) if result else {}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Pillow
+transformers
+torch
+torchvision
+gradio