kawaiipeace commited on
Commit
83783c7
·
1 Parent(s): 0364029
Files changed (2) hide show
  1. app.py +63 -54
  2. app_model.py +114 -0
app.py CHANGED
@@ -1,22 +1,25 @@
1
  import os
2
- import re
3
- from PIL import Image
4
- from dotenv import load_dotenv
5
- from fastapi import FastAPI, UploadFile, File, HTTPException, Header
6
  from fastapi.middleware.cors import CORSMiddleware
7
- from pdf2image import convert_from_bytes
8
  import gradio as gr
9
- from transformers import pipeline
 
 
 
 
10
 
11
- # Load .env
12
  load_dotenv()
13
- API_KEY = os.getenv("API_KEY")
14
- MODEL_ID = "scb10x/typhoon-ocr-7b"
15
 
16
- ocr_pipeline = pipeline("image-to-text", model="scb10x/typhoon-ocr-7b")
 
 
 
17
 
18
- # FastAPI app init
19
  app = FastAPI()
 
 
20
  app.add_middleware(
21
  CORSMiddleware,
22
  allow_origins=["*"],
@@ -24,40 +27,43 @@ app.add_middleware(
24
  allow_headers=["*"],
25
  )
26
 
27
- # --- UTILS ---
28
- def pdf_to_image(file_bytes: bytes) -> Image.Image:
29
- images = convert_from_bytes(file_bytes)
30
- return images[0] # Only first page for now
31
-
32
- def run_ocr(image: Image.Image) -> str:
33
- result = ocr_pipeline(image)
34
- return result[0]["generated_text"]
35
-
36
- def preprocess_text(text: str) -> str:
37
- text = re.sub(r"</?(figure|table|tr|td|th|b|i|u|p|div|span)[^>]*>", "\n", text)
38
- text = re.sub(r"<.*?>", "", text)
39
- text = re.sub(r"\n+", "\n", text)
40
- text = re.sub(r"\s{2,}", " ", text)
41
- return text.strip()
42
-
43
  def extract_fields_regex(text: str) -> dict:
 
 
 
 
 
 
44
  patterns = {
45
- "tax_id": r"(?:TAX\s*ID|เลขที่ผู้เสียภาษี)[\s:\-\.]*([0-9]{10,13})",
46
- "tax_invoice": r"(?:TAX\s*INV\.?|เลขที่ใบกำกับภาษี|ใบกำกับ)[\s:\-\.]*([0-9A-Z\-\/]{6,20})",
47
- "tax_date": r"(?:DATE|วันที่|ออกใบกำกับวันที่)?[\s:\-\.]*([0-9]{2,4}/[0-9]{1,2}/[0-9]{1,2})",
48
- "amount": r"(?:จำนวนเงิน(?:\s*บาทต่อลิตร)?|AMOUNT\s*THB|รวมเงิน)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
49
- "baht_per_litre": r"(?:บาทต่อลิตร|ราคาต่อลิตร|Baht/Litr|Bath/Ltr)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
50
- "litre": r"(?:ลิตร|Ltr\.?|Ltrs?\.?)[\s:\-\.]*([0-9,]+\.[0-9]{2,3})",
51
- "vat": r"(?:VAT|ภาษีมูลค่าเพิ่ม)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
52
- "total": r"(?:TOTAL\s*THB|ยอดรวม|รวมทั้งสิ้น|รวมเงินทั้งสิ้น|ยอดเงินสุทธิ)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
53
  }
54
 
55
  results = {}
56
  for field, pattern in patterns.items():
57
  match = re.search(pattern, text, re.IGNORECASE)
58
  results[field] = match.group(1).strip() if match else None
 
 
 
 
 
 
 
59
  return results
60
 
 
 
 
 
 
 
61
  # --- API Endpoint ---
62
  @app.post("/api/ocr_receipt")
63
  async def ocr_receipt(
@@ -70,45 +76,48 @@ async def ocr_receipt(
70
  content = await file.read()
71
 
72
  try:
 
73
  if file.filename.lower().endswith(".pdf"):
74
  image = pdf_to_image(content)
 
75
  else:
76
- image = Image.open(file.file).convert("RGB")
77
 
78
- text = run_ocr(image)
79
- text_cleaned = preprocess_text(text)
80
- extracted = extract_fields_regex(text_cleaned)
81
 
82
  return {
83
  "raw_ocr": text,
84
- "preprocessed_text": text_cleaned,
85
  "extracted_fields": extracted,
86
  }
87
 
88
  except Exception as e:
89
  raise HTTPException(status_code=500, detail=str(e))
90
 
 
91
  # --- Gradio UI ---
92
  def gradio_interface(image_path: str | Image.Image):
93
  if isinstance(image_path, str) and image_path.lower().endswith(".pdf"):
94
  with open(image_path, "rb") as f:
95
  image = pdf_to_image(f.read())
96
- elif isinstance(image_path, str):
97
- image = Image.open(image_path).convert("RGB")
98
  else:
99
- image = image_path.convert("RGB")
 
 
 
 
 
100
 
101
- text = run_ocr(image)
102
- text_cleaned = preprocess_text(text)
103
- extracted = extract_fields_regex(text_cleaned)
104
- return text_cleaned, extracted
105
 
106
  with gr.Blocks() as demo:
107
- gr.Markdown("## 🧾 Thai Receipt OCR (Typhoon 7B)")
108
  with gr.Row():
109
- img = gr.Image(type="filepath", label="📤 Upload receipt (Image or PDF)")
110
- out_text = gr.Textbox(label="📝 OCR Text", lines=12)
111
- out_fields = gr.JSON(label="🧠 Extracted Fields")
112
- gr.Button("🔍 Run OCR").click(fn=gradio_interface, inputs=img, outputs=[out_text, out_fields])
113
-
114
- demo.launch()
 
 
 
 
1
  import os
2
+ from fastapi import FastAPI, HTTPException, Header, UploadFile, File
 
 
 
3
  from fastapi.middleware.cors import CORSMiddleware
 
4
  import gradio as gr
5
+ from typhoon_ocr import ocr_document
6
+ from pdf2image import convert_from_bytes
7
+ from PIL import Image
8
+ import re
9
+ from dotenv import load_dotenv
10
 
11
+ # --- Load environment variables from .env ---
12
  load_dotenv()
 
 
13
 
14
+ # --- Config ---
15
+ API_KEY = os.getenv("API_KEY")
16
+ TYPHOON_API_KEY = os.getenv("TYPHOON_OCR_API_KEY")
17
+ TYPHOON_BASE_URL = os.getenv("TYPHOON_BASE_URL", "https://api.opentyphoon.ai/v1")
18
 
19
+ # --- FastAPI App ---
20
  app = FastAPI()
21
+
22
+ # CORS (optional for public usage)
23
  app.add_middleware(
24
  CORSMiddleware,
25
  allow_origins=["*"],
 
27
  allow_headers=["*"],
28
  )
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def extract_fields_regex(text: str) -> dict:
31
+ # Preprocess text
32
+ text = re.sub(r"<.*?>", "", text) # Strip tags
33
+ text = re.sub(r"\n+", "\n", text) # Collapse newlines
34
+ text = re.sub(r"\s{2,}", " ", text) # Collapse multiple spaces
35
+ text = re.sub(r"\t+", " ", text)
36
+
37
  patterns = {
38
+ "tax_id": r"(?:TAX\s*ID|เลขที่ผู้เสียภาษี)[\s:\-\.]*([\d]{10,13})",
39
+ "tax_invoice": r"(?:TAX\s*INV\.?|เลขที่ใบกำกับภาษี|ใบกำกับ)[\s:\-\.]*([\dA-Z\-\/]{6,20})",
40
+ "tax_date": r"(?:DATE|วันที่|ออกใบกำกับวันที่)?[\s:\-\.]*([\d]{2,4}/[\d]{1,2}/[\d]{1,2})",
41
+ "amount": r"(?:AMOUNT\s*THB|จำนวนเงิน|รวมเงิน)[\s:\-\.]*([\d,]+\.\d{2})",
42
+ "baht_per_litre": r"(?:Baht\/Litr\.?|Bath\/Ltr\.?|ราคาต่อลิตร|ราคา\/ลิตร|ราคาน้ำมัน|บาทต่อลิตร)[\s:\-\.]*([\d,]+\.\d{2})",
43
+ "litre": r"(?:Ltr\.?|Ltrs?\.?|ลิตร)[\s:\-\.]*([\d,]+\.\d{2,3})",
44
+ "vat": r"(?:VAT|ภาษีมูลค่าเพิ่ม)[\s:\-\.]*([\d,]+\.\d{2})",
45
+ "total": r"(?:TOTAL\s*THB|ยอดรวม|รวมทั้งสิ้น|รวมเงินทั้งสิ้น|ยอดเงินสุทธิ)[\s:\-\.]*([\d,]+\.\d{2})",
46
  }
47
 
48
  results = {}
49
  for field, pattern in patterns.items():
50
  match = re.search(pattern, text, re.IGNORECASE)
51
  results[field] = match.group(1).strip() if match else None
52
+
53
+ # Optional fallback if regex fails
54
+ # if not results["เลขที่ใบกำกับภาษี"]:
55
+ # match = re.search(r"TAX\s*INV\.?\s*</td>\s*<td>\s*([\d\-]+)", text, re.IGNORECASE)
56
+ # if match:
57
+ # results["เลขที่ใบกำกับภาษี"] = match.group(1).strip()
58
+
59
  return results
60
 
61
+
62
+ def pdf_to_image(file_bytes: bytes) -> Image.Image:
63
+ images = convert_from_bytes(file_bytes)
64
+ return images[0] # First page only
65
+
66
+
67
  # --- API Endpoint ---
68
  @app.post("/api/ocr_receipt")
69
  async def ocr_receipt(
 
76
  content = await file.read()
77
 
78
  try:
79
+ # Handle PDF and image
80
  if file.filename.lower().endswith(".pdf"):
81
  image = pdf_to_image(content)
82
+ raw_output = ocr_document(image, task_type="structure")
83
  else:
84
+ raw_output = ocr_document(content, task_type="structure")
85
 
86
+ text = raw_output if isinstance(raw_output, str) else raw_output.get("text", "")
87
+ extracted = extract_fields_regex(text)
 
88
 
89
  return {
90
  "raw_ocr": text,
 
91
  "extracted_fields": extracted,
92
  }
93
 
94
  except Exception as e:
95
  raise HTTPException(status_code=500, detail=str(e))
96
 
97
+
98
  # --- Gradio UI ---
99
  def gradio_interface(image_path: str | Image.Image):
100
  if isinstance(image_path, str) and image_path.lower().endswith(".pdf"):
101
  with open(image_path, "rb") as f:
102
  image = pdf_to_image(f.read())
 
 
103
  else:
104
+ image = image_path
105
+
106
+ raw = ocr_document(image, task_type="structure")
107
+ text = raw if isinstance(raw, str) else raw.get("text", "")
108
+ extracted = extract_fields_regex(text)
109
+ return text, extracted
110
 
 
 
 
 
111
 
112
  with gr.Blocks() as demo:
113
+ gr.Markdown("# 🧾 แปลงและตรวจสอบใบเสร็จ")
114
  with gr.Row():
115
+ img = gr.Image(type="filepath", label="อัปโหลดไฟล์ PDF หรือรูปภาพ")
116
+ out_text = gr.Textbox(label="ข้อความทั้งหมด", lines=10)
117
+ out_fields = gr.JSON(label="ข้อความที่ดึงออกมา")
118
+ btn = gr.Button("ประมวลผลใบเสร็จ")
119
+ btn.click(fn=gradio_interface, inputs=img, outputs=[out_text, out_fields])
120
+
121
+ # --- Mount Gradio on FastAPI ---
122
+ # app = gr.mount_gradio_app(app, demo, path="/ui")
123
+ demo.launch(share=False)
app_model.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from PIL import Image
4
+ from dotenv import load_dotenv
5
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Header
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pdf2image import convert_from_bytes
8
+ import gradio as gr
9
+ from transformers import pipeline
10
+
11
+ # Load .env
12
+ load_dotenv()
13
+ API_KEY = os.getenv("API_KEY")
14
+ MODEL_ID = "scb10x/typhoon-ocr-7b"
15
+
16
+ ocr_pipeline = pipeline("image-to-text", model="scb10x/typhoon-ocr-7b")
17
+
18
+ # FastAPI app init
19
+ app = FastAPI()
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"],
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ # --- UTILS ---
28
+ def pdf_to_image(file_bytes: bytes) -> Image.Image:
29
+ images = convert_from_bytes(file_bytes)
30
+ return images[0] # Only first page for now
31
+
32
+ def run_ocr(image: Image.Image) -> str:
33
+ result = ocr_pipeline(image)
34
+ return result[0]["generated_text"]
35
+
36
+ def preprocess_text(text: str) -> str:
37
+ text = re.sub(r"</?(figure|table|tr|td|th|b|i|u|p|div|span)[^>]*>", "\n", text)
38
+ text = re.sub(r"<.*?>", "", text)
39
+ text = re.sub(r"\n+", "\n", text)
40
+ text = re.sub(r"\s{2,}", " ", text)
41
+ return text.strip()
42
+
43
+ def extract_fields_regex(text: str) -> dict:
44
+ patterns = {
45
+ "tax_id": r"(?:TAX\s*ID|เลขที่ผู้เสียภาษี)[\s:\-\.]*([0-9]{10,13})",
46
+ "tax_invoice": r"(?:TAX\s*INV\.?|เลขที่ใบกำกับภาษี|ใบกำกับ)[\s:\-\.]*([0-9A-Z\-\/]{6,20})",
47
+ "tax_date": r"(?:DATE|วันที่|ออกใบกำกับวันที่)?[\s:\-\.]*([0-9]{2,4}/[0-9]{1,2}/[0-9]{1,2})",
48
+ "amount": r"(?:จำนวนเงิน(?:\s*บาทต่อลิตร)?|AMOUNT\s*THB|รวมเงิน)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
49
+ "baht_per_litre": r"(?:บาทต่อลิตร|ราคาต่อลิตร|Baht/Litr|Bath/Ltr)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
50
+ "litre": r"(?:ลิตร|Ltr\.?|Ltrs?\.?)[\s:\-\.]*([0-9,]+\.[0-9]{2,3})",
51
+ "vat": r"(?:VAT|ภาษีมูลค่าเพิ่ม)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
52
+ "total": r"(?:TOTAL\s*THB|ยอดรวม|รวมทั้งสิ้น|รวมเงินทั้งสิ้น|ยอดเงินสุทธิ)[\s:\-\.]*([0-9,]+\.[0-9]{2})",
53
+ }
54
+
55
+ results = {}
56
+ for field, pattern in patterns.items():
57
+ match = re.search(pattern, text, re.IGNORECASE)
58
+ results[field] = match.group(1).strip() if match else None
59
+ return results
60
+
61
+ # --- API Endpoint ---
62
+ @app.post("/api/ocr_receipt")
63
+ async def ocr_receipt(
64
+ file: UploadFile = File(...),
65
+ x_api_key: str | None = Header(None),
66
+ ):
67
+ if API_KEY and x_api_key != API_KEY:
68
+ raise HTTPException(status_code=401, detail="Invalid API key")
69
+
70
+ content = await file.read()
71
+
72
+ try:
73
+ if file.filename.lower().endswith(".pdf"):
74
+ image = pdf_to_image(content)
75
+ else:
76
+ image = Image.open(file.file).convert("RGB")
77
+
78
+ text = run_ocr(image)
79
+ text_cleaned = preprocess_text(text)
80
+ extracted = extract_fields_regex(text_cleaned)
81
+
82
+ return {
83
+ "raw_ocr": text,
84
+ "preprocessed_text": text_cleaned,
85
+ "extracted_fields": extracted,
86
+ }
87
+
88
+ except Exception as e:
89
+ raise HTTPException(status_code=500, detail=str(e))
90
+
91
+ # --- Gradio UI ---
92
+ def gradio_interface(image_path: str | Image.Image):
93
+ if isinstance(image_path, str) and image_path.lower().endswith(".pdf"):
94
+ with open(image_path, "rb") as f:
95
+ image = pdf_to_image(f.read())
96
+ elif isinstance(image_path, str):
97
+ image = Image.open(image_path).convert("RGB")
98
+ else:
99
+ image = image_path.convert("RGB")
100
+
101
+ text = run_ocr(image)
102
+ text_cleaned = preprocess_text(text)
103
+ extracted = extract_fields_regex(text_cleaned)
104
+ return text_cleaned, extracted
105
+
106
+ with gr.Blocks() as demo:
107
+ gr.Markdown("## 🧾 Thai Receipt OCR (Typhoon 7B)")
108
+ with gr.Row():
109
+ img = gr.Image(type="filepath", label="📤 Upload receipt (Image or PDF)")
110
+ out_text = gr.Textbox(label="📝 OCR Text", lines=12)
111
+ out_fields = gr.JSON(label="🧠 Extracted Fields")
112
+ gr.Button("🔍 Run OCR").click(fn=gradio_interface, inputs=img, outputs=[out_text, out_fields])
113
+
114
+ demo.launch()