import os from fastapi import FastAPI, HTTPException, Header, UploadFile, File from fastapi.middleware.cors import CORSMiddleware import gradio as gr from typhoon_ocr import ocr_document from pdf2image import convert_from_bytes from PIL import Image import re from dotenv import load_dotenv # --- Load environment variables from .env --- load_dotenv() # --- Config --- API_KEY = os.getenv("API_KEY") TYPHOON_API_KEY = os.getenv("TYPHOON_OCR_API_KEY") TYPHOON_BASE_URL = os.getenv("TYPHOON_BASE_URL", "https://api.opentyphoon.ai/v1") # --- FastAPI App --- app = FastAPI() # CORS (optional for public usage) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) def extract_fields_regex(text: str) -> dict: # Preprocess text text = re.sub(r"<.*?>", "", text) # Strip tags text = re.sub(r"\n+", "\n", text) # Collapse newlines text = re.sub(r"\s{2,}", " ", text) # Collapse multiple spaces text = re.sub(r"\t+", " ", text) patterns = { "tax_id": r"(?:TAX\s*ID|เลขที่ผู้เสียภาษี)[\s:\-\.]*([\d]{10,13})", "tax_invoice": r"(?:TAX\s*INV\.?|เลขที่ใบกำกับภาษี|ใบกำกับ)[\s:\-\.]*([\dA-Z\-\/]{6,20})", "tax_date": r"(?:DATE|วันที่|ออกใบกำกับวันที่)?[\s:\-\.]*([\d]{2,4}/[\d]{1,2}/[\d]{1,2})", "amount": r"(?:AMOUNT\s*THB|จำนวนเงิน|รวมเงิน)[\s:\-\.]*([\d,]+\.\d{2})", "baht_per_litre": r"(?:Baht\/Litr\.?|Bath\/Ltr\.?|ราคาต่อลิตร|ราคา\/ลิตร|ราคาน้ำมัน|บาทต่อลิตร)[\s:\-\.]*([\d,]+\.\d{2})", "litre": r"(?:Ltr\.?|Ltrs?\.?|ลิตร)[\s:\-\.]*([\d,]+\.\d{2,3})", "vat": r"(?:VAT|ภาษีมูลค่าเพิ่ม)[\s:\-\.]*([\d,]+\.\d{2})", "total": r"(?:TOTAL\s*THB|ยอดรวม|รวมทั้งสิ้น|รวมเงินทั้งสิ้น|ยอดเงินสุทธิ)[\s:\-\.]*([\d,]+\.\d{2})", } results = {} for field, pattern in patterns.items(): match = re.search(pattern, text, re.IGNORECASE) results[field] = match.group(1).strip() if match else None # Optional fallback if regex fails # if not results["เลขที่ใบกำกับภาษี"]: # match = re.search(r"TAX\s*INV\.?\s*\s*