Spaces:
Sleeping
Sleeping
| from PIL import Image | |
| import io | |
| import fitz | |
| import re | |
| import pytesseract | |
| import google.generativeai as genai | |
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import platform | |
| def extract_images_from_pdf_bytes(pdf_bytes: bytes) -> list: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| images = [] | |
| for page in doc: | |
| pix = page.get_pixmap() | |
| buf = io.BytesIO() | |
| buf.write(pix.tobytes("png")) | |
| images.append(buf.getvalue()) | |
| return images | |
| def clean_ocr_text(text: str) -> str: | |
| text = text.replace("\x0c", " ") # remove form feed | |
| text = text.replace("\u00a0", " ") # replace NBSP with space | |
| text = re.sub(r'(\d)\s*\.\s*(\d)', r'\1.\2', text) # fix split decimals | |
| text = re.sub(r'\s+', ' ', text) # collapse multiple spaces/newlines | |
| return text.strip() | |
| def ocr_text_from_image(image_bytes: bytes) -> str: | |
| image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| return pytesseract.image_to_string(image) | |
| def load_pytesseract(): | |
| if platform.system() == "Darwin": | |
| #pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract' | |
| pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract' | |
| elif platform.system() == "Windows": | |
| pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
| def load_genai(genai_api_key: str): | |
| try: | |
| genai.configure(api_key=genai_api_key) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to configure Gemini API: {e}") | |
| def setupFastAPI()-> FastAPI: | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=[ | |
| "http://localhost:8002" | |
| "http://localhost:9000" | |
| "http://localhost:5501" | |
| ], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| return app | |