Spaces:
Running
Running
import os | |
from fastapi import FastAPI, UploadFile, File, HTTPException | |
from fastapi.responses import JSONResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
from typing import List | |
from smoldocling import cli | |
import shutil | |
import dotenv | |
os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/transformers" | |
os.environ["HF_HUB_CACHE"] = "/app/.cache/hub" | |
# Load environment variables | |
dotenv.load_dotenv() | |
# Initialize FastAPI app | |
app = FastAPI() | |
# Enable CORS (optional, but good for dev/testing) | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Ensure directories exist | |
UPLOAD_DIR = "/tmp/uploads" | |
OUTPUT_DIR = "/tmp/output" | |
os.makedirs(UPLOAD_DIR, exist_ok=True) | |
os.makedirs(OUTPUT_DIR, exist_ok=True) | |
def docling_process_files(file_list: List[str]) -> str: | |
cli.process_files(file_list, OUTPUT_DIR, output_format='json') | |
file_path = file_list[0].replace('\\', '/') | |
file_name = os.path.splitext(os.path.basename(file_path))[0] | |
json_output = os.path.join(OUTPUT_DIR, f"{file_name}.json") | |
overlay_html = os.path.join(OUTPUT_DIR, f"{file_name}_overlay.html") | |
# Generate overlay (optional) | |
cli.generate_docling_overlay(file_path, json_output, overlay_html) | |
# Stitch final cleaned text (you can toggle GPT fixing) | |
cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False) | |
return cleaned_text | |
def root(): | |
return JSONResponse(content={"message": "Root is working"}) | |
def health_check(): | |
return JSONResponse(content={"status": "ok"}) | |
async def parse_docling(file: UploadFile = File(...)): | |
if not file: | |
raise HTTPException(status_code=400, detail="No file uploaded.") | |
save_path = os.path.join(UPLOAD_DIR, file.filename) | |
with open(save_path, "wb") as buffer: | |
shutil.copyfileobj(file.file, buffer) | |
try: | |
text_output = docling_process_files([save_path]) | |
return JSONResponse(content={"text": text_output}) | |
except Exception as e: | |
return JSONResponse(status_code=500, content={"error": str(e)}) | |