Spaces:
Sleeping
Sleeping
| """ | |
| Unified Document Extraction API - Docling + DocStrange | |
| Deploy this as a SINGLE app on Hugging Face Spaces | |
| Provides both Docling AND DocStrange extraction in one service | |
| """ | |
| import os | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| from fastapi import FastAPI, File, UploadFile, HTTPException, Query | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import uvicorn | |
| # ============================================================================ | |
| # INITIALIZATION | |
| # ============================================================================ | |
| # Docling setup | |
| HAS_DOCLING = False | |
| docling_converter = None | |
| try: | |
| from docling.document_converter import DocumentConverter | |
| HAS_DOCLING = True | |
| except ImportError: | |
| pass | |
| # DocStrange setup | |
| HAS_DOCTSTRANGE = False | |
| docstrange_extractor = None | |
| try: | |
| # Add docstrange to path | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'docstrange')) | |
| from docstrange import DocumentExtractor | |
| HAS_DOCTSTRANGE = True | |
| except ImportError: | |
| pass | |
| app = FastAPI( | |
| title="Unified Document Extraction API", | |
| description="Extract documents using Docling OR DocStrange AI engines", | |
| version="2.0.0" | |
| ) | |
| # Allow CORS for DataSync integration | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ============================================================================ | |
| # LAZY INITIALIZATION | |
| # ============================================================================ | |
| def get_docling_converter(): | |
| """Get or create Docling converter""" | |
| global docling_converter | |
| if docling_converter is None and HAS_DOCLING: | |
| docling_converter = DocumentConverter() | |
| return docling_converter | |
| def get_docstrange_extractor(): | |
| """Get or create DocStrange extractor""" | |
| global docstrange_extractor | |
| if docstrange_extractor is None and HAS_DOCTSTRANGE: | |
| # Auto-detect GPU | |
| try: | |
| import torch | |
| gpu = torch.cuda.is_available() | |
| except: | |
| gpu = False | |
| docstrange_extractor = DocumentExtractor(gpu=gpu) | |
| return docstrange_extractor | |
| # ============================================================================ | |
| # HEALTH & INFO ENDPOINTS | |
| # ============================================================================ | |
| def root(): | |
| """Health check""" | |
| return { | |
| "status": "ok", | |
| "service": "Unified Document Extraction API", | |
| "version": "2.0.0", | |
| "engines": { | |
| "docling": HAS_DOCLING, | |
| "docstrange": HAS_DOCTSTRANGE | |
| } | |
| } | |
| def health(): | |
| """Detailed health check""" | |
| try: | |
| import torch | |
| gpu = torch.cuda.is_available() | |
| vram = f"{torch.cuda.get_device_properties(0).total_mem/1024**3:.1f}GB" if gpu else "N/A" | |
| except: | |
| gpu = False | |
| vram = "N/A" | |
| return { | |
| "status": "ok", | |
| "gpu": gpu, | |
| "vram": vram, | |
| "engines": { | |
| "docling": HAS_DOCLING, | |
| "docstrange": HAS_DOCTSTRANGE | |
| } | |
| } | |
| def list_engines(): | |
| """List available extraction engines""" | |
| return { | |
| "engines": [ | |
| { | |
| "id": "docling", | |
| "name": "Docling AI", | |
| "available": HAS_DOCLING, | |
| "description": "Advanced document parsing with structure preservation" | |
| }, | |
| { | |
| "id": "docstrange", | |
| "name": "DocStrange", | |
| "available": HAS_DOCTSTRANGE, | |
| "description": "GPU-accelerated intelligent document processing" | |
| } | |
| ] | |
| } | |
| # ============================================================================ | |
| # EXTRACTION ENDPOINTS | |
| # ============================================================================ | |
| async def convert_document( | |
| file: UploadFile = File(...), | |
| engine: str = Query("docling", description="Extraction engine: docling or docstrange"), | |
| output_format: str = Query("markdown", description="Output format: markdown, json, tables") | |
| ): | |
| """ | |
| Convert document using specified engine | |
| Args: | |
| file: Document file (PDF, DOCX, XLSX, Images, etc.) | |
| engine: docling or docstrange | |
| output_format: markdown, json, tables | |
| Returns: JSON with extracted data | |
| """ | |
| if not file.filename: | |
| raise HTTPException(status_code=400, detail="No file provided") | |
| # Validate engine | |
| if engine not in ['docling', 'docstrange']: | |
| raise HTTPException(status_code=400, detail=f"Unknown engine: {engine}. Use 'docling' or 'docstrange'") | |
| # Check engine availability | |
| if engine == 'docling' and not HAS_DOCLING: | |
| raise HTTPException(status_code=503, detail="Docling engine not available") | |
| if engine == 'docstrange' and not HAS_DOCTSTRANGE: | |
| raise HTTPException(status_code=503, detail="DocStrange engine not available") | |
| # Validate file extension | |
| supported_extensions = ['.pdf', '.docx', '.xlsx', '.pptx', '.png', '.jpg', '.jpeg', | |
| '.bmp', '.tiff', '.webp', '.gif', '.txt', '.html', '.md', '.csv'] | |
| ext = Path(file.filename).suffix.lower() | |
| if ext not in supported_extensions: | |
| raise HTTPException(status_code=400, detail=f"Unsupported format: {ext}") | |
| try: | |
| # Save uploaded file temporarily | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: | |
| content = await file.read() | |
| tmp.write(content) | |
| tmp_path = tmp.name | |
| # Extract using selected engine | |
| if engine == 'docling': | |
| result = _extract_with_docling(tmp_path, output_format) | |
| else: # docstrange | |
| result = _extract_with_docstrange(tmp_path, output_format) | |
| # Cleanup | |
| os.unlink(tmp_path) | |
| return JSONResponse(content=result) | |
| except Exception as e: | |
| # Cleanup on error | |
| if 'tmp_path' in locals(): | |
| try: | |
| os.unlink(tmp_path) | |
| except: | |
| pass | |
| raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}") | |
| async def convert_to_markdown( | |
| file: UploadFile = File(...), | |
| engine: str = Query("docling", description="docling or docstrange") | |
| ): | |
| """Extract document to markdown only (lightweight endpoint)""" | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix.lower()) as tmp: | |
| content = await file.read() | |
| tmp.write(content) | |
| tmp_path = tmp.name | |
| if engine == 'docling' and HAS_DOCLING: | |
| converter = get_docling_converter() | |
| result = converter.convert(tmp_path) | |
| markdown = result.document.export_to_markdown() | |
| elif engine == 'docstrange' and HAS_DOCTSTRANGE: | |
| ext = get_docstrange_extractor() | |
| result = ext.extract_document(tmp_path, output_format='markdown') | |
| markdown = result.get('data', '') | |
| else: | |
| raise HTTPException(status_code=503, detail=f"{engine} engine not available") | |
| os.unlink(tmp_path) | |
| return { | |
| "success": True, | |
| "markdown": markdown, | |
| "engine": engine, | |
| "file_name": file.filename | |
| } | |
| except Exception as e: | |
| if 'tmp_path' in locals(): | |
| try: | |
| os.unlink(tmp_path) | |
| except: | |
| pass | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def convert_tables( | |
| file: UploadFile = File(...), | |
| engine: str = Query("docling", description="docling or docstrange") | |
| ): | |
| """Extract tables only from document""" | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix.lower()) as tmp: | |
| content = await file.read() | |
| tmp.write(content) | |
| tmp_path = tmp.name | |
| tables_data = [] | |
| if engine == 'docling' and HAS_DOCLING: | |
| converter = get_docling_converter() | |
| result = converter.convert(tmp_path) | |
| for table_idx, table in enumerate(result.document.tables): | |
| try: | |
| df = table.export_to_dataframe() | |
| tables_data.append({ | |
| "table_index": table_idx, | |
| "headers": list(df.columns), | |
| "rows": df.to_dict('records'), | |
| "row_count": len(df) | |
| }) | |
| except: | |
| pass | |
| os.unlink(tmp_path) | |
| return { | |
| "success": True, | |
| "tables": tables_data, | |
| "tables_count": len(tables_data), | |
| "engine": engine, | |
| "file_name": file.filename | |
| } | |
| except Exception as e: | |
| if 'tmp_path' in locals(): | |
| try: | |
| os.unlink(tmp_path) | |
| except: | |
| pass | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # ============================================================================ | |
| # ENGINE-SPECIFIC EXTRACTION FUNCTIONS | |
| # ============================================================================ | |
| def _extract_with_docling(file_path, output_format): | |
| """Extract using Docling""" | |
| converter = get_docling_converter() | |
| result = converter.convert(file_path) | |
| doc = result.document | |
| response = { | |
| "success": True, | |
| "file_name": os.path.basename(file_path), | |
| "engine": "docling", | |
| "format": output_format, | |
| "document": { | |
| "markdown": doc.export_to_markdown(), | |
| "num_pages": len(doc.pages) if hasattr(doc, 'pages') else 0, | |
| "tables_count": len(doc.tables) | |
| }, | |
| "metadata": { | |
| "engine": "docling", | |
| "model": "docling-default" | |
| } | |
| } | |
| # Add tables if requested | |
| if output_format in ['json', 'tables']: | |
| tables_data = [] | |
| for table_idx, table in enumerate(doc.tables): | |
| try: | |
| df = table.export_to_dataframe() | |
| tables_data.append({ | |
| "table_index": table_idx, | |
| "rows": df.to_dict('records'), | |
| "row_count": len(df) | |
| }) | |
| except: | |
| pass | |
| response['document']['tables'] = tables_data | |
| return response | |
| def _extract_with_docstrange(file_path, output_format): | |
| """Extract using DocStrange""" | |
| ext = get_docstrange_extractor() | |
| result = ext.extract_document(file_path, output_format=output_format) | |
| response = { | |
| "success": True, | |
| "file_name": os.path.basename(file_path), | |
| "engine": "docstrange", | |
| "format": result.get('format', output_format), | |
| "data": result.get('data', {}), | |
| "metadata": { | |
| "engine": "docstrange", | |
| "file_size": result.get('metadata', {}).get('file_size', 0), | |
| "gpu_mode": result.get('metadata', {}).get('gpu_mode', False) | |
| } | |
| } | |
| return response | |
| # ============================================================================ | |
| # MAIN ENTRY POINT | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| print("\n" + "="*60) | |
| print("Unified Document Extraction API") | |
| print("="*60) | |
| print(f"Docling: {'✅ Available' if HAS_DOCLING else '❌ Not installed'}") | |
| print(f"DocStrange: {'✅ Available' if HAS_DOCTSTRANGE else '❌ Not installed'}") | |
| print("="*60) | |
| print("URL: http://localhost:7860") | |
| print("Docs: http://localhost:7860/docs") | |
| print("="*60 + "\n") | |
| uvicorn.run( | |
| "app:app", | |
| host="0.0.0.0", | |
| port=7860 | |
| ) | |