Spaces:

arjunbhargav212
/

docling-processor

Sleeping

App Files Files Community

docling-processor / app.py

arjunbhargav212

Upload 4 files

dc23f92 verified about 2 months ago

raw

history blame contribute delete

12.1 kB

	"""
	Unified Document Extraction API - Docling + DocStrange
	Deploy this as a SINGLE app on Hugging Face Spaces
	Provides both Docling AND DocStrange extraction in one service
	"""
	import os
	import sys
	import tempfile
	from pathlib import Path

	from fastapi import FastAPI, File, UploadFile, HTTPException, Query
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	import uvicorn

	# ============================================================================
	# INITIALIZATION
	# ============================================================================

	# Docling setup
	HAS_DOCLING = False
	docling_converter = None
	try:
	from docling.document_converter import DocumentConverter
	HAS_DOCLING = True
	except ImportError:
	pass

	# DocStrange setup
	HAS_DOCTSTRANGE = False
	docstrange_extractor = None
	try:
	# Add docstrange to path
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'docstrange'))
	from docstrange import DocumentExtractor
	HAS_DOCTSTRANGE = True
	except ImportError:
	pass

	app = FastAPI(
	title="Unified Document Extraction API",
	description="Extract documents using Docling OR DocStrange AI engines",
	version="2.0.0"
	)

	# Allow CORS for DataSync integration
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	# ============================================================================
	# LAZY INITIALIZATION
	# ============================================================================

	def get_docling_converter():
	"""Get or create Docling converter"""
	global docling_converter
	if docling_converter is None and HAS_DOCLING:
	docling_converter = DocumentConverter()
	return docling_converter


	def get_docstrange_extractor():
	"""Get or create DocStrange extractor"""
	global docstrange_extractor
	if docstrange_extractor is None and HAS_DOCTSTRANGE:
	# Auto-detect GPU
	try:
	import torch
	gpu = torch.cuda.is_available()
	except:
	gpu = False
	docstrange_extractor = DocumentExtractor(gpu=gpu)
	return docstrange_extractor


	# ============================================================================
	# HEALTH & INFO ENDPOINTS
	# ============================================================================

	@app.get("/")
	def root():
	"""Health check"""
	return {
	"status": "ok",
	"service": "Unified Document Extraction API",
	"version": "2.0.0",
	"engines": {
	"docling": HAS_DOCLING,
	"docstrange": HAS_DOCTSTRANGE
	}
	}


	@app.get("/health")
	def health():
	"""Detailed health check"""
	try:
	import torch
	gpu = torch.cuda.is_available()
	vram = f"{torch.cuda.get_device_properties(0).total_mem/1024**3:.1f}GB" if gpu else "N/A"
	except:
	gpu = False
	vram = "N/A"

	return {
	"status": "ok",
	"gpu": gpu,
	"vram": vram,
	"engines": {
	"docling": HAS_DOCLING,
	"docstrange": HAS_DOCTSTRANGE
	}
	}


	@app.get("/engines")
	def list_engines():
	"""List available extraction engines"""
	return {
	"engines": [
	{
	"id": "docling",
	"name": "Docling AI",
	"available": HAS_DOCLING,
	"description": "Advanced document parsing with structure preservation"
	},
	{
	"id": "docstrange",
	"name": "DocStrange",
	"available": HAS_DOCTSTRANGE,
	"description": "GPU-accelerated intelligent document processing"
	}
	]
	}


	# ============================================================================
	# EXTRACTION ENDPOINTS
	# ============================================================================

	@app.post("/convert")
	async def convert_document(
	file: UploadFile = File(...),
	engine: str = Query("docling", description="Extraction engine: docling or docstrange"),
	output_format: str = Query("markdown", description="Output format: markdown, json, tables")
	):
	"""
	Convert document using specified engine

	Args:
	file: Document file (PDF, DOCX, XLSX, Images, etc.)
	engine: docling or docstrange
	output_format: markdown, json, tables

	Returns: JSON with extracted data
	"""
	if not file.filename:
	raise HTTPException(status_code=400, detail="No file provided")

	# Validate engine
	if engine not in ['docling', 'docstrange']:
	raise HTTPException(status_code=400, detail=f"Unknown engine: {engine}. Use 'docling' or 'docstrange'")

	# Check engine availability
	if engine == 'docling' and not HAS_DOCLING:
	raise HTTPException(status_code=503, detail="Docling engine not available")
	if engine == 'docstrange' and not HAS_DOCTSTRANGE:
	raise HTTPException(status_code=503, detail="DocStrange engine not available")

	# Validate file extension
	supported_extensions = ['.pdf', '.docx', '.xlsx', '.pptx', '.png', '.jpg', '.jpeg',
	'.bmp', '.tiff', '.webp', '.gif', '.txt', '.html', '.md', '.csv']
	ext = Path(file.filename).suffix.lower()
	if ext not in supported_extensions:
	raise HTTPException(status_code=400, detail=f"Unsupported format: {ext}")

	try:
	# Save uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
	content = await file.read()
	tmp.write(content)
	tmp_path = tmp.name

	# Extract using selected engine
	if engine == 'docling':
	result = _extract_with_docling(tmp_path, output_format)
	else: # docstrange
	result = _extract_with_docstrange(tmp_path, output_format)

	# Cleanup
	os.unlink(tmp_path)

	return JSONResponse(content=result)

	except Exception as e:
	# Cleanup on error
	if 'tmp_path' in locals():
	try:
	os.unlink(tmp_path)
	except:
	pass

	raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")


	@app.post("/convert/markdown")
	async def convert_to_markdown(
	file: UploadFile = File(...),
	engine: str = Query("docling", description="docling or docstrange")
	):
	"""Extract document to markdown only (lightweight endpoint)"""
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix.lower()) as tmp:
	content = await file.read()
	tmp.write(content)
	tmp_path = tmp.name

	if engine == 'docling' and HAS_DOCLING:
	converter = get_docling_converter()
	result = converter.convert(tmp_path)
	markdown = result.document.export_to_markdown()
	elif engine == 'docstrange' and HAS_DOCTSTRANGE:
	ext = get_docstrange_extractor()
	result = ext.extract_document(tmp_path, output_format='markdown')
	markdown = result.get('data', '')
	else:
	raise HTTPException(status_code=503, detail=f"{engine} engine not available")

	os.unlink(tmp_path)

	return {
	"success": True,
	"markdown": markdown,
	"engine": engine,
	"file_name": file.filename
	}

	except Exception as e:
	if 'tmp_path' in locals():
	try:
	os.unlink(tmp_path)
	except:
	pass
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/convert/tables")
	async def convert_tables(
	file: UploadFile = File(...),
	engine: str = Query("docling", description="docling or docstrange")
	):
	"""Extract tables only from document"""
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix.lower()) as tmp:
	content = await file.read()
	tmp.write(content)
	tmp_path = tmp.name

	tables_data = []

	if engine == 'docling' and HAS_DOCLING:
	converter = get_docling_converter()
	result = converter.convert(tmp_path)
	for table_idx, table in enumerate(result.document.tables):
	try:
	df = table.export_to_dataframe()
	tables_data.append({
	"table_index": table_idx,
	"headers": list(df.columns),
	"rows": df.to_dict('records'),
	"row_count": len(df)
	})
	except:
	pass

	os.unlink(tmp_path)

	return {
	"success": True,
	"tables": tables_data,
	"tables_count": len(tables_data),
	"engine": engine,
	"file_name": file.filename
	}

	except Exception as e:
	if 'tmp_path' in locals():
	try:
	os.unlink(tmp_path)
	except:
	pass
	raise HTTPException(status_code=500, detail=str(e))


	# ============================================================================
	# ENGINE-SPECIFIC EXTRACTION FUNCTIONS
	# ============================================================================

	def _extract_with_docling(file_path, output_format):
	"""Extract using Docling"""
	converter = get_docling_converter()
	result = converter.convert(file_path)
	doc = result.document

	response = {
	"success": True,
	"file_name": os.path.basename(file_path),
	"engine": "docling",
	"format": output_format,
	"document": {
	"markdown": doc.export_to_markdown(),
	"num_pages": len(doc.pages) if hasattr(doc, 'pages') else 0,
	"tables_count": len(doc.tables)
	},
	"metadata": {
	"engine": "docling",
	"model": "docling-default"
	}
	}

	# Add tables if requested
	if output_format in ['json', 'tables']:
	tables_data = []
	for table_idx, table in enumerate(doc.tables):
	try:
	df = table.export_to_dataframe()
	tables_data.append({
	"table_index": table_idx,
	"rows": df.to_dict('records'),
	"row_count": len(df)
	})
	except:
	pass
	response['document']['tables'] = tables_data

	return response


	def _extract_with_docstrange(file_path, output_format):
	"""Extract using DocStrange"""
	ext = get_docstrange_extractor()
	result = ext.extract_document(file_path, output_format=output_format)

	response = {
	"success": True,
	"file_name": os.path.basename(file_path),
	"engine": "docstrange",
	"format": result.get('format', output_format),
	"data": result.get('data', {}),
	"metadata": {
	"engine": "docstrange",
	"file_size": result.get('metadata', {}).get('file_size', 0),
	"gpu_mode": result.get('metadata', {}).get('gpu_mode', False)
	}
	}

	return response


	# ============================================================================
	# MAIN ENTRY POINT
	# ============================================================================

	if __name__ == "__main__":
	print("\n" + "="*60)
	print("Unified Document Extraction API")
	print("="*60)
	print(f"Docling: {'✅ Available' if HAS_DOCLING else '❌ Not installed'}")
	print(f"DocStrange: {'✅ Available' if HAS_DOCTSTRANGE else '❌ Not installed'}")
	print("="*60)
	print("URL: http://localhost:7860")
	print("Docs: http://localhost:7860/docs")
	print("="*60 + "\n")

	uvicorn.run(
	"app:app",
	host="0.0.0.0",
	port=7860
	)