Spaces:
Running
Running
# PDF handling | |
PyMuPDF # provides fitz; replace ‘fitz’ to avoid the frontend conflict :contentReference[oaicite:0]{index=0} | |
pdf2image # PDF→PIL images wrapper :contentReference[oaicite:1]{index=1} | |
# OCR & layout | |
paddleocr # PaddleOCR toolkit :contentReference[oaicite:2]{index=2} | |
paddlepaddle # Paddle backend for PaddleOCR | |
# Table extraction | |
camelot-py[base] # Camelot’s core + cv dependencies :contentReference[oaicite:3]{index=3} | |
# Data processing | |
numpy | |
pandas | |
# NLP & ML | |
spacy | |
transformers | |
torch | |
tqdm | |
# Vision (if using any OpenCV ops) | |
opencv-python | |
# HTML parsing (if needed) | |
beautifulsoup4 | |
# System dependency wrapper (note: poppler-utils is a system package, not pip) | |
# poppler-utils ← install via apt/conda, not pip | |
# Install spaCy small English model | |
en_core_web_sm @ https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl | |