# PDF handling PyMuPDF # provides fitz; replace ‘fitz’ to avoid the frontend conflict :contentReference[oaicite:0]{index=0} pdf2image # PDF→PIL images wrapper :contentReference[oaicite:1]{index=1} # OCR & layout paddleocr # PaddleOCR toolkit :contentReference[oaicite:2]{index=2} paddlepaddle # Paddle backend for PaddleOCR # Table extraction camelot-py[base] # Camelot’s core + cv dependencies :contentReference[oaicite:3]{index=3} # Data processing numpy pandas # NLP & ML spacy transformers torch tqdm # Vision (if using any OpenCV ops) opencv-python # HTML parsing (if needed) beautifulsoup4 # System dependency wrapper (note: poppler-utils is a system package, not pip) # poppler-utils ← install via apt/conda, not pip # Install spaCy small English model en_core_web_sm @ https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl