Spaces:
Runtime error
Runtime error
from docling.document_converter import DocumentConverter | |
from paddleocr import PaddleOCR | |
from llama_parse import LlamaParse | |
from pdf2image import convert_from_path | |
import numpy as np | |
import os | |
llama_key = os.getenv('LLAMA_INDEX_API_KEY') | |
def process_text(res): | |
page_texts = {} | |
texts = res.get('texts') | |
for item in texts: | |
for prov in item['prov']: | |
page_no = prov['page_no'] | |
text = item['text'] | |
page_key = f'{page_no}' | |
if page_key not in page_texts: | |
page_texts[page_key] = text | |
else: | |
page_texts[page_key] += ' ' + text | |
return page_texts | |
def get_table_text(grids): | |
table_text = "Here is a Table : \n" | |
for row in grids: | |
for col in row: | |
val = col.get('text') | |
table_text+=f'{val} ,' | |
table_text+='\n' | |
return table_text | |
def process_tables(res , page_texts : dict = {}): | |
try: | |
tables = res.get('tables', []) | |
if not isinstance(tables, list): | |
raise ValueError("Expected 'tables' to be a list.") | |
for table in tables: | |
try: | |
# Ensure 'prov' exists and has the necessary structure | |
prov = table.get('prov', []) | |
if not prov or not isinstance(prov, list): | |
raise ValueError("Missing or invalid 'prov' structure in table.") | |
page_no = str(prov[0].get('page_no')) | |
if not page_no: | |
raise ValueError("Missing or invalid 'page_no' in 'prov'.") | |
# Ensure 'data' and 'grid' exist | |
data = table.get('data', {}) | |
grids = data.get('grid', []) | |
if not isinstance(grids, list): | |
raise ValueError("Missing or invalid 'grid' structure in 'data'.") | |
# Process grid data into text | |
text = get_table_text(grids) # Ensure `get_table_text` is well-defined | |
if not isinstance(text, str): | |
raise ValueError("get_table_text did not return a string.") | |
# Add text to page_texts | |
if page_no not in page_texts: | |
page_texts[page_no] = text | |
else: | |
page_texts[page_no] += '\n' + text | |
print(f"Processed page {page_no}") | |
except Exception as table_error: | |
print(f"Error processing table: {table_error}") | |
return page_texts | |
except Exception as e: | |
print(f"Error processing tables: {e}") | |
return page_texts | |
def process_docs(doc_path): | |
""" | |
Process the uploaded PDF document using LlamaParse and PaddleOCR. | |
Args: | |
doc_path (str): Path to the uploaded PDF document. | |
Returns: | |
query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images. | |
""" | |
## LLama Parser | |
parser = LlamaParse( | |
api_key=llama_key, | |
result_type='markdown', | |
verbose=True, | |
language='en', | |
num_workers=2 | |
) | |
documents = parser.load_data(doc_path) | |
docs = [doc.text for doc in documents] | |
## Paddle OCR | |
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True) | |
images_pdf = convert_from_path(doc_path, 300) | |
documents2 = [] | |
for image in images_pdf: | |
result = ocr.ocr(np.array(image), cls=True) | |
text = " ".join([line[1][0] for line in result[0]]) | |
documents2.append(text) | |
docs2 = documents2 | |
## Docling | |
converter = DocumentConverter() | |
result = converter.convert(doc_path) | |
res = result.document.export_to_dict() | |
docs3 = process_text(res) | |
docs3 = process_tables(res,docs3) | |
return docs, docs2, docs3 | |