parser_benchmarking / helper.py
Gopal2002's picture
Added LLama Module (#2)
2df4c2a verified
raw
history blame
3.78 kB
from docling.document_converter import DocumentConverter
from paddleocr import PaddleOCR
from llama_parse import LlamaParse
from pdf2image import convert_from_path
import numpy as np
import os
llama_key = os.getenv('LLAMA_INDEX_API_KEY')
def process_text(res):
page_texts = {}
texts = res.get('texts')
for item in texts:
for prov in item['prov']:
page_no = prov['page_no']
text = item['text']
page_key = f'{page_no}'
if page_key not in page_texts:
page_texts[page_key] = text
else:
page_texts[page_key] += ' ' + text
return page_texts
def get_table_text(grids):
table_text = "Here is a Table : \n"
for row in grids:
for col in row:
val = col.get('text')
table_text+=f'{val} ,'
table_text+='\n'
return table_text
def process_tables(res , page_texts : dict = {}):
try:
tables = res.get('tables', [])
if not isinstance(tables, list):
raise ValueError("Expected 'tables' to be a list.")
for table in tables:
try:
# Ensure 'prov' exists and has the necessary structure
prov = table.get('prov', [])
if not prov or not isinstance(prov, list):
raise ValueError("Missing or invalid 'prov' structure in table.")
page_no = str(prov[0].get('page_no'))
if not page_no:
raise ValueError("Missing or invalid 'page_no' in 'prov'.")
# Ensure 'data' and 'grid' exist
data = table.get('data', {})
grids = data.get('grid', [])
if not isinstance(grids, list):
raise ValueError("Missing or invalid 'grid' structure in 'data'.")
# Process grid data into text
text = get_table_text(grids) # Ensure `get_table_text` is well-defined
if not isinstance(text, str):
raise ValueError("get_table_text did not return a string.")
# Add text to page_texts
if page_no not in page_texts:
page_texts[page_no] = text
else:
page_texts[page_no] += '\n' + text
print(f"Processed page {page_no}")
except Exception as table_error:
print(f"Error processing table: {table_error}")
return page_texts
except Exception as e:
print(f"Error processing tables: {e}")
return page_texts
def process_docs(doc_path):
"""
Process the uploaded PDF document using LlamaParse and PaddleOCR.
Args:
doc_path (str): Path to the uploaded PDF document.
Returns:
query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
"""
## LLama Parser
parser = LlamaParse(
api_key=llama_key,
result_type='markdown',
verbose=True,
language='en',
num_workers=2
)
documents = parser.load_data(doc_path)
docs = [doc.text for doc in documents]
## Paddle OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
images_pdf = convert_from_path(doc_path, 300)
documents2 = []
for image in images_pdf:
result = ocr.ocr(np.array(image), cls=True)
text = " ".join([line[1][0] for line in result[0]])
documents2.append(text)
docs2 = documents2
## Docling
converter = DocumentConverter()
result = converter.convert(doc_path)
res = result.document.export_to_dict()
docs3 = process_text(res)
docs3 = process_tables(res,docs3)
return docs, docs2, docs3