Spaces:
Runtime error
Runtime error
| from docling.document_converter import DocumentConverter | |
| from paddleocr import PaddleOCR | |
| from pdf2image import convert_from_path | |
| import numpy as np | |
| def process_text(res): | |
| page_texts = {} | |
| texts = res.get('texts') | |
| for item in texts: | |
| for prov in item['prov']: | |
| page_no = prov['page_no'] | |
| text = item['text'] | |
| page_key = f'{page_no}' | |
| if page_key not in page_texts: | |
| page_texts[page_key] = text | |
| else: | |
| page_texts[page_key] += ' ' + text | |
| return page_texts | |
| def get_table_text(grids): | |
| table_text = "Here is a Table : \n" | |
| for row in grids: | |
| for col in row: | |
| val = col.get('text') | |
| table_text+=f'{val} ,' | |
| table_text+='\n' | |
| return table_text | |
| def process_tables(res , page_texts : dict = {}): | |
| try: | |
| tables = res.get('tables', []) | |
| if not isinstance(tables, list): | |
| raise ValueError("Expected 'tables' to be a list.") | |
| for table in tables: | |
| try: | |
| # Ensure 'prov' exists and has the necessary structure | |
| prov = table.get('prov', []) | |
| if not prov or not isinstance(prov, list): | |
| raise ValueError("Missing or invalid 'prov' structure in table.") | |
| page_no = str(prov[0].get('page_no')) | |
| if not page_no: | |
| raise ValueError("Missing or invalid 'page_no' in 'prov'.") | |
| # Ensure 'data' and 'grid' exist | |
| data = table.get('data', {}) | |
| grids = data.get('grid', []) | |
| if not isinstance(grids, list): | |
| raise ValueError("Missing or invalid 'grid' structure in 'data'.") | |
| # Process grid data into text | |
| text = get_table_text(grids) # Ensure `get_table_text` is well-defined | |
| if not isinstance(text, str): | |
| raise ValueError("get_table_text did not return a string.") | |
| # Add text to page_texts | |
| if page_no not in page_texts: | |
| page_texts[page_no] = text | |
| else: | |
| page_texts[page_no] += '\n' + text | |
| print(f"Processed page {page_no}") | |
| except Exception as table_error: | |
| print(f"Error processing table: {table_error}") | |
| return page_texts | |
| except Exception as e: | |
| print(f"Error processing tables: {e}") | |
| return page_texts | |
| def process_docs(doc_path): | |
| """ | |
| Process the uploaded PDF document using LlamaParse and PaddleOCR. | |
| Args: | |
| doc_path (str): Path to the uploaded PDF document. | |
| Returns: | |
| query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images. | |
| """ | |
| ## Paddle OCR | |
| ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True) | |
| images_pdf = convert_from_path(doc_path, 300) | |
| documents2 = [] | |
| for image in images_pdf: | |
| result = ocr.ocr(np.array(image), cls=True) | |
| text = " ".join([line[1][0] for line in result[0]]) | |
| documents2.append(text) | |
| docs2 = documents2 | |
| ## Docling | |
| converter = DocumentConverter() | |
| result = converter.convert(doc_path) | |
| res = result.document.export_to_dict() | |
| docs3 = process_text(res) | |
| docs3 = process_tables(res,docs3) | |
| return docs2,docs3 |