Yaswanth123 commited on
Commit
eac4c3d
·
verified ·
1 Parent(s): e003d31

Create resource_processor.py

Browse files
Files changed (1) hide show
  1. resource_processor.py +80 -0
resource_processor.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FILE: resource_processor.py
2
+
3
+ import io
4
+ import PyPDF2
5
+ import docx
6
+ import logging
7
+ from typing import Optional, Dict, List, Any
8
+ import os
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def _extract_text_from_pdf_bytes(file_bytes: bytes, filename: str) -> Optional[str]:
13
+ # (Your full PDF extraction logic here)
14
+ try:
15
+ pdf_file = io.BytesIO(file_bytes)
16
+ reader = PyPDF2.PdfReader(pdf_file)
17
+ text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
18
+ return text.strip() or None
19
+ except Exception as e:
20
+ logger.error(f"Error extracting PDF '{filename}': {e}")
21
+ return None
22
+ def _extract_text_from_docx_bytes(file_bytes: bytes, filename: str) -> Optional[str]:
23
+ try:
24
+ doc_file = io.BytesIO(file_bytes)
25
+ document = docx.Document(doc_file)
26
+ text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])
27
+ full_text = text.strip()
28
+ logger.info(f"Successfully extracted text from DOCX '{filename}'. Length: {len(full_text)} chars.")
29
+ return full_text if full_text else None
30
+ except Exception as e:
31
+ logger.error(f"Error extracting text from DOCX '{filename}': {e}", exc_info=True)
32
+ return None
33
+
34
+ def _extract_text_from_txt_bytes(file_bytes: bytes, filename: str) -> Optional[str]:
35
+ try:
36
+ text = file_bytes.decode('utf-8')
37
+ except UnicodeDecodeError:
38
+ try:
39
+ logger.warning(f"UTF-8 decoding failed for '{filename}', trying latin-1.")
40
+ text = file_bytes.decode('latin-1')
41
+ except UnicodeDecodeError:
42
+ logger.error(f"Could not decode TXT file '{filename}' with UTF-8 or latin-1.")
43
+ return None
44
+ except Exception as e_latin1:
45
+ logger.error(f"Error decoding TXT file '{filename}' with latin-1: {e_latin1}")
46
+ return None
47
+ except Exception as e:
48
+ logger.error(f"Error reading TXT file '{filename}': {e}", exc_info=True)
49
+ return None
50
+
51
+ full_text = text.strip()
52
+ logger.info(f"Successfully read text from TXT '{filename}'. Length: {len(full_text)} chars.")
53
+ return full_text if full_text else None
54
+
55
+
56
+ def process_uploaded_files(uploaded_files: Optional[List[Any]]) -> Optional[Dict[str, str]]:
57
+ if not uploaded_files:
58
+ return None
59
+
60
+ extracted_texts = {}
61
+ for uploaded_file in uploaded_files:
62
+ filename = uploaded_file.name
63
+ # In Gradio, .name is the path to a temporary file
64
+ with open(filename, 'rb') as f:
65
+ content_bytes = f.read()
66
+
67
+ text_content = None
68
+ if filename.lower().endswith('.pdf'):
69
+ text_content = _extract_text_from_pdf_bytes(content_bytes, filename)
70
+ elif filename.lower().endswith('.docx'):
71
+ text_content = _extract_text_from_docx_bytes(content_bytes, filename)
72
+ elif filename.lower().endswith('.txt'):
73
+ text_content = _extract_text_from_txt_bytes(content_bytes, filename)
74
+
75
+ if text_content:
76
+ # We use the original filename (basename) as the key
77
+ original_filename = os.path.basename(filename)
78
+ extracted_texts[original_filename] = text_content
79
+
80
+ return extracted_texts if extracted_texts else None