Spaces:
Running
Running
Commit
·
7d7f295
1
Parent(s):
f825473
Added handwritten
Browse files- app.py +50 -49
- requirements.txt +2 -0
- temp_uploaded_image_paddle.jpg +3 -0
app.py
CHANGED
@@ -25,7 +25,7 @@ import matplotlib
|
|
25 |
import boto3
|
26 |
from decimal import Decimal
|
27 |
import uuid
|
28 |
-
|
29 |
|
30 |
# Configure logging
|
31 |
logging.basicConfig(level=logging.INFO)
|
@@ -193,6 +193,27 @@ def merge_extractions(regex_fields, llm_fields):
|
|
193 |
merged["products"] = llm_fields.get("products") or regex_fields.get("products")
|
194 |
return merged
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
def main():
|
197 |
st.set_page_config(
|
198 |
page_title="FormIQ - Intelligent Document Parser",
|
@@ -246,49 +267,43 @@ def main():
|
|
246 |
)
|
247 |
|
248 |
if uploaded_file is not None:
|
249 |
-
|
250 |
-
if uploaded_file.type == "application/pdf":
|
251 |
-
images = convert_from_bytes(uploaded_file.read())
|
252 |
-
image = images[0] # Use the first page
|
253 |
-
else:
|
254 |
-
image = Image.open(uploaded_file)
|
255 |
st.image(image, caption="Uploaded Document", width=600)
|
256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
# Process button
|
258 |
if st.button("Process Document"):
|
259 |
with st.spinner("Processing document..."):
|
260 |
try:
|
261 |
-
# Save the uploaded file to a temporary location
|
262 |
temp_path = "temp_uploaded_image.jpg"
|
263 |
image.save(temp_path)
|
264 |
|
265 |
-
#
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
st.error(f"Failed to parse LLM output as JSON: {e}")
|
285 |
-
else:
|
286 |
-
st.warning("No valid JSON found in LLM output.")
|
287 |
-
|
288 |
-
# Display extracted products if present
|
289 |
-
if "products" in llm_data and llm_data["products"]:
|
290 |
-
st.subheader("Products (LLM Extracted)")
|
291 |
-
st.dataframe(pd.DataFrame(llm_data["products"]))
|
292 |
|
293 |
except Exception as e:
|
294 |
logger.error(f"Error processing document: {str(e)}")
|
@@ -351,19 +366,5 @@ def main():
|
|
351 |
else:
|
352 |
st.info("Confusion matrix not found.")
|
353 |
|
354 |
-
# Load model and processor
|
355 |
-
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
|
356 |
-
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
|
357 |
-
|
358 |
-
# Load your image (crop to handwritten region if possible)
|
359 |
-
image = Image.open('handwritten_sample.jpg').convert("RGB")
|
360 |
-
|
361 |
-
# Preprocess and predict
|
362 |
-
pixel_values = processor(images=image, return_tensors="pt").pixel_values
|
363 |
-
generated_ids = model.generate(pixel_values)
|
364 |
-
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
365 |
-
|
366 |
-
print("Handwritten text:", generated_text)
|
367 |
-
|
368 |
if __name__ == "__main__":
|
369 |
main()
|
|
|
25 |
import boto3
|
26 |
from decimal import Decimal
|
27 |
import uuid
|
28 |
+
from paddleocr import PaddleOCR
|
29 |
|
30 |
# Configure logging
|
31 |
logging.basicConfig(level=logging.INFO)
|
|
|
193 |
merged["products"] = llm_fields.get("products") or regex_fields.get("products")
|
194 |
return merged
|
195 |
|
196 |
+
def extract_handwritten_text(image):
|
197 |
+
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
|
198 |
+
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
|
199 |
+
pixel_values = processor(images=image, return_tensors="pt").pixel_values
|
200 |
+
generated_ids = model.generate(pixel_values)
|
201 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
202 |
+
return generated_text
|
203 |
+
|
204 |
+
@st.cache_resource
|
205 |
+
def get_paddle_ocr():
|
206 |
+
return PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
|
207 |
+
|
208 |
+
def extract_handwritten_text_paddle(image):
|
209 |
+
ocr = get_paddle_ocr()
|
210 |
+
# Save PIL image to a temporary file
|
211 |
+
temp_path = 'temp_uploaded_image_paddle.jpg'
|
212 |
+
image.save(temp_path)
|
213 |
+
result = ocr.ocr(temp_path, cls=True)
|
214 |
+
lines = [line[1][0] for line in result[0]]
|
215 |
+
return '\n'.join(lines)
|
216 |
+
|
217 |
def main():
|
218 |
st.set_page_config(
|
219 |
page_title="FormIQ - Intelligent Document Parser",
|
|
|
267 |
)
|
268 |
|
269 |
if uploaded_file is not None:
|
270 |
+
image = Image.open(uploaded_file).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
271 |
st.image(image, caption="Uploaded Document", width=600)
|
272 |
|
273 |
+
handwritten_text = None
|
274 |
+
# Option to extract handwritten text with PaddleOCR
|
275 |
+
if st.checkbox("Extract handwritten text (PaddleOCR)?"):
|
276 |
+
with st.spinner("Extracting handwritten text with PaddleOCR..."):
|
277 |
+
handwritten_text = extract_handwritten_text_paddle(image)
|
278 |
+
st.subheader("Handwritten Text Extracted (PaddleOCR)")
|
279 |
+
st.write(handwritten_text)
|
280 |
+
|
281 |
# Process button
|
282 |
if st.button("Process Document"):
|
283 |
with st.spinner("Processing document..."):
|
284 |
try:
|
|
|
285 |
temp_path = "temp_uploaded_image.jpg"
|
286 |
image.save(temp_path)
|
287 |
|
288 |
+
# Use handwritten text if available, else fallback to pytesseract
|
289 |
+
if handwritten_text:
|
290 |
+
llm_input_text = handwritten_text
|
291 |
+
else:
|
292 |
+
llm_input_text = pytesseract.image_to_string(Image.open(temp_path))
|
293 |
+
|
294 |
+
llm_result = extract_with_perplexity_llm(llm_input_text)
|
295 |
+
llm_json = extract_json_from_llm_output(llm_result)
|
296 |
+
st.subheader("Structured Data (Perplexity LLM)")
|
297 |
+
if llm_json:
|
298 |
+
try:
|
299 |
+
llm_data = json.loads(llm_json)
|
300 |
+
st.json(llm_data)
|
301 |
+
save_to_dynamodb(llm_data)
|
302 |
+
st.success("Saved to DynamoDB!")
|
303 |
+
except Exception as e:
|
304 |
+
st.error(f"Failed to parse LLM output as JSON: {e}")
|
305 |
+
else:
|
306 |
+
st.warning("No valid JSON found in LLM output.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
except Exception as e:
|
309 |
logger.error(f"Error processing document: {str(e)}")
|
|
|
366 |
else:
|
367 |
st.info("Confusion matrix not found.")
|
368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
if __name__ == "__main__":
|
370 |
main()
|
requirements.txt
CHANGED
@@ -34,3 +34,5 @@ plotly==5.18.0
|
|
34 |
matplotlib
|
35 |
scikit-learn
|
36 |
pdf2image
|
|
|
|
|
|
34 |
matplotlib
|
35 |
scikit-learn
|
36 |
pdf2image
|
37 |
+
paddleocr
|
38 |
+
paddlepaddle
|
temp_uploaded_image_paddle.jpg
ADDED
![]() |
Git LFS Details
|