Spaces:

opentyphoon
/

typhoon-ocr

Running

App Files Files Community

opentyphoon commited on 4 days ago

Commit

9766910

verified ·

1 Parent(s): d01316b

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -55

app.py CHANGED Viewed

@@ -2,14 +2,15 @@ import base64
 from io import BytesIO
 import json
 import os
-from meta_prompt import get_prompt
 from openai import OpenAI
-from utils import render_pdf_to_base64png, image_to_pdf, get_anchor_text
 import gradio as gr
 from PIL import Image
-openai = OpenAI(base_url=os.environ.get("TYPHOON_BASE_URL"), api_key=os.environ.get("TYPHOON_API_KEY"))
 theme = gr.themes.Soft(
     primary_hue=gr.themes.Color(
@@ -29,61 +30,51 @@ theme = gr.themes.Soft(
     neutral_hue="stone",
 )
-def process_pdf(pdf_or_image_file, task_type):
     if pdf_or_image_file is None:
         return None, "No file uploaded"
     orig_filename = pdf_or_image_file.name
-    ext = os.path.splitext(orig_filename)[1].lower()
-    filename = orig_filename  # default to original file if PDF
-    # If the file is not a PDF, assume it's an image and convert it to PDF.
-    if ext not in [".pdf"]:
-        filename = image_to_pdf(orig_filename)
-        if filename is None:
-            return None, "Error converting image to PDF"
-    # Render the first page to base64 PNG and then load it into a PIL image.
-    image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1800)
-    image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
-    # Extract anchor text from the PDF (first page)
-    anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=8000)
-    # Retrieve and fill in the prompt template with the anchor_text
-    prompt_template_fn = get_prompt(task_type)
-    PROMPT = prompt_template_fn(anchor_text)
-    # Create a messages structure including text and image URL
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "text", "text": PROMPT},
-            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-        ],
-    }]
-    # send messages to openai compatible api
-    response = openai.chat.completions.create(
-        model=os.environ.get("TYPHOON_OCR_MODEL"),
-        messages=messages,
-        max_tokens=16384,
-        extra_body={
-            "repetition_penalty": 1.2,
-            "temperature": 0.1,
-            "top_p": 0.6,
-        },
-    )
-    text_output = response.choices[0].message.content
-    # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
-    try:
-        json_data = json.loads(text_output)
-        markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
     except Exception as e:
-        markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
-    return image_pil, markdown_out
 # Build the Gradio UI.
@@ -112,10 +103,24 @@ with gr.Blocks(theme=theme) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             # Update file_types to accept PDF as well as common image formats.
-            pdf_input = gr.File(label="📄 Upload Image file or PDF file (only the first page will be processed)", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
-            task_dropdown = gr.Dropdown(["default", "structure"], label="🎯 Select Task", value="default")
             run_button = gr.Button("🚀 Run")
-            image_output = gr.Image(label="📸 Preview Image (Page 1)", type="pil")
         with gr.Column(scale=2):
             markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
@@ -123,9 +128,9 @@ with gr.Blocks(theme=theme) as demo:
     # Connect the UI inputs to the processing function.
     run_button.click(
         fn=process_pdf,
-        inputs=[pdf_input, task_dropdown],
         outputs=[image_output, markdown_output]
     )
 # Launch the Gradio demo (temporary public share for 72 hours)
-demo.launch(share=False)

 from io import BytesIO
 import json
 import os
 from openai import OpenAI
+from dotenv import load_dotenv
+from typhoon_ocr import prepare_ocr_messages
 import gradio as gr
 from PIL import Image
+load_dotenv()
+openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
 theme = gr.themes.Soft(
     primary_hue=gr.themes.Color(
     neutral_hue="stone",
 )
+def process_pdf(pdf_or_image_file, task_type, page_number):
     if pdf_or_image_file is None:
         return None, "No file uploaded"
     orig_filename = pdf_or_image_file.name
+    try:
+        # Use the new simplified function to prepare OCR messages with page number
+        messages = prepare_ocr_messages(
+            pdf_or_image_path=orig_filename,
+            task_type=task_type,
+            target_image_dim=1800,
+            target_text_length=8000,
+            page_num=page_number if page_number else 1
+        )
+        # Extract the image from the message content for display
+        image_url = messages[0]["content"][1]["image_url"]["url"]
+        image_base64 = image_url.replace("data:image/png;base64,", "")
+        image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
+        # Send messages to OpenAI compatible API
+        response = openai.chat.completions.create(
+            model=os.getenv("TYPHOON_OCR_MODEL"),
+            messages=messages,
+            max_tokens=16384,
+            extra_body={
+                "repetition_penalty": 1.2,
+                "temperature": 0.1,
+                "top_p": 0.6,
+            },
+        )
+        text_output = response.choices[0].message.content
+        # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
+        try:
+            json_data = json.loads(text_output)
+            markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
+        except Exception as e:
+            markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
+        return image_pil, markdown_out
     except Exception as e:
+        return None, f"Error processing file: {str(e)}"
 # Build the Gradio UI.
     with gr.Row():
         with gr.Column(scale=1):
             # Update file_types to accept PDF as well as common image formats.
+            pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
+            with gr.Box():
+                task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
+                gr.HTML("""
+                <p><b>default</b>: for infographic and general documents</p>
+                <p><b>structure</b>: for documents with complex layout and images</p>
+                <p>We recommend using trying both and see which one works better for your use case.</p>
+                """, elem_classes=["task-dropdown-info"])
+                demo.css = """
+                .task-dropdown-info {
+                    padding: 0 16px;
+                    font-size: 12px;
+                }
+                """
+            page_number = gr.Number(label="📄 Page Number (for PDFs only)", value=1, minimum=1, step=1)
             run_button = gr.Button("🚀 Run")
+            image_output = gr.Image(label="📸 Preview Image", type="pil")
         with gr.Column(scale=2):
             markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
     # Connect the UI inputs to the processing function.
     run_button.click(
         fn=process_pdf,
+        inputs=[pdf_input, task_dropdown, page_number],
         outputs=[image_output, markdown_output]
     )
 # Launch the Gradio demo (temporary public share for 72 hours)
+demo.launch(share=False)