import base64
from io import BytesIO
import json
import os
from openai import OpenAI
from dotenv import load_dotenv
from typhoon_ocr import prepare_ocr_messages
import gradio as gr
from PIL import Image
load_dotenv()
openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
theme = gr.themes.Soft(
primary_hue=gr.themes.Color(
c50="#f7f7fd",
c100="#dfdef8",
c200="#c4c1f2",
c300="#a29eea",
c400="#8f8ae6",
c500="#756fe0",
c600="#635cc1",
c700="#4f4a9b",
c800="#433f83",
c900="#302d5e",
c950="#302d5e",
),
secondary_hue="rose",
neutral_hue="stone",
)
def process_pdf(pdf_or_image_file, task_type, page_number):
if pdf_or_image_file is None:
return None, "No file uploaded"
orig_filename = pdf_or_image_file.name
try:
# Use the new simplified function to prepare OCR messages with page number
messages = prepare_ocr_messages(
pdf_or_image_path=orig_filename,
task_type=task_type,
target_image_dim=1800,
target_text_length=8000,
page_num=page_number if page_number else 1
)
# Extract the image from the message content for display
image_url = messages[0]["content"][1]["image_url"]["url"]
image_base64 = image_url.replace("data:image/png;base64,", "")
image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
# Send messages to OpenAI compatible API
response = openai.chat.completions.create(
model=os.getenv("TYPHOON_OCR_MODEL"),
messages=messages,
max_tokens=16384,
extra_body={
"repetition_penalty": 1.2,
"temperature": 0.1,
"top_p": 0.6,
},
)
text_output = response.choices[0].message.content
# Try to parse the output assuming it is a Python dictionary containing 'natural_text'
try:
json_data = json.loads(text_output)
markdown_out = json_data.get('natural_text', "").replace("
default: This mode works for most cases and is recommended for files without a clear template such as infographics.
structure: This mode offers improved performance for complex layout documents such as those containing images, tables and forms.
We recommend trying both and see which one works better for your use case.
""", elem_classes=["task-dropdown-info"]) demo.css = """ .task-background { background: var(--block-background-fill) !important; } .task-background > * { background: var(--block-background-fill) !important; } .task-dropdown-info { padding: 0 16px; font-size: 12px; } """ page_number = gr.Number(label="📄 Page Number (for PDFs only)", value=1, minimum=1, step=1) run_button = gr.Button("🚀 Run") image_output = gr.Image(label="📸 Preview Image", type="pil") with gr.Column(scale=2): markdown_output = gr.Markdown(label='Markdown Result', show_label=True) # Connect the UI inputs to the processing function. run_button.click( fn=process_pdf, inputs=[pdf_input, task_dropdown, page_number], outputs=[image_output, markdown_output] ) # Launch the Gradio demo (temporary public share for 72 hours) demo.launch(share=False)