Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,14 +2,15 @@ import base64
|
|
2 |
from io import BytesIO
|
3 |
import json
|
4 |
import os
|
5 |
-
from meta_prompt import get_prompt
|
6 |
from openai import OpenAI
|
7 |
-
from
|
|
|
8 |
import gradio as gr
|
9 |
from PIL import Image
|
10 |
|
|
|
11 |
|
12 |
-
openai = OpenAI(base_url=os.
|
13 |
|
14 |
theme = gr.themes.Soft(
|
15 |
primary_hue=gr.themes.Color(
|
@@ -29,61 +30,51 @@ theme = gr.themes.Soft(
|
|
29 |
neutral_hue="stone",
|
30 |
)
|
31 |
|
32 |
-
def process_pdf(pdf_or_image_file, task_type):
|
33 |
if pdf_or_image_file is None:
|
34 |
return None, "No file uploaded"
|
35 |
|
36 |
orig_filename = pdf_or_image_file.name
|
37 |
-
ext = os.path.splitext(orig_filename)[1].lower()
|
38 |
-
filename = orig_filename # default to original file if PDF
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
# Extract anchor text from the PDF (first page)
|
51 |
-
anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=8000)
|
52 |
-
|
53 |
-
# Retrieve and fill in the prompt template with the anchor_text
|
54 |
-
prompt_template_fn = get_prompt(task_type)
|
55 |
-
PROMPT = prompt_template_fn(anchor_text)
|
56 |
-
|
57 |
-
# Create a messages structure including text and image URL
|
58 |
-
messages = [{
|
59 |
-
"role": "user",
|
60 |
-
"content": [
|
61 |
-
{"type": "text", "text": PROMPT},
|
62 |
-
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
63 |
-
],
|
64 |
-
}]
|
65 |
-
# send messages to openai compatible api
|
66 |
-
response = openai.chat.completions.create(
|
67 |
-
model=os.environ.get("TYPHOON_OCR_MODEL"),
|
68 |
-
messages=messages,
|
69 |
-
max_tokens=16384,
|
70 |
-
extra_body={
|
71 |
-
"repetition_penalty": 1.2,
|
72 |
-
"temperature": 0.1,
|
73 |
-
"top_p": 0.6,
|
74 |
-
},
|
75 |
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
# Try to parse the output assuming it is a Python dictionary containing 'natural_text'
|
80 |
-
try:
|
81 |
-
json_data = json.loads(text_output)
|
82 |
-
markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
|
83 |
except Exception as e:
|
84 |
-
|
85 |
-
|
86 |
-
return image_pil, markdown_out
|
87 |
|
88 |
|
89 |
# Build the Gradio UI.
|
@@ -112,10 +103,24 @@ with gr.Blocks(theme=theme) as demo:
|
|
112 |
with gr.Row():
|
113 |
with gr.Column(scale=1):
|
114 |
# Update file_types to accept PDF as well as common image formats.
|
115 |
-
pdf_input = gr.File(label="π Upload Image file or PDF file
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
run_button = gr.Button("π Run")
|
118 |
-
image_output = gr.Image(label="πΈ Preview Image
|
119 |
with gr.Column(scale=2):
|
120 |
markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
|
121 |
|
@@ -123,9 +128,9 @@ with gr.Blocks(theme=theme) as demo:
|
|
123 |
# Connect the UI inputs to the processing function.
|
124 |
run_button.click(
|
125 |
fn=process_pdf,
|
126 |
-
inputs=[pdf_input, task_dropdown],
|
127 |
outputs=[image_output, markdown_output]
|
128 |
)
|
129 |
|
130 |
# Launch the Gradio demo (temporary public share for 72 hours)
|
131 |
-
demo.launch(share=False)
|
|
|
2 |
from io import BytesIO
|
3 |
import json
|
4 |
import os
|
|
|
5 |
from openai import OpenAI
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from typhoon_ocr import prepare_ocr_messages
|
8 |
import gradio as gr
|
9 |
from PIL import Image
|
10 |
|
11 |
+
load_dotenv()
|
12 |
|
13 |
+
openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
|
14 |
|
15 |
theme = gr.themes.Soft(
|
16 |
primary_hue=gr.themes.Color(
|
|
|
30 |
neutral_hue="stone",
|
31 |
)
|
32 |
|
33 |
+
def process_pdf(pdf_or_image_file, task_type, page_number):
|
34 |
if pdf_or_image_file is None:
|
35 |
return None, "No file uploaded"
|
36 |
|
37 |
orig_filename = pdf_or_image_file.name
|
|
|
|
|
38 |
|
39 |
+
try:
|
40 |
+
# Use the new simplified function to prepare OCR messages with page number
|
41 |
+
messages = prepare_ocr_messages(
|
42 |
+
pdf_or_image_path=orig_filename,
|
43 |
+
task_type=task_type,
|
44 |
+
target_image_dim=1800,
|
45 |
+
target_text_length=8000,
|
46 |
+
page_num=page_number if page_number else 1
|
47 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
# Extract the image from the message content for display
|
50 |
+
image_url = messages[0]["content"][1]["image_url"]["url"]
|
51 |
+
image_base64 = image_url.replace("data:image/png;base64,", "")
|
52 |
+
image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
|
53 |
+
|
54 |
+
# Send messages to OpenAI compatible API
|
55 |
+
response = openai.chat.completions.create(
|
56 |
+
model=os.getenv("TYPHOON_OCR_MODEL"),
|
57 |
+
messages=messages,
|
58 |
+
max_tokens=16384,
|
59 |
+
extra_body={
|
60 |
+
"repetition_penalty": 1.2,
|
61 |
+
"temperature": 0.1,
|
62 |
+
"top_p": 0.6,
|
63 |
+
},
|
64 |
+
)
|
65 |
+
text_output = response.choices[0].message.content
|
66 |
+
|
67 |
+
# Try to parse the output assuming it is a Python dictionary containing 'natural_text'
|
68 |
+
try:
|
69 |
+
json_data = json.loads(text_output)
|
70 |
+
markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
|
71 |
+
except Exception as e:
|
72 |
+
markdown_out = f"β οΈ Could not extract `natural_text` from output.\nError: {str(e)}"
|
73 |
+
|
74 |
+
return image_pil, markdown_out
|
75 |
|
|
|
|
|
|
|
|
|
76 |
except Exception as e:
|
77 |
+
return None, f"Error processing file: {str(e)}"
|
|
|
|
|
78 |
|
79 |
|
80 |
# Build the Gradio UI.
|
|
|
103 |
with gr.Row():
|
104 |
with gr.Column(scale=1):
|
105 |
# Update file_types to accept PDF as well as common image formats.
|
106 |
+
pdf_input = gr.File(label="π Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
|
107 |
+
|
108 |
+
with gr.Box():
|
109 |
+
task_dropdown = gr.Radio(["default", "structure"], label="π― Select Task", value="default")
|
110 |
+
gr.HTML("""
|
111 |
+
<p><b>default</b>: for infographic and general documents</p>
|
112 |
+
<p><b>structure</b>: for documents with complex layout and images</p>
|
113 |
+
<p>We recommend using trying both and see which one works better for your use case.</p>
|
114 |
+
""", elem_classes=["task-dropdown-info"])
|
115 |
+
demo.css = """
|
116 |
+
.task-dropdown-info {
|
117 |
+
padding: 0 16px;
|
118 |
+
font-size: 12px;
|
119 |
+
}
|
120 |
+
"""
|
121 |
+
page_number = gr.Number(label="π Page Number (for PDFs only)", value=1, minimum=1, step=1)
|
122 |
run_button = gr.Button("π Run")
|
123 |
+
image_output = gr.Image(label="πΈ Preview Image", type="pil")
|
124 |
with gr.Column(scale=2):
|
125 |
markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
|
126 |
|
|
|
128 |
# Connect the UI inputs to the processing function.
|
129 |
run_button.click(
|
130 |
fn=process_pdf,
|
131 |
+
inputs=[pdf_input, task_dropdown, page_number],
|
132 |
outputs=[image_output, markdown_output]
|
133 |
)
|
134 |
|
135 |
# Launch the Gradio demo (temporary public share for 72 hours)
|
136 |
+
demo.launch(share=False)
|