opentyphoon commited on
Commit
9766910
Β·
verified Β·
1 Parent(s): d01316b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -55
app.py CHANGED
@@ -2,14 +2,15 @@ import base64
2
  from io import BytesIO
3
  import json
4
  import os
5
- from meta_prompt import get_prompt
6
  from openai import OpenAI
7
- from utils import render_pdf_to_base64png, image_to_pdf, get_anchor_text
 
8
  import gradio as gr
9
  from PIL import Image
10
 
 
11
 
12
- openai = OpenAI(base_url=os.environ.get("TYPHOON_BASE_URL"), api_key=os.environ.get("TYPHOON_API_KEY"))
13
 
14
  theme = gr.themes.Soft(
15
  primary_hue=gr.themes.Color(
@@ -29,61 +30,51 @@ theme = gr.themes.Soft(
29
  neutral_hue="stone",
30
  )
31
 
32
- def process_pdf(pdf_or_image_file, task_type):
33
  if pdf_or_image_file is None:
34
  return None, "No file uploaded"
35
 
36
  orig_filename = pdf_or_image_file.name
37
- ext = os.path.splitext(orig_filename)[1].lower()
38
- filename = orig_filename # default to original file if PDF
39
 
40
- # If the file is not a PDF, assume it's an image and convert it to PDF.
41
- if ext not in [".pdf"]:
42
- filename = image_to_pdf(orig_filename)
43
- if filename is None:
44
- return None, "Error converting image to PDF"
45
-
46
- # Render the first page to base64 PNG and then load it into a PIL image.
47
- image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1800)
48
- image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
49
-
50
- # Extract anchor text from the PDF (first page)
51
- anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=8000)
52
-
53
- # Retrieve and fill in the prompt template with the anchor_text
54
- prompt_template_fn = get_prompt(task_type)
55
- PROMPT = prompt_template_fn(anchor_text)
56
-
57
- # Create a messages structure including text and image URL
58
- messages = [{
59
- "role": "user",
60
- "content": [
61
- {"type": "text", "text": PROMPT},
62
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
63
- ],
64
- }]
65
- # send messages to openai compatible api
66
- response = openai.chat.completions.create(
67
- model=os.environ.get("TYPHOON_OCR_MODEL"),
68
- messages=messages,
69
- max_tokens=16384,
70
- extra_body={
71
- "repetition_penalty": 1.2,
72
- "temperature": 0.1,
73
- "top_p": 0.6,
74
- },
75
 
76
- )
77
- text_output = response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
80
- try:
81
- json_data = json.loads(text_output)
82
- markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
83
  except Exception as e:
84
- markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
85
-
86
- return image_pil, markdown_out
87
 
88
 
89
  # Build the Gradio UI.
@@ -112,10 +103,24 @@ with gr.Blocks(theme=theme) as demo:
112
  with gr.Row():
113
  with gr.Column(scale=1):
114
  # Update file_types to accept PDF as well as common image formats.
115
- pdf_input = gr.File(label="πŸ“„ Upload Image file or PDF file (only the first page will be processed)", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
116
- task_dropdown = gr.Dropdown(["default", "structure"], label="🎯 Select Task", value="default")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  run_button = gr.Button("πŸš€ Run")
118
- image_output = gr.Image(label="πŸ“Έ Preview Image (Page 1)", type="pil")
119
  with gr.Column(scale=2):
120
  markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
121
 
@@ -123,9 +128,9 @@ with gr.Blocks(theme=theme) as demo:
123
  # Connect the UI inputs to the processing function.
124
  run_button.click(
125
  fn=process_pdf,
126
- inputs=[pdf_input, task_dropdown],
127
  outputs=[image_output, markdown_output]
128
  )
129
 
130
  # Launch the Gradio demo (temporary public share for 72 hours)
131
- demo.launch(share=False)
 
2
  from io import BytesIO
3
  import json
4
  import os
 
5
  from openai import OpenAI
6
+ from dotenv import load_dotenv
7
+ from typhoon_ocr import prepare_ocr_messages
8
  import gradio as gr
9
  from PIL import Image
10
 
11
+ load_dotenv()
12
 
13
+ openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
14
 
15
  theme = gr.themes.Soft(
16
  primary_hue=gr.themes.Color(
 
30
  neutral_hue="stone",
31
  )
32
 
33
+ def process_pdf(pdf_or_image_file, task_type, page_number):
34
  if pdf_or_image_file is None:
35
  return None, "No file uploaded"
36
 
37
  orig_filename = pdf_or_image_file.name
 
 
38
 
39
+ try:
40
+ # Use the new simplified function to prepare OCR messages with page number
41
+ messages = prepare_ocr_messages(
42
+ pdf_or_image_path=orig_filename,
43
+ task_type=task_type,
44
+ target_image_dim=1800,
45
+ target_text_length=8000,
46
+ page_num=page_number if page_number else 1
47
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ # Extract the image from the message content for display
50
+ image_url = messages[0]["content"][1]["image_url"]["url"]
51
+ image_base64 = image_url.replace("data:image/png;base64,", "")
52
+ image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
53
+
54
+ # Send messages to OpenAI compatible API
55
+ response = openai.chat.completions.create(
56
+ model=os.getenv("TYPHOON_OCR_MODEL"),
57
+ messages=messages,
58
+ max_tokens=16384,
59
+ extra_body={
60
+ "repetition_penalty": 1.2,
61
+ "temperature": 0.1,
62
+ "top_p": 0.6,
63
+ },
64
+ )
65
+ text_output = response.choices[0].message.content
66
+
67
+ # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
68
+ try:
69
+ json_data = json.loads(text_output)
70
+ markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
71
+ except Exception as e:
72
+ markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
73
+
74
+ return image_pil, markdown_out
75
 
 
 
 
 
76
  except Exception as e:
77
+ return None, f"Error processing file: {str(e)}"
 
 
78
 
79
 
80
  # Build the Gradio UI.
 
103
  with gr.Row():
104
  with gr.Column(scale=1):
105
  # Update file_types to accept PDF as well as common image formats.
106
+ pdf_input = gr.File(label="πŸ“„ Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
107
+
108
+ with gr.Box():
109
+ task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
110
+ gr.HTML("""
111
+ <p><b>default</b>: for infographic and general documents</p>
112
+ <p><b>structure</b>: for documents with complex layout and images</p>
113
+ <p>We recommend using trying both and see which one works better for your use case.</p>
114
+ """, elem_classes=["task-dropdown-info"])
115
+ demo.css = """
116
+ .task-dropdown-info {
117
+ padding: 0 16px;
118
+ font-size: 12px;
119
+ }
120
+ """
121
+ page_number = gr.Number(label="πŸ“„ Page Number (for PDFs only)", value=1, minimum=1, step=1)
122
  run_button = gr.Button("πŸš€ Run")
123
+ image_output = gr.Image(label="πŸ“Έ Preview Image", type="pil")
124
  with gr.Column(scale=2):
125
  markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
126
 
 
128
  # Connect the UI inputs to the processing function.
129
  run_button.click(
130
  fn=process_pdf,
131
+ inputs=[pdf_input, task_dropdown, page_number],
132
  outputs=[image_output, markdown_output]
133
  )
134
 
135
  # Launch the Gradio demo (temporary public share for 72 hours)
136
+ demo.launch(share=False)