Spaces:
Paused
Paused
import gradio as gr | |
import markdown | |
from markdown.extensions.tables import TableExtension | |
from markdown.extensions.fenced_code import FencedCodeExtension | |
from markdown.extensions.toc import TocExtension | |
from markdown.extensions.attr_list import AttrListExtension | |
from markdown.extensions.codehilite import CodeHiliteExtension | |
import requests | |
from bs4 import BeautifulSoup | |
# For ReaderLM-2 functionality | |
from transformers import pipeline | |
# For ReaderLM-1 functionality | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import spaces | |
import re | |
from markdownify import markdownify | |
###################################### | |
# 1) MARKDOWN-STUDIO FUNCTIONALITY # | |
###################################### | |
def render_markdown(md_text): | |
""" | |
Render a string of Markdown text into HTML using various Markdown extensions. | |
- Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting. | |
""" | |
print("Rendering markdown input to HTML...") # Debug log | |
return markdown.markdown( | |
md_text, | |
extensions=[ | |
TableExtension(), # Adds support for Markdown tables | |
FencedCodeExtension(), # Allows for fenced code blocks | |
TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2 | |
AttrListExtension(), # Enables attribute lists for elements | |
CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks | |
], | |
) | |
###################################### | |
# 2) VIEW SOURCE INFO FUNCTIONALITY # | |
###################################### | |
def view_source_info(url): | |
""" | |
Fetch the HTML source of the given URL. | |
- Supports `view-source:` prefix or plain URLs. | |
""" | |
if url.startswith("view-source:"): | |
url = url.replace("view-source:", "").strip() | |
if not url.startswith(("http://", "https://")): | |
url = "https://" + url | |
print(f"Fetching source for URL: {url}...") # Debug log | |
response = requests.get(url) | |
return response.text | |
###################################### | |
# 3) READERLM-2 FUNCTIONALITY # | |
###################################### | |
# Load the JinaAI ReaderLM-v2 model | |
model_name = "jinaai/ReaderLM-v2" | |
print(f"Loading model: {model_name}...") # Debug log | |
html_converter = pipeline("text-generation", model=model_name) | |
def convert_html(html_input, output_format, custom_prompt=None): | |
""" | |
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON. | |
- Takes raw HTML as input and converts it to the specified output format. | |
- Allows for a custom system prompt. | |
""" | |
if custom_prompt: | |
prompt = f"{custom_prompt}\n\n{html_input}" | |
else: | |
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}" | |
print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log | |
print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input | |
# Use the pipeline to generate the conversion | |
response = html_converter(prompt, max_length=9999, num_return_sequences=1) | |
converted_output = response[0]['generated_text'] | |
# Remove the prompt from the output to clean up the response | |
converted_output = converted_output.replace(prompt, "").strip() | |
print("Conversion completed.") # Debug log | |
return converted_output | |
###################################### | |
# 4) READERLM-1 FUNCTIONALITY # | |
###################################### | |
# Prepare models and tokenizers for ReaderLM-1 | |
print("Loading ReaderLM-1 models and tokenizers...") # Debug log | |
models = { | |
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained( | |
"jinaai/reader-lm-0.5b", trust_remote_code=True | |
).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU | |
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained( | |
"jinaai/reader-lm-1.5b", trust_remote_code=True | |
).eval().to("cuda"), # Load the larger 1.5b model onto the GPU | |
} | |
tokenizers = { | |
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained( | |
"jinaai/reader-lm-0.5b", trust_remote_code=True | |
), | |
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained( | |
"jinaai/reader-lm-1.5b", trust_remote_code=True | |
), | |
} | |
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): | |
""" | |
Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models. | |
- Includes both model-based generation and a rule-based markdownify output. | |
""" | |
print(f"Running example with model: {model_id}...") # Debug log | |
model = models[model_id] # Select the model based on the input ID | |
tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer | |
# Construct the chat-based input for the model | |
messages = [{"role": "user", "content": html_content}] | |
input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model | |
print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text | |
# Tokenize the input text | |
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") | |
# Generate output using the model | |
outputs = model.generate( | |
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08 | |
) | |
# Extract the assistant's response from the generated output | |
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" | |
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) | |
print("Model generation completed.") # Debug log | |
# Use markdownify as a rule-based fallback for comparison | |
markdownify_output = markdownify(html_content) | |
print("Rule-based markdownify output generated.") # Debug log | |
# Return both model-based and rule-based outputs | |
return assistant_response[0], markdownify_output | |
# Example HTML for ReaderLM-1 | |
example_html = """<div id="myDIV" class="header"> | |
<h2>My To Do List</h2> | |
<input type="text" id="myInput" placeholder="Title..."> | |
<span onclick="newElement()" class="addBtn">Add</span> | |
</div> | |
<ul id="myUL"> | |
<li>Hit the gym</li> | |
<li class="checked">Pay bills</li> | |
<li>Meet George</li> | |
<li>Buy eggs</li> | |
<li>Read a book</li> | |
<li>Organize office</li> | |
</ul>""" | |
######################################################## | |
# Combine everything into a single Gradio Blocks app # | |
######################################################## | |
# Optional extra CSS for styling the ReaderLM-1 tab | |
css = """ | |
#output { | |
height: 500px; # Set the height of the output box | |
overflow: auto; # Enable scrolling for large content | |
border: 1px solid #ccc; # Add a border around the box | |
} | |
""" | |
# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling | |
print("Initializing Gradio app...") # Debug log | |
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo: | |
######################################################## | |
# TAB 1: Markdown Suite (live preview) | |
######################################################## | |
with gr.Tab("Live Preview"): | |
gr.Markdown("# Markdown Suite") # Add a title for the tab | |
with gr.Row(): | |
with gr.Column(): | |
md_input = gr.Textbox( | |
lines=20, | |
placeholder="Write your markdown here...", | |
label="Markdown Input", # Input for Markdown text | |
) | |
with gr.Column(): | |
md_output = gr.HTML( | |
label="Rendered Output" # Display the rendered HTML output | |
) | |
# Update the output whenever the input changes | |
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output) | |
######################################################## | |
# TAB 2: View Source Info | |
######################################################## | |
with gr.Tab("View Source Info"): | |
gr.Markdown("## View HTML Source Code") # Tab description | |
source_input = gr.Textbox( | |
label="Enter URL (with or without 'view-source:')", | |
placeholder="e.g., https://example.com or view-source:example.com" | |
) | |
source_output = gr.Textbox( | |
label="HTML Source Code", | |
lines=20 | |
) | |
source_button = gr.Button("Fetch Source") # Button to fetch source | |
source_button.click( | |
fn=view_source_info, | |
inputs=source_input, | |
outputs=source_output | |
) | |
######################################################## | |
# TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON) | |
######################################################## | |
with gr.Tab("ReaderLM-2 Converter"): | |
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description | |
with gr.Row(): | |
html_input_2 = gr.Textbox( | |
lines=10, | |
placeholder="Paste your raw HTML here...", | |
label="Raw HTML Input" # Input for raw HTML | |
) | |
output_format_2 = gr.Radio( | |
["Markdown", "JSON"], # Choose the output format | |
label="Output Format", | |
value="Markdown" # Default to Markdown output | |
) | |
custom_prompt_2 = gr.Textbox( | |
lines=2, | |
placeholder="Optional: Enter a custom prompt...", | |
label="Custom System Prompt" | |
) | |
convert_btn_2 = gr.Button("Convert") # Button to trigger conversion | |
converted_output_2 = gr.Textbox( | |
lines=20, | |
label="Converted Output" # Display the converted output | |
) | |
# Provide usage details for the converter | |
gr.Markdown( | |
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**." | |
) | |
# Connect the button click event to the conversion function | |
convert_btn_2.click( | |
fn=convert_html, | |
inputs=[html_input_2, output_format_2, custom_prompt_2], | |
outputs=converted_output_2 | |
) | |
# Add example inputs for demonstration | |
gr.Examples( | |
examples=[ | |
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"], | |
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"] | |
], | |
inputs=[html_input_2, output_format_2, custom_prompt_2], | |
outputs=converted_output_2, | |
fn=convert_html, | |
cache_examples=False # Disable caching for dynamic examples | |
) | |
######################################################## | |
# TAB 4: ReaderLM-1 HTML-to-Markdown | |
######################################################## | |
with gr.Tab("ReaderLM-1 Converter"): | |
gr.Markdown(""" | |
# HTML-to-Markdown with ReaderLM-1 | |
Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b** | |
to convert HTML to Markdown. Compare against rule-based `markdownify`. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
model_selector = gr.Dropdown( | |
choices=list(models.keys()), # Allow selection between the two models | |
label="Model", | |
value="jinaai/reader-lm-1.5b" # Default to the larger model | |
) | |
html_content = gr.Textbox( | |
label="HTML" # Input for raw HTML | |
) | |
submit_btn = gr.Button(value="Submit") # Button to trigger the model | |
with gr.Column(): | |
model_output_text = gr.Textbox(label="Reader LM Output") # Model-generated Markdown | |
markdownify_output = gr.Textbox(label="Markdownify Output") # Rule-based Markdown | |
# Add example HTML input for demonstration | |
gr.Examples( | |
examples=[ | |
[example_html], | |
], | |
inputs=[html_content], | |
outputs=[model_output_text, markdownify_output], | |
fn=run_example, | |
cache_examples=True, # Cache example outputs | |
label="Try example HTML" | |
) | |
# Connect the submit button to the run_example function | |
submit_btn.click( | |
fn=run_example, | |
inputs=[html_content, model_selector], | |
outputs=[model_output_text, markdownify_output] | |
) | |
# Finally, launch the combined demo app | |
print("Launching the demo...") # Debug log | |
demo.launch() |