Markdown-Studio / app.py
Nymbo's picture
adding adding source fetcher
da60cd1 verified
import gradio as gr
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.toc import TocExtension
from markdown.extensions.attr_list import AttrListExtension
from markdown.extensions.codehilite import CodeHiliteExtension
import requests
from bs4 import BeautifulSoup
# For ReaderLM-2 functionality
from transformers import pipeline
# For ReaderLM-1 functionality
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify
######################################
# 1) MARKDOWN-STUDIO FUNCTIONALITY #
######################################
def render_markdown(md_text):
"""
Render a string of Markdown text into HTML using various Markdown extensions.
- Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
"""
print("Rendering markdown input to HTML...") # Debug log
return markdown.markdown(
md_text,
extensions=[
TableExtension(), # Adds support for Markdown tables
FencedCodeExtension(), # Allows for fenced code blocks
TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2
AttrListExtension(), # Enables attribute lists for elements
CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks
],
)
######################################
# 2) VIEW SOURCE INFO FUNCTIONALITY #
######################################
def view_source_info(url):
"""
Fetch the HTML source of the given URL.
- Supports `view-source:` prefix or plain URLs.
"""
if url.startswith("view-source:"):
url = url.replace("view-source:", "").strip()
if not url.startswith(("http://", "https://")):
url = "https://" + url
print(f"Fetching source for URL: {url}...") # Debug log
response = requests.get(url)
return response.text
######################################
# 3) READERLM-2 FUNCTIONALITY #
######################################
# Load the JinaAI ReaderLM-v2 model
model_name = "jinaai/ReaderLM-v2"
print(f"Loading model: {model_name}...") # Debug log
html_converter = pipeline("text-generation", model=model_name)
def convert_html(html_input, output_format, custom_prompt=None):
"""
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
- Takes raw HTML as input and converts it to the specified output format.
- Allows for a custom system prompt.
"""
if custom_prompt:
prompt = f"{custom_prompt}\n\n{html_input}"
else:
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log
print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input
# Use the pipeline to generate the conversion
response = html_converter(prompt, max_length=9999, num_return_sequences=1)
converted_output = response[0]['generated_text']
# Remove the prompt from the output to clean up the response
converted_output = converted_output.replace(prompt, "").strip()
print("Conversion completed.") # Debug log
return converted_output
######################################
# 4) READERLM-1 FUNCTIONALITY #
######################################
# Prepare models and tokenizers for ReaderLM-1
print("Loading ReaderLM-1 models and tokenizers...") # Debug log
models = {
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
).eval().to("cuda"), # Load the larger 1.5b model onto the GPU
}
tokenizers = {
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
),
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
),
}
@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
"""
Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
- Includes both model-based generation and a rule-based markdownify output.
"""
print(f"Running example with model: {model_id}...") # Debug log
model = models[model_id] # Select the model based on the input ID
tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer
# Construct the chat-based input for the model
messages = [{"role": "user", "content": html_content}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model
print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text
# Tokenize the input text
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
# Generate output using the model
outputs = model.generate(
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
)
# Extract the assistant's response from the generated output
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
print("Model generation completed.") # Debug log
# Use markdownify as a rule-based fallback for comparison
markdownify_output = markdownify(html_content)
print("Rule-based markdownify output generated.") # Debug log
# Return both model-based and rule-based outputs
return assistant_response[0], markdownify_output
# Example HTML for ReaderLM-1
example_html = """<div id="myDIV" class="header">
<h2>My To Do List</h2>
<input type="text" id="myInput" placeholder="Title...">
<span onclick="newElement()" class="addBtn">Add</span>
</div>
<ul id="myUL">
<li>Hit the gym</li>
<li class="checked">Pay bills</li>
<li>Meet George</li>
<li>Buy eggs</li>
<li>Read a book</li>
<li>Organize office</li>
</ul>"""
########################################################
# Combine everything into a single Gradio Blocks app #
########################################################
# Optional extra CSS for styling the ReaderLM-1 tab
css = """
#output {
height: 500px; # Set the height of the output box
overflow: auto; # Enable scrolling for large content
border: 1px solid #ccc; # Add a border around the box
}
"""
# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
print("Initializing Gradio app...") # Debug log
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
########################################################
# TAB 1: Markdown Suite (live preview)
########################################################
with gr.Tab("Live Preview"):
gr.Markdown("# Markdown Suite") # Add a title for the tab
with gr.Row():
with gr.Column():
md_input = gr.Textbox(
lines=20,
placeholder="Write your markdown here...",
label="Markdown Input", # Input for Markdown text
)
with gr.Column():
md_output = gr.HTML(
label="Rendered Output" # Display the rendered HTML output
)
# Update the output whenever the input changes
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
########################################################
# TAB 2: View Source Info
########################################################
with gr.Tab("View Source Info"):
gr.Markdown("## View HTML Source Code") # Tab description
source_input = gr.Textbox(
label="Enter URL (with or without 'view-source:')",
placeholder="e.g., https://example.com or view-source:example.com"
)
source_output = gr.Textbox(
label="HTML Source Code",
lines=20
)
source_button = gr.Button("Fetch Source") # Button to fetch source
source_button.click(
fn=view_source_info,
inputs=source_input,
outputs=source_output
)
########################################################
# TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON)
########################################################
with gr.Tab("ReaderLM-2 Converter"):
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
with gr.Row():
html_input_2 = gr.Textbox(
lines=10,
placeholder="Paste your raw HTML here...",
label="Raw HTML Input" # Input for raw HTML
)
output_format_2 = gr.Radio(
["Markdown", "JSON"], # Choose the output format
label="Output Format",
value="Markdown" # Default to Markdown output
)
custom_prompt_2 = gr.Textbox(
lines=2,
placeholder="Optional: Enter a custom prompt...",
label="Custom System Prompt"
)
convert_btn_2 = gr.Button("Convert") # Button to trigger conversion
converted_output_2 = gr.Textbox(
lines=20,
label="Converted Output" # Display the converted output
)
# Provide usage details for the converter
gr.Markdown(
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
)
# Connect the button click event to the conversion function
convert_btn_2.click(
fn=convert_html,
inputs=[html_input_2, output_format_2, custom_prompt_2],
outputs=converted_output_2
)
# Add example inputs for demonstration
gr.Examples(
examples=[
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
],
inputs=[html_input_2, output_format_2, custom_prompt_2],
outputs=converted_output_2,
fn=convert_html,
cache_examples=False # Disable caching for dynamic examples
)
########################################################
# TAB 4: ReaderLM-1 HTML-to-Markdown
########################################################
with gr.Tab("ReaderLM-1 Converter"):
gr.Markdown("""
# HTML-to-Markdown with ReaderLM-1
Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b**
to convert HTML to Markdown. Compare against rule-based `markdownify`.
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(
choices=list(models.keys()), # Allow selection between the two models
label="Model",
value="jinaai/reader-lm-1.5b" # Default to the larger model
)
html_content = gr.Textbox(
label="HTML" # Input for raw HTML
)
submit_btn = gr.Button(value="Submit") # Button to trigger the model
with gr.Column():
model_output_text = gr.Textbox(label="Reader LM Output") # Model-generated Markdown
markdownify_output = gr.Textbox(label="Markdownify Output") # Rule-based Markdown
# Add example HTML input for demonstration
gr.Examples(
examples=[
[example_html],
],
inputs=[html_content],
outputs=[model_output_text, markdownify_output],
fn=run_example,
cache_examples=True, # Cache example outputs
label="Try example HTML"
)
# Connect the submit button to the run_example function
submit_btn.click(
fn=run_example,
inputs=[html_content, model_selector],
outputs=[model_output_text, markdownify_output]
)
# Finally, launch the combined demo app
print("Launching the demo...") # Debug log
demo.launch()