Spaces:

Nymbo
/

Markdown-Studio

Paused

App Files Files Community

Markdown-Studio / app.py

Nymbo

adding adding source fetcher

da60cd1 verified 4 days ago

raw

history blame contribute delete

12.5 kB

	import gradio as gr
	import markdown
	from markdown.extensions.tables import TableExtension
	from markdown.extensions.fenced_code import FencedCodeExtension
	from markdown.extensions.toc import TocExtension
	from markdown.extensions.attr_list import AttrListExtension
	from markdown.extensions.codehilite import CodeHiliteExtension
	import requests
	from bs4 import BeautifulSoup

	# For ReaderLM-2 functionality
	from transformers import pipeline

	# For ReaderLM-1 functionality
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import spaces
	import re
	from markdownify import markdownify

	######################################
	# 1) MARKDOWN-STUDIO FUNCTIONALITY #
	######################################
	def render_markdown(md_text):
	"""
	Render a string of Markdown text into HTML using various Markdown extensions.
	- Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
	"""
	print("Rendering markdown input to HTML...") # Debug log
	return markdown.markdown(
	md_text,
	extensions=[
	TableExtension(), # Adds support for Markdown tables
	FencedCodeExtension(), # Allows for fenced code blocks
	TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2
	AttrListExtension(), # Enables attribute lists for elements
	CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks
	],
	)

	######################################
	# 2) VIEW SOURCE INFO FUNCTIONALITY #
	######################################
	def view_source_info(url):
	"""
	Fetch the HTML source of the given URL.
	- Supports `view-source:` prefix or plain URLs.
	"""
	if url.startswith("view-source:"):
	url = url.replace("view-source:", "").strip()
	if not url.startswith(("http://", "https://")):
	url = "https://" + url
	print(f"Fetching source for URL: {url}...") # Debug log
	response = requests.get(url)
	return response.text

	######################################
	# 3) READERLM-2 FUNCTIONALITY #
	######################################
	# Load the JinaAI ReaderLM-v2 model
	model_name = "jinaai/ReaderLM-v2"
	print(f"Loading model: {model_name}...") # Debug log
	html_converter = pipeline("text-generation", model=model_name)

	def convert_html(html_input, output_format, custom_prompt=None):
	"""
	Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
	- Takes raw HTML as input and converts it to the specified output format.
	- Allows for a custom system prompt.
	"""
	if custom_prompt:
	prompt = f"{custom_prompt}\n\n{html_input}"
	else:
	prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"

	print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log
	print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input

	# Use the pipeline to generate the conversion
	response = html_converter(prompt, max_length=9999, num_return_sequences=1)
	converted_output = response[0]['generated_text']

	# Remove the prompt from the output to clean up the response
	converted_output = converted_output.replace(prompt, "").strip()
	print("Conversion completed.") # Debug log
	return converted_output

	######################################
	# 4) READERLM-1 FUNCTIONALITY #
	######################################
	# Prepare models and tokenizers for ReaderLM-1
	print("Loading ReaderLM-1 models and tokenizers...") # Debug log
	models = {
	"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
	"jinaai/reader-lm-0.5b", trust_remote_code=True
	).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU
	"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
	"jinaai/reader-lm-1.5b", trust_remote_code=True
	).eval().to("cuda"), # Load the larger 1.5b model onto the GPU
	}
	tokenizers = {
	"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
	"jinaai/reader-lm-0.5b", trust_remote_code=True
	),
	"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
	"jinaai/reader-lm-1.5b", trust_remote_code=True
	),
	}

	@spaces.GPU
	def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
	"""
	Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
	- Includes both model-based generation and a rule-based markdownify output.
	"""
	print(f"Running example with model: {model_id}...") # Debug log
	model = models[model_id] # Select the model based on the input ID
	tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer

	# Construct the chat-based input for the model
	messages = [{"role": "user", "content": html_content}]
	input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model
	print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text

	# Tokenize the input text
	inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

	# Generate output using the model
	outputs = model.generate(
	inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
	)

	# Extract the assistant's response from the generated output
	pattern = r"<\\|im_start\\|>assistant(.*?)<\\|im_end\\|>"
	assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
	print("Model generation completed.") # Debug log

	# Use markdownify as a rule-based fallback for comparison
	markdownify_output = markdownify(html_content)
	print("Rule-based markdownify output generated.") # Debug log

	# Return both model-based and rule-based outputs
	return assistant_response[0], markdownify_output

	# Example HTML for ReaderLM-1
	example_html = """<div id="myDIV" class="header">
	<h2>My To Do List</h2>
	<input type="text" id="myInput" placeholder="Title...">
	<span onclick="newElement()" class="addBtn">Add</span>
	</div>

	<ul id="myUL">
	<li>Hit the gym</li>
	<li class="checked">Pay bills</li>
	<li>Meet George</li>
	<li>Buy eggs</li>
	<li>Read a book</li>
	<li>Organize office</li>
	</ul>"""

	########################################################
	# Combine everything into a single Gradio Blocks app #
	########################################################

	# Optional extra CSS for styling the ReaderLM-1 tab
	css = """
	#output {
	height: 500px; # Set the height of the output box
	overflow: auto; # Enable scrolling for large content
	border: 1px solid #ccc; # Add a border around the box
	}
	"""

	# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
	print("Initializing Gradio app...") # Debug log
	with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:

	########################################################
	# TAB 1: Markdown Suite (live preview)
	########################################################
	with gr.Tab("Live Preview"):
	gr.Markdown("# Markdown Suite") # Add a title for the tab

	with gr.Row():
	with gr.Column():
	md_input = gr.Textbox(
	lines=20,
	placeholder="Write your markdown here...",
	label="Markdown Input", # Input for Markdown text
	)
	with gr.Column():
	md_output = gr.HTML(
	label="Rendered Output" # Display the rendered HTML output
	)

	# Update the output whenever the input changes
	md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)

	########################################################
	# TAB 2: View Source Info
	########################################################
	with gr.Tab("View Source Info"):
	gr.Markdown("## View HTML Source Code") # Tab description

	source_input = gr.Textbox(
	label="Enter URL (with or without 'view-source:')",
	placeholder="e.g., https://example.com or view-source:example.com"
	)
	source_output = gr.Textbox(
	label="HTML Source Code",
	lines=20
	)
	source_button = gr.Button("Fetch Source") # Button to fetch source

	source_button.click(
	fn=view_source_info,
	inputs=source_input,
	outputs=source_output
	)

	########################################################
	# TAB 3: ReaderLM-2 Converter (HTML → Markdown/JSON)
	########################################################
	with gr.Tab("ReaderLM-2 Converter"):
	gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description

	with gr.Row():
	html_input_2 = gr.Textbox(
	lines=10,
	placeholder="Paste your raw HTML here...",
	label="Raw HTML Input" # Input for raw HTML
	)
	output_format_2 = gr.Radio(
	["Markdown", "JSON"], # Choose the output format
	label="Output Format",
	value="Markdown" # Default to Markdown output
	)
	custom_prompt_2 = gr.Textbox(
	lines=2,
	placeholder="Optional: Enter a custom prompt...",
	label="Custom System Prompt"
	)

	convert_btn_2 = gr.Button("Convert") # Button to trigger conversion
	converted_output_2 = gr.Textbox(
	lines=20,
	label="Converted Output" # Display the converted output
	)

	# Provide usage details for the converter
	gr.Markdown(
	"Convert raw HTML into formatted Markdown or JSON using JinaAI ReaderLM-v2."
	)

	# Connect the button click event to the conversion function
	convert_btn_2.click(
	fn=convert_html,
	inputs=[html_input_2, output_format_2, custom_prompt_2],
	outputs=converted_output_2
	)

	# Add example inputs for demonstration
	gr.Examples(
	examples=[
	["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
	["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
	],
	inputs=[html_input_2, output_format_2, custom_prompt_2],
	outputs=converted_output_2,
	fn=convert_html,
	cache_examples=False # Disable caching for dynamic examples
	)

	########################################################
	# TAB 4: ReaderLM-1 HTML-to-Markdown
	########################################################
	with gr.Tab("ReaderLM-1 Converter"):
	gr.Markdown("""
	# HTML-to-Markdown with ReaderLM-1
	Use either jinaai/reader-lm-0.5b or jinaai/reader-lm-1.5b
	to convert HTML to Markdown. Compare against rule-based `markdownify`.
	""")

	with gr.Row():
	with gr.Column():
	model_selector = gr.Dropdown(
	choices=list(models.keys()), # Allow selection between the two models
	label="Model",
	value="jinaai/reader-lm-1.5b" # Default to the larger model
	)
	html_content = gr.Textbox(
	label="HTML" # Input for raw HTML
	)
	submit_btn = gr.Button(value="Submit") # Button to trigger the model

	with gr.Column():
	model_output_text = gr.Textbox(label="Reader LM Output") # Model-generated Markdown
	markdownify_output = gr.Textbox(label="Markdownify Output") # Rule-based Markdown

	# Add example HTML input for demonstration
	gr.Examples(
	examples=[
	[example_html],
	],
	inputs=[html_content],
	outputs=[model_output_text, markdownify_output],
	fn=run_example,
	cache_examples=True, # Cache example outputs
	label="Try example HTML"
	)

	# Connect the submit button to the run_example function
	submit_btn.click(
	fn=run_example,
	inputs=[html_content, model_selector],
	outputs=[model_output_text, markdownify_output]
	)

	# Finally, launch the combined demo app
	print("Launching the demo...") # Debug log
	demo.launch()