Spaces:

JournalistsonHF
/

ai-scraper

Running

App Files Files Community

ai-scraper / app.py

fdaudens HF staff

add json checker

037744c 4 months ago

raw

history blame contribute delete

4.32 kB

	import os
	from dotenv import load_dotenv
	from scrapegraphai.graphs import SmartScraperGraph
	from scrapegraphai.utils import prettify_exec_info
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
	import gradio as gr
	import subprocess
	import json
	import re

	# Ensure Playwright installs required browsers and dependencies
	subprocess.run(["playwright", "install"])
	#subprocess.run(["playwright", "install-deps"])

	# Load environment variables
	load_dotenv()
	HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

	# Initialize the model instances
	#repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
	repo_id = "Qwen/Qwen2.5-72B-Instruct"

	llm_model_instance = HuggingFaceEndpoint(
	repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
	)

	embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
	api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
	)

	graph_config = {
	"llm": {
	"model_instance": llm_model_instance,
	"model_tokens": 100000,
	},
	"embeddings": {"model_instance": embedder_model_instance}
	}
	#######
	def clean_json_string(json_str):
	"""
	Removes any comments or prefixes before the actual JSON content.
	Returns the cleaned JSON string.
	"""
	# Find the first occurrence of '{'
	json_start = json_str.find('{')
	if json_start == -1:
	# If no '{' is found, try with '[' for arrays
	json_start = json_str.find('[')
	if json_start == -1:
	return json_str # Return original if no JSON markers found

	# Extract everything from the first JSON marker
	cleaned_str = json_str[json_start:]

	# Verify it's valid JSON
	try:
	json.loads(cleaned_str)
	return cleaned_str
	except json.JSONDecodeError:
	return json_str # Return original if cleaning results in invalid JSON

	def scrape_and_summarize(prompt, source):
	smart_scraper_graph = SmartScraperGraph(
	prompt=prompt,
	source=source,
	config=graph_config
	)
	result = smart_scraper_graph.run()

	# Clean the result if it's a string
	if isinstance(result, str):
	result = clean_json_string(result)

	exec_info = smart_scraper_graph.get_execution_info()
	return result, prettify_exec_info(exec_info)



	#######
	# def scrape_and_summarize(prompt, source):
	# smart_scraper_graph = SmartScraperGraph(
	# prompt=prompt,
	# source=source,
	# config=graph_config
	# )
	# result = smart_scraper_graph.run()
	# exec_info = smart_scraper_graph.get_execution_info()
	# return result, prettify_exec_info(exec_info)

	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Scrape websites, no-code version")
	gr.Markdown("""
	Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started.

	This is a no-code version of the excellent library [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai).

	It's a basic demo and a work in progress. Please contribute to it to make it more useful!

	Note: You might need to add "Output only the results; do not add any comments or include 'JSON OUTPUT' or similar phrases" in your prompt to ensure the LLM only provides the result.
	""")
	with gr.Row():
	with gr.Column():

	model_dropdown = gr.Textbox(label="Model", value="Qwen/Qwen2.5-72B-Instruct")
	prompt_input = gr.Textbox(label="Prompt", value="List all the press releases with their headlines and urls. Output only the results; do not add any comments or include 'JSON OUTPUT' or similar phrases.")
	source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
	scrape_button = gr.Button("Scrape and Summarize")

	with gr.Column():
	result_output = gr.JSON(label="Result")
	exec_info_output = gr.Textbox(label="Execution Info")

	scrape_button.click(
	scrape_and_summarize,
	inputs=[prompt_input, source_input],
	outputs=[result_output, exec_info_output]
	)

	# Launch the Gradio app
	if __name__ == "__main__":
	demo.launch()