Spaces:

ParitKansal
/

tempAutoScraping

Sleeping

App Files Files Community

tempAutoScraping / app.py

ParitKansal

Add all files

1728c48 5 months ago

raw

history blame contribute delete

4.94 kB

	import streamlit as st
	import subprocess
	import os

	from jsonToText import convert_json_to_text
	from llm import process_and_save_json

	# Default XPath if not provided by the user
	DEFAULT_XPATH = '//body' # You can change this to whatever default XPath you prefer

	# Function to update the spider with user inputs and run it
	def run_spider(website_url, xpath):
	# Extract domain from the website URL
	domain = website_url.split("//")[-1].split("/")[0]

	# Update the spider file with user input (start_urls, custom_xpath, and allowed_domains)
	spider_path = 'webscraper/webscraper/spiders/websiteSpider.py'

	# Read the spider file
	with open(spider_path, 'r') as file:
	spider_code = file.readlines()

	# Modify start_urls, custom_xpath, and allowed_domains
	for idx, line in enumerate(spider_code):
	if line.strip().startswith('start_urls ='):
	spider_code[idx] = f' start_urls = ["{website_url}"]\n'
	if line.strip().startswith('custom_xpath ='):
	spider_code[idx] = f' custom_xpath = "{xpath}"\n'
	if line.strip().startswith('allowed_domains ='):
	spider_code[idx] = f' allowed_domains = ["{domain}"]\n'

	# Write back the modified spider code
	with open(spider_path, 'w') as file:
	file.writelines(spider_code)

	# Run the Scrapy spider using subprocess
	scrapy_command = f'scrapy crawl websiteSpider'
	spider_dir = 'webscraper/webscraper' # Set the directory where the spider is located
	subprocess.run(scrapy_command, cwd=spider_dir, shell=True)


	# Streamlit UI
	st.title('Web Scraper Interface')

	# User input for website link and XPath
	website_url = st.text_input('Enter the website URL:', '')
	xpath = st.text_input('Enter the XPath:', DEFAULT_XPATH) # Default XPath if not provided

	# Variable to check if spider has been run
	spider_ran = False

	# Button to run the spider
	if st.button('Run Spider'):
	if website_url:
	st.write(f'Running the spider on {website_url} using XPath: {xpath}')
	run_spider(website_url, xpath)
	st.success('Spider finished running!')
	convert_json_to_text("webscraper/webscraper/scraped.json", "output.txt")
	spider_ran = True
	else:
	st.error('Please provide a website URL.')

	# If spider has been run, show download buttons
	if spider_ran or os.path.exists("webscraper/webscraper/scraped.json"):
	# Add an option to download the output.txt file
	with open("output.txt", "r") as file:
	st.download_button(
	label="Download Output Text",
	data=file,
	file_name="output.txt",
	mime="text/plain"
	)

	# Add an option to download the scraped.json file
	with open("webscraper/webscraper/scraped.json", "r") as json_file:
	st.download_button(
	label="Download Scraped JSON",
	data=json_file,
	file_name="scraped.json",
	mime="application/json"
	)

	# Title for organizing section
	st.title("Do you want to organize the scraped data?")

	# Use session state to track if the user has clicked "Yes"
	if "organize_requested" not in st.session_state:
	st.session_state.organize_requested = False

	# Button to toggle the organize section
	if st.button("Yes"):
	st.session_state.organize_requested = True

	# If user clicked "Yes", show input for 'about', 'API key', and organize button
	if st.session_state.organize_requested:
	# Input for 'about' to describe the data for organizing
	about = st.text_input('Enter one or two words that describe the data like "books" or "events":', '')

	# Input for custom details to organize the data (e.g., name, price, stock)
	details = st.text_input('Enter the details to extract (comma separated) like name, date', '')

	# Input for the API key
	api_key = st.text_input('Enter your Groq API key:', type="password")

	# Button to organize and save JSON
	if st.button("Organize"):
	if about and details and api_key:
	# Convert comma-separated details into a list
	details_list = [detail.strip() for detail in details.split(',')]

	# Process and save the JSON with the provided details and API key
	process_and_save_json("output.txt", "organize.json", api_key=api_key, about=about, details=details_list)
	st.success('Data has been organized and saved to organize.json.')

	# Add an option to download the organized JSON file
	with open("organize.json", "r") as organized_json_file:
	st.download_button(
	label="Download Organized JSON",
	data=organized_json_file,
	file_name="organize.json",
	mime="application/json"
	)
	else:
	st.error('Please provide a description, details, and your API key before organizing.')