Spaces:
Sleeping
Sleeping
import streamlit as st | |
import subprocess | |
import os | |
from jsonToText import convert_json_to_text | |
from llm import process_and_save_json | |
# Default XPath if not provided by the user | |
DEFAULT_XPATH = '//body' # You can change this to whatever default XPath you prefer | |
# Function to update the spider with user inputs and run it | |
def run_spider(website_url, xpath): | |
# Extract domain from the website URL | |
domain = website_url.split("//")[-1].split("/")[0] | |
# Update the spider file with user input (start_urls, custom_xpath, and allowed_domains) | |
spider_path = 'webscraper/webscraper/spiders/websiteSpider.py' | |
# Read the spider file | |
with open(spider_path, 'r') as file: | |
spider_code = file.readlines() | |
# Modify start_urls, custom_xpath, and allowed_domains | |
for idx, line in enumerate(spider_code): | |
if line.strip().startswith('start_urls ='): | |
spider_code[idx] = f' start_urls = ["{website_url}"]\n' | |
if line.strip().startswith('custom_xpath ='): | |
spider_code[idx] = f' custom_xpath = "{xpath}"\n' | |
if line.strip().startswith('allowed_domains ='): | |
spider_code[idx] = f' allowed_domains = ["{domain}"]\n' | |
# Write back the modified spider code | |
with open(spider_path, 'w') as file: | |
file.writelines(spider_code) | |
# Run the Scrapy spider using subprocess | |
scrapy_command = f'scrapy crawl websiteSpider' | |
spider_dir = 'webscraper/webscraper' # Set the directory where the spider is located | |
subprocess.run(scrapy_command, cwd=spider_dir, shell=True) | |
# Streamlit UI | |
st.title('Web Scraper Interface') | |
# User input for website link and XPath | |
website_url = st.text_input('Enter the website URL:', '') | |
xpath = st.text_input('Enter the XPath:', DEFAULT_XPATH) # Default XPath if not provided | |
# Variable to check if spider has been run | |
spider_ran = False | |
# Button to run the spider | |
if st.button('Run Spider'): | |
if website_url: | |
st.write(f'Running the spider on {website_url} using XPath: {xpath}') | |
run_spider(website_url, xpath) | |
st.success('Spider finished running!') | |
convert_json_to_text("webscraper/webscraper/scraped.json", "output.txt") | |
spider_ran = True | |
else: | |
st.error('Please provide a website URL.') | |
# If spider has been run, show download buttons | |
if spider_ran or os.path.exists("webscraper/webscraper/scraped.json"): | |
# Add an option to download the output.txt file | |
with open("output.txt", "r") as file: | |
st.download_button( | |
label="Download Output Text", | |
data=file, | |
file_name="output.txt", | |
mime="text/plain" | |
) | |
# Add an option to download the scraped.json file | |
with open("webscraper/webscraper/scraped.json", "r") as json_file: | |
st.download_button( | |
label="Download Scraped JSON", | |
data=json_file, | |
file_name="scraped.json", | |
mime="application/json" | |
) | |
# Title for organizing section | |
st.title("Do you want to organize the scraped data?") | |
# Use session state to track if the user has clicked "Yes" | |
if "organize_requested" not in st.session_state: | |
st.session_state.organize_requested = False | |
# Button to toggle the organize section | |
if st.button("Yes"): | |
st.session_state.organize_requested = True | |
# If user clicked "Yes", show input for 'about', 'API key', and organize button | |
if st.session_state.organize_requested: | |
# Input for 'about' to describe the data for organizing | |
about = st.text_input('Enter one or two words that describe the data like "books" or "events":', '') | |
# Input for custom details to organize the data (e.g., name, price, stock) | |
details = st.text_input('Enter the details to extract (comma separated) like name, date', '') | |
# Input for the API key | |
api_key = st.text_input('Enter your Groq API key:', type="password") | |
# Button to organize and save JSON | |
if st.button("Organize"): | |
if about and details and api_key: | |
# Convert comma-separated details into a list | |
details_list = [detail.strip() for detail in details.split(',')] | |
# Process and save the JSON with the provided details and API key | |
process_and_save_json("output.txt", "organize.json", api_key=api_key, about=about, details=details_list) | |
st.success('Data has been organized and saved to organize.json.') | |
# Add an option to download the organized JSON file | |
with open("organize.json", "r") as organized_json_file: | |
st.download_button( | |
label="Download Organized JSON", | |
data=organized_json_file, | |
file_name="organize.json", | |
mime="application/json" | |
) | |
else: | |
st.error('Please provide a description, details, and your API key before organizing.') | |