import streamlit as st import subprocess import os from jsonToText import convert_json_to_text from llm import process_and_save_json # Default XPath if not provided by the user DEFAULT_XPATH = '//body' # You can change this to whatever default XPath you prefer # Function to update the spider with user inputs and run it def run_spider(website_url, xpath): # Extract domain from the website URL domain = website_url.split("//")[-1].split("/")[0] # Update the spider file with user input (start_urls, custom_xpath, and allowed_domains) spider_path = 'webscraper/webscraper/spiders/websiteSpider.py' # Read the spider file with open(spider_path, 'r') as file: spider_code = file.readlines() # Modify start_urls, custom_xpath, and allowed_domains for idx, line in enumerate(spider_code): if line.strip().startswith('start_urls ='): spider_code[idx] = f' start_urls = ["{website_url}"]\n' if line.strip().startswith('custom_xpath ='): spider_code[idx] = f' custom_xpath = "{xpath}"\n' if line.strip().startswith('allowed_domains ='): spider_code[idx] = f' allowed_domains = ["{domain}"]\n' # Write back the modified spider code with open(spider_path, 'w') as file: file.writelines(spider_code) # Run the Scrapy spider using subprocess scrapy_command = f'scrapy crawl websiteSpider' spider_dir = 'webscraper/webscraper' # Set the directory where the spider is located subprocess.run(scrapy_command, cwd=spider_dir, shell=True) # Streamlit UI st.title('Web Scraper Interface') # User input for website link and XPath website_url = st.text_input('Enter the website URL:', '') xpath = st.text_input('Enter the XPath:', DEFAULT_XPATH) # Default XPath if not provided # Variable to check if spider has been run spider_ran = False # Button to run the spider if st.button('Run Spider'): if website_url: st.write(f'Running the spider on {website_url} using XPath: {xpath}') run_spider(website_url, xpath) st.success('Spider finished running!') convert_json_to_text("webscraper/webscraper/scraped.json", "output.txt") spider_ran = True else: st.error('Please provide a website URL.') # If spider has been run, show download buttons if spider_ran or os.path.exists("webscraper/webscraper/scraped.json"): # Add an option to download the output.txt file with open("output.txt", "r") as file: st.download_button( label="Download Output Text", data=file, file_name="output.txt", mime="text/plain" ) # Add an option to download the scraped.json file with open("webscraper/webscraper/scraped.json", "r") as json_file: st.download_button( label="Download Scraped JSON", data=json_file, file_name="scraped.json", mime="application/json" ) # Title for organizing section st.title("Do you want to organize the scraped data?") # Use session state to track if the user has clicked "Yes" if "organize_requested" not in st.session_state: st.session_state.organize_requested = False # Button to toggle the organize section if st.button("Yes"): st.session_state.organize_requested = True # If user clicked "Yes", show input for 'about', 'API key', and organize button if st.session_state.organize_requested: # Input for 'about' to describe the data for organizing about = st.text_input('Enter one or two words that describe the data like "books" or "events":', '') # Input for custom details to organize the data (e.g., name, price, stock) details = st.text_input('Enter the details to extract (comma separated) like name, date', '') # Input for the API key api_key = st.text_input('Enter your Groq API key:', type="password") # Button to organize and save JSON if st.button("Organize"): if about and details and api_key: # Convert comma-separated details into a list details_list = [detail.strip() for detail in details.split(',')] # Process and save the JSON with the provided details and API key process_and_save_json("output.txt", "organize.json", api_key=api_key, about=about, details=details_list) st.success('Data has been organized and saved to organize.json.') # Add an option to download the organized JSON file with open("organize.json", "r") as organized_json_file: st.download_button( label="Download Organized JSON", data=organized_json_file, file_name="organize.json", mime="application/json" ) else: st.error('Please provide a description, details, and your API key before organizing.')