ParitKansal's picture
Add all files
1728c48
import streamlit as st
import subprocess
import os
from jsonToText import convert_json_to_text
from llm import process_and_save_json
# Default XPath if not provided by the user
DEFAULT_XPATH = '//body' # You can change this to whatever default XPath you prefer
# Function to update the spider with user inputs and run it
def run_spider(website_url, xpath):
# Extract domain from the website URL
domain = website_url.split("//")[-1].split("/")[0]
# Update the spider file with user input (start_urls, custom_xpath, and allowed_domains)
spider_path = 'webscraper/webscraper/spiders/websiteSpider.py'
# Read the spider file
with open(spider_path, 'r') as file:
spider_code = file.readlines()
# Modify start_urls, custom_xpath, and allowed_domains
for idx, line in enumerate(spider_code):
if line.strip().startswith('start_urls ='):
spider_code[idx] = f' start_urls = ["{website_url}"]\n'
if line.strip().startswith('custom_xpath ='):
spider_code[idx] = f' custom_xpath = "{xpath}"\n'
if line.strip().startswith('allowed_domains ='):
spider_code[idx] = f' allowed_domains = ["{domain}"]\n'
# Write back the modified spider code
with open(spider_path, 'w') as file:
file.writelines(spider_code)
# Run the Scrapy spider using subprocess
scrapy_command = f'scrapy crawl websiteSpider'
spider_dir = 'webscraper/webscraper' # Set the directory where the spider is located
subprocess.run(scrapy_command, cwd=spider_dir, shell=True)
# Streamlit UI
st.title('Web Scraper Interface')
# User input for website link and XPath
website_url = st.text_input('Enter the website URL:', '')
xpath = st.text_input('Enter the XPath:', DEFAULT_XPATH) # Default XPath if not provided
# Variable to check if spider has been run
spider_ran = False
# Button to run the spider
if st.button('Run Spider'):
if website_url:
st.write(f'Running the spider on {website_url} using XPath: {xpath}')
run_spider(website_url, xpath)
st.success('Spider finished running!')
convert_json_to_text("webscraper/webscraper/scraped.json", "output.txt")
spider_ran = True
else:
st.error('Please provide a website URL.')
# If spider has been run, show download buttons
if spider_ran or os.path.exists("webscraper/webscraper/scraped.json"):
# Add an option to download the output.txt file
with open("output.txt", "r") as file:
st.download_button(
label="Download Output Text",
data=file,
file_name="output.txt",
mime="text/plain"
)
# Add an option to download the scraped.json file
with open("webscraper/webscraper/scraped.json", "r") as json_file:
st.download_button(
label="Download Scraped JSON",
data=json_file,
file_name="scraped.json",
mime="application/json"
)
# Title for organizing section
st.title("Do you want to organize the scraped data?")
# Use session state to track if the user has clicked "Yes"
if "organize_requested" not in st.session_state:
st.session_state.organize_requested = False
# Button to toggle the organize section
if st.button("Yes"):
st.session_state.organize_requested = True
# If user clicked "Yes", show input for 'about', 'API key', and organize button
if st.session_state.organize_requested:
# Input for 'about' to describe the data for organizing
about = st.text_input('Enter one or two words that describe the data like "books" or "events":', '')
# Input for custom details to organize the data (e.g., name, price, stock)
details = st.text_input('Enter the details to extract (comma separated) like name, date', '')
# Input for the API key
api_key = st.text_input('Enter your Groq API key:', type="password")
# Button to organize and save JSON
if st.button("Organize"):
if about and details and api_key:
# Convert comma-separated details into a list
details_list = [detail.strip() for detail in details.split(',')]
# Process and save the JSON with the provided details and API key
process_and_save_json("output.txt", "organize.json", api_key=api_key, about=about, details=details_list)
st.success('Data has been organized and saved to organize.json.')
# Add an option to download the organized JSON file
with open("organize.json", "r") as organized_json_file:
st.download_button(
label="Download Organized JSON",
data=organized_json_file,
file_name="organize.json",
mime="application/json"
)
else:
st.error('Please provide a description, details, and your API key before organizing.')