import pandas as pd import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm import streamlit as st from io import BytesIO def extract_article_info(url): """ Extracts meta title, meta description, heading, subheadings, and all text in

tags from a blog post URL. Args: url (str): The URL of the blog post. Returns: str: A string containing the extracted information. """ try: # Fetch the HTML content of the URL response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extract meta title meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None # Extract meta description meta_description = None meta_tag = soup.find('meta', attrs={'name': 'description'}) if meta_tag and meta_tag.get('content'): meta_description = meta_tag['content'] # Extract heading (Assuming

is used for the main heading) heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None # Extract subheadings (Assuming

tags are used for subheadings) subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')] # Extract all text from

tags and add two breaks between paragraphs all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')] article_text = "\n\n".join(all_paragraphs) # Add two breaks between paragraphs # Combine heading and subheadings with article text full_article_text = f"{heading}\n\n" if heading else "" for subheading in subheadings: full_article_text += f"{subheading}\n\n" full_article_text += article_text return full_article_text except requests.exceptions.RequestException as e: return f"Error fetching the URL: {e}" except Exception as e: return f"Error processing the content: {e}" def process_file(uploaded_file): # Load the Excel file df = pd.read_excel(uploaded_file) # Check if 'URL' column exists if 'URL' not in df.columns: return None, "The 'URL' column is missing from the Excel file." # List to hold results results = [] # Use ThreadPoolExecutor for parallel processing with ThreadPoolExecutor() as executor: # Submit tasks to the executor future_to_url = {executor.submit(extract_article_info, url): url for url in df['URL']} for future in as_completed(future_to_url): url = future_to_url[future] try: # Append the result to the results list results.append(future.result()) except Exception as e: # Handle exceptions during execution results.append(f"Error processing the URL {url}: {e}") # Add the results to a new column in the DataFrame df['Article Text'] = results # Save the updated DataFrame to a BytesIO object output = BytesIO() df.to_excel(output, index=False, engine='openpyxl') output.seek(0) return output, None # Streamlit App st.title("Web Article Extractor") st.markdown("Upload an Excel file with a column named 'URL' containing the links to process.") # File upload uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx"]) if uploaded_file is not None: with st.spinner("Processing your file..."): output, error = process_file(uploaded_file) if error: st.error(error) else: st.success("File processed successfully!") st.download_button( label="Download Processed File", data=output, file_name="processed_file.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" )