articles / app.py
abdulllah01's picture
Update app.py
36702ae verified
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import streamlit as st
from io import BytesIO
def extract_article_info(url):
"""
Extracts meta title, meta description, heading, subheadings, and all text in <p> tags from a blog post URL.
Args:
url (str): The URL of the blog post.
Returns:
str: A string containing the extracted information.
"""
try:
# Fetch the HTML content of the URL
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract meta title
meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None
# Extract meta description
meta_description = None
meta_tag = soup.find('meta', attrs={'name': 'description'})
if meta_tag and meta_tag.get('content'):
meta_description = meta_tag['content']
# Extract heading (Assuming <h1> is used for the main heading)
heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
# Extract subheadings (Assuming <h2> tags are used for subheadings)
subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
# Extract all text from <p> tags and add two breaks between paragraphs
all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
article_text = "\n\n".join(all_paragraphs) # Add two breaks between paragraphs
# Combine heading and subheadings with article text
full_article_text = f"{heading}\n\n" if heading else ""
for subheading in subheadings:
full_article_text += f"{subheading}\n\n"
full_article_text += article_text
return full_article_text
except requests.exceptions.RequestException as e:
return f"Error fetching the URL: {e}"
except Exception as e:
return f"Error processing the content: {e}"
def process_file(uploaded_file):
# Load the Excel file
df = pd.read_excel(uploaded_file)
# Check if 'URL' column exists
if 'URL' not in df.columns:
return None, "The 'URL' column is missing from the Excel file."
# List to hold results
results = []
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
# Submit tasks to the executor
future_to_url = {executor.submit(extract_article_info, url): url for url in df['URL']}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
# Append the result to the results list
results.append(future.result())
except Exception as e:
# Handle exceptions during execution
results.append(f"Error processing the URL {url}: {e}")
# Add the results to a new column in the DataFrame
df['Article Text'] = results
# Save the updated DataFrame to a BytesIO object
output = BytesIO()
df.to_excel(output, index=False, engine='openpyxl')
output.seek(0)
return output, None
# Streamlit App
st.title("Web Article Extractor")
st.markdown("Upload an Excel file with a column named 'URL' containing the links to process.")
# File upload
uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx"])
if uploaded_file is not None:
with st.spinner("Processing your file..."):
output, error = process_file(uploaded_file)
if error:
st.error(error)
else:
st.success("File processed successfully!")
st.download_button(
label="Download Processed File",
data=output,
file_name="processed_file.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)