Spaces:

abdulllah01
/

articles

Sleeping

App Files Files Community

articles / app.py

abdulllah01

Update app.py

36702ae verified 30 days ago

raw

history blame contribute delete

3.92 kB

	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from tqdm import tqdm
	import streamlit as st
	from io import BytesIO

	def extract_article_info(url):
	"""
	Extracts meta title, meta description, heading, subheadings, and all text in <p> tags from a blog post URL.
	Args:
	url (str): The URL of the blog post.
	Returns:
	str: A string containing the extracted information.
	"""
	try:
	# Fetch the HTML content of the URL
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract meta title
	meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None

	# Extract meta description
	meta_description = None
	meta_tag = soup.find('meta', attrs={'name': 'description'})
	if meta_tag and meta_tag.get('content'):
	meta_description = meta_tag['content']

	# Extract heading (Assuming <h1> is used for the main heading)
	heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None

	# Extract subheadings (Assuming <h2> tags are used for subheadings)
	subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]

	# Extract all text from <p> tags and add two breaks between paragraphs
	all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
	article_text = "\n\n".join(all_paragraphs) # Add two breaks between paragraphs

	# Combine heading and subheadings with article text
	full_article_text = f"{heading}\n\n" if heading else ""
	for subheading in subheadings:
	full_article_text += f"{subheading}\n\n"
	full_article_text += article_text

	return full_article_text

	except requests.exceptions.RequestException as e:
	return f"Error fetching the URL: {e}"
	except Exception as e:
	return f"Error processing the content: {e}"

	def process_file(uploaded_file):
	# Load the Excel file
	df = pd.read_excel(uploaded_file)

	# Check if 'URL' column exists
	if 'URL' not in df.columns:
	return None, "The 'URL' column is missing from the Excel file."

	# List to hold results
	results = []

	# Use ThreadPoolExecutor for parallel processing
	with ThreadPoolExecutor() as executor:
	# Submit tasks to the executor
	future_to_url = {executor.submit(extract_article_info, url): url for url in df['URL']}

	for future in as_completed(future_to_url):
	url = future_to_url[future]
	try:
	# Append the result to the results list
	results.append(future.result())
	except Exception as e:
	# Handle exceptions during execution
	results.append(f"Error processing the URL {url}: {e}")

	# Add the results to a new column in the DataFrame
	df['Article Text'] = results

	# Save the updated DataFrame to a BytesIO object
	output = BytesIO()
	df.to_excel(output, index=False, engine='openpyxl')
	output.seek(0)

	return output, None

	# Streamlit App
	st.title("Web Article Extractor")
	st.markdown("Upload an Excel file with a column named 'URL' containing the links to process.")

	# File upload
	uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx"])

	if uploaded_file is not None:
	with st.spinner("Processing your file..."):
	output, error = process_file(uploaded_file)

	if error:
	st.error(error)
	else:
	st.success("File processed successfully!")
	st.download_button(
	label="Download Processed File",
	data=output,
	file_name="processed_file.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)