abdulllah01 commited on
Commit
36702ae
·
verified ·
1 Parent(s): f2bcb62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -42
app.py CHANGED
@@ -1,12 +1,22 @@
1
- import streamlit as st
2
  import pandas as pd
3
  import requests
4
  from bs4 import BeautifulSoup
 
 
 
5
  from io import BytesIO
6
 
7
  def extract_article_info(url):
 
 
 
 
 
 
 
8
  try:
9
- response = requests.get(url)
 
10
  response.raise_for_status()
11
  soup = BeautifulSoup(response.text, 'html.parser')
12
 
@@ -19,15 +29,15 @@ def extract_article_info(url):
19
  if meta_tag and meta_tag.get('content'):
20
  meta_description = meta_tag['content']
21
 
22
- # Extract heading
23
  heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
24
 
25
- # Extract subheadings
26
  subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
27
 
28
  # Extract all text from <p> tags and add two breaks between paragraphs
29
  all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
30
- article_text = "\n\n".join(all_paragraphs)
31
 
32
  # Combine heading and subheadings with article text
33
  full_article_text = f"{heading}\n\n" if heading else ""
@@ -42,43 +52,59 @@ def extract_article_info(url):
42
  except Exception as e:
43
  return f"Error processing the content: {e}"
44
 
45
- def process_excel(file):
46
- # Read the uploaded Excel file
47
- df = pd.read_excel(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- if 'URL' in df.columns:
50
- # Apply extract_article_info to each URL and store the result in a new column
51
- df['Article Text'] = df['URL'].apply(extract_article_info)
52
 
53
- # Save the updated DataFrame to a BytesIO object to prepare it for download
54
- output = BytesIO()
55
- df.to_excel(output, index=False)
56
- output.seek(0)
57
- return output
58
  else:
59
- return None
60
-
61
- def main():
62
- st.title("Excel URL Processor")
63
- st.markdown("Upload an Excel file with a column named 'URL' to extract article information.")
64
-
65
- # Upload Excel file
66
- uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"])
67
-
68
- if uploaded_file:
69
- # Process the file
70
- processed_file = process_excel(uploaded_file)
71
-
72
- if processed_file:
73
- st.success("File processed successfully!")
74
- st.download_button(
75
- label="Download Modified Excel File",
76
- data=processed_file,
77
- file_name="updated_file.xlsx",
78
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
79
- )
80
- else:
81
- st.error("The uploaded file does not contain a column named 'URL'.")
82
-
83
- if __name__ == "__main__":
84
- main()
 
 
1
  import pandas as pd
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from tqdm import tqdm
6
+ import streamlit as st
7
  from io import BytesIO
8
 
9
  def extract_article_info(url):
10
+ """
11
+ Extracts meta title, meta description, heading, subheadings, and all text in <p> tags from a blog post URL.
12
+ Args:
13
+ url (str): The URL of the blog post.
14
+ Returns:
15
+ str: A string containing the extracted information.
16
+ """
17
  try:
18
+ # Fetch the HTML content of the URL
19
+ response = requests.get(url, timeout=10)
20
  response.raise_for_status()
21
  soup = BeautifulSoup(response.text, 'html.parser')
22
 
 
29
  if meta_tag and meta_tag.get('content'):
30
  meta_description = meta_tag['content']
31
 
32
+ # Extract heading (Assuming <h1> is used for the main heading)
33
  heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
34
 
35
+ # Extract subheadings (Assuming <h2> tags are used for subheadings)
36
  subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
37
 
38
  # Extract all text from <p> tags and add two breaks between paragraphs
39
  all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
40
+ article_text = "\n\n".join(all_paragraphs) # Add two breaks between paragraphs
41
 
42
  # Combine heading and subheadings with article text
43
  full_article_text = f"{heading}\n\n" if heading else ""
 
52
  except Exception as e:
53
  return f"Error processing the content: {e}"
54
 
55
+ def process_file(uploaded_file):
56
+ # Load the Excel file
57
+ df = pd.read_excel(uploaded_file)
58
+
59
+ # Check if 'URL' column exists
60
+ if 'URL' not in df.columns:
61
+ return None, "The 'URL' column is missing from the Excel file."
62
+
63
+ # List to hold results
64
+ results = []
65
+
66
+ # Use ThreadPoolExecutor for parallel processing
67
+ with ThreadPoolExecutor() as executor:
68
+ # Submit tasks to the executor
69
+ future_to_url = {executor.submit(extract_article_info, url): url for url in df['URL']}
70
+
71
+ for future in as_completed(future_to_url):
72
+ url = future_to_url[future]
73
+ try:
74
+ # Append the result to the results list
75
+ results.append(future.result())
76
+ except Exception as e:
77
+ # Handle exceptions during execution
78
+ results.append(f"Error processing the URL {url}: {e}")
79
+
80
+ # Add the results to a new column in the DataFrame
81
+ df['Article Text'] = results
82
+
83
+ # Save the updated DataFrame to a BytesIO object
84
+ output = BytesIO()
85
+ df.to_excel(output, index=False, engine='openpyxl')
86
+ output.seek(0)
87
+
88
+ return output, None
89
+
90
+ # Streamlit App
91
+ st.title("Web Article Extractor")
92
+ st.markdown("Upload an Excel file with a column named 'URL' containing the links to process.")
93
+
94
+ # File upload
95
+ uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx"])
96
 
97
+ if uploaded_file is not None:
98
+ with st.spinner("Processing your file..."):
99
+ output, error = process_file(uploaded_file)
100
 
101
+ if error:
102
+ st.error(error)
 
 
 
103
  else:
104
+ st.success("File processed successfully!")
105
+ st.download_button(
106
+ label="Download Processed File",
107
+ data=output,
108
+ file_name="processed_file.xlsx",
109
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
110
+ )