import streamlit as st import advertools as adv import pandas as pd # Sidebar instructions st.sidebar.markdown("### Web Page Header Extractor") st.sidebar.markdown(""" Enter your webpage URL into the tool to analyze header tags. Shout out to Elias Dabbas for [Advertools](https://github.com/eliasdabbas/advertools) which i used in the backend and as always, thanks to Koray Tuğberk Gübür for all the knowledge I have learned from Topical Authority SEO course. [topicalauthority.digital](https://www.topicalauthority.digital/)""") st.sidebar.markdown("## Tool uploaded and maintained by: [Blazing SEO](http://blazing-seo.com/)") def extract_headers(url): try: # Define the output file path output_file = "crawl_output.jl" # Perform the crawl with restricted settings adv.crawl( url, output_file=output_file, follow_links=False, # Do not follow links allowed_domains=[url.split('//')[1].split('/')[0]] # Restrict to the base domain ) # Load the crawl data crawl_df = pd.read_json(output_file, lines=True) # Display the column names for debugging print("Columns in the crawl data:", crawl_df.columns) # Extract headers from h1 to h6 headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()] print("Header columns found:", headers_columns) # Create a DataFrame for headers headers = crawl_df[headers_columns] # Melt and split headers by @@ delimiter headers_melted = headers.melt(var_name='Header', value_name='Content').dropna() headers_melted['Content'] = headers_melted['Content'].apply(lambda x: x.split('@@') if isinstance(x, str) else []) # Explode the headers to separate rows headers_exploded = headers_melted.explode('Content').dropna().reset_index(drop=True) return headers_exploded except Exception as e: print("Error occurred:", e) return str(e) def main(): st.title("Web Page Header Extractor") url = st.text_input("Enter the URL of the web page:") if st.button("Extract Headers"): if url: headers = extract_headers(url) if isinstance(headers, pd.DataFrame) and not headers.empty: st.write("Extracted Headers:") st.write(headers) else: st.error("No headers found or an error occurred.") else: st.error("Please enter a valid URL.") if __name__ == "__main__": main()