import streamlit as st import requests from bs4 import BeautifulSoup from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.util import ngrams from nltk.stem import PorterStemmer import pandas as pd import nltk import string import io import os api_key = os.getenv("API_KEY") # ==========1- NLTK DOWNLOADS========= def ensure_nltk_data(): resources = [ ("tokenizers/punkt", "punkt"), ("corpora/stopwords", "stopwords"), ("tokenizers/punkt_tab", "punkt_tab") ] for resource_path, download_name in resources: try: nltk.data.find(resource_path) except LookupError: nltk.download(download_name) ensure_nltk_data() # =======2-EXTRACT FUNCTION WITH USER AGENT========== def extract_blog_content(url): headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; rv:105.0) " "Gecko/20100101 Firefox/105.0" ) } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') meta_title = soup.find('meta', attrs={'name': 'title'}) or soup.find('title') meta_description = soup.find('meta', attrs={'name': 'description'}) meta_title = meta_title['content'] if meta_title and 'content' in meta_title.attrs else '' meta_description = meta_description['content'] if meta_description and 'content' in meta_description.attrs else '' article_title_element = soup.find('h1') article_title = article_title_element.get_text(strip=True) if article_title_element else '' blog_text = " ".join([p.get_text() for p in soup.find_all('p')]) return meta_title, meta_description, article_title, blog_text #========3- PREPROCESSING + TF-IDF LOGIC======= def preprocess_text(text): stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() tokens = word_tokenize(text.lower()) tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation] return tokens def generate_ngrams(tokens, max_n=3): all_ngrams = [] for n in range(1, max_n + 1): ngrams_list = [" ".join(gram) for gram in ngrams(tokens, n)] all_ngrams.extend(ngrams_list) return all_ngrams # ======= 4-KEYWORD TOOL API + SELECTION LOGIC ========== def get_keyword_metrics(keywords): if not keywords: st.error("No keywords to process.") return {} url = "https://api.keywordtool.io/v2/search/volume/google" payload = { "metrics_network": "googlesearchnetwork", "metrics_currency": "USD", "complete": False, "output": "json", "apikey": api_key, "keyword": keywords } headers = {"content-type": "application/json"} response = requests.post(url, json=payload, headers=headers) if response.status_code == 200: return response.json() else: st.error("API Error: " + response.text) return {} def select_top_keywords(metrics_response, percentage, scored_keywords): keywords_data = metrics_response.get('results', {}) keyword_scores = [] for keyword, data in keywords_data.items(): search_volume = data.get('volume', 0) or 0 trend = data.get('trend', 0) or 0 cpc = data.get('cpc', 0) or 0 competition = data.get('cmp', 0) or 0 tfidf_score = next((score for kw, score in scored_keywords if kw == keyword), 0) percentage_score = tfidf_score * 100 # Convert to percentage keyword_scores.append((keyword, percentage_score, search_volume, trend, cpc, competition)) sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True) top_count = max(1, int(len(sorted_keywords) * (percentage / 100))) return sorted_keywords[:top_count] # =======5-UI & LOGIC FLOW========== st.title("Keyword Analysis Tool") # A. URL input url = st.text_input("Enter the URL:", key="url_input") if "meta_title" not in st.session_state: st.session_state.meta_title = "" if "meta_description" not in st.session_state: st.session_state.meta_description = "" if "article_title" not in st.session_state: st.session_state.article_title = "" if "article_text" not in st.session_state: st.session_state.article_text = "" # B- Step 1: Fetch Data if st.button("Fetch Data"): if url.strip(): meta_title, meta_description, article_title, blog_text = extract_blog_content(url) st.session_state.meta_title = meta_title st.session_state.meta_description = meta_description st.session_state.article_title = article_title st.session_state.article_text = blog_text else: st.error("Please enter a valid URL.") # C-Show the fetched data so user can modify st.subheader("Modify Fetched Content") st.session_state.meta_title = st.text_input("Meta Title", st.session_state.meta_title) st.session_state.meta_description = st.text_area("Meta Description", st.session_state.meta_description) st.session_state.article_title = st.text_input("Article Title", st.session_state.article_title) st.session_state.article_text = st.text_area("Article Text", st.session_state.article_text) # D- Checkboxes to select which parts to analyze include_meta_title = st.checkbox("Include Meta Title") include_meta_description = st.checkbox("Include Meta Description") include_article_title = st.checkbox("Include Article Title") include_article_text = st.checkbox("Include Article Text") # E- Top % of Keywords top_percentage = st.number_input("Top % of Keywords to Display", min_value=1, max_value=100, value=100, step=1) # F- Analyze Button -> runs the original logic if st.button("Analyze"): if not url.strip(): st.error("Please enter a valid URL.") else: selected_text = "" if include_meta_title: selected_text += st.session_state.meta_title + " " if include_meta_description: selected_text += st.session_state.meta_description + " " if include_article_title: selected_text += st.session_state.article_title + " " if include_article_text: selected_text += st.session_state.article_text if not selected_text.strip(): st.error("No text selected for analysis. Please check at least one option.") else: # ========== ORIGINAL ANALYSIS LOGIC (unchanged) ========== tokens = preprocess_text(selected_text) ngrams_list = generate_ngrams(tokens, max_n=3) unique_ngrams = list(set(ngrams_list)) if not unique_ngrams: st.error("Vocabulary is empty. Please ensure valid input data.") else: tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_ngrams) tfidf_vectorizer.fit([" ".join(tokens)]) tfidf_scores = tfidf_vectorizer.transform([" ".join(tokens)]).toarray()[0] scored_keywords = sorted( zip(unique_ngrams, tfidf_scores), key=lambda x: x[1], reverse=True )[:100] keywords = [kw for kw, _ in scored_keywords] metrics_response = get_keyword_metrics(keywords) if metrics_response: # Select top keywords based on user percentage top_keywords_data = select_top_keywords(metrics_response, top_percentage, scored_keywords) data = { "Keyword": [k[0] for k in top_keywords_data], "Score (%)": [f"{k[1]:.2f}" for k in top_keywords_data], "Search Volume": [k[2] for k in top_keywords_data], "Trend": [k[3] for k in top_keywords_data], "CPC": [k[4] for k in top_keywords_data], "Competition": [k[5] for k in top_keywords_data], } df = pd.DataFrame(data) st.dataframe(df) output_format = st.selectbox("Download format", ["CSV", "Excel"]) if output_format == "CSV": csv_data = df.to_csv(index=False).encode('utf-8') st.download_button( label="Download CSV", data=csv_data, file_name="keywords.csv", mime="text/csv", key="download-csv", ) else: # Excel excel_buffer = io.BytesIO() with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: df.to_excel(writer, index=False, sheet_name="Sheet1") excel_data = excel_buffer.getvalue() st.download_button( label="Download Excel", data=excel_data, file_name="keywords.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", key="download-excel", )