File size: 9,291 Bytes
8e29239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import streamlit as st
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import PorterStemmer
import pandas as pd
import nltk
import string
import io 
import os

api_key = os.getenv("API_KEY")

# ==========1- NLTK DOWNLOADS=========
def ensure_nltk_data():
    resources = [
        ("tokenizers/punkt", "punkt"),
        ("corpora/stopwords", "stopwords"),
        ("tokenizers/punkt_tab", "punkt_tab")  
    ]
    for resource_path, download_name in resources:
        try:
            nltk.data.find(resource_path)
        except LookupError:
            nltk.download(download_name)

ensure_nltk_data()

# =======2-EXTRACT FUNCTION WITH USER AGENT==========
def extract_blog_content(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; rv:105.0) "
            "Gecko/20100101 Firefox/105.0"
        )
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    meta_title = soup.find('meta', attrs={'name': 'title'}) or soup.find('title')
    meta_description = soup.find('meta', attrs={'name': 'description'})

    meta_title = meta_title['content'] if meta_title and 'content' in meta_title.attrs else ''
    meta_description = meta_description['content'] if meta_description and 'content' in meta_description.attrs else ''

    article_title_element = soup.find('h1')
    article_title = article_title_element.get_text(strip=True) if article_title_element else ''

    blog_text = " ".join([p.get_text() for p in soup.find_all('p')])
    return meta_title, meta_description, article_title, blog_text

#========3- PREPROCESSING + TF-IDF LOGIC=======
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

def generate_ngrams(tokens, max_n=3):
    all_ngrams = []
    for n in range(1, max_n + 1):
        ngrams_list = [" ".join(gram) for gram in ngrams(tokens, n)]
        all_ngrams.extend(ngrams_list)
    return all_ngrams
# ======= 4-KEYWORD TOOL API + SELECTION LOGIC   ==========
def get_keyword_metrics(keywords):
    if not keywords:
        st.error("No keywords to process.")
        return {}
    url = "https://api.keywordtool.io/v2/search/volume/google"
    payload = {
        "metrics_network": "googlesearchnetwork",
        "metrics_currency": "USD",
        "complete": False,
        "output": "json",
        "apikey": api_key,
        "keyword": keywords
    }
    headers = {"content-type": "application/json"}
    response = requests.post(url, json=payload, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        st.error("API Error: " + response.text)
        return {}
def select_top_keywords(metrics_response, percentage, scored_keywords):
    keywords_data = metrics_response.get('results', {})
    keyword_scores = []


    
    for keyword, data in keywords_data.items():
        search_volume = data.get('volume', 0) or 0
        trend = data.get('trend', 0) or 0
        cpc = data.get('cpc', 0) or 0
        competition = data.get('cmp', 0) or 0

        tfidf_score = next((score for kw, score in scored_keywords if kw == keyword), 0)
        percentage_score = tfidf_score * 100  # Convert to percentage
        keyword_scores.append((keyword, percentage_score, search_volume, trend, cpc, competition))

    sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True)
    top_count = max(1, int(len(sorted_keywords) * (percentage / 100)))
    return sorted_keywords[:top_count]

# =======5-UI & LOGIC FLOW==========

st.title("Keyword Analysis Tool")

# A. URL input
url = st.text_input("Enter the URL:", key="url_input")

if "meta_title" not in st.session_state:
    st.session_state.meta_title = ""
if "meta_description" not in st.session_state:
    st.session_state.meta_description = ""
if "article_title" not in st.session_state:
    st.session_state.article_title = ""
if "article_text" not in st.session_state:
    st.session_state.article_text = ""

# B- Step 1: Fetch Data
if st.button("Fetch Data"):
    if url.strip():
        meta_title, meta_description, article_title, blog_text = extract_blog_content(url)
        st.session_state.meta_title = meta_title
        st.session_state.meta_description = meta_description
        st.session_state.article_title = article_title
        st.session_state.article_text = blog_text
    else:
        st.error("Please enter a valid URL.")

# C-Show the fetched data so user can modify
st.subheader("Modify Fetched Content")
st.session_state.meta_title = st.text_input("Meta Title", st.session_state.meta_title)
st.session_state.meta_description = st.text_area("Meta Description", st.session_state.meta_description)
st.session_state.article_title = st.text_input("Article Title", st.session_state.article_title)
st.session_state.article_text = st.text_area("Article Text", st.session_state.article_text)

# D- Checkboxes to select which parts to analyze
include_meta_title = st.checkbox("Include Meta Title")
include_meta_description = st.checkbox("Include Meta Description")
include_article_title = st.checkbox("Include Article Title")
include_article_text = st.checkbox("Include Article Text")

# E- Top % of Keywords
top_percentage = st.number_input("Top % of Keywords to Display", min_value=1, max_value=100, value=100, step=1)

# F- Analyze Button -> runs the original logic
if st.button("Analyze"):
    if not url.strip():
        st.error("Please enter a valid URL.")
    else:
        selected_text = ""
        if include_meta_title:
            selected_text += st.session_state.meta_title + " "
        if include_meta_description:
            selected_text += st.session_state.meta_description + " "
        if include_article_title:
            selected_text += st.session_state.article_title + " "
        if include_article_text:
            selected_text += st.session_state.article_text

        if not selected_text.strip():
            st.error("No text selected for analysis. Please check at least one option.")
        else:
            # ========== ORIGINAL ANALYSIS LOGIC (unchanged) ==========
            tokens = preprocess_text(selected_text)
            ngrams_list = generate_ngrams(tokens, max_n=3)
            unique_ngrams = list(set(ngrams_list))

            if not unique_ngrams:
                st.error("Vocabulary is empty. Please ensure valid input data.")
            else:
                tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_ngrams)
                tfidf_vectorizer.fit([" ".join(tokens)])
                tfidf_scores = tfidf_vectorizer.transform([" ".join(tokens)]).toarray()[0]

                scored_keywords = sorted(
                    zip(unique_ngrams, tfidf_scores),
                    key=lambda x: x[1],
                    reverse=True
                )[:100]

                keywords = [kw for kw, _ in scored_keywords]

                metrics_response = get_keyword_metrics(keywords)
                if metrics_response:
                    # Select top keywords based on user percentage
                    top_keywords_data = select_top_keywords(metrics_response, top_percentage, scored_keywords)

                    data = {
                        "Keyword": [k[0] for k in top_keywords_data],
                        "Score (%)": [f"{k[1]:.2f}" for k in top_keywords_data],
                        "Search Volume": [k[2] for k in top_keywords_data],
                        "Trend": [k[3] for k in top_keywords_data],
                        "CPC": [k[4] for k in top_keywords_data],
                        "Competition": [k[5] for k in top_keywords_data],
                    }
                    df = pd.DataFrame(data)

                    st.dataframe(df)

                    output_format = st.selectbox("Download format", ["CSV", "Excel"])

                    if output_format == "CSV":
                        csv_data = df.to_csv(index=False).encode('utf-8')
                        st.download_button(
                            label="Download CSV",
                            data=csv_data,
                            file_name="keywords.csv",
                            mime="text/csv",
                            key="download-csv",
                        )
                    else:  # Excel
                        excel_buffer = io.BytesIO()
                        with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
                            df.to_excel(writer, index=False, sheet_name="Sheet1")
                        excel_data = excel_buffer.getvalue()

                        st.download_button(
                            label="Download Excel",
                            data=excel_data,
                            file_name="keywords.xlsx",
                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                            key="download-excel",
                        )