File size: 5,410 Bytes
2755a07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import streamlit as st
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from wordcloud import WordCloud
from gensim import corpora, models

# 下載字體
def download_font(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

# 字體URL和保存路徑
font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
font_path = 'TaipeiSansTCBeta-Regular.ttf'

# 下載字體
download_font(font_url, font_path)

# 設置字體
font_prop = FontProperties(fname=font_path)

# 抓取Yahoo新聞標題和內容
def fetch_yahoo_news(url):
    response = requests.get(url)
    web_content = response.content
    soup = BeautifulSoup(web_content, 'html.parser')
    title = soup.find('h1').text
    content = soup.find('article').text
    return title, content

# 斷詞函數
def jieba_tokenizer(text):
    return jieba.lcut(text)

# 初始化CountVectorizer並定義KeyBERT模型
vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
kw_model = KeyBERT()

# 提取關鍵詞的函數(使用MMR)
def extract_keywords(doc, diversity=0.7):
    keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
    return keywords

# 畫圖函數
def plot_keywords(keywords, title):
    words = [kw[0] for kw in keywords]
    scores = [kw[1] for kw in keywords]

    plt.figure(figsize=(10, 6))
    bars = plt.barh(words, scores, color='skyblue', edgecolor='black', linewidth=1.2)
    plt.xlabel('分數', fontproperties=font_prop, fontsize=14)
    plt.title(title, fontproperties=font_prop, fontsize=16)
    plt.gca().invert_yaxis()  # 反轉Y軸,使得分數最高的關鍵詞在最上面
    plt.xticks(fontproperties=font_prop, fontsize=12)
    plt.yticks(fontproperties=font_prop, fontsize=12)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    
    # 添加分數標籤
    for bar in bars:
        plt.gca().text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
                       f'{bar.get_width():.4f}', va='center', ha='left', fontsize=12, fontproperties=font_prop)

    st.pyplot(plt)

# 生成TF-IDF文字雲的函數
def plot_wordcloud(text):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
    tfidf_matrix = tfidf_vectorizer.fit_transform([text])
    tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
    
    wordcloud = WordCloud(font_path=font_path, background_color='white', max_words=100, width=800, height=400)
    wordcloud.generate_from_frequencies(tfidf_scores)

    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('TF-IDF文字雲', fontproperties=font_prop, fontsize=16)
    st.pyplot(plt)

# LDA主題模型函數
def lda_topic_modeling(text, num_topics=5):
    # 斷詞並創建字典和語料庫
    tokens = jieba_tokenizer(text)
    dictionary = corpora.Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    
    # 生成LDA模型
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    
    # 提取主題
    topics = lda_model.print_topics(num_words=5)
    return topics

# 建立Streamlit網頁應用程式
st.title("🤙🤙🤙YAHOO新聞關鍵詞提取工具👂👂")

# 設置MMR多樣性參數
diversity = st.slider("選擇MMR多樣性參數", 0.0, 1.0, 0.7)

# 抓取Yahoo新聞的URL輸入框
url = st.text_input("請輸入Yahoo新聞的URL:")

if st.button("抓取並提取關鍵詞"):
    if url:
        title, content = fetch_yahoo_news(url)
        st.write("新聞標題:", title)
        st.write("新聞內容:", content)
        
        # 將內容轉為DataFrame
        data = {'Title': [title], 'Content': [content]}
        df = pd.DataFrame(data)
        st.write("新聞內容的DataFrame:")
        st.write(df)
        
        # 提取關鍵詞
        keywords = extract_keywords(content, diversity=diversity)
        st.write("關鍵詞提取結果:")
        for keyword in keywords:
            st.write(f"{keyword[0]}: {keyword[1]:.4f}")
        
        plot_keywords(keywords, "關鍵詞提取結果")

        # 使用另一個模型進行關鍵詞提取
        kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
        keywords_multilingual = kw_model_multilingual.extract_keywords(content, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
        st.write("多語言模型關鍵詞提取結果:")
        for keyword in keywords_multilingual:
            st.write(f"{keyword[0]}: {keyword[1]:.4f}")
        
        plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
        
        # 生成TF-IDF文字雲
        plot_wordcloud(content)

        # LDA主題模型
        num_topics = st.slider("選擇LDA主題數量", 1, 10, 5)
        lda_topics = lda_topic_modeling(content, num_topics=num_topics)
        st.write("LDA主題模型結果:")
        for topic in lda_topics:
            st.write(f"主題 {topic[0]}: {topic[1]}")
    else:
        st.write("請輸入有效的Yahoo新聞URL。")