Spaces:
Build error
Build error
File size: 5,410 Bytes
2755a07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import streamlit as st
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from wordcloud import WordCloud
from gensim import corpora, models
# 下載字體
def download_font(url, save_path):
response = requests.get(url)
with open(save_path, 'wb') as f:
f.write(response.content)
# 字體URL和保存路徑
font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
font_path = 'TaipeiSansTCBeta-Regular.ttf'
# 下載字體
download_font(font_url, font_path)
# 設置字體
font_prop = FontProperties(fname=font_path)
# 抓取Yahoo新聞標題和內容
def fetch_yahoo_news(url):
response = requests.get(url)
web_content = response.content
soup = BeautifulSoup(web_content, 'html.parser')
title = soup.find('h1').text
content = soup.find('article').text
return title, content
# 斷詞函數
def jieba_tokenizer(text):
return jieba.lcut(text)
# 初始化CountVectorizer並定義KeyBERT模型
vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
kw_model = KeyBERT()
# 提取關鍵詞的函數(使用MMR)
def extract_keywords(doc, diversity=0.7):
keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
return keywords
# 畫圖函數
def plot_keywords(keywords, title):
words = [kw[0] for kw in keywords]
scores = [kw[1] for kw in keywords]
plt.figure(figsize=(10, 6))
bars = plt.barh(words, scores, color='skyblue', edgecolor='black', linewidth=1.2)
plt.xlabel('分數', fontproperties=font_prop, fontsize=14)
plt.title(title, fontproperties=font_prop, fontsize=16)
plt.gca().invert_yaxis() # 反轉Y軸,使得分數最高的關鍵詞在最上面
plt.xticks(fontproperties=font_prop, fontsize=12)
plt.yticks(fontproperties=font_prop, fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
# 添加分數標籤
for bar in bars:
plt.gca().text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
f'{bar.get_width():.4f}', va='center', ha='left', fontsize=12, fontproperties=font_prop)
st.pyplot(plt)
# 生成TF-IDF文字雲的函數
def plot_wordcloud(text):
tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform([text])
tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
wordcloud = WordCloud(font_path=font_path, background_color='white', max_words=100, width=800, height=400)
wordcloud.generate_from_frequencies(tfidf_scores)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('TF-IDF文字雲', fontproperties=font_prop, fontsize=16)
st.pyplot(plt)
# LDA主題模型函數
def lda_topic_modeling(text, num_topics=5):
# 斷詞並創建字典和語料庫
tokens = jieba_tokenizer(text)
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(tokens)]
# 生成LDA模型
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
# 提取主題
topics = lda_model.print_topics(num_words=5)
return topics
# 建立Streamlit網頁應用程式
st.title("🤙🤙🤙YAHOO新聞關鍵詞提取工具👂👂")
# 設置MMR多樣性參數
diversity = st.slider("選擇MMR多樣性參數", 0.0, 1.0, 0.7)
# 抓取Yahoo新聞的URL輸入框
url = st.text_input("請輸入Yahoo新聞的URL:")
if st.button("抓取並提取關鍵詞"):
if url:
title, content = fetch_yahoo_news(url)
st.write("新聞標題:", title)
st.write("新聞內容:", content)
# 將內容轉為DataFrame
data = {'Title': [title], 'Content': [content]}
df = pd.DataFrame(data)
st.write("新聞內容的DataFrame:")
st.write(df)
# 提取關鍵詞
keywords = extract_keywords(content, diversity=diversity)
st.write("關鍵詞提取結果:")
for keyword in keywords:
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
plot_keywords(keywords, "關鍵詞提取結果")
# 使用另一個模型進行關鍵詞提取
kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
keywords_multilingual = kw_model_multilingual.extract_keywords(content, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
st.write("多語言模型關鍵詞提取結果:")
for keyword in keywords_multilingual:
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
# 生成TF-IDF文字雲
plot_wordcloud(content)
# LDA主題模型
num_topics = st.slider("選擇LDA主題數量", 1, 10, 5)
lda_topics = lda_topic_modeling(content, num_topics=num_topics)
st.write("LDA主題模型結果:")
for topic in lda_topics:
st.write(f"主題 {topic[0]}: {topic[1]}")
else:
st.write("請輸入有效的Yahoo新聞URL。")
|