Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,7 @@
|
|
|
|
1 |
import requests
|
|
|
|
|
2 |
import jieba
|
3 |
from keybert import KeyBERT
|
4 |
from sklearn.feature_extraction.text import CountVectorizer
|
@@ -22,9 +25,6 @@ download_font(font_url, font_path)
|
|
22 |
# 設置字體
|
23 |
font_prop = FontProperties(fname=font_path)
|
24 |
|
25 |
-
# 讀取繁體中文詞典
|
26 |
-
# jieba.set_dictionary('path_to_your_dict.txt') # 繁體中文詞典的實際路徑,若需要繁體字典請取消註解並設置正確路徑
|
27 |
-
|
28 |
# 定義斷詞函數
|
29 |
def jieba_tokenizer(text):
|
30 |
return jieba.lcut(text)
|
@@ -52,13 +52,26 @@ def plot_keywords(keywords, title):
|
|
52 |
plt.yticks(fontproperties=font_prop)
|
53 |
st.pyplot(plt)
|
54 |
|
55 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
st.title("中文關鍵詞提取工具")
|
57 |
-
doc = st.text_area("請輸入文章:")
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
62 |
st.write("關鍵詞提取結果:")
|
63 |
for keyword in keywords:
|
64 |
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
|
@@ -67,11 +80,11 @@ if st.button("提取關鍵詞"):
|
|
67 |
|
68 |
# 使用另一個模型進行關鍵詞提取
|
69 |
kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
|
70 |
-
keywords_multilingual = kw_model_multilingual.extract_keywords(
|
71 |
st.write("多語言模型關鍵詞提取結果:")
|
72 |
for keyword in keywords_multilingual:
|
73 |
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
|
74 |
|
75 |
plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
|
76 |
-
|
77 |
-
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import pandas as pd
|
5 |
import jieba
|
6 |
from keybert import KeyBERT
|
7 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
25 |
# 設置字體
|
26 |
font_prop = FontProperties(fname=font_path)
|
27 |
|
|
|
|
|
|
|
28 |
# 定義斷詞函數
|
29 |
def jieba_tokenizer(text):
|
30 |
return jieba.lcut(text)
|
|
|
52 |
plt.yticks(fontproperties=font_prop)
|
53 |
st.pyplot(plt)
|
54 |
|
55 |
+
# Web scraping部分
|
56 |
+
def fetch_article(url):
|
57 |
+
response = requests.get(url)
|
58 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
59 |
+
title = soup.find('h1').get_text()
|
60 |
+
content_paragraphs = soup.find_all('p')
|
61 |
+
content = ' '.join([para.get_text() for para in content_paragraphs])
|
62 |
+
return title, content
|
63 |
+
|
64 |
+
# Streamlit應用程式
|
65 |
st.title("中文關鍵詞提取工具")
|
|
|
66 |
|
67 |
+
url = st.text_input("請輸入Yahoo新聞文章的URL:")
|
68 |
+
if url:
|
69 |
+
title, content = fetch_article(url)
|
70 |
+
st.write("文章標題:", title)
|
71 |
+
st.write("文章內容:", content)
|
72 |
+
|
73 |
+
if st.button("提取關鍵詞"):
|
74 |
+
keywords = extract_keywords(content)
|
75 |
st.write("關鍵詞提取結果:")
|
76 |
for keyword in keywords:
|
77 |
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
|
|
|
80 |
|
81 |
# 使用另一個模型進行關鍵詞提取
|
82 |
kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
|
83 |
+
keywords_multilingual = kw_model_multilingual.extract_keywords(content, vectorizer=vectorizer)
|
84 |
st.write("多語言模型關鍵詞提取結果:")
|
85 |
for keyword in keywords_multilingual:
|
86 |
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
|
87 |
|
88 |
plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
|
89 |
+
else:
|
90 |
+
st.write("請輸入文章的URL以進行關鍵詞提取。")
|