Roberta2024 commited on
Commit
46427cb
·
verified ·
1 Parent(s): 64b8404

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -11
app.py CHANGED
@@ -1,4 +1,7 @@
 
1
  import requests
 
 
2
  import jieba
3
  from keybert import KeyBERT
4
  from sklearn.feature_extraction.text import CountVectorizer
@@ -22,9 +25,6 @@ download_font(font_url, font_path)
22
  # 設置字體
23
  font_prop = FontProperties(fname=font_path)
24
 
25
- # 讀取繁體中文詞典
26
- # jieba.set_dictionary('path_to_your_dict.txt') # 繁體中文詞典的實際路徑,若需要繁體字典請取消註解並設置正確路徑
27
-
28
  # 定義斷詞函數
29
  def jieba_tokenizer(text):
30
  return jieba.lcut(text)
@@ -52,13 +52,26 @@ def plot_keywords(keywords, title):
52
  plt.yticks(fontproperties=font_prop)
53
  st.pyplot(plt)
54
 
55
- # 建立Streamlit網頁應用程式
 
 
 
 
 
 
 
 
 
56
  st.title("中文關鍵詞提取工具")
57
- doc = st.text_area("請輸入文章:")
58
 
59
- if st.button("提取關鍵詞"):
60
- if doc:
61
- keywords = extract_keywords(doc)
 
 
 
 
 
62
  st.write("關鍵詞提取結果:")
63
  for keyword in keywords:
64
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
@@ -67,11 +80,11 @@ if st.button("提取關鍵詞"):
67
 
68
  # 使用另一個模型進行關鍵詞提取
69
  kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
70
- keywords_multilingual = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer)
71
  st.write("多語言模型關鍵詞提取結果:")
72
  for keyword in keywords_multilingual:
73
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
74
 
75
  plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
76
- else:
77
- st.write("請輸入文章內容以進行關鍵詞提取。")
 
1
+ # -*- coding: utf-8 -*-
2
  import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
  import jieba
6
  from keybert import KeyBERT
7
  from sklearn.feature_extraction.text import CountVectorizer
 
25
  # 設置字體
26
  font_prop = FontProperties(fname=font_path)
27
 
 
 
 
28
  # 定義斷詞函數
29
  def jieba_tokenizer(text):
30
  return jieba.lcut(text)
 
52
  plt.yticks(fontproperties=font_prop)
53
  st.pyplot(plt)
54
 
55
+ # Web scraping部分
56
+ def fetch_article(url):
57
+ response = requests.get(url)
58
+ soup = BeautifulSoup(response.content, 'html.parser')
59
+ title = soup.find('h1').get_text()
60
+ content_paragraphs = soup.find_all('p')
61
+ content = ' '.join([para.get_text() for para in content_paragraphs])
62
+ return title, content
63
+
64
+ # Streamlit應用程式
65
  st.title("中文關鍵詞提取工具")
 
66
 
67
+ url = st.text_input("請輸入Yahoo新聞文章的URL:")
68
+ if url:
69
+ title, content = fetch_article(url)
70
+ st.write("文章標題:", title)
71
+ st.write("文章內容:", content)
72
+
73
+ if st.button("提取關鍵詞"):
74
+ keywords = extract_keywords(content)
75
  st.write("關鍵詞提取結果:")
76
  for keyword in keywords:
77
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
 
80
 
81
  # 使用另一個模型進行關鍵詞提取
82
  kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
83
+ keywords_multilingual = kw_model_multilingual.extract_keywords(content, vectorizer=vectorizer)
84
  st.write("多語言模型關鍵詞提取結果:")
85
  for keyword in keywords_multilingual:
86
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
87
 
88
  plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
89
+ else:
90
+ st.write("請輸入文章的URL以進行關鍵詞提取。")