dryade36513 commited on
Commit
cff7733
·
verified ·
1 Parent(s): bd8c43b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +151 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import jieba
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import difflib
7
+ import numpy as np
8
+ import time
9
+
10
+ # 設置網頁標題等信息
11
+ st.set_page_config(
12
+ page_title="哞哞文章相似度檢測",
13
+ page_icon="🐮",
14
+ layout="wide",
15
+ initial_sidebar_state="collapsed"
16
+ )
17
+
18
+ # 自定義CSS樣式
19
+ st.markdown("""
20
+ <style>
21
+ .stTextArea textarea {
22
+ font-size: 16px !important;
23
+ }
24
+ .big-font {
25
+ font-size: 24px !important;
26
+ font-weight: bold !important;
27
+ color: #FF4B4B !important;
28
+ }
29
+ .result-font {
30
+ font-size: 20px !important;
31
+ color: #1E88E5 !important;
32
+ }
33
+ </style>
34
+ """, unsafe_allow_html=True)
35
+
36
+ # 顯示標題
37
+ st.markdown("<h1 style='text-align: center; color: #FF4B4B;'>🐮 哞哞文章相似度檢測</h1>", unsafe_allow_html=True)
38
+
39
+ # 創建兩列佈局
40
+ col1, col2 = st.columns(2)
41
+
42
+ with col1:
43
+ st.markdown("### 📝 文章1")
44
+ text1 = st.text_area("", height=300, placeholder="請在這裡輸入第一篇文章...", key="text1")
45
+
46
+ with col2:
47
+ st.markdown("### 📝 文章2")
48
+ text2 = st.text_area("", height=300, placeholder="請在這裡輸入第二篇文章...", key="text2")
49
+
50
+ # 創建按鈕列
51
+ col_btn1, col_btn2, col_btn3 = st.columns([1,1,1])
52
+
53
+ with col_btn2:
54
+ start_btn = st.button("🚀 開始計算相似度", type="primary", use_container_width=True)
55
+
56
+ def calculate_similarity(text1, text2):
57
+ """計算文本相似度"""
58
+ if not text1.strip() or not text2.strip():
59
+ return None, None
60
+
61
+ # 1. 計算字詞重合度
62
+ words1 = list(jieba.cut(text1))
63
+ words2 = list(jieba.cut(text2))
64
+ word_set1 = set(words1)
65
+ word_set2 = set(words2)
66
+ word_similarity = len(word_set1.intersection(word_set2)) / len(word_set1.union(word_set2))
67
+
68
+ # 2. 計算句子相似度
69
+ sentences1 = text1.split("。")
70
+ sentences2 = text2.split("。")
71
+ sentence_matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
72
+ sentence_similarity = sentence_matcher.ratio()
73
+
74
+ # 3. 計算TF-IDF相似度
75
+ vectorizer = TfidfVectorizer()
76
+ try:
77
+ tfidf_matrix = vectorizer.fit_transform([text1, text2])
78
+ cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
79
+ except:
80
+ cosine_sim = 0
81
+
82
+ # 計算總相似度
83
+ weights = [0.4, 0.3, 0.3]
84
+ total_similarity = (word_similarity * weights[0] +
85
+ sentence_similarity * weights[1] +
86
+ cosine_sim * weights[2]) * 100
87
+
88
+ similarity_score = round(total_similarity, 2)
89
+
90
+ # 判定結果
91
+ if similarity_score <= 30:
92
+ result = "兩篇文章沒有關係"
93
+ elif similarity_score <= 60:
94
+ result = "兩篇文章似乎有那麼一點關係"
95
+ elif similarity_score <= 80:
96
+ result = "兩篇文章很類似"
97
+ else:
98
+ result = "兩篇文章有抄襲犯罪的味道"
99
+
100
+ return similarity_score, result
101
+
102
+ if start_btn and text1 and text2:
103
+ with st.spinner('🔍 分析中,請稍等...'):
104
+ # 顯示進度條
105
+ progress_text = "計算中..."
106
+ my_bar = st.progress(0, text=progress_text)
107
+ for percent_complete in range(100):
108
+ time.sleep(0.01)
109
+ my_bar.progress(percent_complete + 1, text=progress_text)
110
+
111
+ # 計算相似度
112
+ similarity_score, result = calculate_similarity(text1, text2)
113
+
114
+ if similarity_score is not None:
115
+ # 清除進度條
116
+ my_bar.empty()
117
+
118
+ # 顯示結果
119
+ st.markdown("---")
120
+ st.markdown("<h3 style='text-align: center;'>✨ 分析結果</h3>", unsafe_allow_html=True)
121
+
122
+ result_text = f"""
123
+ <div style='text-align: center;'>
124
+ <p class='big-font'>相似度:{similarity_score}%</p>
125
+ <p class='result-font'>分析結果:{result}</p>
126
+ </div>
127
+ """
128
+ st.markdown(result_text, unsafe_allow_html=True)
129
+
130
+ # 顯示可愛的表情符號
131
+ if similarity_score <= 30:
132
+ st.markdown("<h1 style='text-align: center;'>😌</h1>", unsafe_allow_html=True)
133
+ elif similarity_score <= 60:
134
+ st.markdown("<h1 style='text-align: center;'>🤔</h1>", unsafe_allow_html=True)
135
+ elif similarity_score <= 80:
136
+ st.markdown("<h1 style='text-align: center;'>😮</h1>", unsafe_allow_html=True)
137
+ else:
138
+ st.markdown("<h1 style='text-align: center;'>😱</h1>", unsafe_allow_html=True)
139
+ else:
140
+ st.info('👆 請在上方輸入兩篇要比較的文章,然後點擊"開始計算相似度"按鈕')
141
+
142
+ # 在底部添加說明
143
+ st.markdown("---")
144
+ st.markdown("""
145
+ <div style='text-align: center;'>
146
+ <p style='color: gray; font-size: 14px;'>
147
+ 💡 判定標準:<br>
148
+ 0-30%:文章沒有關係 | 31-60%:稍有關係 | 61-80%:很類似 | 81-100%:疑似抄襲
149
+ </p>
150
+ </div>
151
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ jieba
3
+ scikit-learn
4
+ numpy