Spaces:
Sleeping
Sleeping
File size: 5,261 Bytes
cff7733 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# app.py
import streamlit as st
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import numpy as np
import time
# 設置網頁標題等信息
st.set_page_config(
page_title="哞哞文章相似度檢測",
page_icon="🐮",
layout="wide",
initial_sidebar_state="collapsed"
)
# 自定義CSS樣式
st.markdown("""
<style>
.stTextArea textarea {
font-size: 16px !important;
}
.big-font {
font-size: 24px !important;
font-weight: bold !important;
color: #FF4B4B !important;
}
.result-font {
font-size: 20px !important;
color: #1E88E5 !important;
}
</style>
""", unsafe_allow_html=True)
# 顯示標題
st.markdown("<h1 style='text-align: center; color: #FF4B4B;'>🐮 哞哞文章相似度檢測</h1>", unsafe_allow_html=True)
# 創建兩列佈局
col1, col2 = st.columns(2)
with col1:
st.markdown("### 📝 文章1")
text1 = st.text_area("", height=300, placeholder="請在這裡輸入第一篇文章...", key="text1")
with col2:
st.markdown("### 📝 文章2")
text2 = st.text_area("", height=300, placeholder="請在這裡輸入第二篇文章...", key="text2")
# 創建按鈕列
col_btn1, col_btn2, col_btn3 = st.columns([1,1,1])
with col_btn2:
start_btn = st.button("🚀 開始計算相似度", type="primary", use_container_width=True)
def calculate_similarity(text1, text2):
"""計算文本相似度"""
if not text1.strip() or not text2.strip():
return None, None
# 1. 計算字詞重合度
words1 = list(jieba.cut(text1))
words2 = list(jieba.cut(text2))
word_set1 = set(words1)
word_set2 = set(words2)
word_similarity = len(word_set1.intersection(word_set2)) / len(word_set1.union(word_set2))
# 2. 計算句子相似度
sentences1 = text1.split("。")
sentences2 = text2.split("。")
sentence_matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
sentence_similarity = sentence_matcher.ratio()
# 3. 計算TF-IDF相似度
vectorizer = TfidfVectorizer()
try:
tfidf_matrix = vectorizer.fit_transform([text1, text2])
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
except:
cosine_sim = 0
# 計算總相似度
weights = [0.4, 0.3, 0.3]
total_similarity = (word_similarity * weights[0] +
sentence_similarity * weights[1] +
cosine_sim * weights[2]) * 100
similarity_score = round(total_similarity, 2)
# 判定結果
if similarity_score <= 30:
result = "兩篇文章沒有關係"
elif similarity_score <= 60:
result = "兩篇文章似乎有那麼一點關係"
elif similarity_score <= 80:
result = "兩篇文章很類似"
else:
result = "兩篇文章有抄襲犯罪的味道"
return similarity_score, result
if start_btn and text1 and text2:
with st.spinner('🔍 分析中,請稍等...'):
# 顯示進度條
progress_text = "計算中..."
my_bar = st.progress(0, text=progress_text)
for percent_complete in range(100):
time.sleep(0.01)
my_bar.progress(percent_complete + 1, text=progress_text)
# 計算相似度
similarity_score, result = calculate_similarity(text1, text2)
if similarity_score is not None:
# 清除進度條
my_bar.empty()
# 顯示結果
st.markdown("---")
st.markdown("<h3 style='text-align: center;'>✨ 分析結果</h3>", unsafe_allow_html=True)
result_text = f"""
<div style='text-align: center;'>
<p class='big-font'>相似度:{similarity_score}%</p>
<p class='result-font'>分析結果:{result}</p>
</div>
"""
st.markdown(result_text, unsafe_allow_html=True)
# 顯示可愛的表情符號
if similarity_score <= 30:
st.markdown("<h1 style='text-align: center;'>😌</h1>", unsafe_allow_html=True)
elif similarity_score <= 60:
st.markdown("<h1 style='text-align: center;'>🤔</h1>", unsafe_allow_html=True)
elif similarity_score <= 80:
st.markdown("<h1 style='text-align: center;'>😮</h1>", unsafe_allow_html=True)
else:
st.markdown("<h1 style='text-align: center;'>😱</h1>", unsafe_allow_html=True)
else:
st.info('👆 請在上方輸入兩篇要比較的文章,然後點擊"開始計算相似度"按鈕')
# 在底部添加說明
st.markdown("---")
st.markdown("""
<div style='text-align: center;'>
<p style='color: gray; font-size: 14px;'>
💡 判定標準:<br>
0-30%:文章沒有關係 | 31-60%:稍有關係 | 61-80%:很類似 | 81-100%:疑似抄襲
</p>
</div>
""", unsafe_allow_html=True) |