Spaces:
Sleeping
Sleeping
# app.py | |
import streamlit as st | |
import jieba | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import difflib | |
import numpy as np | |
import time | |
# 設置網頁標題等信息 | |
st.set_page_config( | |
page_title="哞哞文章相似度檢測", | |
page_icon="🐮", | |
layout="wide", | |
initial_sidebar_state="collapsed" | |
) | |
# 自定義CSS樣式 | |
st.markdown(""" | |
<style> | |
.stTextArea textarea { | |
font-size: 16px !important; | |
} | |
.big-font { | |
font-size: 24px !important; | |
font-weight: bold !important; | |
color: #FF4B4B !important; | |
} | |
.result-font { | |
font-size: 20px !important; | |
color: #1E88E5 !important; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# 顯示標題 | |
st.markdown("<h1 style='text-align: center; color: #FF4B4B;'>🐮 哞哞文章相似度檢測</h1>", unsafe_allow_html=True) | |
# 創建兩列佈局 | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("### 📝 文章1") | |
text1 = st.text_area("", height=300, placeholder="請在這裡輸入第一篇文章...", key="text1") | |
with col2: | |
st.markdown("### 📝 文章2") | |
text2 = st.text_area("", height=300, placeholder="請在這裡輸入第二篇文章...", key="text2") | |
# 創建按鈕列 | |
col_btn1, col_btn2, col_btn3 = st.columns([1,1,1]) | |
with col_btn2: | |
start_btn = st.button("🚀 開始計算相似度", type="primary", use_container_width=True) | |
def calculate_similarity(text1, text2): | |
"""計算文本相似度""" | |
if not text1.strip() or not text2.strip(): | |
return None, None | |
# 1. 計算字詞重合度 | |
words1 = list(jieba.cut(text1)) | |
words2 = list(jieba.cut(text2)) | |
word_set1 = set(words1) | |
word_set2 = set(words2) | |
word_similarity = len(word_set1.intersection(word_set2)) / len(word_set1.union(word_set2)) | |
# 2. 計算句子相似度 | |
sentences1 = text1.split("。") | |
sentences2 = text2.split("。") | |
sentence_matcher = difflib.SequenceMatcher(None, sentences1, sentences2) | |
sentence_similarity = sentence_matcher.ratio() | |
# 3. 計算TF-IDF相似度 | |
vectorizer = TfidfVectorizer() | |
try: | |
tfidf_matrix = vectorizer.fit_transform([text1, text2]) | |
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] | |
except: | |
cosine_sim = 0 | |
# 計算總相似度 | |
weights = [0.4, 0.3, 0.3] | |
total_similarity = (word_similarity * weights[0] + | |
sentence_similarity * weights[1] + | |
cosine_sim * weights[2]) * 100 | |
similarity_score = round(total_similarity, 2) | |
# 判定結果 | |
if similarity_score <= 30: | |
result = "兩篇文章沒有關係" | |
elif similarity_score <= 60: | |
result = "兩篇文章似乎有那麼一點關係" | |
elif similarity_score <= 80: | |
result = "兩篇文章很類似" | |
else: | |
result = "兩篇文章有抄襲犯罪的味道" | |
return similarity_score, result | |
if start_btn and text1 and text2: | |
with st.spinner('🔍 分析中,請稍等...'): | |
# 顯示進度條 | |
progress_text = "計算中..." | |
my_bar = st.progress(0, text=progress_text) | |
for percent_complete in range(100): | |
time.sleep(0.01) | |
my_bar.progress(percent_complete + 1, text=progress_text) | |
# 計算相似度 | |
similarity_score, result = calculate_similarity(text1, text2) | |
if similarity_score is not None: | |
# 清除進度條 | |
my_bar.empty() | |
# 顯示結果 | |
st.markdown("---") | |
st.markdown("<h3 style='text-align: center;'>✨ 分析結果</h3>", unsafe_allow_html=True) | |
result_text = f""" | |
<div style='text-align: center;'> | |
<p class='big-font'>相似度:{similarity_score}%</p> | |
<p class='result-font'>分析結果:{result}</p> | |
</div> | |
""" | |
st.markdown(result_text, unsafe_allow_html=True) | |
# 顯示可愛的表情符號 | |
if similarity_score <= 30: | |
st.markdown("<h1 style='text-align: center;'>😌</h1>", unsafe_allow_html=True) | |
elif similarity_score <= 60: | |
st.markdown("<h1 style='text-align: center;'>🤔</h1>", unsafe_allow_html=True) | |
elif similarity_score <= 80: | |
st.markdown("<h1 style='text-align: center;'>😮</h1>", unsafe_allow_html=True) | |
else: | |
st.markdown("<h1 style='text-align: center;'>😱</h1>", unsafe_allow_html=True) | |
else: | |
st.info('👆 請在上方輸入兩篇要比較的文章,然後點擊"開始計算相似度"按鈕') | |
# 在底部添加說明 | |
st.markdown("---") | |
st.markdown(""" | |
<div style='text-align: center;'> | |
<p style='color: gray; font-size: 14px;'> | |
💡 判定標準:<br> | |
0-30%:文章沒有關係 | 31-60%:稍有關係 | 61-80%:很類似 | 81-100%:疑似抄襲 | |
</p> | |
</div> | |
""", unsafe_allow_html=True) |