# app.py
import streamlit as st
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import numpy as np
import time
# 設置網頁標題等信息
st.set_page_config(
page_title="哞哞文章相似度檢測",
page_icon="🐮",
layout="wide",
initial_sidebar_state="collapsed"
)
# 自定義CSS樣式
st.markdown("""
""", unsafe_allow_html=True)
# 顯示標題
st.markdown("
🐮 哞哞文章相似度檢測
", unsafe_allow_html=True)
# 創建兩列佈局
col1, col2 = st.columns(2)
with col1:
st.markdown("### 📝 文章1")
text1 = st.text_area("", height=300, placeholder="請在這裡輸入第一篇文章...", key="text1")
with col2:
st.markdown("### 📝 文章2")
text2 = st.text_area("", height=300, placeholder="請在這裡輸入第二篇文章...", key="text2")
# 創建按鈕列
col_btn1, col_btn2, col_btn3 = st.columns([1,1,1])
with col_btn2:
start_btn = st.button("🚀 開始計算相似度", type="primary", use_container_width=True)
def calculate_similarity(text1, text2):
"""計算文本相似度"""
if not text1.strip() or not text2.strip():
return None, None
# 1. 計算字詞重合度
words1 = list(jieba.cut(text1))
words2 = list(jieba.cut(text2))
word_set1 = set(words1)
word_set2 = set(words2)
word_similarity = len(word_set1.intersection(word_set2)) / len(word_set1.union(word_set2))
# 2. 計算句子相似度
sentences1 = text1.split("。")
sentences2 = text2.split("。")
sentence_matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
sentence_similarity = sentence_matcher.ratio()
# 3. 計算TF-IDF相似度
vectorizer = TfidfVectorizer()
try:
tfidf_matrix = vectorizer.fit_transform([text1, text2])
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
except:
cosine_sim = 0
# 計算總相似度
weights = [0.4, 0.3, 0.3]
total_similarity = (word_similarity * weights[0] +
sentence_similarity * weights[1] +
cosine_sim * weights[2]) * 100
similarity_score = round(total_similarity, 2)
# 判定結果
if similarity_score <= 30:
result = "兩篇文章沒有關係"
elif similarity_score <= 60:
result = "兩篇文章似乎有那麼一點關係"
elif similarity_score <= 80:
result = "兩篇文章很類似"
else:
result = "兩篇文章有抄襲犯罪的味道"
return similarity_score, result
if start_btn and text1 and text2:
with st.spinner('🔍 分析中,請稍等...'):
# 顯示進度條
progress_text = "計算中..."
my_bar = st.progress(0, text=progress_text)
for percent_complete in range(100):
time.sleep(0.01)
my_bar.progress(percent_complete + 1, text=progress_text)
# 計算相似度
similarity_score, result = calculate_similarity(text1, text2)
if similarity_score is not None:
# 清除進度條
my_bar.empty()
# 顯示結果
st.markdown("---")
st.markdown("✨ 分析結果
", unsafe_allow_html=True)
result_text = f"""
相似度:{similarity_score}%
分析結果:{result}
"""
st.markdown(result_text, unsafe_allow_html=True)
# 顯示可愛的表情符號
if similarity_score <= 30:
st.markdown("😌
", unsafe_allow_html=True)
elif similarity_score <= 60:
st.markdown("🤔
", unsafe_allow_html=True)
elif similarity_score <= 80:
st.markdown("😮
", unsafe_allow_html=True)
else:
st.markdown("😱
", unsafe_allow_html=True)
else:
st.info('👆 請在上方輸入兩篇要比較的文章,然後點擊"開始計算相似度"按鈕')
# 在底部添加說明
st.markdown("---")
st.markdown("""
💡 判定標準:
0-30%:文章沒有關係 | 31-60%:稍有關係 | 61-80%:很類似 | 81-100%:疑似抄襲
""", unsafe_allow_html=True)