# app.py import streamlit as st import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import difflib import numpy as np import time # 設置網頁標題等信息 st.set_page_config( page_title="哞哞文章相似度檢測", page_icon="🐮", layout="wide", initial_sidebar_state="collapsed" ) # 自定義CSS樣式 st.markdown(""" """, unsafe_allow_html=True) # 顯示標題 st.markdown("

🐮 哞哞文章相似度檢測

", unsafe_allow_html=True) # 創建兩列佈局 col1, col2 = st.columns(2) with col1: st.markdown("### 📝 文章1") text1 = st.text_area("", height=300, placeholder="請在這裡輸入第一篇文章...", key="text1") with col2: st.markdown("### 📝 文章2") text2 = st.text_area("", height=300, placeholder="請在這裡輸入第二篇文章...", key="text2") # 創建按鈕列 col_btn1, col_btn2, col_btn3 = st.columns([1,1,1]) with col_btn2: start_btn = st.button("🚀 開始計算相似度", type="primary", use_container_width=True) def calculate_similarity(text1, text2): """計算文本相似度""" if not text1.strip() or not text2.strip(): return None, None # 1. 計算字詞重合度 words1 = list(jieba.cut(text1)) words2 = list(jieba.cut(text2)) word_set1 = set(words1) word_set2 = set(words2) word_similarity = len(word_set1.intersection(word_set2)) / len(word_set1.union(word_set2)) # 2. 計算句子相似度 sentences1 = text1.split("。") sentences2 = text2.split("。") sentence_matcher = difflib.SequenceMatcher(None, sentences1, sentences2) sentence_similarity = sentence_matcher.ratio() # 3. 計算TF-IDF相似度 vectorizer = TfidfVectorizer() try: tfidf_matrix = vectorizer.fit_transform([text1, text2]) cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] except: cosine_sim = 0 # 計算總相似度 weights = [0.4, 0.3, 0.3] total_similarity = (word_similarity * weights[0] + sentence_similarity * weights[1] + cosine_sim * weights[2]) * 100 similarity_score = round(total_similarity, 2) # 判定結果 if similarity_score <= 30: result = "兩篇文章沒有關係" elif similarity_score <= 60: result = "兩篇文章似乎有那麼一點關係" elif similarity_score <= 80: result = "兩篇文章很類似" else: result = "兩篇文章有抄襲犯罪的味道" return similarity_score, result if start_btn and text1 and text2: with st.spinner('🔍 分析中，請稍等...'): # 顯示進度條 progress_text = "計算中..." my_bar = st.progress(0, text=progress_text) for percent_complete in range(100): time.sleep(0.01) my_bar.progress(percent_complete + 1, text=progress_text) # 計算相似度 similarity_score, result = calculate_similarity(text1, text2) if similarity_score is not None: # 清除進度條 my_bar.empty() # 顯示結果 st.markdown("---") st.markdown("

✨ 分析結果

", unsafe_allow_html=True) result_text = f"""

相似度：{similarity_score}%

分析結果：{result}

""" st.markdown(result_text, unsafe_allow_html=True) # 顯示可愛的表情符號 if similarity_score <= 30: st.markdown("

😌

", unsafe_allow_html=True) elif similarity_score <= 60: st.markdown("

🤔

", unsafe_allow_html=True) elif similarity_score <= 80: st.markdown("

😮

", unsafe_allow_html=True) else: st.markdown("

😱

", unsafe_allow_html=True) else: st.info('👆 請在上方輸入兩篇要比較的文章，然後點擊"開始計算相似度"按鈕') # 在底部添加說明 st.markdown("---") st.markdown("""

💡 判定標準：
0-30%：文章沒有關係 | 31-60%：稍有關係 | 61-80%：很類似 | 81-100%：疑似抄襲

""", unsafe_allow_html=True)