File size: 5,261 Bytes
cff7733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# app.py
import streamlit as st
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import numpy as np
import time

# 設置網頁標題等信息
st.set_page_config(
    page_title="哞哞文章相似度檢測",
    page_icon="🐮",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# 自定義CSS樣式
st.markdown("""

<style>

    .stTextArea textarea {

        font-size: 16px !important;

    }

    .big-font {

        font-size: 24px !important;

        font-weight: bold !important;

        color: #FF4B4B !important;

    }

    .result-font {

        font-size: 20px !important;

        color: #1E88E5 !important;

    }

</style>

""", unsafe_allow_html=True)

# 顯示標題
st.markdown("<h1 style='text-align: center; color: #FF4B4B;'>🐮 哞哞文章相似度檢測</h1>", unsafe_allow_html=True)

# 創建兩列佈局
col1, col2 = st.columns(2)

with col1:
    st.markdown("### 📝 文章1")
    text1 = st.text_area("", height=300, placeholder="請在這裡輸入第一篇文章...", key="text1")

with col2:
    st.markdown("### 📝 文章2")
    text2 = st.text_area("", height=300, placeholder="請在這裡輸入第二篇文章...", key="text2")

# 創建按鈕列
col_btn1, col_btn2, col_btn3 = st.columns([1,1,1])

with col_btn2:
    start_btn = st.button("🚀 開始計算相似度", type="primary", use_container_width=True)

def calculate_similarity(text1, text2):
    """計算文本相似度"""
    if not text1.strip() or not text2.strip():
        return None, None
    
    # 1. 計算字詞重合度
    words1 = list(jieba.cut(text1))
    words2 = list(jieba.cut(text2))
    word_set1 = set(words1)
    word_set2 = set(words2)
    word_similarity = len(word_set1.intersection(word_set2)) / len(word_set1.union(word_set2))
    
    # 2. 計算句子相似度
    sentences1 = text1.split("。")
    sentences2 = text2.split("。")
    sentence_matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
    sentence_similarity = sentence_matcher.ratio()
    
    # 3. 計算TF-IDF相似度
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    except:
        cosine_sim = 0
    
    # 計算總相似度
    weights = [0.4, 0.3, 0.3]
    total_similarity = (word_similarity * weights[0] + 
                       sentence_similarity * weights[1] + 
                       cosine_sim * weights[2]) * 100
                       
    similarity_score = round(total_similarity, 2)
    
    # 判定結果
    if similarity_score <= 30:
        result = "兩篇文章沒有關係"
    elif similarity_score <= 60:
        result = "兩篇文章似乎有那麼一點關係"
    elif similarity_score <= 80:
        result = "兩篇文章很類似"
    else:
        result = "兩篇文章有抄襲犯罪的味道"
        
    return similarity_score, result

if start_btn and text1 and text2:
    with st.spinner('🔍 分析中,請稍等...'):
        # 顯示進度條
        progress_text = "計算中..."
        my_bar = st.progress(0, text=progress_text)
        for percent_complete in range(100):
            time.sleep(0.01)
            my_bar.progress(percent_complete + 1, text=progress_text)
            
        # 計算相似度
        similarity_score, result = calculate_similarity(text1, text2)
        
        if similarity_score is not None:
            # 清除進度條
            my_bar.empty()
            
            # 顯示結果
            st.markdown("---")
            st.markdown("<h3 style='text-align: center;'>✨ 分析結果</h3>", unsafe_allow_html=True)
            
            result_text = f"""

            <div style='text-align: center;'>

                <p class='big-font'>相似度:{similarity_score}%</p>

                <p class='result-font'>分析結果:{result}</p>

            </div>

            """
            st.markdown(result_text, unsafe_allow_html=True)
            
            # 顯示可愛的表情符號
            if similarity_score <= 30:
                st.markdown("<h1 style='text-align: center;'>😌</h1>", unsafe_allow_html=True)
            elif similarity_score <= 60:
                st.markdown("<h1 style='text-align: center;'>🤔</h1>", unsafe_allow_html=True)
            elif similarity_score <= 80:
                st.markdown("<h1 style='text-align: center;'>😮</h1>", unsafe_allow_html=True)
            else:
                st.markdown("<h1 style='text-align: center;'>😱</h1>", unsafe_allow_html=True)
else:
    st.info('👆 請在上方輸入兩篇要比較的文章,然後點擊"開始計算相似度"按鈕')

# 在底部添加說明
st.markdown("---")
st.markdown("""

<div style='text-align: center;'>

    <p style='color: gray; font-size: 14px;'>

        💡 判定標準:<br>

        0-30%:文章沒有關係 | 31-60%:稍有關係 | 61-80%:很類似 | 81-100%:疑似抄襲

    </p>

</div>

""", unsafe_allow_html=True)