Spaces:

KemmerEdition
/

arxive_classifier

Running

App Files Files Community

KemmerEdition commited on 17 days ago

Commit

d81b5f5

1 Parent(s): 5721366

add lab materials

Browse files

Files changed (3) hide show

app.py +131 -0
categories.csv +10 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+@st.cache_resource
+def pipeline_getter():
+    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
+    model = AutoModelForSequenceClassification.from_pretrained('KemmerEdition/my-distill-classifier')
+    mapping = pd.read_csv('./categories.csv').values.squeeze()
+    return tokenizer, model, mapping
+tokenizer, model, mapping = pipeline_getter()
+def predict_article_categories_with_confidence(
+    text_data,
+    abstract_text=None,
+    confidence_level=0.95,
+    max_categories=9
+):
+    tokenized_input = tokenizer(
+        text=text_data,
+        text_pair=abstract_text,
+        padding=True,
+        truncation=True,
+        return_tensors='pt'
+    )
+    model_output = model(**tokenized_input)
+    logits = model_output.logits
+    probs = torch.sigmoid(logits).detach().numpy().flatten()
+    sorted_indices = np.argsort(probs)[::-1]
+    sorted_probs = probs[sorted_indices]
+    cumulative_probs = np.cumsum(sorted_probs)
+    selected_indices = []
+    for i, cum_prob in enumerate(cumulative_probs):
+        if cum_prob >= confidence_level or i >= max_categories - 1:
+            selected_indices = sorted_indices[:i+1]
+            break
+    result = {
+        'probabilities': probs,
+        'predicted_categories': [mapping[idx] for idx in selected_indices],
+        'confidence': cumulative_probs[len(selected_indices)-1],
+        'top_category': mapping[sorted_indices[0]],
+        'used_categories': len(selected_indices)
+    }
+    return result
+st.markdown("""
+<style>
+    .header {
+        font-size: 36px !important;
+        color: #1f77b4;
+        margin-bottom: 20px;
+    }
+    .input-box {
+        background-color: #f0f2f6;
+        padding: 20px;
+        border-radius: 10px;
+        margin-bottom: 20px;
+    }
+    .result-box {
+        background-color: #e6f3ff;
+        padding: 20px;
+        border-radius: 10px;
+        margin-top: 20px;
+    }
+    .category-badge {
+        display: inline-block;
+        background-color: #1f77b4;
+        color: white;
+        padding: 5px 10px;
+        margin: 5px;
+        border-radius: 15px;
+        font-size: 14px;
+    }
+</style>
+""", unsafe_allow_html=True)
+st.markdown('<div class="header">Classificator of Paper from arxiv</div>', unsafe_allow_html=True)
+with st.container():
+    st.markdown('<div class="input-box">', unsafe_allow_html=True)
+    title_input = st.text_input('**Here you can write title:**', placeholder="e.g. Quantum Machine Learning Approaches")
+    abstract_input = st.text_area('**Here you can write summary from arxiv:**',
+                                placeholder="Paste the abstract here for more accurate categorization...",
+                                height=150)
+    st.markdown('</div>', unsafe_allow_html=True)
+    col1, col2 = st.columns(2)
+    with col1:
+        confidence_level = st.slider('**Confidence level (%)**', 80, 100, 95)
+    with col2:
+        max_categories = st.slider('**Maximum categories**', 1, 10, 3)
+if st.button('**Press F (just press)**', type="primary"):
+    if len(title_input) > 0:
+        with st.spinner('Analyzing paper content...'):
+            result = predict_article_categories_with_confidence(
+                title_input,
+                abstract_input if abstract_input else None,
+                confidence_level=confidence_level/100,
+                max_categories=max_categories
+            )
+        with st.container():
+            st.markdown('<div class="result-box">', unsafe_allow_html=True)
+            st.subheader("Categorization Results")
+            st.markdown(f"**Most likely category:**")
+            st.markdown(f'<div class="category-badge">{result["top_category"]} (p={result["probabilities"][np.argmax(result["probabilities"])]:.3f})</div>',
+                       unsafe_allow_html=True)
+            if len(result["predicted_categories"]) > 1:
+                st.markdown(f"Additional categories:")
+                for category in result["predicted_categories"][1:]:
+                    st.markdown(f'<div class="category-badge">{category}</div>', unsafe_allow_html=True)
+            st.markdown("---")
+    else:
+        st.warning("Please enter at least the paper title")

categories.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+category
+cs
+econ
+eess
+math
+nlin
+physics
+q-bio
+q-fin
+stat

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers==4.47.0
+torch==2.2.2
+pandas==2.2.2
+numpy==1.26.4