import streamlit as st import numpy as np import pandas as pd import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification @st.cache_resource def pipeline_getter(): tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased') model = AutoModelForSequenceClassification.from_pretrained('KemmerEdition/my-distill-classifier') mapping = pd.read_csv('./categories.csv').values.squeeze() return tokenizer, model, mapping tokenizer, model, mapping = pipeline_getter() def predict_article_categories_with_confidence( text_data, abstract_text=None, confidence_level=0.95, max_categories=9 ): tokenized_input = tokenizer( text=text_data, text_pair=abstract_text, padding=True, truncation=True, return_tensors='pt' ) model_output = model(**tokenized_input) logits = model_output.logits probs = torch.sigmoid(logits).detach().numpy().flatten() sorted_indices = np.argsort(probs)[::-1] sorted_probs = probs[sorted_indices] cumulative_probs = np.cumsum(sorted_probs) selected_indices = [] for i, cum_prob in enumerate(cumulative_probs): if cum_prob >= confidence_level or i >= max_categories - 1: selected_indices = sorted_indices[:i+1] break result = { 'probabilities': probs, 'predicted_categories': [mapping[idx] for idx in selected_indices], 'confidence': cumulative_probs[len(selected_indices)-1], 'top_category': mapping[sorted_indices[0]], 'used_categories': len(selected_indices) } return result st.markdown(""" """, unsafe_allow_html=True) st.markdown('', unsafe_allow_html=True) with st.container(): st.markdown('

', unsafe_allow_html=True) title_input = st.text_input('**Here you can write title:**', placeholder="e.g. Quantum Machine Learning Approaches") abstract_input = st.text_area('**Here you can write summary from arxiv:**', placeholder="Paste the abstract here for more accurate categorization...", height=150) st.markdown('

', unsafe_allow_html=True) col1, col2 = st.columns(2) with col1: confidence_level = st.slider('**Confidence level (%)**', 80, 100, 95) with col2: max_categories = st.slider('**Maximum categories**', 1, 10, 3) if st.button('**Press F (just press)**', type="primary"): if len(title_input) > 0: with st.spinner('Analyzing paper content...'): result = predict_article_categories_with_confidence( title_input, abstract_input if abstract_input else None, confidence_level=confidence_level/100, max_categories=max_categories ) with st.container(): st.markdown('

', unsafe_allow_html=True) st.subheader("Categorization Results") st.markdown(f"**Most likely category:**") st.markdown(f'

{result["top_category"]} (p={result["probabilities"][np.argmax(result["probabilities"])]:.3f})

', unsafe_allow_html=True) if len(result["predicted_categories"]) > 1: st.markdown(f"Additional categories:") for category in result["predicted_categories"][1:]: st.markdown(f'

{category}

', unsafe_allow_html=True) st.markdown("---") else: st.warning("Please enter at least the paper title")