Spaces:

unpaper
/

AddPaper

Running

App Files Files Community

katsukiai commited on Feb 28

Commit

1f53a43

verified ·

1 Parent(s): eb0aef7

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -34

app.py CHANGED Viewed

@@ -3,18 +3,20 @@ import arxiv
 import requests
 import os
 from pathlib import Path
-from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
 from huggingface_hub import login, HfApi
 import fitz  # PyMuPDF
 import pandas as pd
 from collections import Counter
 import re
 # Constants
-MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
-SECONDARY_MODEL = "distilbert-base-uncased"
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
 SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
 # CSS
 st.markdown("""
@@ -28,6 +30,13 @@ st.markdown("""
         border-radius: 5px;
         display: inline-block;
     }
     </style>
 """, unsafe_allow_html=True)
@@ -38,20 +47,42 @@ arxiv_id = st.sidebar.text_input("Enter arXiv ID", "2407.21783")
 upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
 space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
 token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
 # Login to Hugging Face
 if token:
     login(token=token)
 # Initialize models
 @st.cache_resource
 def load_models():
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
-    secondary_model = pipeline("text-classification", model=SECONDARY_MODEL)
-    return tokenizer, model, secondary_model
-tokenizer, model, secondary_model = load_models()
 # Functions
 def fetch_arxiv_paper(paper_id):
@@ -80,27 +111,53 @@ def analyze_authors(text):
         author_list.extend([name.strip() for name in names])
     return Counter(author_list)
-def process_text_with_models(text):
-    inputs = tokenizer(text[:512], return_tensors="pt", truncation=True)
-    outputs = model(**inputs)
-    secondary_results = secondary_model(text[:512])
-    return outputs, secondary_results
 def create_huggingface_space(space_name, metadata):
     api = HfApi()
-    api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
-    api.upload_file(
-        path_or_fileobj="README.md",
-        path_in_repo="README.md",
-        repo_id=space_name,
-        repo_type="space"
-    )
-    return f"https://huggingface.co/spaces/{space_name}"
 # Main App
 st.title("arXiv Paper to Hugging Face Space Converter")
 st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)
 # Process arXiv or PDF
 if arxiv_id or upload_pdf:
     if upload_pdf:
@@ -114,7 +171,10 @@ if arxiv_id or upload_pdf:
     # Extract and analyze
     text = extract_text_from_pdf(pdf_path)
     author_analysis = analyze_authors(text)
-    model_outputs, secondary_outputs = process_text_with_models(text)
     # Display results
     st.header("Paper Analysis")
@@ -122,30 +182,40 @@ if arxiv_id or upload_pdf:
     st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))
     st.subheader("AI Analysis")
-    st.write("Primary Model Outputs:", model_outputs.logits)
-    st.write("Secondary Model Outputs:", secondary_outputs)
-    # Metadata
     metadata = {
         "title": paper.title if arxiv_id else "Uploaded PDF",
         "authors": list(author_analysis.keys()),
         "arxiv_id": arxiv_id if arxiv_id else "N/A",
         "model_analysis": {
-            "primary": str(model_outputs.logits),
-            "secondary": str(secondary_outputs)
         }
     }
     # Create Space
     if st.button("Create Hugging Face Space"):
         space_url = create_huggingface_space(space_name, metadata)
-        st.success(f"Space created: {space_url}")
-        st.markdown(f"""
-            <a href="{space_url}" target="_blank">
-                <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
-                     alt="Hugging Face Space" width="150">
-            </a>
-        """, unsafe_allow_html=True)
 # Cleanup
 if os.path.exists("temp.pdf"):

 import requests
 import os
 from pathlib import Path
+from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
 from huggingface_hub import login, HfApi
 import fitz  # PyMuPDF
 import pandas as pd
 from collections import Counter
 import re
+import json
 # Constants
+MODEL_NAME = "google/flan-t5-large"
+SECONDARY_MODEL = "facebook/bart-large-cnn"
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
 SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
+HF_API_URL = "https://huggingface.co/api/models"
 # CSS
 st.markdown("""
         border-radius: 5px;
         display: inline-block;
     }
+    .warning {
+        background-color: #fff3cd;
+        color: #856404;
+        padding: 10px;
+        border-radius: 5px;
+        margin: 10px 0;
+    }
     </style>
 """, unsafe_allow_html=True)
 upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
 space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
 token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
+model_choice = st.sidebar.selectbox("Select Model", ["Text-to-Text (FLAN-T5)", "Text Generation (BART)"])
 # Login to Hugging Face
 if token:
     login(token=token)
+# Fetch available models from Hugging Face API
+@st.cache_data(ttl=3600)
+def fetch_hf_models():
+    try:
+        response = requests.get(HF_API_URL, headers={"Authorization": f"Bearer {token}"})
+        if response.status_code == 200:
+            return response.json()
+        else:
+            st.warning("Failed to fetch models from Hugging Face API. Using default models.")
+            return None
+    except Exception as e:
+        st.warning(f"Error fetching models: {str(e)}. Using default models.")
+        return None
+hf_models = fetch_hf_models()
 # Initialize models
 @st.cache_resource
 def load_models():
+    if model_choice == "Text-to-Text (FLAN-T5)":
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+        pipeline_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(SECONDARY_MODEL)
+        model = AutoModelForSeq2SeqLM.from_pretrained(SECONDARY_MODEL)
+        pipeline_model = pipeline("summarization", model=model, tokenizer=tokenizer)
+    return tokenizer, model, pipeline_model
+tokenizer, model, pipeline_model = load_models()
 # Functions
 def fetch_arxiv_paper(paper_id):
         author_list.extend([name.strip() for name in names])
     return Counter(author_list)
+def process_text_with_model(text, task="summarize"):
+    if model_choice == "Text-to-Text (FLAN-T5)":
+        prompt = f"{task} the following text: {text[:1000]}"
+        result = pipeline_model(prompt, max_length=512, num_beams=4)
+    else:
+        result = pipeline_model(text[:1000], max_length=512, min_length=30, do_sample=False)
+    return result[0]['generated_text']
 def create_huggingface_space(space_name, metadata):
     api = HfApi()
+    try:
+        api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
+        # Upload metadata
+        with open("metadata.json", "w") as f:
+            json.dump(metadata, f, indent=2)
+        api.upload_file(
+            path_or_fileobj="metadata.json",
+            path_in_repo="metadata.json",
+            repo_id=space_name,
+            repo_type="space"
+        )
+        api.upload_file(
+            path_or_fileobj="README.md",
+            path_in_repo="README.md",
+            repo_id=space_name,
+            repo_type="space"
+        )
+        return f"https://huggingface.co/spaces/{space_name}"
+    except Exception as e:
+        st.error(f"Failed to create space: {str(e)}")
+        return None
+    finally:
+        if os.path.exists("metadata.json"):
+            os.remove("metadata.json")
 # Main App
 st.title("arXiv Paper to Hugging Face Space Converter")
 st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)
+# Warning about model usage
+st.markdown("""
+    <div class='warning'>
+        <strong>Warning:</strong> Ensure you have proper permissions to use selected models.
+        Model outputs are stored in metadata and will be publicly visible in the space.
+    </div>
+""", unsafe_allow_html=True)
 # Process arXiv or PDF
 if arxiv_id or upload_pdf:
     if upload_pdf:
     # Extract and analyze
     text = extract_text_from_pdf(pdf_path)
     author_analysis = analyze_authors(text)
+    # Model processing
+    summary = process_text_with_model(text, "summarize")
+    key_points = process_text_with_model(text, "extract key points" if model_choice == "Text-to-Text (FLAN-T5)" else "summarize")
     # Display results
     st.header("Paper Analysis")
     st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))
     st.subheader("AI Analysis")
+    st.write("Summary:", summary)
+    st.write("Key Points:", key_points)
+    # Enhanced metadata
     metadata = {
         "title": paper.title if arxiv_id else "Uploaded PDF",
         "authors": list(author_analysis.keys()),
         "arxiv_id": arxiv_id if arxiv_id else "N/A",
         "model_analysis": {
+            "summary": summary,
+            "key_points": key_points,
+            "model_used": model_choice,
+            "model_name": MODEL_NAME if model_choice == "Text-to-Text (FLAN-T5)" else SECONDARY_MODEL,
+            "model_license": "Check model card on Hugging Face",
+            "processing_date": pd.Timestamp.now().isoformat()
+        },
+        "warnings": {
+            "model_usage": "Ensure proper model licensing",
+            "content_visibility": "All outputs will be public in space",
+            "data_source": "Verify arXiv/paper permissions"
         }
     }
     # Create Space
     if st.button("Create Hugging Face Space"):
         space_url = create_huggingface_space(space_name, metadata)
+        if space_url:
+            st.success(f"Space created: {space_url}")
+            st.markdown(f"""
+                <a href="{space_url}" target="_blank">
+                    <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
+                         alt="Hugging Face Space" width="150">
+                </a>
+            """, unsafe_allow_html=True)
 # Cleanup
 if os.path.exists("temp.pdf"):