katsukiai commited on
Commit
1f53a43
·
verified ·
1 Parent(s): eb0aef7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -34
app.py CHANGED
@@ -3,18 +3,20 @@ import arxiv
3
  import requests
4
  import os
5
  from pathlib import Path
6
- from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
7
  from huggingface_hub import login, HfApi
8
  import fitz # PyMuPDF
9
  import pandas as pd
10
  from collections import Counter
11
  import re
 
12
 
13
  # Constants
14
- MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
15
- SECONDARY_MODEL = "distilbert-base-uncased"
16
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
17
  SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
 
18
 
19
  # CSS
20
  st.markdown("""
@@ -28,6 +30,13 @@ st.markdown("""
28
  border-radius: 5px;
29
  display: inline-block;
30
  }
 
 
 
 
 
 
 
31
  </style>
32
  """, unsafe_allow_html=True)
33
 
@@ -38,20 +47,42 @@ arxiv_id = st.sidebar.text_input("Enter arXiv ID", "2407.21783")
38
  upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
39
  space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
40
  token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
 
41
 
42
  # Login to Hugging Face
43
  if token:
44
  login(token=token)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # Initialize models
47
  @st.cache_resource
48
  def load_models():
49
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
50
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
51
- secondary_model = pipeline("text-classification", model=SECONDARY_MODEL)
52
- return tokenizer, model, secondary_model
 
 
 
 
 
53
 
54
- tokenizer, model, secondary_model = load_models()
55
 
56
  # Functions
57
  def fetch_arxiv_paper(paper_id):
@@ -80,27 +111,53 @@ def analyze_authors(text):
80
  author_list.extend([name.strip() for name in names])
81
  return Counter(author_list)
82
 
83
- def process_text_with_models(text):
84
- inputs = tokenizer(text[:512], return_tensors="pt", truncation=True)
85
- outputs = model(**inputs)
86
- secondary_results = secondary_model(text[:512])
87
- return outputs, secondary_results
 
 
88
 
89
  def create_huggingface_space(space_name, metadata):
90
  api = HfApi()
91
- api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
92
- api.upload_file(
93
- path_or_fileobj="README.md",
94
- path_in_repo="README.md",
95
- repo_id=space_name,
96
- repo_type="space"
97
- )
98
- return f"https://huggingface.co/spaces/{space_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  # Main App
101
  st.title("arXiv Paper to Hugging Face Space Converter")
102
  st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)
103
 
 
 
 
 
 
 
 
 
104
  # Process arXiv or PDF
105
  if arxiv_id or upload_pdf:
106
  if upload_pdf:
@@ -114,7 +171,10 @@ if arxiv_id or upload_pdf:
114
  # Extract and analyze
115
  text = extract_text_from_pdf(pdf_path)
116
  author_analysis = analyze_authors(text)
117
- model_outputs, secondary_outputs = process_text_with_models(text)
 
 
 
118
 
119
  # Display results
120
  st.header("Paper Analysis")
@@ -122,30 +182,40 @@ if arxiv_id or upload_pdf:
122
  st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))
123
 
124
  st.subheader("AI Analysis")
125
- st.write("Primary Model Outputs:", model_outputs.logits)
126
- st.write("Secondary Model Outputs:", secondary_outputs)
127
 
128
- # Metadata
129
  metadata = {
130
  "title": paper.title if arxiv_id else "Uploaded PDF",
131
  "authors": list(author_analysis.keys()),
132
  "arxiv_id": arxiv_id if arxiv_id else "N/A",
133
  "model_analysis": {
134
- "primary": str(model_outputs.logits),
135
- "secondary": str(secondary_outputs)
 
 
 
 
 
 
 
 
 
136
  }
137
  }
138
 
139
  # Create Space
140
  if st.button("Create Hugging Face Space"):
141
  space_url = create_huggingface_space(space_name, metadata)
142
- st.success(f"Space created: {space_url}")
143
- st.markdown(f"""
144
- <a href="{space_url}" target="_blank">
145
- <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
146
- alt="Hugging Face Space" width="150">
147
- </a>
148
- """, unsafe_allow_html=True)
 
149
 
150
  # Cleanup
151
  if os.path.exists("temp.pdf"):
 
3
  import requests
4
  import os
5
  from pathlib import Path
6
+ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
7
  from huggingface_hub import login, HfApi
8
  import fitz # PyMuPDF
9
  import pandas as pd
10
  from collections import Counter
11
  import re
12
+ import json
13
 
14
  # Constants
15
+ MODEL_NAME = "google/flan-t5-large"
16
+ SECONDARY_MODEL = "facebook/bart-large-cnn"
17
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
18
  SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
19
+ HF_API_URL = "https://huggingface.co/api/models"
20
 
21
  # CSS
22
  st.markdown("""
 
30
  border-radius: 5px;
31
  display: inline-block;
32
  }
33
+ .warning {
34
+ background-color: #fff3cd;
35
+ color: #856404;
36
+ padding: 10px;
37
+ border-radius: 5px;
38
+ margin: 10px 0;
39
+ }
40
  </style>
41
  """, unsafe_allow_html=True)
42
 
 
47
  upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
48
  space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
49
  token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
50
+ model_choice = st.sidebar.selectbox("Select Model", ["Text-to-Text (FLAN-T5)", "Text Generation (BART)"])
51
 
52
  # Login to Hugging Face
53
  if token:
54
  login(token=token)
55
 
56
+ # Fetch available models from Hugging Face API
57
+ @st.cache_data(ttl=3600)
58
+ def fetch_hf_models():
59
+ try:
60
+ response = requests.get(HF_API_URL, headers={"Authorization": f"Bearer {token}"})
61
+ if response.status_code == 200:
62
+ return response.json()
63
+ else:
64
+ st.warning("Failed to fetch models from Hugging Face API. Using default models.")
65
+ return None
66
+ except Exception as e:
67
+ st.warning(f"Error fetching models: {str(e)}. Using default models.")
68
+ return None
69
+
70
+ hf_models = fetch_hf_models()
71
+
72
  # Initialize models
73
  @st.cache_resource
74
  def load_models():
75
+ if model_choice == "Text-to-Text (FLAN-T5)":
76
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
77
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
78
+ pipeline_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
79
+ else:
80
+ tokenizer = AutoTokenizer.from_pretrained(SECONDARY_MODEL)
81
+ model = AutoModelForSeq2SeqLM.from_pretrained(SECONDARY_MODEL)
82
+ pipeline_model = pipeline("summarization", model=model, tokenizer=tokenizer)
83
+ return tokenizer, model, pipeline_model
84
 
85
+ tokenizer, model, pipeline_model = load_models()
86
 
87
  # Functions
88
  def fetch_arxiv_paper(paper_id):
 
111
  author_list.extend([name.strip() for name in names])
112
  return Counter(author_list)
113
 
114
+ def process_text_with_model(text, task="summarize"):
115
+ if model_choice == "Text-to-Text (FLAN-T5)":
116
+ prompt = f"{task} the following text: {text[:1000]}"
117
+ result = pipeline_model(prompt, max_length=512, num_beams=4)
118
+ else:
119
+ result = pipeline_model(text[:1000], max_length=512, min_length=30, do_sample=False)
120
+ return result[0]['generated_text']
121
 
122
  def create_huggingface_space(space_name, metadata):
123
  api = HfApi()
124
+ try:
125
+ api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
126
+ # Upload metadata
127
+ with open("metadata.json", "w") as f:
128
+ json.dump(metadata, f, indent=2)
129
+ api.upload_file(
130
+ path_or_fileobj="metadata.json",
131
+ path_in_repo="metadata.json",
132
+ repo_id=space_name,
133
+ repo_type="space"
134
+ )
135
+ api.upload_file(
136
+ path_or_fileobj="README.md",
137
+ path_in_repo="README.md",
138
+ repo_id=space_name,
139
+ repo_type="space"
140
+ )
141
+ return f"https://huggingface.co/spaces/{space_name}"
142
+ except Exception as e:
143
+ st.error(f"Failed to create space: {str(e)}")
144
+ return None
145
+ finally:
146
+ if os.path.exists("metadata.json"):
147
+ os.remove("metadata.json")
148
 
149
  # Main App
150
  st.title("arXiv Paper to Hugging Face Space Converter")
151
  st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)
152
 
153
+ # Warning about model usage
154
+ st.markdown("""
155
+ <div class='warning'>
156
+ <strong>Warning:</strong> Ensure you have proper permissions to use selected models.
157
+ Model outputs are stored in metadata and will be publicly visible in the space.
158
+ </div>
159
+ """, unsafe_allow_html=True)
160
+
161
  # Process arXiv or PDF
162
  if arxiv_id or upload_pdf:
163
  if upload_pdf:
 
171
  # Extract and analyze
172
  text = extract_text_from_pdf(pdf_path)
173
  author_analysis = analyze_authors(text)
174
+
175
+ # Model processing
176
+ summary = process_text_with_model(text, "summarize")
177
+ key_points = process_text_with_model(text, "extract key points" if model_choice == "Text-to-Text (FLAN-T5)" else "summarize")
178
 
179
  # Display results
180
  st.header("Paper Analysis")
 
182
  st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))
183
 
184
  st.subheader("AI Analysis")
185
+ st.write("Summary:", summary)
186
+ st.write("Key Points:", key_points)
187
 
188
+ # Enhanced metadata
189
  metadata = {
190
  "title": paper.title if arxiv_id else "Uploaded PDF",
191
  "authors": list(author_analysis.keys()),
192
  "arxiv_id": arxiv_id if arxiv_id else "N/A",
193
  "model_analysis": {
194
+ "summary": summary,
195
+ "key_points": key_points,
196
+ "model_used": model_choice,
197
+ "model_name": MODEL_NAME if model_choice == "Text-to-Text (FLAN-T5)" else SECONDARY_MODEL,
198
+ "model_license": "Check model card on Hugging Face",
199
+ "processing_date": pd.Timestamp.now().isoformat()
200
+ },
201
+ "warnings": {
202
+ "model_usage": "Ensure proper model licensing",
203
+ "content_visibility": "All outputs will be public in space",
204
+ "data_source": "Verify arXiv/paper permissions"
205
  }
206
  }
207
 
208
  # Create Space
209
  if st.button("Create Hugging Face Space"):
210
  space_url = create_huggingface_space(space_name, metadata)
211
+ if space_url:
212
+ st.success(f"Space created: {space_url}")
213
+ st.markdown(f"""
214
+ <a href="{space_url}" target="_blank">
215
+ <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
216
+ alt="Hugging Face Space" width="150">
217
+ </a>
218
+ """, unsafe_allow_html=True)
219
 
220
  # Cleanup
221
  if os.path.exists("temp.pdf"):