katsukiai commited on
Commit
f4fdb5d
·
verified ·
1 Parent(s): a7ef7e7

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -222
app.py DELETED
@@ -1,222 +0,0 @@
1
- import streamlit as st
2
- import arxiv
3
- import requests
4
- import os
5
- from pathlib import Path
6
- from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
7
- from huggingface_hub import login, HfApi
8
- import fitz # PyMuPDF
9
- import pandas as pd
10
- from collections import Counter
11
- import re
12
- import json
13
-
14
- # Constants
15
- MODEL_NAME = "google/flan-t5-large"
16
- SECONDARY_MODEL = "facebook/bart-large-cnn"
17
- HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
18
- SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
19
- HF_API_URL = "https://huggingface.co/api/models"
20
-
21
- # CSS
22
- st.markdown("""
23
- <style>
24
- .main { background-color: #f5f5f5; }
25
- .sidebar .sidebar-content { background-color: #ffffff; }
26
- .badge {
27
- background-color: #ff4b4b;
28
- color: white;
29
- padding: 5px 10px;
30
- border-radius: 5px;
31
- display: inline-block;
32
- }
33
- .warning {
34
- background-color: #fff3cd;
35
- color: #856404;
36
- padding: 10px;
37
- border-radius: 5px;
38
- margin: 10px 0;
39
- }
40
- </style>
41
- """, unsafe_allow_html=True)
42
-
43
- # Sidebar
44
- st.sidebar.title("arXiv Paper Converter")
45
- st.sidebar.header("Settings")
46
- arxiv_id = st.sidebar.text_input("Enter arXiv ID", "2407.21783")
47
- upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
48
- space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
49
- token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
50
- model_choice = st.sidebar.selectbox("Select Model", ["Text-to-Text (FLAN-T5)", "Text Generation (BART)"])
51
-
52
- # Login to Hugging Face
53
- if token:
54
- login(token=token)
55
-
56
- # Fetch available models from Hugging Face API
57
- @st.cache_data(ttl=3600)
58
- def fetch_hf_models():
59
- try:
60
- response = requests.get(HF_API_URL, headers={"Authorization": f"Bearer {token}"})
61
- if response.status_code == 200:
62
- return response.json()
63
- else:
64
- st.warning("Failed to fetch models from Hugging Face API. Using default models.")
65
- return None
66
- except Exception as e:
67
- st.warning(f"Error fetching models: {str(e)}. Using default models.")
68
- return None
69
-
70
- hf_models = fetch_hf_models()
71
-
72
- # Initialize models
73
- @st.cache_resource
74
- def load_models():
75
- if model_choice == "Text-to-Text (FLAN-T5)":
76
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
77
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
78
- pipeline_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
79
- else:
80
- tokenizer = AutoTokenizer.from_pretrained(SECONDARY_MODEL)
81
- model = AutoModelForSeq2SeqLM.from_pretrained(SECONDARY_MODEL)
82
- pipeline_model = pipeline("summarization", model=model, tokenizer=tokenizer)
83
- return tokenizer, model, pipeline_model
84
-
85
- tokenizer, model, pipeline_model = load_models()
86
-
87
- # Functions
88
- def fetch_arxiv_paper(paper_id):
89
- client = arxiv.Client()
90
- search = arxiv.Search(id_list=[paper_id])
91
- paper = next(client.results(search))
92
- return paper
93
-
94
- def download_pdf(paper, filename):
95
- paper.download_pdf(filename=filename)
96
- return filename
97
-
98
- def extract_text_from_pdf(pdf_path):
99
- doc = fitz.open(pdf_path)
100
- text = ""
101
- for page in doc:
102
- text += page.get_text()
103
- return text
104
-
105
- def analyze_authors(text):
106
- author_pattern = r"Author[s]?:\s*(.+?)(?:\n|$)"
107
- authors = re.findall(author_pattern, text, re.IGNORECASE)
108
- author_list = []
109
- for author in authors:
110
- names = author.split(',')
111
- author_list.extend([name.strip() for name in names])
112
- return Counter(author_list)
113
-
114
- def process_text_with_model(text, task="summarize"):
115
- if model_choice == "Text-to-Text (FLAN-T5)":
116
- prompt = f"{task} the following text: {text[:1000]}"
117
- result = pipeline_model(prompt, max_length=512, num_beams=4)
118
- else:
119
- result = pipeline_model(text[:1000], max_length=512, min_length=30, do_sample=False)
120
- return result[0]['generated_text']
121
-
122
- def create_huggingface_space(space_name, metadata):
123
- api = HfApi()
124
- try:
125
- api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
126
- # Upload metadata
127
- with open("metadata.json", "w") as f:
128
- json.dump(metadata, f, indent=2)
129
- api.upload_file(
130
- path_or_fileobj="metadata.json",
131
- path_in_repo="metadata.json",
132
- repo_id=space_name,
133
- repo_type="space"
134
- )
135
- api.upload_file(
136
- path_or_fileobj="README.md",
137
- path_in_repo="README.md",
138
- repo_id=space_name,
139
- repo_type="space"
140
- )
141
- return f"https://huggingface.co/spaces/{space_name}"
142
- except Exception as e:
143
- st.error(f"Failed to create space: {str(e)}")
144
- return None
145
- finally:
146
- if os.path.exists("metadata.json"):
147
- os.remove("metadata.json")
148
-
149
- # Main App
150
- st.title("arXiv Paper to Hugging Face Space Converter")
151
- st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)
152
-
153
- # Warning about model usage
154
- st.markdown("""
155
- <div class='warning'>
156
- <strong>Warning:</strong> Ensure you have proper permissions to use selected models.
157
- Model outputs are stored in metadata and will be publicly visible in the space.
158
- </div>
159
- """, unsafe_allow_html=True)
160
-
161
- # Process arXiv or PDF
162
- if arxiv_id or upload_pdf:
163
- if upload_pdf:
164
- pdf_path = "temp.pdf"
165
- with open(pdf_path, "wb") as f:
166
- f.write(upload_pdf.getbuffer())
167
- else:
168
- paper = fetch_arxiv_paper(arxiv_id)
169
- pdf_path = download_pdf(paper, "temp.pdf")
170
-
171
- # Extract and analyze
172
- text = extract_text_from_pdf(pdf_path)
173
- author_analysis = analyze_authors(text)
174
-
175
- # Model processing
176
- summary = process_text_with_model(text, "summarize")
177
- key_points = process_text_with_model(text, "extract key points" if model_choice == "Text-to-Text (FLAN-T5)" else "summarize")
178
-
179
- # Display results
180
- st.header("Paper Analysis")
181
- st.subheader("Authors")
182
- st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))
183
-
184
- st.subheader("AI Analysis")
185
- st.write("Summary:", summary)
186
- st.write("Key Points:", key_points)
187
-
188
- # Enhanced metadata
189
- metadata = {
190
- "title": paper.title if arxiv_id else "Uploaded PDF",
191
- "authors": list(author_analysis.keys()),
192
- "arxiv_id": arxiv_id if arxiv_id else "N/A",
193
- "model_analysis": {
194
- "summary": summary,
195
- "key_points": key_points,
196
- "model_used": model_choice,
197
- "model_name": MODEL_NAME if model_choice == "Text-to-Text (FLAN-T5)" else SECONDARY_MODEL,
198
- "model_license": "Check model card on Hugging Face",
199
- "processing_date": pd.Timestamp.now().isoformat()
200
- },
201
- "warnings": {
202
- "model_usage": "Ensure proper model licensing",
203
- "content_visibility": "All outputs will be public in space",
204
- "data_source": "Verify arXiv/paper permissions"
205
- }
206
- }
207
-
208
- # Create Space
209
- if st.button("Create Hugging Face Space"):
210
- space_url = create_huggingface_space(space_name, metadata)
211
- if space_url:
212
- st.success(f"Space created: {space_url}")
213
- st.markdown(f"""
214
- <a href="{space_url}" target="_blank">
215
- <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
216
- alt="Hugging Face Space" width="150">
217
- </a>
218
- """, unsafe_allow_html=True)
219
-
220
- # Cleanup
221
- if os.path.exists("temp.pdf"):
222
- os.remove("temp.pdf")