darwincb commited on
Commit
4b392a8
Β·
1 Parent(s): ab4eb68

Add Jan v1 Research Assistant with web scraping, multi-source analysis, and entity extraction

Browse files
Files changed (2) hide show
  1. app.py +406 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Jan v1 Research Assistant for Hugging Face Spaces
3
+ Optimized for research tasks and source analysis
4
+ """
5
+
6
+ import gradio as gr
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer
8
+ import torch
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ import json
12
+ from datetime import datetime
13
+ from typing import List, Dict, Optional
14
+ import hashlib
15
+
16
+ # Initialize model
17
+ print("πŸš€ Loading Jan v1 model...")
18
+ model_name = "janhq/Jan-v1-4B"
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ model_name,
22
+ torch_dtype=torch.bfloat16,
23
+ device_map="auto",
24
+ load_in_8bit=True # Reduce memory usage
25
+ )
26
+ print("βœ… Model loaded successfully!")
27
+
28
+ # Cache for responses
29
+ response_cache = {}
30
+
31
+ def get_cache_key(query: str, context: str) -> str:
32
+ """Generate cache key for query+context"""
33
+ combined = f"{query}|{context}"
34
+ return hashlib.md5(combined.encode()).hexdigest()
35
+
36
+ def scrape_url(url: str) -> str:
37
+ """Scrape and extract text from URL"""
38
+ try:
39
+ headers = {
40
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
41
+ }
42
+ response = requests.get(url, headers=headers, timeout=10)
43
+ soup = BeautifulSoup(response.content, 'html.parser')
44
+
45
+ # Remove script and style elements
46
+ for script in soup(["script", "style"]):
47
+ script.decompose()
48
+
49
+ text = soup.get_text()
50
+ lines = (line.strip() for line in text.splitlines())
51
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
52
+ text = ' '.join(chunk for chunk in chunks if chunk)
53
+
54
+ return text[:4000] # Limit to 4000 chars
55
+ except Exception as e:
56
+ return f"Error scraping URL: {str(e)}"
57
+
58
+ def research_assistant(
59
+ query: str,
60
+ context: str = "",
61
+ temperature: float = 0.6,
62
+ use_cache: bool = True,
63
+ research_mode: str = "comprehensive"
64
+ ) -> str:
65
+ """
66
+ Main research assistant function
67
+ """
68
+ # Check cache
69
+ cache_key = get_cache_key(query, context)
70
+ if use_cache and cache_key in response_cache:
71
+ return "πŸ“Œ [Cached] " + response_cache[cache_key]
72
+
73
+ # Build prompt based on research mode
74
+ if research_mode == "comprehensive":
75
+ prompt = f"""You are an expert research analyst. Provide comprehensive analysis.
76
+
77
+ Context/Sources:
78
+ {context if context else "No specific context provided"}
79
+
80
+ Research Query:
81
+ {query}
82
+
83
+ Provide your analysis with:
84
+ 1. Key Findings & Insights
85
+ 2. Supporting Evidence
86
+ 3. Critical Analysis
87
+ 4. Confidence Level
88
+ 5. Suggested Follow-up Questions
89
+ 6. Potential Limitations
90
+
91
+ Analysis:"""
92
+
93
+ elif research_mode == "fact_extraction":
94
+ prompt = f"""Extract and verify factual information.
95
+
96
+ Source Material:
97
+ {context}
98
+
99
+ Task: {query}
100
+
101
+ Extract:
102
+ - Factual claims with confidence scores (0-100%)
103
+ - Key entities and relationships
104
+ - Dates, numbers, and statistics
105
+ - Contradictions or inconsistencies
106
+
107
+ Facts:"""
108
+
109
+ elif research_mode == "source_comparison":
110
+ prompt = f"""Compare and contrast multiple sources.
111
+
112
+ Sources:
113
+ {context}
114
+
115
+ Comparison Task: {query}
116
+
117
+ Analyze:
118
+ - Common themes
119
+ - Contradictions
120
+ - Unique perspectives
121
+ - Reliability assessment
122
+ - Synthesis
123
+
124
+ Comparison:"""
125
+
126
+ else: # quick_summary
127
+ prompt = f"""Provide a quick summary.
128
+
129
+ Content: {context}
130
+ Task: {query}
131
+
132
+ Summary:"""
133
+
134
+ # Tokenize and generate
135
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
136
+
137
+ with torch.no_grad():
138
+ outputs = model.generate(
139
+ **inputs,
140
+ max_new_tokens=1024,
141
+ temperature=temperature,
142
+ top_p=0.95,
143
+ top_k=20,
144
+ do_sample=True,
145
+ pad_token_id=tokenizer.eos_token_id
146
+ )
147
+
148
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
149
+ # Remove the prompt from response
150
+ response = response.replace(prompt, "").strip()
151
+
152
+ # Cache the response
153
+ if use_cache:
154
+ response_cache[cache_key] = response
155
+
156
+ return response
157
+
158
+ def process_multiple_sources(sources_text: str, query: str, temperature: float = 0.6) -> str:
159
+ """Process multiple sources (URLs or text)"""
160
+ sources = sources_text.strip().split('\n')
161
+ combined_context = ""
162
+ source_count = 0
163
+
164
+ for source in sources[:5]: # Limit to 5 sources
165
+ source = source.strip()
166
+ if not source:
167
+ continue
168
+
169
+ source_count += 1
170
+ if source.startswith('http'):
171
+ content = scrape_url(source)
172
+ combined_context += f"\n\n--- Source {source_count} (URL: {source[:50]}...) ---\n{content[:800]}"
173
+ else:
174
+ combined_context += f"\n\n--- Source {source_count} (Text) ---\n{source[:800]}"
175
+
176
+ if not combined_context:
177
+ return "No valid sources provided"
178
+
179
+ return research_assistant(
180
+ query=query,
181
+ context=combined_context,
182
+ temperature=temperature,
183
+ research_mode="source_comparison"
184
+ )
185
+
186
+ def extract_entities(text: str) -> str:
187
+ """Extract key entities from text"""
188
+ return research_assistant(
189
+ query="Extract all people, organizations, locations, dates, and key concepts",
190
+ context=text,
191
+ temperature=0.3,
192
+ research_mode="fact_extraction"
193
+ )
194
+
195
+ def generate_research_questions(topic: str, context: str = "") -> str:
196
+ """Generate research questions for a topic"""
197
+ return research_assistant(
198
+ query=f"Generate 10 specific, actionable research questions about: {topic}",
199
+ context=context,
200
+ temperature=0.7,
201
+ research_mode="comprehensive"
202
+ )
203
+
204
+ # Create Gradio interface
205
+ with gr.Blocks(title="Jan v1 Research Assistant", theme=gr.themes.Soft()) as demo:
206
+ gr.Markdown("""
207
+ # πŸ”¬ Jan v1 Research Assistant
208
+
209
+ Powered by Jan-v1-4B (91.1% accuracy) - Optimized for research and analysis
210
+
211
+ ### Features:
212
+ - 🌐 Web scraping and analysis
213
+ - πŸ“Š Multi-source comparison
214
+ - πŸ” Entity extraction
215
+ - ❓ Research question generation
216
+ - πŸ’Ύ Response caching
217
+ """)
218
+
219
+ with gr.Tab("Single Source Analysis"):
220
+ with gr.Row():
221
+ with gr.Column():
222
+ single_query = gr.Textbox(
223
+ label="Research Query",
224
+ placeholder="What would you like to research?",
225
+ lines=2
226
+ )
227
+ single_context = gr.Textbox(
228
+ label="Context (paste text or URL)",
229
+ placeholder="Paste article text or enter URL to analyze",
230
+ lines=5
231
+ )
232
+ single_mode = gr.Radio(
233
+ ["comprehensive", "fact_extraction", "quick_summary"],
234
+ label="Analysis Mode",
235
+ value="comprehensive"
236
+ )
237
+ single_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature")
238
+ single_cache = gr.Checkbox(label="Use cache", value=True)
239
+ single_btn = gr.Button("πŸ” Analyze", variant="primary")
240
+
241
+ with gr.Column():
242
+ single_output = gr.Textbox(
243
+ label="Analysis Results",
244
+ lines=15
245
+ )
246
+
247
+ def analyze_single(query, context, mode, temp, cache):
248
+ # Check if context is URL
249
+ if context.startswith('http'):
250
+ context = scrape_url(context)
251
+
252
+ return research_assistant(
253
+ query=query,
254
+ context=context,
255
+ temperature=temp,
256
+ use_cache=cache,
257
+ research_mode=mode
258
+ )
259
+
260
+ single_btn.click(
261
+ analyze_single,
262
+ inputs=[single_query, single_context, single_mode, single_temp, single_cache],
263
+ outputs=single_output
264
+ )
265
+
266
+ with gr.Tab("Multi-Source Comparison"):
267
+ with gr.Row():
268
+ with gr.Column():
269
+ multi_sources = gr.Textbox(
270
+ label="Sources (one per line, URLs or text)",
271
+ placeholder="https://example.com/article1\nhttps://example.com/article2\nOr paste text directly",
272
+ lines=6
273
+ )
274
+ multi_query = gr.Textbox(
275
+ label="Comparison Query",
276
+ placeholder="What aspects should I compare?",
277
+ lines=2
278
+ )
279
+ multi_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature")
280
+ multi_btn = gr.Button("πŸ”„ Compare Sources", variant="primary")
281
+
282
+ with gr.Column():
283
+ multi_output = gr.Textbox(
284
+ label="Comparison Results",
285
+ lines=15
286
+ )
287
+
288
+ multi_btn.click(
289
+ process_multiple_sources,
290
+ inputs=[multi_sources, multi_query, multi_temp],
291
+ outputs=multi_output
292
+ )
293
+
294
+ with gr.Tab("Entity Extraction"):
295
+ with gr.Row():
296
+ with gr.Column():
297
+ entity_input = gr.Textbox(
298
+ label="Text or URL",
299
+ placeholder="Paste text or URL to extract entities from",
300
+ lines=8
301
+ )
302
+ entity_btn = gr.Button("🏷️ Extract Entities", variant="primary")
303
+
304
+ with gr.Column():
305
+ entity_output = gr.Textbox(
306
+ label="Extracted Entities",
307
+ lines=10
308
+ )
309
+
310
+ def extract_entities_wrapper(text):
311
+ if text.startswith('http'):
312
+ text = scrape_url(text)
313
+ return extract_entities(text)
314
+
315
+ entity_btn.click(
316
+ extract_entities_wrapper,
317
+ inputs=entity_input,
318
+ outputs=entity_output
319
+ )
320
+
321
+ with gr.Tab("Research Question Generator"):
322
+ with gr.Row():
323
+ with gr.Column():
324
+ rq_topic = gr.Textbox(
325
+ label="Research Topic",
326
+ placeholder="Enter your research topic",
327
+ lines=2
328
+ )
329
+ rq_context = gr.Textbox(
330
+ label="Additional Context (optional)",
331
+ placeholder="Any specific focus areas or constraints",
332
+ lines=4
333
+ )
334
+ rq_btn = gr.Button("πŸ’‘ Generate Questions", variant="primary")
335
+
336
+ with gr.Column():
337
+ rq_output = gr.Textbox(
338
+ label="Research Questions",
339
+ lines=12
340
+ )
341
+
342
+ rq_btn.click(
343
+ generate_research_questions,
344
+ inputs=[rq_topic, rq_context],
345
+ outputs=rq_output
346
+ )
347
+
348
+ with gr.Tab("API Integration"):
349
+ gr.Markdown("""
350
+ ### πŸ”Œ Integrate with your Research App
351
+
352
+ Once deployed, you can call this Space via API:
353
+
354
+ ```javascript
355
+ // JavaScript/TypeScript
356
+ const response = await fetch('https://[your-username].hf.space/api/predict', {
357
+ method: 'POST',
358
+ headers: { 'Content-Type': 'application/json' },
359
+ body: JSON.stringify({
360
+ data: [
361
+ "Your research query",
362
+ "Context or URL",
363
+ "comprehensive", // mode
364
+ 0.6, // temperature
365
+ true // use cache
366
+ ]
367
+ })
368
+ });
369
+ const result = await response.json();
370
+ ```
371
+
372
+ ```python
373
+ # Python
374
+ import requests
375
+
376
+ response = requests.post(
377
+ 'https://[your-username].hf.space/api/predict',
378
+ json={
379
+ "data": [
380
+ "Your research query",
381
+ "Context or URL",
382
+ "comprehensive",
383
+ 0.6,
384
+ True
385
+ ]
386
+ }
387
+ )
388
+ result = response.json()
389
+ ```
390
+ """)
391
+
392
+ gr.Markdown("""
393
+ ---
394
+ ### πŸ’‘ Tips:
395
+ - Lower temperature (0.1-0.3) for factual extraction
396
+ - Higher temperature (0.7-0.9) for creative research questions
397
+ - Cache is cleared when Space restarts
398
+ - URLs are automatically scraped and analyzed
399
+ """)
400
+
401
+ if __name__ == "__main__":
402
+ demo.launch(
403
+ server_name="0.0.0.0",
404
+ server_port=7860,
405
+ share=False
406
+ )
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Jan v1 Research Assistant Requirements
2
+ transformers==4.36.2
3
+ torch==2.1.2
4
+ gradio==4.19.2
5
+ accelerate==0.25.0
6
+ bitsandbytes==0.42.0
7
+ sentencepiece==0.1.99
8
+ beautifulsoup4==4.12.3
9
+ requests==2.31.0
10
+ lxml==5.1.0