Spaces:
Paused
Paused
""" | |
Jan v1 Research Assistant for Hugging Face Spaces | |
Optimized for research tasks and source analysis | |
""" | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
from datetime import datetime | |
from typing import List, Dict, Optional | |
import hashlib | |
# Initialize model | |
print("π Loading Jan v1 model...") | |
model_name = "janhq/Jan-v1-4B" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
load_in_8bit=True # Reduce memory usage | |
) | |
print("β Model loaded successfully!") | |
# Cache for responses | |
response_cache = {} | |
def get_cache_key(query: str, context: str) -> str: | |
"""Generate cache key for query+context""" | |
combined = f"{query}|{context}" | |
return hashlib.md5(combined.encode()).hexdigest() | |
def scrape_url(url: str) -> str: | |
"""Scrape and extract text from URL""" | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Remove script and style elements | |
for script in soup(["script", "style"]): | |
script.decompose() | |
text = soup.get_text() | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
text = ' '.join(chunk for chunk in chunks if chunk) | |
return text[:4000] # Limit to 4000 chars | |
except Exception as e: | |
return f"Error scraping URL: {str(e)}" | |
def research_assistant( | |
query: str, | |
context: str = "", | |
temperature: float = 0.6, | |
use_cache: bool = True, | |
research_mode: str = "comprehensive" | |
) -> str: | |
""" | |
Main research assistant function | |
""" | |
# Check cache | |
cache_key = get_cache_key(query, context) | |
if use_cache and cache_key in response_cache: | |
return "π [Cached] " + response_cache[cache_key] | |
# Build prompt based on research mode | |
if research_mode == "comprehensive": | |
prompt = f"""You are an expert research analyst. Provide comprehensive analysis. | |
Context/Sources: | |
{context if context else "No specific context provided"} | |
Research Query: | |
{query} | |
Provide your analysis with: | |
1. Key Findings & Insights | |
2. Supporting Evidence | |
3. Critical Analysis | |
4. Confidence Level | |
5. Suggested Follow-up Questions | |
6. Potential Limitations | |
Analysis:""" | |
elif research_mode == "fact_extraction": | |
prompt = f"""Extract and verify factual information. | |
Source Material: | |
{context} | |
Task: {query} | |
Extract: | |
- Factual claims with confidence scores (0-100%) | |
- Key entities and relationships | |
- Dates, numbers, and statistics | |
- Contradictions or inconsistencies | |
Facts:""" | |
elif research_mode == "source_comparison": | |
prompt = f"""Compare and contrast multiple sources. | |
Sources: | |
{context} | |
Comparison Task: {query} | |
Analyze: | |
- Common themes | |
- Contradictions | |
- Unique perspectives | |
- Reliability assessment | |
- Synthesis | |
Comparison:""" | |
else: # quick_summary | |
prompt = f"""Provide a quick summary. | |
Content: {context} | |
Task: {query} | |
Summary:""" | |
# Tokenize and generate | |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=1024, | |
temperature=temperature, | |
top_p=0.95, | |
top_k=20, | |
do_sample=True, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Remove the prompt from response | |
response = response.replace(prompt, "").strip() | |
# Cache the response | |
if use_cache: | |
response_cache[cache_key] = response | |
return response | |
def process_multiple_sources(sources_text: str, query: str, temperature: float = 0.6) -> str: | |
"""Process multiple sources (URLs or text)""" | |
sources = sources_text.strip().split('\n') | |
combined_context = "" | |
source_count = 0 | |
for source in sources[:5]: # Limit to 5 sources | |
source = source.strip() | |
if not source: | |
continue | |
source_count += 1 | |
if source.startswith('http'): | |
content = scrape_url(source) | |
combined_context += f"\n\n--- Source {source_count} (URL: {source[:50]}...) ---\n{content[:800]}" | |
else: | |
combined_context += f"\n\n--- Source {source_count} (Text) ---\n{source[:800]}" | |
if not combined_context: | |
return "No valid sources provided" | |
return research_assistant( | |
query=query, | |
context=combined_context, | |
temperature=temperature, | |
research_mode="source_comparison" | |
) | |
def extract_entities(text: str) -> str: | |
"""Extract key entities from text""" | |
return research_assistant( | |
query="Extract all people, organizations, locations, dates, and key concepts", | |
context=text, | |
temperature=0.3, | |
research_mode="fact_extraction" | |
) | |
def generate_research_questions(topic: str, context: str = "") -> str: | |
"""Generate research questions for a topic""" | |
return research_assistant( | |
query=f"Generate 10 specific, actionable research questions about: {topic}", | |
context=context, | |
temperature=0.7, | |
research_mode="comprehensive" | |
) | |
# Create Gradio interface | |
with gr.Blocks(title="Jan v1 Research Assistant", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# π¬ Jan v1 Research Assistant | |
Powered by Jan-v1-4B (91.1% accuracy) - Optimized for research and analysis | |
### Features: | |
- π Web scraping and analysis | |
- π Multi-source comparison | |
- π Entity extraction | |
- β Research question generation | |
- πΎ Response caching | |
""") | |
with gr.Tab("Single Source Analysis"): | |
with gr.Row(): | |
with gr.Column(): | |
single_query = gr.Textbox( | |
label="Research Query", | |
placeholder="What would you like to research?", | |
lines=2 | |
) | |
single_context = gr.Textbox( | |
label="Context (paste text or URL)", | |
placeholder="Paste article text or enter URL to analyze", | |
lines=5 | |
) | |
single_mode = gr.Radio( | |
["comprehensive", "fact_extraction", "quick_summary"], | |
label="Analysis Mode", | |
value="comprehensive" | |
) | |
single_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature") | |
single_cache = gr.Checkbox(label="Use cache", value=True) | |
single_btn = gr.Button("π Analyze", variant="primary") | |
with gr.Column(): | |
single_output = gr.Textbox( | |
label="Analysis Results", | |
lines=15 | |
) | |
def analyze_single(query, context, mode, temp, cache): | |
# Check if context is URL | |
if context.startswith('http'): | |
context = scrape_url(context) | |
return research_assistant( | |
query=query, | |
context=context, | |
temperature=temp, | |
use_cache=cache, | |
research_mode=mode | |
) | |
single_btn.click( | |
analyze_single, | |
inputs=[single_query, single_context, single_mode, single_temp, single_cache], | |
outputs=single_output | |
) | |
with gr.Tab("Multi-Source Comparison"): | |
with gr.Row(): | |
with gr.Column(): | |
multi_sources = gr.Textbox( | |
label="Sources (one per line, URLs or text)", | |
placeholder="https://example.com/article1\nhttps://example.com/article2\nOr paste text directly", | |
lines=6 | |
) | |
multi_query = gr.Textbox( | |
label="Comparison Query", | |
placeholder="What aspects should I compare?", | |
lines=2 | |
) | |
multi_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature") | |
multi_btn = gr.Button("π Compare Sources", variant="primary") | |
with gr.Column(): | |
multi_output = gr.Textbox( | |
label="Comparison Results", | |
lines=15 | |
) | |
multi_btn.click( | |
process_multiple_sources, | |
inputs=[multi_sources, multi_query, multi_temp], | |
outputs=multi_output | |
) | |
with gr.Tab("Entity Extraction"): | |
with gr.Row(): | |
with gr.Column(): | |
entity_input = gr.Textbox( | |
label="Text or URL", | |
placeholder="Paste text or URL to extract entities from", | |
lines=8 | |
) | |
entity_btn = gr.Button("π·οΈ Extract Entities", variant="primary") | |
with gr.Column(): | |
entity_output = gr.Textbox( | |
label="Extracted Entities", | |
lines=10 | |
) | |
def extract_entities_wrapper(text): | |
if text.startswith('http'): | |
text = scrape_url(text) | |
return extract_entities(text) | |
entity_btn.click( | |
extract_entities_wrapper, | |
inputs=entity_input, | |
outputs=entity_output | |
) | |
with gr.Tab("Research Question Generator"): | |
with gr.Row(): | |
with gr.Column(): | |
rq_topic = gr.Textbox( | |
label="Research Topic", | |
placeholder="Enter your research topic", | |
lines=2 | |
) | |
rq_context = gr.Textbox( | |
label="Additional Context (optional)", | |
placeholder="Any specific focus areas or constraints", | |
lines=4 | |
) | |
rq_btn = gr.Button("π‘ Generate Questions", variant="primary") | |
with gr.Column(): | |
rq_output = gr.Textbox( | |
label="Research Questions", | |
lines=12 | |
) | |
rq_btn.click( | |
generate_research_questions, | |
inputs=[rq_topic, rq_context], | |
outputs=rq_output | |
) | |
with gr.Tab("API Integration"): | |
gr.Markdown(""" | |
### π Integrate with your Research App | |
Once deployed, you can call this Space via API: | |
```javascript | |
// JavaScript/TypeScript | |
const response = await fetch('https://[your-username].hf.space/api/predict', { | |
method: 'POST', | |
headers: { 'Content-Type': 'application/json' }, | |
body: JSON.stringify({ | |
data: [ | |
"Your research query", | |
"Context or URL", | |
"comprehensive", // mode | |
0.6, // temperature | |
true // use cache | |
] | |
}) | |
}); | |
const result = await response.json(); | |
``` | |
```python | |
# Python | |
import requests | |
response = requests.post( | |
'https://[your-username].hf.space/api/predict', | |
json={ | |
"data": [ | |
"Your research query", | |
"Context or URL", | |
"comprehensive", | |
0.6, | |
True | |
] | |
} | |
) | |
result = response.json() | |
``` | |
""") | |
gr.Markdown(""" | |
--- | |
### π‘ Tips: | |
- Lower temperature (0.1-0.3) for factual extraction | |
- Higher temperature (0.7-0.9) for creative research questions | |
- Cache is cleared when Space restarts | |
- URLs are automatically scraped and analyzed | |
""") | |
if __name__ == "__main__": | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False | |
) |