Spaces:
Paused
Paused
""" | |
Jan v1 Research Assistant - WITH REAL WEB SEARCH | |
""" | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import urllib.parse | |
# Initialize model with error handling | |
print("π Loading Jan v1...") | |
model_name = "janhq/Jan-v1-4B" | |
try: | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
load_in_4bit=True, | |
trust_remote_code=True, | |
low_cpu_mem_usage=True | |
) | |
print("β Jan v1 loaded!") | |
model_loaded = True | |
except Exception as e: | |
print(f"β Error loading Jan v1: {e}") | |
print("π Using simplified fallback...") | |
# Simple fallback that always works | |
tokenizer = None | |
model = None | |
model_loaded = False | |
class RealWebSearch: | |
def __init__(self): | |
self.session = requests.Session() | |
self.session.headers.update({ | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
}) | |
def search_web(self, query, num_results=3): | |
"""Real web search using multiple methods""" | |
results = [] | |
# Method 1: Try Google Search (via scraping) | |
try: | |
search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}" | |
response = self.session.get(search_url, timeout=5) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find search results | |
search_divs = soup.find_all('div', class_='g')[:num_results] | |
for div in search_divs: | |
title_elem = div.find('h3') | |
link_elem = div.find('a') | |
snippet_elem = div.find('span', class_='aCOpRe') or div.find('span', class_='st') | |
if title_elem and link_elem: | |
results.append({ | |
'title': title_elem.get_text(), | |
'body': snippet_elem.get_text() if snippet_elem else "No snippet available", | |
'url': link_elem.get('href', '#') | |
}) | |
if results: | |
print(f"β Found {len(results)} real Google results") | |
return results | |
except Exception as e: | |
print(f"Google search failed: {e}") | |
# Method 2: Try Bing Search | |
try: | |
bing_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}" | |
response = self.session.get(bing_url, timeout=5) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find Bing results | |
for li in soup.find_all('li', class_='b_algo')[:num_results]: | |
h2 = li.find('h2') | |
if h2: | |
link = h2.find('a') | |
snippet = li.find('p') | |
if link: | |
results.append({ | |
'title': link.get_text(), | |
'body': snippet.get_text() if snippet else "No description", | |
'url': link.get('href', '#') | |
}) | |
if results: | |
print(f"β Found {len(results)} real Bing results") | |
return results | |
except Exception as e: | |
print(f"Bing search failed: {e}") | |
# Method 3: Try Wikipedia API | |
try: | |
wiki_url = f"https://en.wikipedia.org/w/api.php?action=opensearch&search={query}&limit={num_results}&format=json" | |
response = self.session.get(wiki_url, timeout=5) | |
data = response.json() | |
if len(data) >= 4: | |
titles = data[1] | |
descriptions = data[2] | |
urls = data[3] | |
for i in range(min(len(titles), num_results)): | |
results.append({ | |
'title': titles[i], | |
'body': descriptions[i] if i < len(descriptions) else "Wikipedia article", | |
'url': urls[i] if i < len(urls) else f"https://en.wikipedia.org/wiki/{titles[i].replace(' ', '_')}" | |
}) | |
if results: | |
print(f"β Found {len(results)} real Wikipedia results") | |
return results | |
except Exception as e: | |
print(f"Wikipedia search failed: {e}") | |
# Method 4: Try arXiv for academic queries | |
if "research" in query.lower() or "paper" in query.lower() or "study" in query.lower(): | |
try: | |
arxiv_url = f"http://export.arxiv.org/api/query?search_query=all:{urllib.parse.quote(query)}&max_results={num_results}" | |
response = self.session.get(arxiv_url, timeout=5) | |
soup = BeautifulSoup(response.text, 'xml') | |
for entry in soup.find_all('entry')[:num_results]: | |
title = entry.find('title') | |
summary = entry.find('summary') | |
link = entry.find('id') | |
if title and link: | |
results.append({ | |
'title': title.get_text().strip(), | |
'body': summary.get_text()[:200].strip() if summary else "Academic paper", | |
'url': link.get_text().strip() | |
}) | |
if results: | |
print(f"β Found {len(results)} real arXiv results") | |
return results | |
except Exception as e: | |
print(f"arXiv search failed: {e}") | |
# If all methods fail, return a message | |
print("β All search methods failed, returning fallback") | |
return [{ | |
'title': f"Search for: {query}", | |
'body': "Unable to fetch real-time results. Please try a different query or check your connection.", | |
'url': f"https://www.google.com/search?q={urllib.parse.quote(query)}" | |
}] | |
def research_with_sources(query, temperature=0.5): | |
"""Research with REAL web sources""" | |
if not query: | |
return "Please enter a research query" | |
print(f"π Researching: {query}") | |
# Get REAL search results | |
search_engine = RealWebSearch() | |
results = search_engine.search_web(query, 3) | |
# Build context from real sources | |
sources_text = "" | |
citations = [] | |
for i, result in enumerate(results): | |
sources_text += f"[{i+1}] {result['title']}: {result['body']}\n" | |
citations.append(f"[{i+1}] {result['title']}\n {result['url']}") | |
# Generate analysis with Jan v1 | |
prompt = f"""Based on these sources, analyze: {query} | |
Sources: | |
{sources_text} | |
Provide comprehensive analysis with key findings and implications:""" | |
inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True) | |
inputs = inputs.to(model.device) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=400, | |
temperature=temperature, | |
do_sample=True, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
analysis = response.replace(prompt, "").strip() | |
# Format with REAL sources | |
result = f"{analysis}\n\n" + "="*50 + "\nπ REAL SOURCES:\n\n" | |
for citation in citations: | |
result += citation + "\n\n" | |
return result | |
# Create interface | |
with gr.Blocks(title="Jan v1 Research - REAL Sources", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# π Jan v1 Research Assistant - WITH REAL WEB SEARCH | |
**Now with REAL sources from Google, Bing, Wikipedia, and arXiv!** | |
Powered by Jan v1 (4B params) - Like Perplexity but FREE | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
query_input = gr.Textbox( | |
label="Research Query", | |
placeholder="Enter any topic to research with real sources...", | |
lines=2 | |
) | |
temp_slider = gr.Slider(0.1, 0.9, value=0.5, label="Temperature") | |
search_btn = gr.Button("π Research with REAL Sources", variant="primary") | |
with gr.Column(): | |
output = gr.Textbox( | |
label="Analysis with Real Sources", | |
lines=20, | |
show_copy_button=True | |
) | |
search_btn.click( | |
research_with_sources, | |
inputs=[query_input, temp_slider], | |
outputs=output | |
) | |
gr.Examples( | |
examples=[ | |
["latest AI developments 2024", 0.5], | |
["quantum computing breakthroughs", 0.6], | |
["climate change solutions", 0.5], | |
["Chinese microdrama trends", 0.6] | |
], | |
inputs=[query_input, temp_slider] | |
) | |
if __name__ == "__main__": | |
demo.launch() |