Spaces:
Paused
Paused
π REAL WEB SEARCH: Google, Bing, Wikipedia, arXiv - No more fake URLs!
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""
|
2 |
-
Jan v1 Research Assistant -
|
3 |
"""
|
4 |
|
5 |
import gradio as gr
|
@@ -7,52 +7,180 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
7 |
import torch
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
10 |
-
import
|
|
|
11 |
|
12 |
-
# Initialize model
|
13 |
-
print("π Loading Jan v1
|
14 |
model_name = "janhq/Jan-v1-4B"
|
15 |
|
16 |
-
# Load with 4-bit quantization for speed
|
17 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
18 |
model = AutoModelForCausalLM.from_pretrained(
|
19 |
model_name,
|
20 |
torch_dtype=torch.float16,
|
21 |
device_map="auto",
|
22 |
-
load_in_4bit=True,
|
23 |
trust_remote_code=True,
|
24 |
low_cpu_mem_usage=True
|
25 |
)
|
26 |
|
27 |
-
print("β
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
{
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
def
|
38 |
-
"""
|
39 |
if not query:
|
40 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
#
|
43 |
-
|
44 |
-
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
inputs =
|
51 |
|
52 |
with torch.no_grad():
|
53 |
outputs = model.generate(
|
54 |
**inputs,
|
55 |
-
max_new_tokens=
|
56 |
temperature=temperature,
|
57 |
do_sample=True,
|
58 |
pad_token_id=tokenizer.eos_token_id
|
@@ -61,24 +189,55 @@ def fast_research(query, temperature=0.4):
|
|
61 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
62 |
analysis = response.replace(prompt, "").strip()
|
63 |
|
64 |
-
#
|
65 |
-
result = f"{analysis}\n\nπ SOURCES:\n"
|
66 |
-
for
|
67 |
-
result +=
|
68 |
|
69 |
return result
|
70 |
|
71 |
-
#
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
if __name__ == "__main__":
|
84 |
demo.launch()
|
|
|
1 |
"""
|
2 |
+
Jan v1 Research Assistant - WITH REAL WEB SEARCH
|
3 |
"""
|
4 |
|
5 |
import gradio as gr
|
|
|
7 |
import torch
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
10 |
+
import json
|
11 |
+
import urllib.parse
|
12 |
|
13 |
+
# Initialize model
|
14 |
+
print("π Loading Jan v1...")
|
15 |
model_name = "janhq/Jan-v1-4B"
|
16 |
|
|
|
17 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
18 |
model = AutoModelForCausalLM.from_pretrained(
|
19 |
model_name,
|
20 |
torch_dtype=torch.float16,
|
21 |
device_map="auto",
|
22 |
+
load_in_4bit=True,
|
23 |
trust_remote_code=True,
|
24 |
low_cpu_mem_usage=True
|
25 |
)
|
26 |
|
27 |
+
print("β
Jan v1 loaded!")
|
28 |
|
29 |
+
class RealWebSearch:
|
30 |
+
def __init__(self):
|
31 |
+
self.session = requests.Session()
|
32 |
+
self.session.headers.update({
|
33 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
34 |
+
})
|
35 |
+
|
36 |
+
def search_web(self, query, num_results=3):
|
37 |
+
"""Real web search using multiple methods"""
|
38 |
+
results = []
|
39 |
+
|
40 |
+
# Method 1: Try Google Search (via scraping)
|
41 |
+
try:
|
42 |
+
search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
|
43 |
+
response = self.session.get(search_url, timeout=5)
|
44 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
45 |
+
|
46 |
+
# Find search results
|
47 |
+
search_divs = soup.find_all('div', class_='g')[:num_results]
|
48 |
+
|
49 |
+
for div in search_divs:
|
50 |
+
title_elem = div.find('h3')
|
51 |
+
link_elem = div.find('a')
|
52 |
+
snippet_elem = div.find('span', class_='aCOpRe') or div.find('span', class_='st')
|
53 |
+
|
54 |
+
if title_elem and link_elem:
|
55 |
+
results.append({
|
56 |
+
'title': title_elem.get_text(),
|
57 |
+
'body': snippet_elem.get_text() if snippet_elem else "No snippet available",
|
58 |
+
'url': link_elem.get('href', '#')
|
59 |
+
})
|
60 |
+
|
61 |
+
if results:
|
62 |
+
print(f"β
Found {len(results)} real Google results")
|
63 |
+
return results
|
64 |
+
except Exception as e:
|
65 |
+
print(f"Google search failed: {e}")
|
66 |
+
|
67 |
+
# Method 2: Try Bing Search
|
68 |
+
try:
|
69 |
+
bing_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}"
|
70 |
+
response = self.session.get(bing_url, timeout=5)
|
71 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
72 |
+
|
73 |
+
# Find Bing results
|
74 |
+
for li in soup.find_all('li', class_='b_algo')[:num_results]:
|
75 |
+
h2 = li.find('h2')
|
76 |
+
if h2:
|
77 |
+
link = h2.find('a')
|
78 |
+
snippet = li.find('p')
|
79 |
+
|
80 |
+
if link:
|
81 |
+
results.append({
|
82 |
+
'title': link.get_text(),
|
83 |
+
'body': snippet.get_text() if snippet else "No description",
|
84 |
+
'url': link.get('href', '#')
|
85 |
+
})
|
86 |
+
|
87 |
+
if results:
|
88 |
+
print(f"β
Found {len(results)} real Bing results")
|
89 |
+
return results
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Bing search failed: {e}")
|
92 |
+
|
93 |
+
# Method 3: Try Wikipedia API
|
94 |
+
try:
|
95 |
+
wiki_url = f"https://en.wikipedia.org/w/api.php?action=opensearch&search={query}&limit={num_results}&format=json"
|
96 |
+
response = self.session.get(wiki_url, timeout=5)
|
97 |
+
data = response.json()
|
98 |
+
|
99 |
+
if len(data) >= 4:
|
100 |
+
titles = data[1]
|
101 |
+
descriptions = data[2]
|
102 |
+
urls = data[3]
|
103 |
+
|
104 |
+
for i in range(min(len(titles), num_results)):
|
105 |
+
results.append({
|
106 |
+
'title': titles[i],
|
107 |
+
'body': descriptions[i] if i < len(descriptions) else "Wikipedia article",
|
108 |
+
'url': urls[i] if i < len(urls) else f"https://en.wikipedia.org/wiki/{titles[i].replace(' ', '_')}"
|
109 |
+
})
|
110 |
+
|
111 |
+
if results:
|
112 |
+
print(f"β
Found {len(results)} real Wikipedia results")
|
113 |
+
return results
|
114 |
+
except Exception as e:
|
115 |
+
print(f"Wikipedia search failed: {e}")
|
116 |
+
|
117 |
+
# Method 4: Try arXiv for academic queries
|
118 |
+
if "research" in query.lower() or "paper" in query.lower() or "study" in query.lower():
|
119 |
+
try:
|
120 |
+
arxiv_url = f"http://export.arxiv.org/api/query?search_query=all:{urllib.parse.quote(query)}&max_results={num_results}"
|
121 |
+
response = self.session.get(arxiv_url, timeout=5)
|
122 |
+
soup = BeautifulSoup(response.text, 'xml')
|
123 |
+
|
124 |
+
for entry in soup.find_all('entry')[:num_results]:
|
125 |
+
title = entry.find('title')
|
126 |
+
summary = entry.find('summary')
|
127 |
+
link = entry.find('id')
|
128 |
+
|
129 |
+
if title and link:
|
130 |
+
results.append({
|
131 |
+
'title': title.get_text().strip(),
|
132 |
+
'body': summary.get_text()[:200].strip() if summary else "Academic paper",
|
133 |
+
'url': link.get_text().strip()
|
134 |
+
})
|
135 |
+
|
136 |
+
if results:
|
137 |
+
print(f"β
Found {len(results)} real arXiv results")
|
138 |
+
return results
|
139 |
+
except Exception as e:
|
140 |
+
print(f"arXiv search failed: {e}")
|
141 |
+
|
142 |
+
# If all methods fail, return a message
|
143 |
+
print("β All search methods failed, returning fallback")
|
144 |
+
return [{
|
145 |
+
'title': f"Search for: {query}",
|
146 |
+
'body': "Unable to fetch real-time results. Please try a different query or check your connection.",
|
147 |
+
'url': f"https://www.google.com/search?q={urllib.parse.quote(query)}"
|
148 |
+
}]
|
149 |
|
150 |
+
def research_with_sources(query, temperature=0.5):
|
151 |
+
"""Research with REAL web sources"""
|
152 |
if not query:
|
153 |
+
return "Please enter a research query"
|
154 |
+
|
155 |
+
print(f"π Researching: {query}")
|
156 |
+
|
157 |
+
# Get REAL search results
|
158 |
+
search_engine = RealWebSearch()
|
159 |
+
results = search_engine.search_web(query, 3)
|
160 |
|
161 |
+
# Build context from real sources
|
162 |
+
sources_text = ""
|
163 |
+
citations = []
|
164 |
|
165 |
+
for i, result in enumerate(results):
|
166 |
+
sources_text += f"[{i+1}] {result['title']}: {result['body']}\n"
|
167 |
+
citations.append(f"[{i+1}] {result['title']}\n {result['url']}")
|
168 |
+
|
169 |
+
# Generate analysis with Jan v1
|
170 |
+
prompt = f"""Based on these sources, analyze: {query}
|
171 |
+
|
172 |
+
Sources:
|
173 |
+
{sources_text}
|
174 |
+
|
175 |
+
Provide comprehensive analysis with key findings and implications:"""
|
176 |
|
177 |
+
inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
|
178 |
+
inputs = inputs.to(model.device)
|
179 |
|
180 |
with torch.no_grad():
|
181 |
outputs = model.generate(
|
182 |
**inputs,
|
183 |
+
max_new_tokens=400,
|
184 |
temperature=temperature,
|
185 |
do_sample=True,
|
186 |
pad_token_id=tokenizer.eos_token_id
|
|
|
189 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
190 |
analysis = response.replace(prompt, "").strip()
|
191 |
|
192 |
+
# Format with REAL sources
|
193 |
+
result = f"{analysis}\n\n" + "="*50 + "\nπ REAL SOURCES:\n\n"
|
194 |
+
for citation in citations:
|
195 |
+
result += citation + "\n\n"
|
196 |
|
197 |
return result
|
198 |
|
199 |
+
# Create interface
|
200 |
+
with gr.Blocks(title="Jan v1 Research - REAL Sources", theme=gr.themes.Soft()) as demo:
|
201 |
+
gr.Markdown("""
|
202 |
+
# π Jan v1 Research Assistant - WITH REAL WEB SEARCH
|
203 |
+
|
204 |
+
**Now with REAL sources from Google, Bing, Wikipedia, and arXiv!**
|
205 |
+
|
206 |
+
Powered by Jan v1 (4B params) - Like Perplexity but FREE
|
207 |
+
""")
|
208 |
+
|
209 |
+
with gr.Row():
|
210 |
+
with gr.Column():
|
211 |
+
query_input = gr.Textbox(
|
212 |
+
label="Research Query",
|
213 |
+
placeholder="Enter any topic to research with real sources...",
|
214 |
+
lines=2
|
215 |
+
)
|
216 |
+
temp_slider = gr.Slider(0.1, 0.9, value=0.5, label="Temperature")
|
217 |
+
search_btn = gr.Button("π Research with REAL Sources", variant="primary")
|
218 |
+
|
219 |
+
with gr.Column():
|
220 |
+
output = gr.Textbox(
|
221 |
+
label="Analysis with Real Sources",
|
222 |
+
lines=20,
|
223 |
+
show_copy_button=True
|
224 |
+
)
|
225 |
+
|
226 |
+
search_btn.click(
|
227 |
+
research_with_sources,
|
228 |
+
inputs=[query_input, temp_slider],
|
229 |
+
outputs=output
|
230 |
+
)
|
231 |
+
|
232 |
+
gr.Examples(
|
233 |
+
examples=[
|
234 |
+
["latest AI developments 2024", 0.5],
|
235 |
+
["quantum computing breakthroughs", 0.6],
|
236 |
+
["climate change solutions", 0.5],
|
237 |
+
["Chinese microdrama trends", 0.6]
|
238 |
+
],
|
239 |
+
inputs=[query_input, temp_slider]
|
240 |
+
)
|
241 |
|
242 |
if __name__ == "__main__":
|
243 |
demo.launch()
|