darwincb commited on
Commit
dcc141f
Β·
1 Parent(s): 8334178

πŸ” REAL WEB SEARCH: Google, Bing, Wikipedia, arXiv - No more fake URLs!

Browse files
Files changed (1) hide show
  1. app.py +199 -40
app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Jan v1 Research Assistant - OPTIMIZED for speed
3
  """
4
 
5
  import gradio as gr
@@ -7,52 +7,180 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
7
  import torch
8
  import requests
9
  from bs4 import BeautifulSoup
10
- import re
 
11
 
12
- # Initialize model with optimizations
13
- print("πŸš€ Loading Jan v1 optimized...")
14
  model_name = "janhq/Jan-v1-4B"
15
 
16
- # Load with 4-bit quantization for speed
17
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
18
  model = AutoModelForCausalLM.from_pretrained(
19
  model_name,
20
  torch_dtype=torch.float16,
21
  device_map="auto",
22
- load_in_4bit=True, # 4-bit is faster than 8-bit
23
  trust_remote_code=True,
24
  low_cpu_mem_usage=True
25
  )
26
 
27
- print("βœ… Model loaded!")
28
 
29
- def quick_search(query):
30
- """Ultra simple search"""
31
- return [
32
- {'title': f'Result 1 for {query}', 'body': 'Recent developments and findings...', 'url': '#'},
33
- {'title': f'Result 2 for {query}', 'body': 'Expert analysis shows...', 'url': '#'},
34
- {'title': f'Result 3 for {query}', 'body': 'Current research indicates...', 'url': '#'}
35
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- def fast_research(query, temperature=0.4):
38
- """Optimized for speed"""
39
  if not query:
40
- return "Enter a query"
 
 
 
 
 
 
41
 
42
- # Quick search
43
- results = quick_search(query)
44
- sources = "\n".join([f"[{i+1}] {r['title']}: {r['body']}" for i, r in enumerate(results)])
45
 
46
- # Shorter prompt for speed
47
- prompt = f"Query: {query}\nSources: {sources}\n\nProvide brief analysis:"
 
 
 
 
 
 
 
 
 
48
 
49
- # Generate with limits
50
- inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
51
 
52
  with torch.no_grad():
53
  outputs = model.generate(
54
  **inputs,
55
- max_new_tokens=200, # Limit output for speed
56
  temperature=temperature,
57
  do_sample=True,
58
  pad_token_id=tokenizer.eos_token_id
@@ -61,24 +189,55 @@ def fast_research(query, temperature=0.4):
61
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
62
  analysis = response.replace(prompt, "").strip()
63
 
64
- # Add sources
65
- result = f"{analysis}\n\nπŸ“š SOURCES:\n"
66
- for i, r in enumerate(results):
67
- result += f"[{i+1}] {r['title']}\n"
68
 
69
  return result
70
 
71
- # Simple interface
72
- demo = gr.Interface(
73
- fn=fast_research,
74
- inputs=[
75
- gr.Textbox(label="Research Query", lines=2),
76
- gr.Slider(0.1, 0.9, value=0.4, label="Temperature")
77
- ],
78
- outputs=gr.Textbox(label="Analysis", lines=15),
79
- title="Jan v1 Research - FAST VERSION",
80
- description="Optimized for speed - 30 second responses"
81
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  if __name__ == "__main__":
84
  demo.launch()
 
1
  """
2
+ Jan v1 Research Assistant - WITH REAL WEB SEARCH
3
  """
4
 
5
  import gradio as gr
 
7
  import torch
8
  import requests
9
  from bs4 import BeautifulSoup
10
+ import json
11
+ import urllib.parse
12
 
13
+ # Initialize model
14
+ print("πŸš€ Loading Jan v1...")
15
  model_name = "janhq/Jan-v1-4B"
16
 
 
17
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
18
  model = AutoModelForCausalLM.from_pretrained(
19
  model_name,
20
  torch_dtype=torch.float16,
21
  device_map="auto",
22
+ load_in_4bit=True,
23
  trust_remote_code=True,
24
  low_cpu_mem_usage=True
25
  )
26
 
27
+ print("βœ… Jan v1 loaded!")
28
 
29
+ class RealWebSearch:
30
+ def __init__(self):
31
+ self.session = requests.Session()
32
+ self.session.headers.update({
33
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
34
+ })
35
+
36
+ def search_web(self, query, num_results=3):
37
+ """Real web search using multiple methods"""
38
+ results = []
39
+
40
+ # Method 1: Try Google Search (via scraping)
41
+ try:
42
+ search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
43
+ response = self.session.get(search_url, timeout=5)
44
+ soup = BeautifulSoup(response.text, 'html.parser')
45
+
46
+ # Find search results
47
+ search_divs = soup.find_all('div', class_='g')[:num_results]
48
+
49
+ for div in search_divs:
50
+ title_elem = div.find('h3')
51
+ link_elem = div.find('a')
52
+ snippet_elem = div.find('span', class_='aCOpRe') or div.find('span', class_='st')
53
+
54
+ if title_elem and link_elem:
55
+ results.append({
56
+ 'title': title_elem.get_text(),
57
+ 'body': snippet_elem.get_text() if snippet_elem else "No snippet available",
58
+ 'url': link_elem.get('href', '#')
59
+ })
60
+
61
+ if results:
62
+ print(f"βœ… Found {len(results)} real Google results")
63
+ return results
64
+ except Exception as e:
65
+ print(f"Google search failed: {e}")
66
+
67
+ # Method 2: Try Bing Search
68
+ try:
69
+ bing_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}"
70
+ response = self.session.get(bing_url, timeout=5)
71
+ soup = BeautifulSoup(response.text, 'html.parser')
72
+
73
+ # Find Bing results
74
+ for li in soup.find_all('li', class_='b_algo')[:num_results]:
75
+ h2 = li.find('h2')
76
+ if h2:
77
+ link = h2.find('a')
78
+ snippet = li.find('p')
79
+
80
+ if link:
81
+ results.append({
82
+ 'title': link.get_text(),
83
+ 'body': snippet.get_text() if snippet else "No description",
84
+ 'url': link.get('href', '#')
85
+ })
86
+
87
+ if results:
88
+ print(f"βœ… Found {len(results)} real Bing results")
89
+ return results
90
+ except Exception as e:
91
+ print(f"Bing search failed: {e}")
92
+
93
+ # Method 3: Try Wikipedia API
94
+ try:
95
+ wiki_url = f"https://en.wikipedia.org/w/api.php?action=opensearch&search={query}&limit={num_results}&format=json"
96
+ response = self.session.get(wiki_url, timeout=5)
97
+ data = response.json()
98
+
99
+ if len(data) >= 4:
100
+ titles = data[1]
101
+ descriptions = data[2]
102
+ urls = data[3]
103
+
104
+ for i in range(min(len(titles), num_results)):
105
+ results.append({
106
+ 'title': titles[i],
107
+ 'body': descriptions[i] if i < len(descriptions) else "Wikipedia article",
108
+ 'url': urls[i] if i < len(urls) else f"https://en.wikipedia.org/wiki/{titles[i].replace(' ', '_')}"
109
+ })
110
+
111
+ if results:
112
+ print(f"βœ… Found {len(results)} real Wikipedia results")
113
+ return results
114
+ except Exception as e:
115
+ print(f"Wikipedia search failed: {e}")
116
+
117
+ # Method 4: Try arXiv for academic queries
118
+ if "research" in query.lower() or "paper" in query.lower() or "study" in query.lower():
119
+ try:
120
+ arxiv_url = f"http://export.arxiv.org/api/query?search_query=all:{urllib.parse.quote(query)}&max_results={num_results}"
121
+ response = self.session.get(arxiv_url, timeout=5)
122
+ soup = BeautifulSoup(response.text, 'xml')
123
+
124
+ for entry in soup.find_all('entry')[:num_results]:
125
+ title = entry.find('title')
126
+ summary = entry.find('summary')
127
+ link = entry.find('id')
128
+
129
+ if title and link:
130
+ results.append({
131
+ 'title': title.get_text().strip(),
132
+ 'body': summary.get_text()[:200].strip() if summary else "Academic paper",
133
+ 'url': link.get_text().strip()
134
+ })
135
+
136
+ if results:
137
+ print(f"βœ… Found {len(results)} real arXiv results")
138
+ return results
139
+ except Exception as e:
140
+ print(f"arXiv search failed: {e}")
141
+
142
+ # If all methods fail, return a message
143
+ print("❌ All search methods failed, returning fallback")
144
+ return [{
145
+ 'title': f"Search for: {query}",
146
+ 'body': "Unable to fetch real-time results. Please try a different query or check your connection.",
147
+ 'url': f"https://www.google.com/search?q={urllib.parse.quote(query)}"
148
+ }]
149
 
150
+ def research_with_sources(query, temperature=0.5):
151
+ """Research with REAL web sources"""
152
  if not query:
153
+ return "Please enter a research query"
154
+
155
+ print(f"πŸ” Researching: {query}")
156
+
157
+ # Get REAL search results
158
+ search_engine = RealWebSearch()
159
+ results = search_engine.search_web(query, 3)
160
 
161
+ # Build context from real sources
162
+ sources_text = ""
163
+ citations = []
164
 
165
+ for i, result in enumerate(results):
166
+ sources_text += f"[{i+1}] {result['title']}: {result['body']}\n"
167
+ citations.append(f"[{i+1}] {result['title']}\n {result['url']}")
168
+
169
+ # Generate analysis with Jan v1
170
+ prompt = f"""Based on these sources, analyze: {query}
171
+
172
+ Sources:
173
+ {sources_text}
174
+
175
+ Provide comprehensive analysis with key findings and implications:"""
176
 
177
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
178
+ inputs = inputs.to(model.device)
179
 
180
  with torch.no_grad():
181
  outputs = model.generate(
182
  **inputs,
183
+ max_new_tokens=400,
184
  temperature=temperature,
185
  do_sample=True,
186
  pad_token_id=tokenizer.eos_token_id
 
189
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
190
  analysis = response.replace(prompt, "").strip()
191
 
192
+ # Format with REAL sources
193
+ result = f"{analysis}\n\n" + "="*50 + "\nπŸ“š REAL SOURCES:\n\n"
194
+ for citation in citations:
195
+ result += citation + "\n\n"
196
 
197
  return result
198
 
199
+ # Create interface
200
+ with gr.Blocks(title="Jan v1 Research - REAL Sources", theme=gr.themes.Soft()) as demo:
201
+ gr.Markdown("""
202
+ # πŸš€ Jan v1 Research Assistant - WITH REAL WEB SEARCH
203
+
204
+ **Now with REAL sources from Google, Bing, Wikipedia, and arXiv!**
205
+
206
+ Powered by Jan v1 (4B params) - Like Perplexity but FREE
207
+ """)
208
+
209
+ with gr.Row():
210
+ with gr.Column():
211
+ query_input = gr.Textbox(
212
+ label="Research Query",
213
+ placeholder="Enter any topic to research with real sources...",
214
+ lines=2
215
+ )
216
+ temp_slider = gr.Slider(0.1, 0.9, value=0.5, label="Temperature")
217
+ search_btn = gr.Button("πŸ” Research with REAL Sources", variant="primary")
218
+
219
+ with gr.Column():
220
+ output = gr.Textbox(
221
+ label="Analysis with Real Sources",
222
+ lines=20,
223
+ show_copy_button=True
224
+ )
225
+
226
+ search_btn.click(
227
+ research_with_sources,
228
+ inputs=[query_input, temp_slider],
229
+ outputs=output
230
+ )
231
+
232
+ gr.Examples(
233
+ examples=[
234
+ ["latest AI developments 2024", 0.5],
235
+ ["quantum computing breakthroughs", 0.6],
236
+ ["climate change solutions", 0.5],
237
+ ["Chinese microdrama trends", 0.6]
238
+ ],
239
+ inputs=[query_input, temp_slider]
240
+ )
241
 
242
  if __name__ == "__main__":
243
  demo.launch()