Spaces:

darwincb
/

jan-v1-research

Paused

App Files Files Community

darwincb commited on 23 days ago

Commit

dcc141f

1 Parent(s): 8334178

🔍 REAL WEB SEARCH: Google, Bing, Wikipedia, arXiv - No more fake URLs!

Browse files

Files changed (1) hide show

app.py +199 -40

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Jan v1 Research Assistant - OPTIMIZED for speed
 """
 import gradio as gr
@@ -7,52 +7,180 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import requests
 from bs4 import BeautifulSoup
-import re
-# Initialize model with optimizations
-print("🚀 Loading Jan v1 optimized...")
 model_name = "janhq/Jan-v1-4B"
-# Load with 4-bit quantization for speed
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float16,
     device_map="auto",
-    load_in_4bit=True,  # 4-bit is faster than 8-bit
     trust_remote_code=True,
     low_cpu_mem_usage=True
 )
-print("✅ Model loaded!")
-def quick_search(query):
-    """Ultra simple search"""
-    return [
-        {'title': f'Result 1 for {query}', 'body': 'Recent developments and findings...', 'url': '#'},
-        {'title': f'Result 2 for {query}', 'body': 'Expert analysis shows...', 'url': '#'},
-        {'title': f'Result 3 for {query}', 'body': 'Current research indicates...', 'url': '#'}
-    ]
-def fast_research(query, temperature=0.4):
-    """Optimized for speed"""
     if not query:
-        return "Enter a query"
-    # Quick search
-    results = quick_search(query)
-    sources = "\n".join([f"[{i+1}] {r['title']}: {r['body']}" for i, r in enumerate(results)])
-    # Shorter prompt for speed
-    prompt = f"Query: {query}\nSources: {sources}\n\nProvide brief analysis:"
-    # Generate with limits
-    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=200,  # Limit output for speed
             temperature=temperature,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id
@@ -61,24 +189,55 @@ def fast_research(query, temperature=0.4):
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     analysis = response.replace(prompt, "").strip()
-    # Add sources
-    result = f"{analysis}\n\n📚 SOURCES:\n"
-    for i, r in enumerate(results):
-        result += f"[{i+1}] {r['title']}\n"
     return result
-# Simple interface
-demo = gr.Interface(
-    fn=fast_research,
-    inputs=[
-        gr.Textbox(label="Research Query", lines=2),
-        gr.Slider(0.1, 0.9, value=0.4, label="Temperature")
-    ],
-    outputs=gr.Textbox(label="Analysis", lines=15),
-    title="Jan v1 Research - FAST VERSION",
-    description="Optimized for speed - 30 second responses"
-)
 if __name__ == "__main__":
     demo.launch()

 """
+Jan v1 Research Assistant - WITH REAL WEB SEARCH
 """
 import gradio as gr
 import torch
 import requests
 from bs4 import BeautifulSoup
+import json
+import urllib.parse
+# Initialize model
+print("🚀 Loading Jan v1...")
 model_name = "janhq/Jan-v1-4B"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float16,
     device_map="auto",
+    load_in_4bit=True,
     trust_remote_code=True,
     low_cpu_mem_usage=True
 )
+print("✅ Jan v1 loaded!")
+class RealWebSearch:
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+    def search_web(self, query, num_results=3):
+        """Real web search using multiple methods"""
+        results = []
+        # Method 1: Try Google Search (via scraping)
+        try:
+            search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
+            response = self.session.get(search_url, timeout=5)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Find search results
+            search_divs = soup.find_all('div', class_='g')[:num_results]
+            for div in search_divs:
+                title_elem = div.find('h3')
+                link_elem = div.find('a')
+                snippet_elem = div.find('span', class_='aCOpRe') or div.find('span', class_='st')
+                if title_elem and link_elem:
+                    results.append({
+                        'title': title_elem.get_text(),
+                        'body': snippet_elem.get_text() if snippet_elem else "No snippet available",
+                        'url': link_elem.get('href', '#')
+                    })
+            if results:
+                print(f"✅ Found {len(results)} real Google results")
+                return results
+        except Exception as e:
+            print(f"Google search failed: {e}")
+        # Method 2: Try Bing Search
+        try:
+            bing_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}"
+            response = self.session.get(bing_url, timeout=5)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Find Bing results
+            for li in soup.find_all('li', class_='b_algo')[:num_results]:
+                h2 = li.find('h2')
+                if h2:
+                    link = h2.find('a')
+                    snippet = li.find('p')
+                    if link:
+                        results.append({
+                            'title': link.get_text(),
+                            'body': snippet.get_text() if snippet else "No description",
+                            'url': link.get('href', '#')
+                        })
+            if results:
+                print(f"✅ Found {len(results)} real Bing results")
+                return results
+        except Exception as e:
+            print(f"Bing search failed: {e}")
+        # Method 3: Try Wikipedia API
+        try:
+            wiki_url = f"https://en.wikipedia.org/w/api.php?action=opensearch&search={query}&limit={num_results}&format=json"
+            response = self.session.get(wiki_url, timeout=5)
+            data = response.json()
+            if len(data) >= 4:
+                titles = data[1]
+                descriptions = data[2]
+                urls = data[3]
+                for i in range(min(len(titles), num_results)):
+                    results.append({
+                        'title': titles[i],
+                        'body': descriptions[i] if i < len(descriptions) else "Wikipedia article",
+                        'url': urls[i] if i < len(urls) else f"https://en.wikipedia.org/wiki/{titles[i].replace(' ', '_')}"
+                    })
+            if results:
+                print(f"✅ Found {len(results)} real Wikipedia results")
+                return results
+        except Exception as e:
+            print(f"Wikipedia search failed: {e}")
+        # Method 4: Try arXiv for academic queries
+        if "research" in query.lower() or "paper" in query.lower() or "study" in query.lower():
+            try:
+                arxiv_url = f"http://export.arxiv.org/api/query?search_query=all:{urllib.parse.quote(query)}&max_results={num_results}"
+                response = self.session.get(arxiv_url, timeout=5)
+                soup = BeautifulSoup(response.text, 'xml')
+                for entry in soup.find_all('entry')[:num_results]:
+                    title = entry.find('title')
+                    summary = entry.find('summary')
+                    link = entry.find('id')
+                    if title and link:
+                        results.append({
+                            'title': title.get_text().strip(),
+                            'body': summary.get_text()[:200].strip() if summary else "Academic paper",
+                            'url': link.get_text().strip()
+                        })
+                if results:
+                    print(f"✅ Found {len(results)} real arXiv results")
+                    return results
+            except Exception as e:
+                print(f"arXiv search failed: {e}")
+        # If all methods fail, return a message
+        print("❌ All search methods failed, returning fallback")
+        return [{
+            'title': f"Search for: {query}",
+            'body': "Unable to fetch real-time results. Please try a different query or check your connection.",
+            'url': f"https://www.google.com/search?q={urllib.parse.quote(query)}"
+        }]
+def research_with_sources(query, temperature=0.5):
+    """Research with REAL web sources"""
     if not query:
+        return "Please enter a research query"
+    print(f"🔍 Researching: {query}")
+    # Get REAL search results
+    search_engine = RealWebSearch()
+    results = search_engine.search_web(query, 3)
+    # Build context from real sources
+    sources_text = ""
+    citations = []
+    for i, result in enumerate(results):
+        sources_text += f"[{i+1}] {result['title']}: {result['body']}\n"
+        citations.append(f"[{i+1}] {result['title']}\n    {result['url']}")
+    # Generate analysis with Jan v1
+    prompt = f"""Based on these sources, analyze: {query}
+Sources:
+{sources_text}
+Provide comprehensive analysis with key findings and implications:"""
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
+    inputs = inputs.to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=400,
             temperature=temperature,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     analysis = response.replace(prompt, "").strip()
+    # Format with REAL sources
+    result = f"{analysis}\n\n" + "="*50 + "\n📚 REAL SOURCES:\n\n"
+    for citation in citations:
+        result += citation + "\n\n"
     return result
+# Create interface
+with gr.Blocks(title="Jan v1 Research - REAL Sources", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🚀 Jan v1 Research Assistant - WITH REAL WEB SEARCH
+    **Now with REAL sources from Google, Bing, Wikipedia, and arXiv!**
+    Powered by Jan v1 (4B params) - Like Perplexity but FREE
+    """)
+    with gr.Row():
+        with gr.Column():
+            query_input = gr.Textbox(
+                label="Research Query",
+                placeholder="Enter any topic to research with real sources...",
+                lines=2
+            )
+            temp_slider = gr.Slider(0.1, 0.9, value=0.5, label="Temperature")
+            search_btn = gr.Button("🔍 Research with REAL Sources", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(
+                label="Analysis with Real Sources",
+                lines=20,
+                show_copy_button=True
+            )
+    search_btn.click(
+        research_with_sources,
+        inputs=[query_input, temp_slider],
+        outputs=output
+    )
+    gr.Examples(
+        examples=[
+            ["latest AI developments 2024", 0.5],
+            ["quantum computing breakthroughs", 0.6],
+            ["climate change solutions", 0.5],
+            ["Chinese microdrama trends", 0.6]
+        ],
+        inputs=[query_input, temp_slider]
+    )
 if __name__ == "__main__":
     demo.launch()