|
import requests
|
|
from googlesearch import search
|
|
from bs4 import BeautifulSoup
|
|
from typing import List, Dict
|
|
import time
|
|
from config import Config
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
class GoogleSearchTool:
|
|
"""Google Search tool for legal questions with insufficient information"""
|
|
|
|
def __init__(self):
|
|
self.search_delay = 1
|
|
|
|
def search_legal_info(
|
|
self, query: str, num_results: int = None
|
|
) -> List[Dict[str, str]]:
|
|
if num_results is None:
|
|
num_results = Config.GOOGLE_SEARCH_RESULTS_COUNT
|
|
|
|
try:
|
|
|
|
enhanced_queries = [
|
|
f"{query} luật pháp Việt Nam site:thuvienphapluat.vn",
|
|
f"{query} pháp luật Việt Nam site:moj.gov.vn",
|
|
f"{query} quy định pháp luật Việt Nam",
|
|
f"{query} luật việt nam điều khoản",
|
|
]
|
|
|
|
all_results = []
|
|
seen_urls = set()
|
|
|
|
|
|
for enhanced_query in enhanced_queries:
|
|
if len(all_results) >= num_results:
|
|
break
|
|
|
|
try:
|
|
search_results = search(enhanced_query, num_results=3, lang="vi")
|
|
|
|
for url in search_results:
|
|
if len(all_results) >= num_results:
|
|
break
|
|
|
|
if url in seen_urls:
|
|
continue
|
|
|
|
seen_urls.add(url)
|
|
|
|
try:
|
|
|
|
content = self._get_page_content(url)
|
|
if content and content.get("snippet"):
|
|
all_results.append(
|
|
{
|
|
"url": url,
|
|
"title": content.get(
|
|
"title", "Không có tiêu đề"
|
|
),
|
|
"snippet": content.get(
|
|
"snippet", "Không có nội dung"
|
|
),
|
|
"domain": self._extract_domain(url),
|
|
}
|
|
)
|
|
|
|
time.sleep(self.search_delay)
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching content from {url}: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f"Error with search query '{enhanced_query}': {e}")
|
|
continue
|
|
|
|
return all_results[:num_results]
|
|
|
|
except Exception as e:
|
|
print(f"Error performing Google search: {e}")
|
|
return []
|
|
|
|
def _extract_domain(self, url: str) -> str:
|
|
"""Extract domain from URL"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
return parsed.netloc
|
|
except:
|
|
return "Unknown"
|
|
|
|
def _get_page_content(self, url: str) -> Dict[str, str]:
|
|
"""Extract content from a web page with better Vietnamese content handling"""
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "vi-VN,vi;q=0.9,en;q=0.8",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
"Connection": "keep-alive",
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
|
|
if response.encoding.lower() in ["iso-8859-1", "windows-1252"]:
|
|
response.encoding = "utf-8"
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
|
|
title_tag = soup.find("title")
|
|
title = title_tag.get_text().strip() if title_tag else "Không có tiêu đề"
|
|
|
|
|
|
for element in soup(
|
|
["script", "style", "nav", "header", "footer", "aside", "iframe"]
|
|
):
|
|
element.decompose()
|
|
|
|
|
|
main_content = None
|
|
content_selectors = [
|
|
"article",
|
|
"main",
|
|
".content",
|
|
".post-content",
|
|
".entry-content",
|
|
".article-content",
|
|
".news-content",
|
|
"#content",
|
|
".main-content",
|
|
]
|
|
|
|
for selector in content_selectors:
|
|
main_content = soup.select_one(selector)
|
|
if main_content:
|
|
break
|
|
|
|
|
|
if not main_content:
|
|
main_content = soup.find("body")
|
|
|
|
if main_content:
|
|
text = main_content.get_text()
|
|
else:
|
|
text = soup.get_text()
|
|
|
|
|
|
lines = (line.strip() for line in text.splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
text = " ".join(chunk for chunk in chunks if chunk and len(chunk) > 3)
|
|
|
|
|
|
legal_keywords = [
|
|
"luật",
|
|
"điều",
|
|
"khoản",
|
|
"quy định",
|
|
"nghị định",
|
|
"thông tư",
|
|
"quyền",
|
|
"nghĩa vụ",
|
|
]
|
|
|
|
|
|
sentences = text.split(".")
|
|
relevant_sentences = []
|
|
|
|
for sentence in sentences:
|
|
if any(keyword in sentence.lower() for keyword in legal_keywords):
|
|
relevant_sentences.append(sentence.strip())
|
|
if len(" ".join(relevant_sentences)) > 400:
|
|
break
|
|
|
|
if relevant_sentences:
|
|
snippet = ". ".join(relevant_sentences[:3])
|
|
else:
|
|
snippet = text[:600] + "..." if len(text) > 600 else text
|
|
|
|
return {"title": title, "snippet": snippet}
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting content from {url}: {e}")
|
|
return {}
|
|
|
|
def format_search_results(self, results: List[Dict[str, str]]) -> str:
|
|
"""Format search results for LLM context"""
|
|
if not results:
|
|
return "Không tìm thấy thông tin liên quan."
|
|
|
|
formatted_results = ""
|
|
|
|
for i, result in enumerate(results, 1):
|
|
formatted_results += f"**Nguồn {i}: {result['title']}**\n"
|
|
formatted_results += f"Website: {result.get('domain', 'Unknown')}\n"
|
|
formatted_results += f"Nội dung: {result['snippet']}\n"
|
|
formatted_results += f"Link: {result['url']}\n\n"
|
|
|
|
return formatted_results
|
|
|
|
def format_search_results_for_display(self, results: List[Dict[str, str]]) -> str:
|
|
"""Format search results for UI display with clickable links"""
|
|
if not results:
|
|
return "Không tìm thấy thông tin tham khảo từ web."
|
|
|
|
|
|
formatted_html = '<div style="background-color: #f8f9fa; padding: 15px; border-radius: 8px; margin: 10px 0;">'
|
|
formatted_html += '<h4 style="color: #1e40af; margin-bottom: 15px;">🌐 Nguồn tham khảo từ web:</h4>'
|
|
|
|
for i, result in enumerate(results, 1):
|
|
|
|
title_escaped = result["title"].replace("<", "<").replace(">", ">")
|
|
snippet_escaped = (
|
|
result["snippet"][:200].replace("<", "<").replace(">", ">")
|
|
)
|
|
if len(result["snippet"]) > 200:
|
|
snippet_escaped += "..."
|
|
|
|
formatted_html += f"""<div style="background-color: white; padding: 12px; margin-bottom: 10px; border-radius: 6px; border-left: 4px solid #3b82f6;">
|
|
<h5 style="margin: 0; color: #1e40af;">
|
|
<a href="{result['url']}" target="_blank" style="text-decoration: none; color: #1e40af;">
|
|
{i}. {title_escaped}
|
|
</a>
|
|
</h5>
|
|
<p style="color: #6b7280; font-size: 0.9em; margin: 5px 0;">
|
|
📄 {result.get('domain', 'Unknown')}
|
|
</p>
|
|
<p style="margin: 8px 0; color: #374151; line-height: 1.5;">
|
|
{snippet_escaped}
|
|
</p>
|
|
<a href="{result['url']}" target="_blank" style="color: #3b82f6; text-decoration: none; font-size: 0.9em;">
|
|
🔗 Xem chi tiết →
|
|
</a>
|
|
</div>"""
|
|
|
|
formatted_html += "</div>"
|
|
return formatted_html
|
|
|