Spaces:

sivan22
/

Tora-Agent

Runtime error

App Files Files Community

Tora-Agent / tantivy_search.py

sivan22

Upload 9 files

360f294 verified 11 months ago

raw

history blame

6.89 kB

	from typing import List, Dict, Any, Optional
	from tantivy import Index
	import logging
	import os
	import re


	class TantivySearch:
	def __init__(self, index_path: str):
	"""Initialize the Tantivy search agent with the index path"""
	self.index_path = index_path
	self.logger = logging.getLogger(__name__)
	try:
	self.index = Index.open(index_path)
	self.logger.info(f"Successfully opened Tantivy index at {index_path}")
	except Exception as e:
	self.logger.error(f"Failed to open Tantivy index: {e}")
	raise

	def get_query_instructions(self) -> str:
	"""Return instructions for the LLM on how to parse and construct Tantivy queries"""
	return """
	Instructions for generating a query:

	1. Boolean Operators:

	- AND: term1 AND term2 (both required)
	- OR: term1 OR term2 (either term)
	- Multiple words default to OR operation (cloud network = cloud OR network)
	- AND takes precedence over OR
	- Example: Shabath AND (walk OR go)

	2. Field-specific Terms:
	- Field-specific terms: field:term
	- Example: text:אדם AND reference:בראשית
	- available fields: text, reference, topics
	- text contains the text of the document
	- reference contains the citation of the document, e.g. בראשית, פרק א
	- topics contains the topics of the document. available topics includes: תנך, הלכה, מדרש, etc.

	3. Required/Excluded Terms:
	- Required (+): +term (must contain)
	- Excluded (-): -term (must not contain)
	- Example: +security cloud -deprecated
	- Equivalent to: security AND cloud AND NOT deprecated

	4. Phrase Search:
	- Use quotes: "exact phrase"
	- Both single/double quotes work
	- Escape quotes with \\"
	- Slop operator: "term1 term2"~N
	- Example: "cloud security"~2
	- the above will find "cloud framework and security "
	- Prefix matching: "start of phrase"*

	5. Wildcards:
	- ? for single character
	- * for any number of characters
	- Example: sec?rity cloud*

	6. Special Features:
	- All docs: *
	- Boost terms: term^2.0 (positive numbers only)
	- Example: security^2.0 cloud
	- the above will boost security by 2.0

	Query Examples:
	1. Basic: +שבת +חולה +אסור
	2. Field-specific: text:סיני AND topics:תנך
	3. Phrase with slop: "security framework"~2
	4. Complex: +reference:בראשית +text:"הבל"^2.0 +(דמי OR דמים) -הבלים
	6. Mixed: (text:"רבנו משה"^2.0 OR reference:"משנה תורה") AND topics:הלכה) AND text:"תורה המלך"~3 AND NOT topics:מדרש

	Tips:
	- Group complex expressions with parentheses
	- Use quotes for exact phrases
	- Add + for required terms, - for excluded terms
	- Boost important terms with ^N
	- use field-specific terms for better results.
	"""

	def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
	"""Search the Tantivy index with the given query using Tantivy's query syntax"""
	try:
	# Create a searcher
	searcher = self.index.searcher()

	# Parse and execute the query
	try:
	# First try with lenient parsing
	query_parser = self.index.parse_query_lenient(query)
	search_results = searcher.search(query_parser[0], num_results).hits

	except Exception as query_error:
	self.logger.error(f"Lenient query parsing failed: {query_error}")

	# Process results
	results = []
	for score, doc_address in search_results:
	doc = searcher.doc(doc_address)
	text = doc.get_first("text")

	# Extract highlighted snippets based on query terms
	# Remove special syntax for highlighting while preserving Hebrew
	highlight_terms = re.sub(
	r'[:"()[\]{}^~*\\]\|\b(AND\|OR\|NOT\|TO\|IN)\b\|[-+]',
	' ',
	query
	).strip()
	highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]

	# Create regex pattern for highlighting
	if highlight_terms:
	# Escape regex special chars but preserve Hebrew
	patterns = [re.escape(term) for term in highlight_terms]
	pattern = '\|'.join(patterns)
	# Get surrounding context for matches
	matches = list(re.finditer(pattern, text, re.IGNORECASE))
	if matches:
	highlights = []
	for match in matches:
	start = max(0, match.start() - 50)
	end = min(len(text), match.end() + 50)
	highlight = text[start:end]
	if start > 0:
	highlight = f"...{highlight}"
	if end < len(text):
	highlight = f"{highlight}..."
	highlights.append(highlight)
	else:
	highlights = [text[:100] + "..." if len(text) > 100 else text]
	else:
	highlights = [text[:100] + "..." if len(text) > 100 else text]

	result = {
	"score": float(score),
	"title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
	"reference": doc.get_first("reference"),
	"topics": doc.get_first("topics"),
	"file_path": doc.get_first("filePath"),
	"line_number": doc.get_first("segment"),
	"is_pdf": doc.get_first("isPdf"),
	"text": text,
	"highlights": highlights
	}
	results.append(result)

	self.logger.info(f"Found {len(results)} results for query: {query}")
	return results

	except Exception as e:
	self.logger.error(f"Error during search: {str(e)}")
	return []

	def validate_index(self) -> bool:
	"""Validate that the index exists and is accessible"""
	try:
	# Try to create a searcher and perform a simple search
	searcher = self.index.searcher()
	query_parser = self.index.parse_query("*")
	searcher.search(query_parser, 1)
	return True
	except Exception as e:
	self.logger.error(f"Index validation failed: {e}")
	return False