Spaces:
Runtime error
Runtime error
| from langchain_core.tools import tool | |
| from sefaria import get_text as sefaria_get_text, get_commentaries as sefaria_get_commentaries | |
| from tantivy_search import TantivySearch | |
| from typing import Optional | |
| from pydantic import BaseModel, Field | |
| import os | |
| import gdown | |
| import zipfile | |
| from app import INDEX_PATH | |
| class ReadTextArgs(BaseModel): | |
| reference: str = Field(description="The reference to retrieve the text for. examples: 讘专讗砖讬转 讗 驻专拽 讗, Genesis 1:1") | |
| class SearchArgs(BaseModel): | |
| query: str = Field(description="""the query for the search. | |
| Instructions for generating a query: | |
| 1. Boolean Operators: | |
| - AND: term1 AND term2 (both required) | |
| - OR: term1 OR term2 (either term) | |
| - Multiple words default to OR operation (cloud network = cloud OR network) | |
| - AND takes precedence over OR | |
| - Example: Shabath AND (walk OR go) | |
| 2. Field-specific Terms: | |
| - Field-specific terms: field:term | |
| - Example: text:讗讚诐 AND reference:讘专讗砖讬转 | |
| - available fields: text, reference, topics | |
| - text contains the text of the document | |
| - reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗 | |
| - topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc. | |
| 3. Required/Excluded Terms: | |
| - Required (+): +term (must contain) | |
| - Excluded (-): -term (must not contain) | |
| - Example: +security cloud -deprecated | |
| - Equivalent to: security AND cloud AND NOT deprecated | |
| 4. Phrase Search: | |
| - Use quotes: "exact phrase" | |
| - Both single/double quotes work | |
| - Escape quotes with \\" | |
| - Slop operator: "term1 term2"~N | |
| - Example: "cloud security"~2 | |
| - the above will find "cloud framework and security " | |
| - Prefix matching: "start of phrase"* | |
| 5. Wildcards: | |
| - ? for single character | |
| - * for any number of characters | |
| - Example: sec?rity cloud* | |
| 6. Special Features: | |
| - All docs: * | |
| - Boost terms: term^2.0 (positive numbers only) | |
| - Example: security^2.0 cloud | |
| - the above will boost security by 2.0 | |
| Query Examples: | |
| 1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专 | |
| 2. Field-specific: text:住讬谞讬 AND topics:转谞讱 | |
| 3. Phrase with slop: "security framework"~2 | |
| 4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐 | |
| 6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖 | |
| Tips: | |
| - Group complex expressions with parentheses | |
| - Use quotes for exact phrases | |
| - Add + for required terms, - for excluded terms | |
| - Boost important terms with ^N | |
| - use field-specific terms for better results. | |
| - the corpus to search in is an ancient Hebrew corpus: Tora and Talmud. so Try to use ancient Hebrew terms and or Talmudic expressions and prevent modern words that are not common in talmudic texts | |
| """) | |
| num_results: int = Field(description="the maximum number of results to return. Default: 10", default=10) | |
| index_path = INDEX_PATH | |
| if not os.path.exists(index_path): | |
| try: | |
| zip_path = "index.zip" | |
| url = f"https://drive.google.com/uc?id={self.gdrive_index_id}" | |
| gdown.download(url, zip_path, quiet=False) | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| zip_ref.extractall(".") | |
| os.remove(zip_path) | |
| except Exception as e: | |
| raise Exception(f"failed to download index: {e}") | |
| try: | |
| tantivy = TantivySearch(index_path) | |
| tantivy.validate_index() | |
| except Exception as e: | |
| raise Exception(f"failed to create index: {e}") | |
| def search( query: str, num_results: int = 10): | |
| """Searches the index for the given query.""" | |
| results = tantivy.search(query, num_results) | |
| formatted_results = [] | |
| for result in results: | |
| formatted_results.append({ | |
| 'text': result.get('text', 'N/A'), | |
| 'reference': result.get('reference', 'N/A') | |
| }) | |
| return formatted_results | |
| def read_text(reference: str )->str: | |
| """Retrieves the text for a given reference. | |
| """ | |
| text = sefaria_get_text(reference) | |
| return { | |
| 'text': str(text), | |
| 'reference': reference | |
| } | |
| def get_commentaries(reference: str, num_results: int = 10)->str: | |
| """Retrieves references to all available commentaries on the given verse.""" | |
| commentaries = sefaria_get_commentaries(reference) | |
| return { | |
| 'text': '\n'.join(commentaries) if isinstance(commentaries, list) else str(commentaries), | |
| 'reference': f"Commentaries on {reference}" | |
| } | |