|
import json
|
|
import pandas as pd
|
|
from typing import List, Dict, Any
|
|
from config import Config
|
|
from tqdm.auto import tqdm
|
|
|
|
|
|
class LegalDataLoader:
|
|
"""Load and process legal corpus"""
|
|
|
|
def __init__(self):
|
|
self.legal_corpus = None
|
|
|
|
def load_legal_corpus(self) -> List[Dict[str, Any]]:
|
|
"""Load legal corpus from JSON file"""
|
|
try:
|
|
with open(Config.CORPUS_PATH, "r", encoding="utf-8") as f:
|
|
self.legal_corpus = json.load(f)
|
|
|
|
|
|
if isinstance(self.legal_corpus, list):
|
|
print(f"Loaded {len(self.legal_corpus)} legal documents")
|
|
else:
|
|
|
|
print(
|
|
f"Loaded legal document: {self.legal_corpus.get('law_id', 'Unknown')}"
|
|
)
|
|
self.legal_corpus = [self.legal_corpus]
|
|
|
|
return self.legal_corpus
|
|
|
|
except FileNotFoundError:
|
|
print(f"Legal corpus file not found at {Config.CORPUS_PATH}")
|
|
return []
|
|
except json.JSONEncoder as e:
|
|
print(f"Error parsing JSON file: {e}")
|
|
return []
|
|
|
|
def prepare_documents_for_indexing(self) -> List[Dict[str, Any]]:
|
|
"""Prepare legal documents for vector indexing"""
|
|
if self.legal_corpus is None:
|
|
self.load_legal_corpus()
|
|
|
|
documents = []
|
|
for law in tqdm(self.legal_corpus):
|
|
law_id = law.get("law_id", "")
|
|
articles = law.get("articles", [])
|
|
|
|
|
|
for article in articles:
|
|
article_id = article.get("article_id", "")
|
|
title = article.get("title", "")
|
|
content = article.get("text", "")
|
|
|
|
if content and content.strip():
|
|
|
|
doc_id = (
|
|
f"{law_id}_{article_id}"
|
|
if law_id and article_id
|
|
else article_id
|
|
)
|
|
documents.append(
|
|
{
|
|
"id": doc_id,
|
|
"title": title,
|
|
"content": content,
|
|
"metadata": {
|
|
"law_id": law_id,
|
|
"article_id": article_id,
|
|
"title": title,
|
|
"source": "legal_corpus",
|
|
},
|
|
}
|
|
)
|
|
|
|
print(f"Prepared {len(documents)} documents for indexing")
|
|
return documents
|
|
|
|
def get_document_by_id(self, doc_id: str) -> Dict[str, Any]:
|
|
"""Get a specific document by ID"""
|
|
if self.legal_corpus is None:
|
|
self.load_legal_corpus()
|
|
|
|
|
|
for law in self.legal_corpus:
|
|
law_id = law.get("law_id", "")
|
|
articles = law.get("articles", [])
|
|
|
|
for article in articles:
|
|
article_id = article.get("article_id", "")
|
|
combined_id = (
|
|
f"{law_id}_{article_id}" if law_id and article_id else article_id
|
|
)
|
|
|
|
if combined_id == doc_id or article_id == doc_id:
|
|
return {
|
|
"law_id": law_id,
|
|
"article_id": article_id,
|
|
"title": article.get("title", ""),
|
|
"text": article.get("text", ""),
|
|
"combined_id": combined_id,
|
|
}
|
|
return {}
|
|
|
|
def search_documents_by_keyword(self, keyword: str) -> List[Dict[str, Any]]:
|
|
"""Search documents containing specific keywords"""
|
|
if self.legal_corpus is None:
|
|
self.load_legal_corpus()
|
|
|
|
results = []
|
|
keyword_lower = keyword.lower()
|
|
|
|
for law in self.legal_corpus:
|
|
law_id = law.get("law_id", "")
|
|
articles = law.get("articles", [])
|
|
|
|
for article in articles:
|
|
content = article.get("text", "").lower()
|
|
title = article.get("title", "").lower()
|
|
|
|
if keyword_lower in content or keyword_lower in title:
|
|
article_id = article.get("article_id", "")
|
|
combined_id = (
|
|
f"{law_id}_{article_id}"
|
|
if law_id and article_id
|
|
else article_id
|
|
)
|
|
|
|
results.append(
|
|
{
|
|
"law_id": law_id,
|
|
"article_id": article_id,
|
|
"title": article.get("title", ""),
|
|
"text": article.get("text", ""),
|
|
"combined_id": combined_id,
|
|
}
|
|
)
|
|
|
|
return results
|
|
|