import json
import numpy as np
from langchain.schema import Document
import faiss
from rank_bm25 import BM25Okapi
from data_processing import embedding_model
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

retrieved_docs = None

def retrieve_documents_hybrid(query, q_dataset, top_k=5):
    with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f:
        chunked_documents = json.load(f)  # Contains all documents for this dataset
    
    faiss_index_path = f"data_local/{q_dataset}_quantized.faiss"
    index = faiss.read_index(faiss_index_path)

    # Tokenize documents for BM25
    tokenized_docs = [doc.split() for doc in chunked_documents]
    bm25 = BM25Okapi(tokenized_docs)

    query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
    query_embedding = query_embedding.reshape(1, -1)

    # FAISS Search
    _, nearest_indices = index.search(query_embedding, top_k)
    faiss_docs = [chunked_documents[i] for i in nearest_indices[0]]

    # BM25 Search  
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
    bm25_docs = [chunked_documents[i] for i in bm25_top_indices]

    # Merge FAISS + BM25 Results
    retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
    
    reranked_docs = rerank_documents(query, retrieved_docs)

    return reranked_docs

# Retrieval Function
# def retrieve_documents(query, top_k=5):
#     query_dataset = find_query_dataset(query)
#     #index, chunk_docs = load_data_from_faiss(query)

#     with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
#         documents = json.load(f)  # Contains all documents for this dataset

#     faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
#     index = faiss.read_index(faiss_index_path)

#     query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)

#     _, nearest_indices = index.search(query_embedding, top_k)

#     retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]

#     return retrieved_docs

def remove_duplicate_documents(documents):
    unique_documents = []
    seen_documents = set()  # To keep track of seen documents
    for doc in documents:
        # Using the page_content as a unique identifier for deduplication
        doc_content = doc.page_content
        if doc_content not in seen_documents:
            unique_documents.append(doc)
            seen_documents.add(doc_content)
    return unique_documents

def find_query_dataset(query):
    index = faiss.read_index("data_local/question_quantized.faiss")

    with open("data_local/dataset_mapping.json", "r") as f:
        dataset_names = json.load(f)

    question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
    _, nearest_index = index.search(question_embedding, 1)  
    best_dataset = dataset_names[nearest_index[0][0]]
    return best_dataset

def rerank_documents(query, retrieved_docs):
    doc_texts = [doc for doc in retrieved_docs]
    scores = reranker.predict([[query, doc] for doc in doc_texts])
    ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
    return ranked_docs[:5]  # Return top k most relevant