Upload Hindi embeddings model and all associated files
Browse files- hindi-rag-system.py +117 -137
- hindi-rag-system.py.amltmp +117 -137
hindi-rag-system.py
CHANGED
|
@@ -744,35 +744,71 @@ def load_llama_model(model_name="unsloth/Llama-3.2-1B-Instruct", device="cuda"):
|
|
| 744 |
|
| 745 |
return model, tokenizer
|
| 746 |
|
| 747 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 748 |
"""
|
| 749 |
-
Set up
|
| 750 |
"""
|
| 751 |
# Create retriever
|
| 752 |
retriever = vector_store.as_retriever(
|
| 753 |
search_type="similarity",
|
| 754 |
-
search_kwargs={"k":
|
| 755 |
)
|
| 756 |
|
| 757 |
-
# Create a function to generate answers
|
| 758 |
-
def
|
| 759 |
-
#
|
| 760 |
-
|
| 761 |
-
docs = retriever.invoke(query)
|
| 762 |
-
except:
|
| 763 |
-
# Fallback to older method if invoke isn't available
|
| 764 |
-
docs = retriever.get_relevant_documents(query)
|
| 765 |
|
| 766 |
-
#
|
| 767 |
-
|
| 768 |
|
| 769 |
-
# Create prompt
|
| 770 |
prompt = f"""
|
| 771 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
| 772 |
-
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें।
|
| 773 |
|
| 774 |
संदर्भ:
|
| 775 |
-
{
|
| 776 |
|
| 777 |
प्रश्न: {query}
|
| 778 |
|
|
@@ -797,7 +833,7 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
| 797 |
do_sample=True
|
| 798 |
)
|
| 799 |
except Exception as e:
|
| 800 |
-
return f"Error generating response: {str(e)}"
|
| 801 |
|
| 802 |
# Decode the generated text
|
| 803 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
@@ -805,9 +841,9 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
| 805 |
# Extract just the answer part (after the prompt)
|
| 806 |
answer = full_response.split("उत्तर:")[-1].strip()
|
| 807 |
|
| 808 |
-
return answer
|
| 809 |
|
| 810 |
-
return
|
| 811 |
|
| 812 |
# Main RAG functions
|
| 813 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
|
@@ -858,54 +894,10 @@ def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="c
|
|
| 858 |
# Perform similarity search
|
| 859 |
results = perform_similarity_search(vector_store, query, k=k)
|
| 860 |
|
| 861 |
-
|
| 862 |
-
processed_results = []
|
| 863 |
-
seen_chunks = set()
|
| 864 |
-
|
| 865 |
-
for doc, score in results:
|
| 866 |
-
chunk_id = doc.metadata["chunk_id"]
|
| 867 |
-
source = doc.metadata["source"]
|
| 868 |
-
|
| 869 |
-
# Skip if we've already included this chunk
|
| 870 |
-
if (source, chunk_id) in seen_chunks:
|
| 871 |
-
continue
|
| 872 |
-
|
| 873 |
-
seen_chunks.add((source, chunk_id))
|
| 874 |
-
|
| 875 |
-
# Try to find adjacent chunks and combine them
|
| 876 |
-
combined_content = doc.page_content
|
| 877 |
-
|
| 878 |
-
# Look for adjacent chunks in results (both previous and next)
|
| 879 |
-
for adj_id in [chunk_id-1, chunk_id+1]:
|
| 880 |
-
for other_doc, _ in results:
|
| 881 |
-
if (other_doc.metadata["source"] == source and
|
| 882 |
-
other_doc.metadata["chunk_id"] == adj_id and
|
| 883 |
-
(source, adj_id) not in seen_chunks):
|
| 884 |
-
|
| 885 |
-
# Add the adjacent chunk content
|
| 886 |
-
if adj_id < chunk_id: # Previous chunk
|
| 887 |
-
combined_content = other_doc.page_content + " " + combined_content
|
| 888 |
-
else: # Next chunk
|
| 889 |
-
combined_content = combined_content + " " + other_doc.page_content
|
| 890 |
-
|
| 891 |
-
seen_chunks.add((source, adj_id))
|
| 892 |
-
|
| 893 |
-
# Create a new document with combined content
|
| 894 |
-
combined_doc = Document(
|
| 895 |
-
page_content=combined_content,
|
| 896 |
-
metadata={
|
| 897 |
-
"source": source,
|
| 898 |
-
"chunk_id": chunk_id,
|
| 899 |
-
"is_combined": True if combined_content != doc.page_content else False
|
| 900 |
-
}
|
| 901 |
-
)
|
| 902 |
-
|
| 903 |
-
processed_results.append((combined_doc, score))
|
| 904 |
-
|
| 905 |
-
return processed_results, vector_store
|
| 906 |
|
| 907 |
def main():
|
| 908 |
-
parser = argparse.ArgumentParser(description="Hindi RAG System with
|
| 909 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
| 910 |
help="Directory containing the model and tokenizer")
|
| 911 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
|
@@ -928,10 +920,12 @@ def main():
|
|
| 928 |
help="Run in interactive mode for querying")
|
| 929 |
parser.add_argument("--reindex", action="store_true",
|
| 930 |
help="Force reindexing even if index exists")
|
| 931 |
-
parser.add_argument("--qa", action="store_true",
|
| 932 |
-
help="Use LLM for question answering instead of just retrieval")
|
| 933 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
| 934 |
help="HuggingFace model name for the LLM")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
args = parser.parse_args()
|
| 936 |
|
| 937 |
# Load embedding model and tokenizer
|
|
@@ -943,20 +937,15 @@ def main():
|
|
| 943 |
# Create vector store path
|
| 944 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
| 945 |
|
| 946 |
-
# Load LLM
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
print("LLM loaded successfully for QA")
|
| 956 |
-
except Exception as e:
|
| 957 |
-
print(f"Error loading LLM: {e}")
|
| 958 |
-
print("Falling back to retrieval-only mode")
|
| 959 |
-
args.qa = False
|
| 960 |
|
| 961 |
if args.index or args.reindex:
|
| 962 |
# Index text files
|
|
@@ -964,47 +953,43 @@ def main():
|
|
| 964 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
| 965 |
)
|
| 966 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
|
|
|
|
|
|
|
|
|
| 971 |
|
| 972 |
if args.query:
|
| 973 |
-
#
|
| 974 |
-
|
| 975 |
-
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
| 976 |
-
)
|
| 977 |
|
| 978 |
-
#
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 992 |
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
try:
|
| 996 |
-
answer = qa_generator(args.query)
|
| 997 |
-
print("\nLLM Answer:")
|
| 998 |
-
print(answer)
|
| 999 |
-
except Exception as e:
|
| 1000 |
-
print(f"Error generating answer: {e}")
|
| 1001 |
|
| 1002 |
if args.interactive:
|
| 1003 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
| 1004 |
|
| 1005 |
-
# For the first query, load vector store
|
| 1006 |
-
vector_store = None
|
| 1007 |
-
|
| 1008 |
while True:
|
| 1009 |
print("\nEnter query:")
|
| 1010 |
query = input()
|
|
@@ -1015,33 +1000,28 @@ def main():
|
|
| 1015 |
if query.lower() == 'quit':
|
| 1016 |
break
|
| 1017 |
|
| 1018 |
-
#
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
|
| 1023 |
-
# Print retrieval results
|
| 1024 |
-
print("\nSearch Results:")
|
| 1025 |
-
for i, (doc, score) in enumerate(results):
|
| 1026 |
-
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
| 1027 |
-
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
| 1028 |
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1032 |
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
|
| 1037 |
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
answer = qa_generator(query)
|
| 1041 |
-
print("\nLLM Answer:")
|
| 1042 |
-
print(answer)
|
| 1043 |
-
except Exception as e:
|
| 1044 |
-
print(f"Error generating answer: {e}")
|
| 1045 |
|
| 1046 |
# Clean up GPU memory
|
| 1047 |
if args.device == "cuda":
|
|
|
|
| 744 |
|
| 745 |
return model, tokenizer
|
| 746 |
|
| 747 |
+
# NEW FUNCTIONS FOR COMBINED RESULTS APPROACH
|
| 748 |
+
|
| 749 |
+
def combine_top_results(results, query, max_results=4):
|
| 750 |
+
"""
|
| 751 |
+
Combine the top search results into a single coherent context
|
| 752 |
+
|
| 753 |
+
Args:
|
| 754 |
+
results: List of (Document, score) tuples from retrieval
|
| 755 |
+
query: Original user query
|
| 756 |
+
max_results: Maximum number of results to combine
|
| 757 |
+
|
| 758 |
+
Returns:
|
| 759 |
+
String containing combined context from top results
|
| 760 |
+
"""
|
| 761 |
+
# Sort results by score (highest first) and take top N
|
| 762 |
+
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)[:max_results]
|
| 763 |
+
|
| 764 |
+
combined_texts = []
|
| 765 |
+
seen_content = set() # To avoid duplicates
|
| 766 |
+
|
| 767 |
+
for doc, score in sorted_results:
|
| 768 |
+
# Extract relevant sentences to keep context focused
|
| 769 |
+
relevant_text = extract_relevant_sentences(doc.page_content, query, window_size=3)
|
| 770 |
+
|
| 771 |
+
# Skip if this exact text has been seen before
|
| 772 |
+
if relevant_text in seen_content:
|
| 773 |
+
continue
|
| 774 |
+
|
| 775 |
+
# Add source information to the text
|
| 776 |
+
source_name = os.path.basename(doc.metadata["source"])
|
| 777 |
+
text_with_source = f"{relevant_text} [Source: {source_name}]"
|
| 778 |
+
|
| 779 |
+
combined_texts.append(text_with_source)
|
| 780 |
+
seen_content.add(relevant_text)
|
| 781 |
+
|
| 782 |
+
# Combine all texts with clear separation
|
| 783 |
+
combined_context = "\n\n".join(combined_texts)
|
| 784 |
+
|
| 785 |
+
return combined_context
|
| 786 |
+
|
| 787 |
+
def setup_enhanced_qa_system(model, tokenizer, vector_store):
|
| 788 |
"""
|
| 789 |
+
Set up an enhanced QA system using the model and retriever with result combination
|
| 790 |
"""
|
| 791 |
# Create retriever
|
| 792 |
retriever = vector_store.as_retriever(
|
| 793 |
search_type="similarity",
|
| 794 |
+
search_kwargs={"k": 6} # Get more results than we'll use to filter better
|
| 795 |
)
|
| 796 |
|
| 797 |
+
# Create a function to generate answers with combined context
|
| 798 |
+
def generate_enhanced_answer(query):
|
| 799 |
+
# Get raw documents and scores
|
| 800 |
+
docs = vector_store.similarity_search_with_score(query, k=6)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
|
| 802 |
+
# Combine the top results into a single context
|
| 803 |
+
combined_context = combine_top_results(docs, query, max_results=4)
|
| 804 |
|
| 805 |
+
# Create prompt with the combined context
|
| 806 |
prompt = f"""
|
| 807 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
| 808 |
+
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें। अपने उत्तर में सभी प्रासंगिक जानकारी का उपयोग करें।
|
| 809 |
|
| 810 |
संदर्भ:
|
| 811 |
+
{combined_context}
|
| 812 |
|
| 813 |
प्रश्न: {query}
|
| 814 |
|
|
|
|
| 833 |
do_sample=True
|
| 834 |
)
|
| 835 |
except Exception as e:
|
| 836 |
+
return f"Error generating response: {str(e)}", None
|
| 837 |
|
| 838 |
# Decode the generated text
|
| 839 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
| 841 |
# Extract just the answer part (after the prompt)
|
| 842 |
answer = full_response.split("उत्तर:")[-1].strip()
|
| 843 |
|
| 844 |
+
return answer, combined_context
|
| 845 |
|
| 846 |
+
return generate_enhanced_answer
|
| 847 |
|
| 848 |
# Main RAG functions
|
| 849 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
|
|
|
| 894 |
# Perform similarity search
|
| 895 |
results = perform_similarity_search(vector_store, query, k=k)
|
| 896 |
|
| 897 |
+
return results, vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
|
| 899 |
def main():
|
| 900 |
+
parser = argparse.ArgumentParser(description="Hindi RAG System with Combined Results")
|
| 901 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
| 902 |
help="Directory containing the model and tokenizer")
|
| 903 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
|
|
|
| 920 |
help="Run in interactive mode for querying")
|
| 921 |
parser.add_argument("--reindex", action="store_true",
|
| 922 |
help="Force reindexing even if index exists")
|
|
|
|
|
|
|
| 923 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
| 924 |
help="HuggingFace model name for the LLM")
|
| 925 |
+
parser.add_argument("--show_context", action="store_true",
|
| 926 |
+
help="Show the combined context sent to the LLM")
|
| 927 |
+
parser.add_argument("--show_raw_results", action="store_true",
|
| 928 |
+
help="Show the raw search results before combination")
|
| 929 |
args = parser.parse_args()
|
| 930 |
|
| 931 |
# Load embedding model and tokenizer
|
|
|
|
| 937 |
# Create vector store path
|
| 938 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
| 939 |
|
| 940 |
+
# Load LLM
|
| 941 |
+
try:
|
| 942 |
+
# Load LLM
|
| 943 |
+
llm_model, llm_tokenizer = load_llama_model(args.llm_name, args.device)
|
| 944 |
+
print("LLM loaded successfully for QA")
|
| 945 |
+
except Exception as e:
|
| 946 |
+
print(f"Error loading LLM: {e}")
|
| 947 |
+
print("Cannot proceed without LLM for this combined results approach")
|
| 948 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
|
| 950 |
if args.index or args.reindex:
|
| 951 |
# Index text files
|
|
|
|
| 953 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
| 954 |
)
|
| 955 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
| 956 |
+
|
| 957 |
+
# Load vector store for querying
|
| 958 |
+
embeddings = HindiSentenceEmbeddings(embed_model, embed_tokenizer, device=args.device)
|
| 959 |
+
vector_store = load_vector_store(vector_store_path, embeddings)
|
| 960 |
+
|
| 961 |
+
# Set up enhanced QA system
|
| 962 |
+
qa_generator = setup_enhanced_qa_system(llm_model, llm_tokenizer, vector_store)
|
| 963 |
|
| 964 |
if args.query:
|
| 965 |
+
# Process the query with the enhanced system
|
| 966 |
+
print(f"\nProcessing query: {args.query}")
|
|
|
|
|
|
|
| 967 |
|
| 968 |
+
# Show raw results if requested
|
| 969 |
+
if args.show_raw_results:
|
| 970 |
+
results, _ = query_text_corpus(
|
| 971 |
+
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
| 972 |
+
)
|
| 973 |
|
| 974 |
+
print("\nRaw Search Results:")
|
| 975 |
+
for i, (doc, score) in enumerate(results):
|
| 976 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
| 977 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
| 978 |
+
print(f"Content: {doc.page_content[:200]}...")
|
| 979 |
+
|
| 980 |
+
# Generate enhanced answer
|
| 981 |
+
answer, context = qa_generator(args.query)
|
| 982 |
+
|
| 983 |
+
if args.show_context:
|
| 984 |
+
print("\nCombined Context:")
|
| 985 |
+
print(context)
|
| 986 |
|
| 987 |
+
print("\nEnhanced LLM Answer:")
|
| 988 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 989 |
|
| 990 |
if args.interactive:
|
| 991 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
| 992 |
|
|
|
|
|
|
|
|
|
|
| 993 |
while True:
|
| 994 |
print("\nEnter query:")
|
| 995 |
query = input()
|
|
|
|
| 1000 |
if query.lower() == 'quit':
|
| 1001 |
break
|
| 1002 |
|
| 1003 |
+
# Show raw results if requested
|
| 1004 |
+
if args.show_raw_results:
|
| 1005 |
+
results, _ = query_text_corpus(
|
| 1006 |
+
embed_model, embed_tokenizer, vector_store_path, query, args.top_k, args.device
|
| 1007 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1008 |
|
| 1009 |
+
print("\nRaw Search Results:")
|
| 1010 |
+
for i, (doc, score) in enumerate(results):
|
| 1011 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
| 1012 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
| 1013 |
+
print(f"Content: {doc.page_content[:200]}...")
|
| 1014 |
+
|
| 1015 |
+
# Process the query
|
| 1016 |
+
print(f"\nProcessing query: {query}")
|
| 1017 |
+
answer, context = qa_generator(query)
|
| 1018 |
|
| 1019 |
+
if args.show_context:
|
| 1020 |
+
print("\nCombined Context:")
|
| 1021 |
+
print(context)
|
|
|
|
| 1022 |
|
| 1023 |
+
print("\nEnhanced LLM Answer:")
|
| 1024 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
|
| 1026 |
# Clean up GPU memory
|
| 1027 |
if args.device == "cuda":
|
hindi-rag-system.py.amltmp
CHANGED
|
@@ -744,35 +744,71 @@ def load_llama_model(model_name="unsloth/Llama-3.2-1B-Instruct", device="cuda"):
|
|
| 744 |
|
| 745 |
return model, tokenizer
|
| 746 |
|
| 747 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 748 |
"""
|
| 749 |
-
Set up
|
| 750 |
"""
|
| 751 |
# Create retriever
|
| 752 |
retriever = vector_store.as_retriever(
|
| 753 |
search_type="similarity",
|
| 754 |
-
search_kwargs={"k":
|
| 755 |
)
|
| 756 |
|
| 757 |
-
# Create a function to generate answers
|
| 758 |
-
def
|
| 759 |
-
#
|
| 760 |
-
|
| 761 |
-
docs = retriever.invoke(query)
|
| 762 |
-
except:
|
| 763 |
-
# Fallback to older method if invoke isn't available
|
| 764 |
-
docs = retriever.get_relevant_documents(query)
|
| 765 |
|
| 766 |
-
#
|
| 767 |
-
|
| 768 |
|
| 769 |
-
# Create prompt
|
| 770 |
prompt = f"""
|
| 771 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
| 772 |
-
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें।
|
| 773 |
|
| 774 |
संदर्भ:
|
| 775 |
-
{
|
| 776 |
|
| 777 |
प्रश्न: {query}
|
| 778 |
|
|
@@ -797,7 +833,7 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
| 797 |
do_sample=True
|
| 798 |
)
|
| 799 |
except Exception as e:
|
| 800 |
-
return f"Error generating response: {str(e)}"
|
| 801 |
|
| 802 |
# Decode the generated text
|
| 803 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
@@ -805,9 +841,9 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
| 805 |
# Extract just the answer part (after the prompt)
|
| 806 |
answer = full_response.split("उत्तर:")[-1].strip()
|
| 807 |
|
| 808 |
-
return answer
|
| 809 |
|
| 810 |
-
return
|
| 811 |
|
| 812 |
# Main RAG functions
|
| 813 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
|
@@ -858,54 +894,10 @@ def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="c
|
|
| 858 |
# Perform similarity search
|
| 859 |
results = perform_similarity_search(vector_store, query, k=k)
|
| 860 |
|
| 861 |
-
|
| 862 |
-
processed_results = []
|
| 863 |
-
seen_chunks = set()
|
| 864 |
-
|
| 865 |
-
for doc, score in results:
|
| 866 |
-
chunk_id = doc.metadata["chunk_id"]
|
| 867 |
-
source = doc.metadata["source"]
|
| 868 |
-
|
| 869 |
-
# Skip if we've already included this chunk
|
| 870 |
-
if (source, chunk_id) in seen_chunks:
|
| 871 |
-
continue
|
| 872 |
-
|
| 873 |
-
seen_chunks.add((source, chunk_id))
|
| 874 |
-
|
| 875 |
-
# Try to find adjacent chunks and combine them
|
| 876 |
-
combined_content = doc.page_content
|
| 877 |
-
|
| 878 |
-
# Look for adjacent chunks in results (both previous and next)
|
| 879 |
-
for adj_id in [chunk_id-1, chunk_id+1]:
|
| 880 |
-
for other_doc, _ in results:
|
| 881 |
-
if (other_doc.metadata["source"] == source and
|
| 882 |
-
other_doc.metadata["chunk_id"] == adj_id and
|
| 883 |
-
(source, adj_id) not in seen_chunks):
|
| 884 |
-
|
| 885 |
-
# Add the adjacent chunk content
|
| 886 |
-
if adj_id < chunk_id: # Previous chunk
|
| 887 |
-
combined_content = other_doc.page_content + " " + combined_content
|
| 888 |
-
else: # Next chunk
|
| 889 |
-
combined_content = combined_content + " " + other_doc.page_content
|
| 890 |
-
|
| 891 |
-
seen_chunks.add((source, adj_id))
|
| 892 |
-
|
| 893 |
-
# Create a new document with combined content
|
| 894 |
-
combined_doc = Document(
|
| 895 |
-
page_content=combined_content,
|
| 896 |
-
metadata={
|
| 897 |
-
"source": source,
|
| 898 |
-
"chunk_id": chunk_id,
|
| 899 |
-
"is_combined": True if combined_content != doc.page_content else False
|
| 900 |
-
}
|
| 901 |
-
)
|
| 902 |
-
|
| 903 |
-
processed_results.append((combined_doc, score))
|
| 904 |
-
|
| 905 |
-
return processed_results, vector_store
|
| 906 |
|
| 907 |
def main():
|
| 908 |
-
parser = argparse.ArgumentParser(description="Hindi RAG System with
|
| 909 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
| 910 |
help="Directory containing the model and tokenizer")
|
| 911 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
|
@@ -928,10 +920,12 @@ def main():
|
|
| 928 |
help="Run in interactive mode for querying")
|
| 929 |
parser.add_argument("--reindex", action="store_true",
|
| 930 |
help="Force reindexing even if index exists")
|
| 931 |
-
parser.add_argument("--qa", action="store_true",
|
| 932 |
-
help="Use LLM for question answering instead of just retrieval")
|
| 933 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
| 934 |
help="HuggingFace model name for the LLM")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
args = parser.parse_args()
|
| 936 |
|
| 937 |
# Load embedding model and tokenizer
|
|
@@ -943,20 +937,15 @@ def main():
|
|
| 943 |
# Create vector store path
|
| 944 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
| 945 |
|
| 946 |
-
# Load LLM
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
print("LLM loaded successfully for QA")
|
| 956 |
-
except Exception as e:
|
| 957 |
-
print(f"Error loading LLM: {e}")
|
| 958 |
-
print("Falling back to retrieval-only mode")
|
| 959 |
-
args.qa = False
|
| 960 |
|
| 961 |
if args.index or args.reindex:
|
| 962 |
# Index text files
|
|
@@ -964,47 +953,43 @@ def main():
|
|
| 964 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
| 965 |
)
|
| 966 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
|
|
|
|
|
|
|
|
|
| 971 |
|
| 972 |
if args.query:
|
| 973 |
-
#
|
| 974 |
-
|
| 975 |
-
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
| 976 |
-
)
|
| 977 |
|
| 978 |
-
#
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 992 |
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
try:
|
| 996 |
-
answer = qa_generator(args.query)
|
| 997 |
-
print("\nLLM Answer:")
|
| 998 |
-
print(answer)
|
| 999 |
-
except Exception as e:
|
| 1000 |
-
print(f"Error generating answer: {e}")
|
| 1001 |
|
| 1002 |
if args.interactive:
|
| 1003 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
| 1004 |
|
| 1005 |
-
# For the first query, load vector store
|
| 1006 |
-
vector_store = None
|
| 1007 |
-
|
| 1008 |
while True:
|
| 1009 |
print("\nEnter query:")
|
| 1010 |
query = input()
|
|
@@ -1015,33 +1000,28 @@ def main():
|
|
| 1015 |
if query.lower() == 'quit':
|
| 1016 |
break
|
| 1017 |
|
| 1018 |
-
#
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
|
| 1023 |
-
# Print retrieval results
|
| 1024 |
-
print("\nSearch Results:")
|
| 1025 |
-
for i, (doc, score) in enumerate(results):
|
| 1026 |
-
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
| 1027 |
-
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
| 1028 |
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1032 |
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
|
| 1037 |
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
answer = qa_generator(query)
|
| 1041 |
-
print("\nLLM Answer:")
|
| 1042 |
-
print(answer)
|
| 1043 |
-
except Exception as e:
|
| 1044 |
-
print(f"Error generating answer: {e}")
|
| 1045 |
|
| 1046 |
# Clean up GPU memory
|
| 1047 |
if args.device == "cuda":
|
|
|
|
| 744 |
|
| 745 |
return model, tokenizer
|
| 746 |
|
| 747 |
+
# NEW FUNCTIONS FOR COMBINED RESULTS APPROACH
|
| 748 |
+
|
| 749 |
+
def combine_top_results(results, query, max_results=4):
|
| 750 |
+
"""
|
| 751 |
+
Combine the top search results into a single coherent context
|
| 752 |
+
|
| 753 |
+
Args:
|
| 754 |
+
results: List of (Document, score) tuples from retrieval
|
| 755 |
+
query: Original user query
|
| 756 |
+
max_results: Maximum number of results to combine
|
| 757 |
+
|
| 758 |
+
Returns:
|
| 759 |
+
String containing combined context from top results
|
| 760 |
+
"""
|
| 761 |
+
# Sort results by score (highest first) and take top N
|
| 762 |
+
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)[:max_results]
|
| 763 |
+
|
| 764 |
+
combined_texts = []
|
| 765 |
+
seen_content = set() # To avoid duplicates
|
| 766 |
+
|
| 767 |
+
for doc, score in sorted_results:
|
| 768 |
+
# Extract relevant sentences to keep context focused
|
| 769 |
+
relevant_text = extract_relevant_sentences(doc.page_content, query, window_size=3)
|
| 770 |
+
|
| 771 |
+
# Skip if this exact text has been seen before
|
| 772 |
+
if relevant_text in seen_content:
|
| 773 |
+
continue
|
| 774 |
+
|
| 775 |
+
# Add source information to the text
|
| 776 |
+
source_name = os.path.basename(doc.metadata["source"])
|
| 777 |
+
text_with_source = f"{relevant_text} [Source: {source_name}]"
|
| 778 |
+
|
| 779 |
+
combined_texts.append(text_with_source)
|
| 780 |
+
seen_content.add(relevant_text)
|
| 781 |
+
|
| 782 |
+
# Combine all texts with clear separation
|
| 783 |
+
combined_context = "\n\n".join(combined_texts)
|
| 784 |
+
|
| 785 |
+
return combined_context
|
| 786 |
+
|
| 787 |
+
def setup_enhanced_qa_system(model, tokenizer, vector_store):
|
| 788 |
"""
|
| 789 |
+
Set up an enhanced QA system using the model and retriever with result combination
|
| 790 |
"""
|
| 791 |
# Create retriever
|
| 792 |
retriever = vector_store.as_retriever(
|
| 793 |
search_type="similarity",
|
| 794 |
+
search_kwargs={"k": 6} # Get more results than we'll use to filter better
|
| 795 |
)
|
| 796 |
|
| 797 |
+
# Create a function to generate answers with combined context
|
| 798 |
+
def generate_enhanced_answer(query):
|
| 799 |
+
# Get raw documents and scores
|
| 800 |
+
docs = vector_store.similarity_search_with_score(query, k=6)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
|
| 802 |
+
# Combine the top results into a single context
|
| 803 |
+
combined_context = combine_top_results(docs, query, max_results=4)
|
| 804 |
|
| 805 |
+
# Create prompt with the combined context
|
| 806 |
prompt = f"""
|
| 807 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
| 808 |
+
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें। अपने उत्तर में सभी प्रासंगिक जानकारी का उपयोग करें।
|
| 809 |
|
| 810 |
संदर्भ:
|
| 811 |
+
{combined_context}
|
| 812 |
|
| 813 |
प्रश्न: {query}
|
| 814 |
|
|
|
|
| 833 |
do_sample=True
|
| 834 |
)
|
| 835 |
except Exception as e:
|
| 836 |
+
return f"Error generating response: {str(e)}", None
|
| 837 |
|
| 838 |
# Decode the generated text
|
| 839 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
| 841 |
# Extract just the answer part (after the prompt)
|
| 842 |
answer = full_response.split("उत्तर:")[-1].strip()
|
| 843 |
|
| 844 |
+
return answer, combined_context
|
| 845 |
|
| 846 |
+
return generate_enhanced_answer
|
| 847 |
|
| 848 |
# Main RAG functions
|
| 849 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
|
|
|
| 894 |
# Perform similarity search
|
| 895 |
results = perform_similarity_search(vector_store, query, k=k)
|
| 896 |
|
| 897 |
+
return results, vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
|
| 899 |
def main():
|
| 900 |
+
parser = argparse.ArgumentParser(description="Hindi RAG System with Combined Results")
|
| 901 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
| 902 |
help="Directory containing the model and tokenizer")
|
| 903 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
|
|
|
| 920 |
help="Run in interactive mode for querying")
|
| 921 |
parser.add_argument("--reindex", action="store_true",
|
| 922 |
help="Force reindexing even if index exists")
|
|
|
|
|
|
|
| 923 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
| 924 |
help="HuggingFace model name for the LLM")
|
| 925 |
+
parser.add_argument("--show_context", action="store_true",
|
| 926 |
+
help="Show the combined context sent to the LLM")
|
| 927 |
+
parser.add_argument("--show_raw_results", action="store_true",
|
| 928 |
+
help="Show the raw search results before combination")
|
| 929 |
args = parser.parse_args()
|
| 930 |
|
| 931 |
# Load embedding model and tokenizer
|
|
|
|
| 937 |
# Create vector store path
|
| 938 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
| 939 |
|
| 940 |
+
# Load LLM
|
| 941 |
+
try:
|
| 942 |
+
# Load LLM
|
| 943 |
+
llm_model, llm_tokenizer = load_llama_model(args.llm_name, args.device)
|
| 944 |
+
print("LLM loaded successfully for QA")
|
| 945 |
+
except Exception as e:
|
| 946 |
+
print(f"Error loading LLM: {e}")
|
| 947 |
+
print("Cannot proceed without LLM for this combined results approach")
|
| 948 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
|
| 950 |
if args.index or args.reindex:
|
| 951 |
# Index text files
|
|
|
|
| 953 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
| 954 |
)
|
| 955 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
| 956 |
+
|
| 957 |
+
# Load vector store for querying
|
| 958 |
+
embeddings = HindiSentenceEmbeddings(embed_model, embed_tokenizer, device=args.device)
|
| 959 |
+
vector_store = load_vector_store(vector_store_path, embeddings)
|
| 960 |
+
|
| 961 |
+
# Set up enhanced QA system
|
| 962 |
+
qa_generator = setup_enhanced_qa_system(llm_model, llm_tokenizer, vector_store)
|
| 963 |
|
| 964 |
if args.query:
|
| 965 |
+
# Process the query with the enhanced system
|
| 966 |
+
print(f"\nProcessing query: {args.query}")
|
|
|
|
|
|
|
| 967 |
|
| 968 |
+
# Show raw results if requested
|
| 969 |
+
if args.show_raw_results:
|
| 970 |
+
results, _ = query_text_corpus(
|
| 971 |
+
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
| 972 |
+
)
|
| 973 |
|
| 974 |
+
print("\nRaw Search Results:")
|
| 975 |
+
for i, (doc, score) in enumerate(results):
|
| 976 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
| 977 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
| 978 |
+
print(f"Content: {doc.page_content[:200]}...")
|
| 979 |
+
|
| 980 |
+
# Generate enhanced answer
|
| 981 |
+
answer, context = qa_generator(args.query)
|
| 982 |
+
|
| 983 |
+
if args.show_context:
|
| 984 |
+
print("\nCombined Context:")
|
| 985 |
+
print(context)
|
| 986 |
|
| 987 |
+
print("\nEnhanced LLM Answer:")
|
| 988 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 989 |
|
| 990 |
if args.interactive:
|
| 991 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
| 992 |
|
|
|
|
|
|
|
|
|
|
| 993 |
while True:
|
| 994 |
print("\nEnter query:")
|
| 995 |
query = input()
|
|
|
|
| 1000 |
if query.lower() == 'quit':
|
| 1001 |
break
|
| 1002 |
|
| 1003 |
+
# Show raw results if requested
|
| 1004 |
+
if args.show_raw_results:
|
| 1005 |
+
results, _ = query_text_corpus(
|
| 1006 |
+
embed_model, embed_tokenizer, vector_store_path, query, args.top_k, args.device
|
| 1007 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1008 |
|
| 1009 |
+
print("\nRaw Search Results:")
|
| 1010 |
+
for i, (doc, score) in enumerate(results):
|
| 1011 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
| 1012 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
| 1013 |
+
print(f"Content: {doc.page_content[:200]}...")
|
| 1014 |
+
|
| 1015 |
+
# Process the query
|
| 1016 |
+
print(f"\nProcessing query: {query}")
|
| 1017 |
+
answer, context = qa_generator(query)
|
| 1018 |
|
| 1019 |
+
if args.show_context:
|
| 1020 |
+
print("\nCombined Context:")
|
| 1021 |
+
print(context)
|
|
|
|
| 1022 |
|
| 1023 |
+
print("\nEnhanced LLM Answer:")
|
| 1024 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
|
| 1026 |
# Clean up GPU memory
|
| 1027 |
if args.device == "cuda":
|