albhu commited on
Commit
ade8cf8
·
verified ·
1 Parent(s): 5a37b32
Files changed (1) hide show
  1. search.py +32 -117
search.py CHANGED
@@ -1,23 +1,16 @@
1
- from transformers import AutoTokenizer, AutoModelForCausalLM
2
- from docx import Document
3
  from pdfminer.high_level import extract_text
 
4
  from dataclasses import dataclass
5
- from typing import List
6
- from tqdm import tqdm
7
- import re
8
- import pandas as pd # Import pandas module
9
- from sklearn.feature_extraction.text import TfidfVectorizer
10
- import numpy as np
11
 
12
- tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
13
- model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)
 
14
 
15
- EMBEDDING_SEG_LEN = 1500
16
- EMBEDDING_MODEL = "gpt-4"
17
-
18
- EMBEDDING_CTX_LENGTH = 8191
19
- EMBEDDING_ENCODING = "cl100k_base"
20
- ENCODING = "gpt2"
21
 
22
  @dataclass
23
  class Paragraph:
@@ -25,109 +18,31 @@ class Paragraph:
25
  paragraph_num: int
26
  content: str
27
 
28
- def read_pdf_pdfminer(file_path) -> List[Paragraph]:
29
  text = extract_text(file_path).replace('\n', ' ').strip()
30
- paragraphs = batched(text, EMBEDDING_SEG_LEN)
31
- paragraphs_objs = []
32
- paragraph_num = 1
33
- for p in paragraphs:
34
- para = Paragraph(0, paragraph_num, p)
35
- paragraphs_objs.append(para)
36
- paragraph_num += 1
37
- return paragraphs_objs
38
 
39
- def read_docx(file) -> List[Paragraph]:
40
  doc = Document(file)
41
- paragraphs = []
42
- for paragraph_num, paragraph in enumerate(doc.paragraphs, start=1):
43
- content = paragraph.text.strip()
44
- if content:
45
- para = Paragraph(1, paragraph_num, content)
46
- paragraphs.append(para)
47
- return paragraphs
48
-
49
- def count_tokens(text, tokenizer):
50
- return len(tokenizer.encode(text))
51
-
52
- def batched(iterable, n):
53
- l = len(iterable)
54
- for ndx in range(0, l, n):
55
- yield iterable[ndx : min(ndx + n, l)]
56
-
57
- def compute_doc_embeddings(df, tokenizer):
58
- embeddings = {}
59
- for index, row in tqdm(df.iterrows(), total=df.shape[0]):
60
- doc = row["content"]
61
- doc_embedding = get_embedding(doc, tokenizer)
62
- embeddings[index] = doc_embedding
63
- return embeddings
64
-
65
- def enhanced_context_extraction(document, keywords, vectorizer, tfidf_scores, top_n=5):
66
- paragraphs = [para for para in document.split("\n") if para]
67
- scores = [sum([para.lower().count(keyword) * tfidf_scores[vectorizer.vocabulary_[keyword]] for keyword in keywords if keyword in para.lower()]) for para in paragraphs]
68
-
69
- top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
70
- relevant_paragraphs = [paragraphs[i] for i in top_indices]
71
-
72
- return " ".join(relevant_paragraphs)
73
-
74
- def targeted_context_extraction(document, keywords, vectorizer, tfidf_scores, top_n=5):
75
- paragraphs = [para for para in document.split("\n") if para]
76
- scores = [sum([para.lower().count(keyword) * tfidf_scores[vectorizer.vocabulary_[keyword]] for keyword in keywords]) for para in paragraphs]
77
-
78
- top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
79
- relevant_paragraphs = [paragraphs[i] for i in top_indices]
80
-
81
- return " ".join(relevant_paragraphs)
82
-
83
-
84
- def extract_page_and_clause_references(paragraph: str) -> str:
85
- page_matches = re.findall(r'Page (\d+)', paragraph)
86
- clause_matches = re.findall(r'Clause (\d+\.\d+)', paragraph)
87
-
88
- page_ref = f"Page {page_matches[0]}" if page_matches else ""
89
- clause_ref = f"Clause {clause_matches[0]}" if clause_matches else ""
90
-
91
- return f"({page_ref}, {clause_ref})".strip(", ")
92
-
93
- def refine_answer_based_on_question(question: str, answer: str) -> str:
94
- if "Does the agreement contain" in question:
95
- if "not" in answer or "No" in answer:
96
- refined_answer = f"No, the agreement does not contain {answer}"
97
- else:
98
- refined_answer = f"Yes, the agreement contains {answer}"
99
- else:
100
- refined_answer = answer
101
-
102
- return refined_answer
103
-
104
- def answer_query_with_context(question: str, df: pd.DataFrame, top_n_paragraphs: int = 5) -> str:
105
- question_words = set(question.split())
106
-
107
- priority_keywords = ["duration", "term", "period", "month", "year", "day", "week", "agreement", "obligation", "effective date"]
108
-
109
- df['relevance_score'] = df['content'].apply(lambda x: len(question_words.intersection(set(x.split()))) + sum([x.lower().count(pk) for pk in priority_keywords]))
110
-
111
- most_relevant_paragraphs = df.sort_values(by='relevance_score', ascending=False).iloc[:top_n_paragraphs]['content'].tolist()
112
-
113
- context = "\n\n".join(most_relevant_paragraphs)
114
- prompt = f"Question: {question}\n\nContext: {context}\n\nAnswer:"
115
-
116
- inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
117
- outputs = model.generate(inputs, max_length=600)
118
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
119
-
120
- references = extract_page_and_clause_references(context)
121
- answer = refine_answer_based_on_question(question, answer) + " " + references
122
-
123
  return answer
124
 
125
- def get_embedding(text, tokenizer):
126
- try:
127
- inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
128
- outputs = model(**inputs)
129
- embedding = outputs.last_hidden_state
130
- except Exception as e:
131
- print("Error obtaining embedding:", e)
132
- embedding = []
133
- return embedding
 
1
+ from transformers import RagTokenizer, RagTokenForGeneration, AutoTokenizer, AutoModelForCausalLM, pipeline
 
2
  from pdfminer.high_level import extract_text
3
+ from docx import Document
4
  from dataclasses import dataclass
5
+ import pandas as pd
 
 
 
 
 
6
 
7
+ # Initialize RAG
8
+ rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
9
+ rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")
10
 
11
+ # Initialize Phi-2
12
+ phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
13
+ phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)
 
 
 
14
 
15
  @dataclass
16
  class Paragraph:
 
18
  paragraph_num: int
19
  content: str
20
 
21
+ def read_pdf_pdfminer(file_path) -> list[Paragraph]:
22
  text = extract_text(file_path).replace('\n', ' ').strip()
23
+ paragraphs = text.split(". ")
24
+ return [Paragraph(0, i, para) for i, para in enumerate(paragraphs, 1)]
 
 
 
 
 
 
25
 
26
+ def read_docx(file) -> list[Paragraph]:
27
  doc = Document(file)
28
+ return [Paragraph(1, i, para.text.strip()) for i, para in enumerate(doc.paragraphs, 1) if para.text.strip()]
29
+
30
+ def generate_context_with_rag(question: str) -> str:
31
+ inputs = rag_tokenizer(question, return_tensors="pt")
32
+ output_ids = rag_model.generate(**inputs)
33
+ context = rag_tokenizer.decode(output_ids[0], skip_special_tokens=True)
34
+ return context
35
+
36
+ def generate_answer_with_phi(question: str, context: str) -> str:
37
+ enhanced_question = f"Question: {question}\nContext: {context}\nAnswer:"
38
+ inputs = phi_tokenizer.encode(enhanced_question, return_tensors="pt", max_length=512, truncation=True)
39
+ outputs = phi_model.generate(inputs, max_length=600)
40
+ answer = phi_tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  return answer
42
 
43
+ def answer_question(question: str, documents_df: pd.DataFrame) -> str:
44
+ # Assuming documents_df contains the text from uploaded files
45
+ combined_text = " ".join(documents_df['content'].tolist())
46
+ context = generate_context_with_rag(combined_text + " " + question)
47
+ answer = generate_answer_with_phi(question, context)
48
+ return answer