Spaces:

albhu
/

tdocaibot

Sleeping

App Files Files Community

albhu commited on Mar 15, 2024

Commit

a6fe316

verified ·

1 Parent(s): 9d48d5a

Update search.py

Browse files

Files changed (1) hide show

search.py +77 -110

search.py CHANGED Viewed

@@ -1,135 +1,102 @@
-from transformers import AutoTokenizer, GPT2LMHeadModel
 from docx import Document
 from pdfminer.high_level import extract_text
-from transformers import GPT2Tokenizer
 from dataclasses import dataclass
-from typing import List
-from tqdm import tqdm
-import os
-import pandas as pd
-import re
-from sklearn.feature_extraction.text import TfidfVectorizer
-import numpy as np
-tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", trust_remote_code=True)
-model = GPT2LMHeadModel.from_pretrained("impira/layoutlm-document-qa", trust_remote_code=True)
-EMBEDDING_SEG_LEN = 1500
-EMBEDDING_MODEL = "gpt-4"
-EMBEDDING_CTX_LENGTH = 8191
-EMBEDDING_ENCODING = "cl100k_base"
-ENCODING = "gpt2"
 @dataclass
 class Paragraph:
     page_num: int
     paragraph_num: int
     content: str
-def read_pdf_pdfminer(file_path) -> List[Paragraph]:
     text = extract_text(file_path).replace('\n', ' ').strip()
-    paragraphs = batched(text, EMBEDDING_SEG_LEN)
-    paragraphs_objs = []
-    paragraph_num = 1
-    for p in paragraphs:
-        para = Paragraph(0, paragraph_num, p)
-        paragraphs_objs.append(para)
-        paragraph_num += 1
-    return paragraphs_objs
-def read_docx(file) -> List[Paragraph]:
-    doc = Document(file)
-    paragraphs = []
-    for paragraph_num, paragraph in enumerate(doc.paragraphs, start=1):
-        content = paragraph.text.strip()
-        if content:
-            para = Paragraph(1, paragraph_num, content)
-            paragraphs.append(para)
     return paragraphs
-def count_tokens(text, tokenizer):
-    return len(tokenizer.encode(text))
 def batched(iterable, n):
     l = len(iterable)
     for ndx in range(0, l, n):
-        yield iterable[ndx : min(ndx + n, l)]
-def compute_doc_embeddings(df, tokenizer):
-    embeddings = {}
-    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
-        doc = row["content"]
-        doc_embedding = get_embedding(doc, tokenizer)
-        embeddings[index] = doc_embedding
-    return embeddings
-def enhanced_context_extraction(document, keywords, vectorizer, tfidf_scores, top_n=5):
-    paragraphs = [para for para in document.split("\n") if para]
-    scores = [sum([para.lower().count(keyword) * tfidf_scores[vectorizer.vocabulary_[keyword]] for keyword in keywords if keyword in para.lower()]) for para in paragraphs]
-    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
-    relevant_paragraphs = [paragraphs[i] for i in top_indices]
-    return " ".join(relevant_paragraphs)
-def targeted_context_extraction(document, keywords, vectorizer, tfidf_scores, top_n=5):
-    paragraphs = [para for para in document.split("\n") if para]
-    scores = [sum([para.lower().count(keyword) * tfidf_scores[vectorizer.vocabulary_[keyword]] for keyword in keywords]) for para in paragraphs]
-    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
-    relevant_paragraphs = [paragraphs[i] for i in top_indices]
-    return " ".join(relevant_paragraphs)
-def extract_page_and_clause_references(paragraph: str) -> str:
-    page_matches = re.findall(r'Page (\d+)', paragraph)
-    clause_matches = re.findall(r'Clause (\d+\.\d+)', paragraph)
-    page_ref = f"Page {page_matches[0]}" if page_matches else ""
-    clause_ref = f"Clause {clause_matches[0]}" if clause_matches else ""
-    return f"({page_ref}, {clause_ref})".strip(", ")
-def refine_answer_based_on_question(question: str, answer: str) -> str:
-    if "Does the agreement contain" in question:
-        if "not" in answer or "No" in answer:
-            refined_answer = f"No, the agreement does not contain {answer}"
-        else:
-            refined_answer = f"Yes, the agreement contains {answer}"
-    else:
-        refined_answer = answer
-    return refined_answer
-def answer_query_with_context(question: str, df: pd.DataFrame, tokenizer, model, top_n_paragraphs: int = 5) -> str:
-    question_words = set(question.split())
-    priority_keywords = ["duration", "term", "period", "month", "year", "day", "week", "agreement", "obligation", "effective date"]
-    df['relevance_score'] = df['content'].apply(lambda x: len(question_words.intersection(set(x.split()))) + sum([x.lower().count(pk) for pk in priority_keywords]))
-    most_relevant_paragraphs = df.sort_values(by='relevance_score', ascending=False).iloc[:top_n_paragraphs]['content'].tolist()
-    context = "\n\n".join(most_relevant_paragraphs)
-    prompt = f"Question: {question}\n\nContext: {context}\n\nAnswer:"
-    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
-    outputs = model.generate(inputs, max_length=200)
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    references = extract_page_and_clause_references(context)
-    answer = refine_answer_based_on_question(question, answer) + " " + references
-    return answer
-def get_embedding(text, tokenizer):
     try:
-        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
         outputs = model(**inputs)
         embedding = outputs.last_hidden_state
     except Exception as e:
         print("Error obtaining embedding:", e)
-        embedding = []
-    return embedding

+from transformers import AutoTokenizer, AutoModelForCausalLM
 from docx import Document
 from pdfminer.high_level import extract_text
+from typing import List, Union
 from dataclasses import dataclass
+# Initialize the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)
+# Define the Paragraph data class
 @dataclass
 class Paragraph:
     page_num: int
     paragraph_num: int
     content: str
+    embedding: Union[list, None] = None
+# Function to read text from a PDF file
+def read_pdf(file_path: str) -> List[Paragraph]:
     text = extract_text(file_path).replace('\n', ' ').strip()
+    return create_paragraphs(text)
+# Function to read text from a DOCX file
+def read_docx(file_path: str) -> List[Paragraph]:
+    doc = Document(file_path)
+    paragraphs = [Paragraph(1, idx + 1, para.text.strip()) for idx, para in enumerate(doc.paragraphs) if para.text.strip()]
     return paragraphs
+# Helper function to split text into paragraphs
+def create_paragraphs(text: str, max_length: int = 1500) -> List[Paragraph]:
+    paragraphs = []
+    paragraph_num = 1
+    for chunk in batched(text, max_length):
+        para = Paragraph(0, paragraph_num, chunk)
+        paragraphs.append(para)
+        paragraph_num += 1
+    return paragraphs
+# Helper function to batch an iterable
 def batched(iterable, n):
     l = len(iterable)
     for ndx in range(0, l, n):
+        yield iterable[ndx: min(ndx + n, l)]
+# Function to obtain embeddings for a given text
+def get_embedding(text: str, tokenizer, max_length: int = 512) -> Union[list, None]:
     try:
+        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
         outputs = model(**inputs)
         embedding = outputs.last_hidden_state
+        return embedding
     except Exception as e:
         print("Error obtaining embedding:", e)
+        return None
+# Function to process a single paragraph and obtain its embedding
+def process_paragraph(paragraph: Paragraph) -> Union[list, None]:
+    try:
+        embedding = get_embedding(paragraph.content, tokenizer)
+        return embedding
+    except Exception as e:
+        print(f"Error processing paragraph {paragraph.paragraph_num}: {e}")
+        return None
+# Main function to process a document and obtain embeddings for each paragraph
+def process_document(file_path: str, file_type: str = None) -> List[Paragraph]:
+    supported_types = ['pdf', 'docx']
+    if file_type not in supported_types:
+        print(f"Unsupported file type. Please provide one of the following supported types: {', '.join(supported_types)}")
+        return []
+    if file_type == 'pdf':
+        paragraphs = read_pdf(file_path)
+    elif file_type == 'docx':
+        paragraphs = read_docx(file_path)
+    if not paragraphs:
+        print("No paragraphs found in the document.")
+        return []
+    # Process each paragraph and obtain embeddings
+    for idx, paragraph in enumerate(paragraphs):
+        print(f"Processing paragraph {idx + 1}...")
+        embedding = process_paragraph(paragraph)
+        if embedding:
+            paragraph.embedding = embedding
+        else:
+            print(f"Embedding for paragraph {idx + 1} could not be obtained.")
+    return paragraphs
+# Example usage
+if __name__ == "__main__":
+    file_path = "example.pdf"
+    file_type = file_path.split(".")[-1]
+    paragraphs = process_document(file_path, file_type)
+    for para in paragraphs:
+        print(para.content)
+        if hasattr(para, 'embedding') and para.embedding is not None:
+            print("Embedding:", para.embedding)
+        else:
+            print("Embedding could not be obtained.")