Spaces:

jedick
/

AI4citations

Running on Zero

App Files Files Community

jedick commited on 1 day ago

Commit

5cdd81a

1 Parent(s): 00c763e

Add LLM retrieval

Browse files

Files changed (2) hide show

app.py +33 -19
llm_retrieval.py +237 -0

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 from transformers import pipeline
 import nltk
 from retrieval import retrieve_from_pdf
 import os
 import json
 from datetime import datetime
@@ -93,7 +94,9 @@ with gr.Blocks(theme=my_theme, css=custom_css, head=font_awesome_html) as demo:
         with gr.Column(scale=3):
             with gr.Row():
                 gr.Markdown("# AI4citations")
-                gr.Markdown("## *AI-powered citation verification*")
             claim = gr.Textbox(
                 label="Claim",
                 info="aka hypothesis",
@@ -105,6 +108,13 @@ with gr.Blocks(theme=my_theme, css=custom_css, head=font_awesome_html) as demo:
                         pdf_file = gr.File(
                             label="Upload PDF", type="filepath", height=120
                         )
                         get_evidence = gr.Button(value="Get Evidence")
                         top_k = gr.Slider(
                             1,
@@ -193,7 +203,7 @@ with gr.Blocks(theme=my_theme, css=custom_css, head=font_awesome_html) as demo:
                     ### Usage:
                     - Input a **Claim**, then:
-                        - Upload a PDF and click **Get Evidence** OR
                         - Input **Evidence** statements yourself
                     """
                     )
@@ -232,24 +242,15 @@ with gr.Blocks(theme=my_theme, css=custom_css, head=font_awesome_html) as demo:
                 #### *Capstone project*
                 - <i class="fa-brands fa-github"></i> [jedick/MLE-capstone-project](https://github.com/jedick/MLE-capstone-project) (project repo)
                 - <i class="fa-brands fa-github"></i> [jedick/AI4citations](https://github.com/jedick/AI4citations) (app repo)
-                """
-                )
-                gr.Markdown(
-                    """
-                #### *Models*
                 - <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" style="height: 1.2em; display: inline-block;"> [jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint](https://huggingface.co/jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint) (fine-tuned)
                 - <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" style="height: 1.2em; display: inline-block;"> [MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli](https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli) (base)
-                """
-                )
-                gr.Markdown(
-                    """
                 #### *Datasets for fine-tuning*
                 - <i class="fa-brands fa-github"></i> [allenai/SciFact](https://github.com/allenai/scifact) (SciFact)
                 - <i class="fa-brands fa-github"></i> [ScienceNLP-Lab/Citation-Integrity](https://github.com/ScienceNLP-Lab/Citation-Integrity) (CitInt)
-                """
-                )
-                gr.Markdown(
-                    """
                 #### *Other sources*
                 - <i class="fa-brands fa-github"></i> [xhluca/bm25s](https://github.com/xhluca/bm25s) (evidence retrieval)
                 - <img src="https://plos.org/wp-content/uploads/2020/01/logo-color-blue.svg" style="height: 1.4em; display: inline-block;"> [Medicine](https://doi.org/10.1371/journal.pmed.0030197), <i class="fa-brands fa-wikipedia-w"></i> [CRISPR](https://en.wikipedia.org/wiki/CRISPR) (evidence retrieval examples)
@@ -335,6 +336,19 @@ with gr.Blocks(theme=my_theme, css=custom_css, head=font_awesome_html) as demo:
             pdf_file = f"examples/retrieval/{pdf_file}"
         return pdf_file, claim
     def append_feedback(
         claim: str, evidence: str, model: str, label: str, user_label: str
     ) -> None:
@@ -405,8 +419,8 @@ with gr.Blocks(theme=my_theme, css=custom_css, head=font_awesome_html) as demo:
     # Get evidence from PDF and run the model
     gr.on(
         triggers=[get_evidence.click],
-        fn=retrieve_from_pdf,
-        inputs=[pdf_file, claim, top_k],
         outputs=evidence,
     ).then(
         fn=query_model,
@@ -465,8 +479,8 @@ with gr.Blocks(theme=my_theme, css=custom_css, head=font_awesome_html) as demo:
         outputs=[pdf_file, claim],
         api_name=False,
     ).then(
-        fn=retrieve_from_pdf,
-        inputs=[pdf_file, claim, top_k],
         outputs=evidence,
         api_name=False,
     ).then(

 from transformers import pipeline
 import nltk
 from retrieval import retrieve_from_pdf
+from llm_retrieval import retrieve_from_pdf_llm, retrieve_from_pdf_llm_fast
 import os
 import json
 from datetime import datetime
         with gr.Column(scale=3):
             with gr.Row():
                 gr.Markdown("# AI4citations")
+                gr.Markdown(
+                    "## *AI-powered citation verification* ([more info](https://github.com/jedick/AI4citations))"
+                )
             claim = gr.Textbox(
                 label="Claim",
                 info="aka hypothesis",
                         pdf_file = gr.File(
                             label="Upload PDF", type="filepath", height=120
                         )
+                        with gr.Row():
+                            retrieval_method = gr.Radio(
+                                choices=["BM25S", "LLM (Large)", "LLM (Fast)"],
+                                value="BM25S",
+                                label="Retrieval Method",
+                                info="Choose between keyword-based (BM25S) or AI-based (LLM) evidence retrieval",
+                            )
                         get_evidence = gr.Button(value="Get Evidence")
                         top_k = gr.Slider(
                             1,
                     ### Usage:
                     - Input a **Claim**, then:
+                        - Upload a PDF, select retrieval method, and click **Get Evidence** OR
                         - Input **Evidence** statements yourself
                     """
                     )
                 #### *Capstone project*
                 - <i class="fa-brands fa-github"></i> [jedick/MLE-capstone-project](https://github.com/jedick/MLE-capstone-project) (project repo)
                 - <i class="fa-brands fa-github"></i> [jedick/AI4citations](https://github.com/jedick/AI4citations) (app repo)
+                #### *Claim Verification Models (text classification)*
                 - <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" style="height: 1.2em; display: inline-block;"> [jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint](https://huggingface.co/jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint) (fine-tuned)
                 - <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" style="height: 1.2em; display: inline-block;"> [MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli](https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli) (base)
+                #### *Evidence Retrieval Models (question answering)*
+                - <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" style="height: 1.2em; display: inline-block;"> [deepset/deberta-v3-large-squad2](https://huggingface.co/deepset/deberta-v3-large-squad2) (Large)
+                - <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" style="height: 1.2em; display: inline-block;"> [distilbert-base-cased-distilled-squad](https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad) (Fast)
                 #### *Datasets for fine-tuning*
                 - <i class="fa-brands fa-github"></i> [allenai/SciFact](https://github.com/allenai/scifact) (SciFact)
                 - <i class="fa-brands fa-github"></i> [ScienceNLP-Lab/Citation-Integrity](https://github.com/ScienceNLP-Lab/Citation-Integrity) (CitInt)
                 #### *Other sources*
                 - <i class="fa-brands fa-github"></i> [xhluca/bm25s](https://github.com/xhluca/bm25s) (evidence retrieval)
                 - <img src="https://plos.org/wp-content/uploads/2020/01/logo-color-blue.svg" style="height: 1.4em; display: inline-block;"> [Medicine](https://doi.org/10.1371/journal.pmed.0030197), <i class="fa-brands fa-wikipedia-w"></i> [CRISPR](https://en.wikipedia.org/wiki/CRISPR) (evidence retrieval examples)
             pdf_file = f"examples/retrieval/{pdf_file}"
         return pdf_file, claim
+    def retrieve_evidence_with_method(pdf_file, claim, top_k, method):
+        """
+        Retrieve evidence using the selected method
+        """
+        if method == "BM25S":
+            return retrieve_from_pdf(pdf_file, claim, k=top_k)
+        elif method == "LLM (Large)":
+            return retrieve_from_pdf_llm(pdf_file, claim, k=top_k)
+        elif method == "LLM (Fast)":
+            return retrieve_from_pdf_llm_fast(pdf_file, claim, k=top_k)
+        else:
+            return f"Unknown retrieval method: {method}"
     def append_feedback(
         claim: str, evidence: str, model: str, label: str, user_label: str
     ) -> None:
     # Get evidence from PDF and run the model
     gr.on(
         triggers=[get_evidence.click],
+        fn=retrieve_evidence_with_method,
+        inputs=[pdf_file, claim, top_k, retrieval_method],
         outputs=evidence,
     ).then(
         fn=query_model,
         outputs=[pdf_file, claim],
         api_name=False,
     ).then(
+        fn=retrieve_evidence_with_method,
+        inputs=[pdf_file, claim, top_k, retrieval_method],
         outputs=evidence,
         api_name=False,
     ).then(

llm_retrieval.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import re
+import fitz  # pip install pymupdf
+from unidecode import unidecode
+from nltk.tokenize import sent_tokenize
+from transformers import pipeline, AutoTokenizer
+import torch
+from typing import List, Tuple, Optional
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class LLMEvidenceRetriever:
+    """
+    LLM-based evidence retrieval using extractive question answering
+    """
+    def __init__(self, model_name: str = "deepset/deberta-v3-large-squad2"):
+        """
+        Initialize the LLM evidence retriever
+        Args:
+            model_name: HuggingFace model for question answering
+        """
+        self.model_name = model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.qa_pipeline = pipeline(
+            "question-answering",
+            model=model_name,
+            tokenizer=self.tokenizer,
+            device=0 if torch.cuda.is_available() else -1,
+        )
+        # Maximum context length for the model
+        self.max_length = self.tokenizer.model_max_length
+        logger.info(f"Initialized LLM retriever with model: {model_name}")
+    def _extract_and_clean_text(self, pdf_file: str) -> str:
+        """
+        Extract and clean text from PDF file
+        Args:
+            pdf_file: Path to PDF file
+        Returns:
+            Cleaned text from PDF
+        """
+        # Get PDF file as binary
+        with open(pdf_file, mode="rb") as f:
+            pdf_file_bytes = f.read()
+        # Extract text from the PDF
+        pdf_doc = fitz.open(stream=pdf_file_bytes, filetype="pdf")
+        pdf_text = ""
+        for page_num in range(pdf_doc.page_count):
+            page = pdf_doc.load_page(page_num)
+            pdf_text += page.get_text("text")
+        # Clean text
+        # Remove hyphens at end of lines
+        clean_text = re.sub("-\n", "", pdf_text)
+        # Replace remaining newline characters with space
+        clean_text = re.sub("\n", " ", clean_text)
+        # Replace unicode with ascii
+        clean_text = unidecode(clean_text)
+        return clean_text
+    def _chunk_text(self, text: str, max_chunk_size: int = 3000) -> List[str]:
+        """
+        Split text into chunks that fit within model context window
+        Args:
+            text: Input text to chunk
+            max_chunk_size: Maximum size per chunk
+        Returns:
+            List of text chunks
+        """
+        sentences = sent_tokenize(text)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            # Check if adding this sentence would exceed the limit
+            if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
+                current_chunk += " " + sentence if current_chunk else sentence
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence
+        # Add the last chunk
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def _format_claim_as_question(self, claim: str) -> str:
+        """
+        Convert a claim into a question format for better QA performance
+        Args:
+            claim: Input claim
+        Returns:
+            Question formatted for QA model
+        """
+        # Simple heuristics to convert claims to questions
+        claim = claim.strip()
+        # If already a question, return as is
+        if claim.endswith("?"):
+            return claim
+        # Convert common claim patterns to questions
+        if claim.lower().startswith(("the ", "a ", "an ")):
+            return f"What evidence supports that {claim.lower()}?"
+        elif "is" in claim.lower() or "are" in claim.lower():
+            return f"Is it true that {claim.lower()}?"
+        elif "can" in claim.lower() or "could" in claim.lower():
+            return f"{claim}?"
+        else:
+            return f"What evidence supports the claim that {claim.lower()}?"
+    def retrieve_evidence(self, pdf_file: str, claim: str, k: int = 5) -> str:
+        """
+        Retrieve evidence from PDF using LLM-based question answering
+        Args:
+            pdf_file: Path to PDF file
+            claim: Claim to find evidence for
+            k: Number of evidence passages to retrieve
+        Returns:
+            Combined evidence text
+        """
+        try:
+            # Extract and clean text from PDF
+            clean_text = self._extract_and_clean_text(pdf_file)
+            # Convert claim to question format
+            question = self._format_claim_as_question(claim)
+            # Split text into manageable chunks
+            chunks = self._chunk_text(clean_text)
+            # Get answers from each chunk
+            answers = []
+            for i, chunk in enumerate(chunks):
+                try:
+                    result = self.qa_pipeline(
+                        question=question, context=chunk, max_answer_len=200, top_k=1
+                    )
+                    # Handle both single answer and list of answers
+                    if isinstance(result, list):
+                        result = result[0]
+                    if result["score"] > 0.1:  # Confidence threshold
+                        # Extract surrounding context for better evidence
+                        answer_text = result["answer"]
+                        start_idx = max(0, chunk.find(answer_text) - 100)
+                        end_idx = min(
+                            len(chunk), chunk.find(answer_text) + len(answer_text) + 100
+                        )
+                        context = chunk[start_idx:end_idx].strip()
+                        answers.append(
+                            {"text": context, "score": result["score"], "chunk_idx": i}
+                        )
+                except Exception as e:
+                    logger.warning(f"Error processing chunk {i}: {str(e)}")
+                    continue
+            # Sort by confidence score and take top k
+            answers.sort(key=lambda x: x["score"], reverse=True)
+            top_answers = answers[:k]
+            # Combine evidence passages
+            if top_answers:
+                evidence_texts = [answer["text"] for answer in top_answers]
+                combined_evidence = " ".join(evidence_texts)
+                return combined_evidence
+            else:
+                logger.warning("No evidence found with sufficient confidence")
+                return "No relevant evidence found in the document."
+        except Exception as e:
+            logger.error(f"Error in LLM evidence retrieval: {str(e)}")
+            return f"Error retrieving evidence: {str(e)}"
+def retrieve_from_pdf_llm(pdf_file: str, query: str, k: int = 5) -> str:
+    """
+    Wrapper function for LLM-based evidence retrieval
+    Compatible with the existing BM25S interface
+    Args:
+        pdf_file: Path to PDF file
+        query: Query/claim to find evidence for
+        k: Number of evidence passages to retrieve
+    Returns:
+        Retrieved evidence text
+    """
+    # Initialize retriever (in production, this should be cached)
+    retriever = LLMEvidenceRetriever()
+    return retriever.retrieve_evidence(pdf_file, query, k)
+# Alternative lightweight model for faster inference
+class LightweightLLMRetriever(LLMEvidenceRetriever):
+    """
+    Lightweight version using smaller, faster models
+    """
+    def __init__(self):
+        super().__init__(model_name="distilbert-base-cased-distilled-squad")
+def retrieve_from_pdf_llm_fast(pdf_file: str, query: str, k: int = 5) -> str:
+    """
+    Fast LLM-based evidence retrieval using lightweight model
+    Args:
+        pdf_file: Path to PDF file
+        query: Query/claim to find evidence for
+        k: Number of evidence passages to retrieve
+    Returns:
+        Retrieved evidence text
+    """
+    retriever = LightweightLLMRetriever()
+    return retriever.retrieve_evidence(pdf_file, query, k)