Spaces:

Sawon2023
/

llm-pdf-qa

Runtime error

Sawon2023 commited on Sep 20, 2023

Commit

b998950

1 Parent(s): 97c6a38

Updated pdf parsing

https://huggingface.co/spaces/fffiloni/langchain-chat-with-pdf/blob/main/app.py

Files changed (1) hide show

pdftoqa_generator.py CHANGED Viewed

@@ -5,7 +5,7 @@ import statistics
 import gradio as gr
 import pandas as pd
-from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import (
     CharacterTextSplitter,
     RecursiveCharacterTextSplitter,
@@ -18,28 +18,30 @@ os.environ["OPENAI_API_KEY"] = "sk-"
 def pdf_parser(uploaded_file):
     bytes_data = uploaded_file.read()
     with NamedTemporaryFile(delete=False) as tmp:  # open a named temporary file
         tmp.write(bytes_data)                      # Write data from the uploaded file into it
         pdf_loader = PyPDFLoader(tmp.name)        # <---- now it works!
-        #pdf_loader = PyPDFLoader(file_path) only for file path offline
-        documents = pdf_loader.load()
-        documents_text = [d.page_content for d in documents]
-        text_splitter = RecursiveCharacterTextSplitter(
-            # Set a really small chunk size, just to show.
-            chunk_size=600,
-            chunk_overlap=200,
-            length_function=len,
-            is_separator_regex=False,
-        )
-        # Split the text into chunks
-        texts = text_splitter.create_documents(documents_text)
-        os.remove(tmp.name)                            # remove temp file
-        return texts
 def qa_generator(texts):

 import gradio as gr
 import pandas as pd
+from langchain.document_loaders import OnlinePDFLoader
 from langchain.text_splitter import (
     CharacterTextSplitter,
     RecursiveCharacterTextSplitter,
 def pdf_parser(uploaded_file):
+    '''
     bytes_data = uploaded_file.read()
     with NamedTemporaryFile(delete=False) as tmp:  # open a named temporary file
         tmp.write(bytes_data)                      # Write data from the uploaded file into it
         pdf_loader = PyPDFLoader(tmp.name)        # <---- now it works!
+    '''
+    #pdf_loader = PyPDFLoader(file_path) only for file path offline
+    pdf_loader=OnlinePDFLoader(uploaded_file.name) #https://huggingface.co/spaces/fffiloni/langchain-chat-with-pdf/blob/main/app.py
+    documents = pdf_loader.load()
+    documents_text = [d.page_content for d in documents]
+    text_splitter = RecursiveCharacterTextSplitter(
+        # Set a really small chunk size, just to show.
+        chunk_size=600,
+        chunk_overlap=200,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    # Split the text into chunks
+    texts = text_splitter.create_documents(documents_text)
+    #os.remove(tmp.name)                            # remove temp file
+    return texts
 def qa_generator(texts):