Sawon2023 commited on
Commit
b998950
·
1 Parent(s): 97c6a38

Updated pdf parsing

Browse files

https://huggingface.co/spaces/fffiloni/langchain-chat-with-pdf/blob/main/app.py

Files changed (1) hide show
  1. pdftoqa_generator.py +20 -18
pdftoqa_generator.py CHANGED
@@ -5,7 +5,7 @@ import statistics
5
 
6
  import gradio as gr
7
  import pandas as pd
8
- from langchain.document_loaders import PyPDFLoader
9
  from langchain.text_splitter import (
10
  CharacterTextSplitter,
11
  RecursiveCharacterTextSplitter,
@@ -18,28 +18,30 @@ os.environ["OPENAI_API_KEY"] = "sk-"
18
 
19
 
20
  def pdf_parser(uploaded_file):
 
21
  bytes_data = uploaded_file.read()
22
  with NamedTemporaryFile(delete=False) as tmp: # open a named temporary file
23
  tmp.write(bytes_data) # Write data from the uploaded file into it
24
  pdf_loader = PyPDFLoader(tmp.name) # <---- now it works!
25
-
26
- #pdf_loader = PyPDFLoader(file_path) only for file path offline
27
-
28
- documents = pdf_loader.load()
29
- documents_text = [d.page_content for d in documents]
30
-
31
- text_splitter = RecursiveCharacterTextSplitter(
32
- # Set a really small chunk size, just to show.
33
- chunk_size=600,
34
- chunk_overlap=200,
35
- length_function=len,
36
- is_separator_regex=False,
37
- )
38
 
39
- # Split the text into chunks
40
- texts = text_splitter.create_documents(documents_text)
41
- os.remove(tmp.name) # remove temp file
42
- return texts
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def qa_generator(texts):
 
5
 
6
  import gradio as gr
7
  import pandas as pd
8
+ from langchain.document_loaders import OnlinePDFLoader
9
  from langchain.text_splitter import (
10
  CharacterTextSplitter,
11
  RecursiveCharacterTextSplitter,
 
18
 
19
 
20
  def pdf_parser(uploaded_file):
21
+ '''
22
  bytes_data = uploaded_file.read()
23
  with NamedTemporaryFile(delete=False) as tmp: # open a named temporary file
24
  tmp.write(bytes_data) # Write data from the uploaded file into it
25
  pdf_loader = PyPDFLoader(tmp.name) # <---- now it works!
26
+ '''
27
+ #pdf_loader = PyPDFLoader(file_path) only for file path offline
28
+ pdf_loader=OnlinePDFLoader(uploaded_file.name) #https://huggingface.co/spaces/fffiloni/langchain-chat-with-pdf/blob/main/app.py
 
 
 
 
 
 
 
 
 
 
29
 
30
+ documents = pdf_loader.load()
31
+ documents_text = [d.page_content for d in documents]
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ # Set a really small chunk size, just to show.
35
+ chunk_size=600,
36
+ chunk_overlap=200,
37
+ length_function=len,
38
+ is_separator_regex=False,
39
+ )
40
+
41
+ # Split the text into chunks
42
+ texts = text_splitter.create_documents(documents_text)
43
+ #os.remove(tmp.name) # remove temp file
44
+ return texts
45
 
46
 
47
  def qa_generator(texts):