Gary commited on
Commit
1b9a516
·
1 Parent(s): c384c23

Fetch data from pincone

Browse files
Files changed (3) hide show
  1. app.py +1 -4
  2. indexer.py +17 -6
  3. requirements.txt +3 -1
app.py CHANGED
@@ -31,11 +31,8 @@ class CustomRAG:
31
 
32
 
33
  def answer_question(query):
34
- docs = load_raw_dataset()
35
  llm = get_llm("google/flan-t5-base")
36
- vector_database = create_vector_database(
37
- docs, "sentence-transformers/all-MiniLM-L6-v2"
38
- )
39
  prompt_template = get_prompt_template()
40
  rag = CustomRAG(
41
  vector_database,
 
31
 
32
 
33
  def answer_question(query):
 
34
  llm = get_llm("google/flan-t5-base")
35
+ vector_database = create_vector_database("sentence-transformers/all-MiniLM-L6-v2")
 
 
36
  prompt_template = get_prompt_template()
37
  rag = CustomRAG(
38
  vector_database,
indexer.py CHANGED
@@ -1,12 +1,20 @@
 
1
  from datasets import load_dataset
2
  import pandas as pd
3
  from langchain.schema import Document
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
- from langchain.vectorstores import FAISS
6
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
 
 
7
  from langchain.llms import HuggingFacePipeline
8
  from langchain.prompts import PromptTemplate
 
9
 
 
 
 
10
 
11
  def load_raw_dataset():
12
  dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
@@ -15,8 +23,6 @@ def load_raw_dataset():
15
 
16
  df["combined"] = df["input"] + " " + df["output"]
17
 
18
- df = df.sample(n=min(5000, len(df)), random_state=42)
19
-
20
  docs = [
21
  Document(
22
  page_content=row["combined"],
@@ -28,9 +34,14 @@ def load_raw_dataset():
28
  return docs
29
 
30
 
31
- def create_vector_database(docs, model_name):
 
 
 
32
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
33
- vectorstore = FAISS.from_documents(docs, embedding_model)
 
 
34
  return vectorstore
35
 
36
 
 
1
+ from pinecone import Pinecone
2
  from datasets import load_dataset
3
  import pandas as pd
4
  from langchain.schema import Document
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
+ from transformers import (
7
+ AutoTokenizer,
8
+ pipeline,
9
+ AutoModelForSeq2SeqLM,
10
+ )
11
  from langchain.llms import HuggingFacePipeline
12
  from langchain.prompts import PromptTemplate
13
+ import os
14
 
15
+ api_key = os.environ["PINECONE_API_KEY"]
16
+
17
+ from langchain_pinecone import PineconeVectorStore
18
 
19
  def load_raw_dataset():
20
  dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
 
23
 
24
  df["combined"] = df["input"] + " " + df["output"]
25
 
 
 
26
  docs = [
27
  Document(
28
  page_content=row["combined"],
 
34
  return docs
35
 
36
 
37
+ def create_vector_database(model_name):
38
+ PINECONE_INDEX_NAME = "medical-rag-index"
39
+ pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
40
+
41
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
42
+
43
+ index = pc.Index(PINECONE_INDEX_NAME)
44
+ vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)
45
  return vectorstore
46
 
47
 
requirements.txt CHANGED
@@ -7,4 +7,6 @@ faiss-cpu
7
  huggingface-hub
8
  praw
9
  langchain-community
10
- accelerate
 
 
 
7
  huggingface-hub
8
  praw
9
  langchain-community
10
+ accelerate
11
+ langchain-pinecone
12
+ pinecone