Gary
commited on
Commit
·
1b9a516
1
Parent(s):
c384c23
Fetch data from pincone
Browse files- app.py +1 -4
- indexer.py +17 -6
- requirements.txt +3 -1
app.py
CHANGED
@@ -31,11 +31,8 @@ class CustomRAG:
|
|
31 |
|
32 |
|
33 |
def answer_question(query):
|
34 |
-
docs = load_raw_dataset()
|
35 |
llm = get_llm("google/flan-t5-base")
|
36 |
-
vector_database = create_vector_database(
|
37 |
-
docs, "sentence-transformers/all-MiniLM-L6-v2"
|
38 |
-
)
|
39 |
prompt_template = get_prompt_template()
|
40 |
rag = CustomRAG(
|
41 |
vector_database,
|
|
|
31 |
|
32 |
|
33 |
def answer_question(query):
|
|
|
34 |
llm = get_llm("google/flan-t5-base")
|
35 |
+
vector_database = create_vector_database("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
36 |
prompt_template = get_prompt_template()
|
37 |
rag = CustomRAG(
|
38 |
vector_database,
|
indexer.py
CHANGED
@@ -1,12 +1,20 @@
|
|
|
|
1 |
from datasets import load_dataset
|
2 |
import pandas as pd
|
3 |
from langchain.schema import Document
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
-
from
|
6 |
-
|
|
|
|
|
|
|
7 |
from langchain.llms import HuggingFacePipeline
|
8 |
from langchain.prompts import PromptTemplate
|
|
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
def load_raw_dataset():
|
12 |
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
|
@@ -15,8 +23,6 @@ def load_raw_dataset():
|
|
15 |
|
16 |
df["combined"] = df["input"] + " " + df["output"]
|
17 |
|
18 |
-
df = df.sample(n=min(5000, len(df)), random_state=42)
|
19 |
-
|
20 |
docs = [
|
21 |
Document(
|
22 |
page_content=row["combined"],
|
@@ -28,9 +34,14 @@ def load_raw_dataset():
|
|
28 |
return docs
|
29 |
|
30 |
|
31 |
-
def create_vector_database(
|
|
|
|
|
|
|
32 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
33 |
-
|
|
|
|
|
34 |
return vectorstore
|
35 |
|
36 |
|
|
|
1 |
+
from pinecone import Pinecone
|
2 |
from datasets import load_dataset
|
3 |
import pandas as pd
|
4 |
from langchain.schema import Document
|
5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
+
from transformers import (
|
7 |
+
AutoTokenizer,
|
8 |
+
pipeline,
|
9 |
+
AutoModelForSeq2SeqLM,
|
10 |
+
)
|
11 |
from langchain.llms import HuggingFacePipeline
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
+
import os
|
14 |
|
15 |
+
api_key = os.environ["PINECONE_API_KEY"]
|
16 |
+
|
17 |
+
from langchain_pinecone import PineconeVectorStore
|
18 |
|
19 |
def load_raw_dataset():
|
20 |
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
|
|
|
23 |
|
24 |
df["combined"] = df["input"] + " " + df["output"]
|
25 |
|
|
|
|
|
26 |
docs = [
|
27 |
Document(
|
28 |
page_content=row["combined"],
|
|
|
34 |
return docs
|
35 |
|
36 |
|
37 |
+
def create_vector_database(model_name):
|
38 |
+
PINECONE_INDEX_NAME = "medical-rag-index"
|
39 |
+
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
|
40 |
+
|
41 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
42 |
+
|
43 |
+
index = pc.Index(PINECONE_INDEX_NAME)
|
44 |
+
vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)
|
45 |
return vectorstore
|
46 |
|
47 |
|
requirements.txt
CHANGED
@@ -7,4 +7,6 @@ faiss-cpu
|
|
7 |
huggingface-hub
|
8 |
praw
|
9 |
langchain-community
|
10 |
-
accelerate
|
|
|
|
|
|
7 |
huggingface-hub
|
8 |
praw
|
9 |
langchain-community
|
10 |
+
accelerate
|
11 |
+
langchain-pinecone
|
12 |
+
pinecone
|