|
from pinecone import Pinecone |
|
from datasets import load_dataset |
|
import pandas as pd |
|
from langchain.schema import Document |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from transformers import ( |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForSeq2SeqLM, |
|
AutoModelForCausalLM, |
|
) |
|
from langchain.llms import HuggingFacePipeline |
|
from langchain.prompts import PromptTemplate |
|
import os |
|
|
|
api_key = os.environ["PINECONE_API_KEY"] |
|
|
|
from langchain_pinecone import PineconeVectorStore |
|
|
|
|
|
def load_raw_dataset(): |
|
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k") |
|
|
|
df = pd.DataFrame(dataset["train"]) |
|
|
|
df["combined"] = df["input"] + " " + df["output"] |
|
|
|
docs = [ |
|
Document( |
|
page_content=row["combined"], |
|
metadata={"question": row["input"], "answer": row["output"]}, |
|
) |
|
for _, row in df.iterrows() |
|
] |
|
|
|
return docs |
|
|
|
|
|
def create_vector_database(model_name): |
|
PINECONE_INDEX_NAME = "medical-rag-index" |
|
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"]) |
|
|
|
embedding_model = HuggingFaceEmbeddings(model_name=model_name) |
|
|
|
index = pc.Index(PINECONE_INDEX_NAME) |
|
vectorstore = PineconeVectorStore(index=index, embedding=embedding_model) |
|
return vectorstore |
|
|
|
|
|
def get_llm(model_name): |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained( |
|
"google/flan-t5-base", torch_dtype="auto", device_map="auto" |
|
) |
|
|
|
pipe = pipeline( |
|
"text2text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
max_new_tokens=512, |
|
temperature=1, |
|
do_sample=True, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm = HuggingFacePipeline(pipeline=pipe) |
|
return llm |
|
|
|
|
|
def get_prompt_template(): |
|
prompt_template = PromptTemplate( |
|
input_variables=["context", "question"], |
|
template="""Based on the following references and your medical knowledge, provide a detailed response: |
|
|
|
References: |
|
{context} |
|
|
|
Question: {question} |
|
|
|
By considering: |
|
1. The key medical concepts in the question. |
|
2. How the reference cases relate to this question. |
|
3. What medical principles should be applied. |
|
4. Any potential complications or considerations. |
|
|
|
Give the final response: |
|
""", |
|
) |
|
|
|
return prompt_template |
|
|