from pinecone import Pinecone from datasets import load_dataset import pandas as pd from langchain.schema import Document from langchain.embeddings import HuggingFaceEmbeddings from transformers import ( AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM, ) from langchain.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate import os api_key = os.environ["PINECONE_API_KEY"] from langchain_pinecone import PineconeVectorStore def load_raw_dataset(): dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k") df = pd.DataFrame(dataset["train"]) df["combined"] = df["input"] + " " + df["output"] docs = [ Document( page_content=row["combined"], metadata={"question": row["input"], "answer": row["output"]}, ) for _, row in df.iterrows() ] return docs def create_vector_database(model_name): PINECONE_INDEX_NAME = "medical-rag-index" pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"]) embedding_model = HuggingFaceEmbeddings(model_name=model_name) index = pc.Index(PINECONE_INDEX_NAME) vectorstore = PineconeVectorStore(index=index, embedding=embedding_model) return vectorstore def get_llm(model_name): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained( "google/flan-t5-base", torch_dtype="auto", device_map="auto" ) pipe = pipeline( "text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=1, do_sample=True, ) # model = AutoModelForCausalLM.from_pretrained( # model_name, torch_dtype="auto", device_map="auto" # ) # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer, # max_new_tokens=1024, # temperature=0.7, # do_sample=True, # ) llm = HuggingFacePipeline(pipeline=pipe) return llm def get_prompt_template(): prompt_template = PromptTemplate( input_variables=["context", "question"], template="""Based on the following references and your medical knowledge, provide a detailed response: References: {context} Question: {question} By considering: 1. The key medical concepts in the question. 2. How the reference cases relate to this question. 3. What medical principles should be applied. 4. Any potential complications or considerations. Give the final response: """, ) return prompt_template