File size: 2,657 Bytes
1b9a516
cca58a9
 
 
 
1b9a516
 
 
 
1d656af
1b9a516
cca58a9
 
1b9a516
cca58a9
1b9a516
 
 
cca58a9
1d656af
cca58a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b9a516
 
 
 
cca58a9
1b9a516
 
 
cca58a9
 
 
 
 
4ff3551
 
cca58a9
4ff3551
cca58a9
4ff3551
cca58a9
 
4ff3551
 
cca58a9
 
 
4ff3551
 
 
 
 
 
 
 
 
 
 
 
cca58a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from pinecone import Pinecone
from datasets import load_dataset
import pandas as pd
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import (
    AutoTokenizer,
    pipeline,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
)
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
import os

api_key = os.environ["PINECONE_API_KEY"]

from langchain_pinecone import PineconeVectorStore


def load_raw_dataset():
    dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

    df = pd.DataFrame(dataset["train"])

    df["combined"] = df["input"] + " " + df["output"]

    docs = [
        Document(
            page_content=row["combined"],
            metadata={"question": row["input"], "answer": row["output"]},
        )
        for _, row in df.iterrows()
    ]

    return docs


def create_vector_database(model_name):
    PINECONE_INDEX_NAME = "medical-rag-index"
    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

    embedding_model = HuggingFaceEmbeddings(model_name=model_name)

    index = pc.Index(PINECONE_INDEX_NAME)
    vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)
    return vectorstore


def get_llm(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        "google/flan-t5-base", torch_dtype="auto", device_map="auto"
    )

    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=1,
        do_sample=True,
    )

    # model = AutoModelForCausalLM.from_pretrained(
    #     model_name, torch_dtype="auto", device_map="auto"
    # )
    # pipe = pipeline(
    #     "text-generation",
    #     model=model,
    #     tokenizer=tokenizer,
    #     max_new_tokens=1024,
    #     temperature=0.7,
    #     do_sample=True,
    # )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm


def get_prompt_template():
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="""Based on the following references and your medical knowledge, provide a detailed response:

        References:
        {context}

        Question: {question}

        By considering:
        1. The key medical concepts in the question.
        2. How the reference cases relate to this question.
        3. What medical principles should be applied.
        4. Any potential complications or considerations.

        Give the final response:
        """,
    )

    return prompt_template