Master-Thesis-Prakhar
/

GraphRAG

Model card Files Files and versions Community

Prakhar Bhandari commited on Apr 29, 2024

Commit

b77d203

1 Parent(s): babec93

First attempt at incorporating multiple graphs

Browse files

Files changed (9) hide show

kg_builder/src/__pycache__/api_connections.cpython-39.pyc +0 -0
kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc +0 -0
kg_builder/src/__pycache__/models.cpython-39.pyc +0 -0
kg_builder/src/__pycache__/utils.cpython-39.pyc +0 -0
kg_builder/src/api_connections.py +25 -42
kg_builder/src/graph_creation.py +90 -0
kg_builder/src/knowledge_graph_builder.py +4 -1
kg_builder/src/main.py +31 -21
kg_builder/src/query_graph.py +1 -1

kg_builder/src/__pycache__/api_connections.cpython-39.pyc CHANGED Viewed

Binary files a/kg_builder/src/__pycache__/api_connections.cpython-39.pyc and b/kg_builder/src/__pycache__/api_connections.cpython-39.pyc differ

kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc CHANGED Viewed

Binary files a/kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc and b/kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc differ

kg_builder/src/__pycache__/models.cpython-39.pyc CHANGED Viewed

Binary files a/kg_builder/src/__pycache__/models.cpython-39.pyc and b/kg_builder/src/__pycache__/models.cpython-39.pyc differ

kg_builder/src/__pycache__/utils.cpython-39.pyc CHANGED Viewed

Binary files a/kg_builder/src/__pycache__/utils.cpython-39.pyc and b/kg_builder/src/__pycache__/utils.cpython-39.pyc differ

kg_builder/src/api_connections.py CHANGED Viewed

@@ -11,17 +11,21 @@ from typing import Optional, List
 load_dotenv()  # This loads the variables from .env into os.environ
-# Now use os.getenv to access your variables
-url = os.getenv("NEO4J_URL")
-username = os.getenv("NEO4J_USERNAME")
-password = os.getenv("NEO4J_PASSWORD")
-openai_api_key = os.getenv("OPENAI_API_KEY")
-graph = Neo4jGraph(
-    url=url,
-    username=username,
-    password=password
-)
 def get_llm():
     api_key = os.getenv("OPENAI_API_KEY")
@@ -30,44 +34,23 @@ def get_llm():
     return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
 def get_extraction_chain(
     allowed_nodes: Optional[List[str]] = None,
     allowed_rels: Optional[List[str]] = None
     ):
     llm = get_llm()
     prompt = ChatPromptTemplate.from_messages(
         [(
-                "system",
-                f"""# Knowledge Graph Instructions for GPT-4
-        ## 1. Overview
-        You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
-        - **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
-        - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.
-        ## 2. Labeling Nodes
-        - **Consistency**: Utilize uniform labels for node types to maintain clarity.
-        - For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
-        - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
-        {'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
-        {'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}
-        ## 3. Handling Numerical Data and Dates
-        - Integrate numerical data and dates as attributes of the corresponding nodes.
-        - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
-        - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.
-        ## 4. Coreference Resolution
-        - **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
-        - For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.
-        ## 5. Relationship Naming Conventions
-        - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
-        - For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
-        - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.
-        ## 6. Strict Compliance
-        Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
-                """),
                     ("human", "Use the given format to extract information from the following input: {input}"),
                     ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
         ])

 load_dotenv()  # This loads the variables from .env into os.environ
+def get_graph_connection(category):
+    if category == "Chemotherapy":
+        url = os.getenv("CHEMO_NEO4J_URL")
+        username = os.getenv("CHEMO_NEO4J_USERNAME")
+        password = os.getenv("CHEMO_NEO4J_PASSWORD")
+    elif category == "Traffic Law":
+        url = os.getenv("TRAFFIC_NEO4J_URL")
+        username = os.getenv("TRAFFIC_NEO4J_USERNAME")
+        password = os.getenv("TRAFFIC_NEO4J_PASSWORD")
+    else:
+        raise ValueError(f"Unknown category: {category}")
+    return Neo4jGraph(url=url, username=username, password=password)
+openai_api_key = os.getenv("OPENAI_API_KEY")
 def get_llm():
     api_key = os.getenv("OPENAI_API_KEY")
     return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
 def get_extraction_chain(
+    category,
     allowed_nodes: Optional[List[str]] = None,
     allowed_rels: Optional[List[str]] = None
     ):
+    if category == "Chemotherapy":
+        # Chemotherapy-specific prompt
+        prompt_text = ""
+    elif category == "Traffic Law":
+        # Traffic Law-specific prompt
+        prompt_text = "[Traffic Law-specific instructions]"
+    else:
+        raise ValueError("Unknown category")
     llm = get_llm()
     prompt = ChatPromptTemplate.from_messages(
         [(
+                    "system",prompt_text),
                     ("human", "Use the given format to extract information from the following input: {input}"),
                     ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
         ])

kg_builder/src/graph_creation.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from langchain_community.document_loaders import WikipediaLoader
+from langchain.text_splitter import TokenTextSplitter
+from knowledge_graph_builder import extract_and_store_graph
+from langchain.schema import Document
+from dotenv import load_dotenv
+from tqdm import tqdm
+import os
+# Load environment variables
+load_dotenv()
+# Define articles to load
+articles = {
+    "Chemotherapy": "Chemotherapy",
+    "Traffic Law": "Traffic laws in the United States"
+}
+def build_graph_for_article(article_name, category):
+    print(f"Loading documents for: {article_name}")
+    # Load and process the Wikipedia article
+    raw_documents = WikipediaLoader(query=article_name).load()
+    if not raw_documents:
+        print(f"Failed to load content for {article_name}")
+        return
+    text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
+    documents = text_splitter.split_documents(raw_documents[:5])  # Only process the first 5 documents
+    print("Building the knowledge graph...")
+    for i, document in tqdm(enumerate(documents), total=len(documents)):
+        extract_and_store_graph(document, category)
+def main():
+    for category, title in articles.items():
+        build_graph_for_article(title, category)
+if __name__ == "__main__":
+    main()
+# import os
+# from openai import OpenAI
+# from api_connections import get_graph_connection
+# from knowledge_graph_builder import extract_and_store_graph
+# from query_graph import query_knowledge_graph
+# from langchain_community.document_loaders import WikipediaLoader
+# from langchain.text_splitter import TokenTextSplitter
+# from tqdm import tqdm
+# def get_llm():
+#     api_key = os.getenv("OPENAI_API_KEY")
+#     if not api_key:
+#         raise ValueError("No OpenAI API key found in environment variables.")
+#     return OpenAI(api_key=api_key)
+# def classify_query(query):
+#     llm = get_llm()
+#     response = llm.Completion.create(
+#         model="text-davinci-003",  # Consider updating to the latest model as necessary
+#         prompt=f"Classify the following query into 'Chemotherapy' or 'Traffic Law': {query}",
+#         max_tokens=60
+#     )
+#     return response.choices[0].text.strip()
+# def main():
+#     print("Starting the script...")
+#     # Take Wikipedia article name as input
+#     article_name = input("Enter the Wikipedia article name: ")
+#     print(f"Loading documents for: {article_name}")
+#     # Load and process the Wikipedia article
+#     raw_documents = WikipediaLoader(query=article_name).load()
+#     text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
+#     documents = text_splitter.split_documents(raw_documents[:5])  # Only process the first 5 documents
+#     print("Building the knowledge graph...")
+#     # Build the knowledge graph from the documents
+#     for i, d in tqdm(enumerate(documents), total=len(documents)):
+#         extract_and_store_graph(d)
+#     print("Graph construction complete. Please enter your query.")
+#     # Take a query related to the graph
+#     user_query = input("Enter your query related to the graph: ")
+#     print(f"Querying the graph with: {user_query}")
+#     # Query the graph and print the answer
+#     answer = query_knowledge_graph(user_query)
+#     print("Answer to your query:", answer)
+# if __name__ == "__main__":
+#     main()

kg_builder/src/knowledge_graph_builder.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from api_connections import graph
 from langchain_community.graphs.graph_document import (
     Node as BaseNode,
@@ -22,8 +22,11 @@ from langchain.chains.openai_functions import (
 def extract_and_store_graph(
     document: Document,
     nodes:Optional[List[str]] = None,
     rels:Optional[List[str]]=None) -> None:
     # Extract graph data using OpenAI functions
     extract_chain = get_extraction_chain(nodes, rels)
     data = extract_chain.invoke(document.page_content)['function']

+from api_connections import get_graph_connection
 from langchain_community.graphs.graph_document import (
     Node as BaseNode,
 def extract_and_store_graph(
     document: Document,
+    category: str,
     nodes:Optional[List[str]] = None,
     rels:Optional[List[str]]=None) -> None:
+    graph = get_graph_connection(category)
     # Extract graph data using OpenAI functions
     extract_chain = get_extraction_chain(nodes, rels)
     data = extract_chain.invoke(document.page_content)['function']

kg_builder/src/main.py CHANGED Viewed

@@ -1,33 +1,43 @@
 from knowledge_graph_builder import extract_and_store_graph
 from query_graph import query_knowledge_graph
 from langchain_community.document_loaders import WikipediaLoader
 from langchain.text_splitter import TokenTextSplitter
 from tqdm import tqdm
-def main():
-    print("Starting the script...")
-    # Take Wikipedia article name as input
-    article_name = input("Enter the Wikipedia article name: ")
-    print(f"Loading documents for: {article_name}")
-    # Load and process the Wikipedia article
-    raw_documents = WikipediaLoader(query=article_name).load()
-    text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
-    documents = text_splitter.split_documents(raw_documents[:5])  # Only process the first 5 documents
-    print("Building the knowledge graph...")
-    # Build the knowledge graph from the documents
-    for i, d in tqdm(enumerate(documents), total=len(documents)):
-        extract_and_store_graph(d)
-    print("Graph construction complete. Please enter your query.")
-    # Take a query related to the graph
-    user_query = input("Enter your query related to the graph: ")
-    print(f"Querying the graph with: {user_query}")
-    # Query the graph and print the answer
-    answer = query_knowledge_graph(user_query)
-    print("Answer to your query:", answer)
 if __name__ == "__main__":
     main()

+import os
+from openai import OpenAI
+from api_connections import get_graph_connection
 from knowledge_graph_builder import extract_and_store_graph
 from query_graph import query_knowledge_graph
 from langchain_community.document_loaders import WikipediaLoader
 from langchain.text_splitter import TokenTextSplitter
 from tqdm import tqdm
+def get_llm():
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("No OpenAI API key found in environment variables.")
+    return OpenAI(api_key=api_key)
+def classify_query(query):
+    llm = get_llm()
+    response = llm.Completion.create(
+        model="text-davinci-003",  # Consider updating to the latest model as necessary
+        prompt=f"Classify the following query into 'Chemotherapy' or 'Traffic Law': {query}",
+        max_tokens=60
+    )
+    return response.choices[0].text.strip()
+def main():
+    print("Starting the script...")
+    # Get user query
+    query = input("Please enter your query: ")
+    # Classify the query
+    category = classify_query(query)
+    print(f"Query classified into category: {category}")
+    # Get the correct graph connection
+    graph = get_graph_connection(category)
+    # Query the correct graph
+    result = query_knowledge_graph(graph, query)
+    print(f"Query result: {result}")
 if __name__ == "__main__":
     main()

kg_builder/src/query_graph.py CHANGED Viewed

@@ -2,7 +2,7 @@ from langchain.chains import GraphCypherQAChain
 from langchain_openai import ChatOpenAI
 from api_connections import graph  # Importing 'graph' from 'api_connections.py'
-def query_knowledge_graph(query):
     print("Refreshing the graph schema...")
     # Refresh the graph schema before querying
     graph.refresh_schema()

 from langchain_openai import ChatOpenAI
 from api_connections import graph  # Importing 'graph' from 'api_connections.py'
+def query_knowledge_graph(graph, query):
     print("Refreshing the graph schema...")
     # Refresh the graph schema before querying
     graph.refresh_schema()