Master-Thesis-Prakhar
/

GraphRAG

Model card Files Files and versions Community

GraphRAG / kg_builder /src /api_connections.py

Prakhar Bhandari

updated prompt and function for traffic law wiki

0beb8e1 about 1 year ago

7.39 kB


	from langchain_community.graphs import Neo4jGraph
	from dotenv import load_dotenv
	import os
	from langchain.chains.openai_functions import create_structured_output_chain
	from langchain_openai import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate
	from models import KnowledgeGraph
	from typing import Optional, List


	load_dotenv() # This loads the variables from .env into os.environ

	def get_graph_connection(category):
	if category == "Chemotherapy":
	url = os.getenv("CHEMO_NEO4J_URL")
	username = os.getenv("CHEMO_NEO4J_USERNAME")
	password = os.getenv("CHEMO_NEO4J_PASSWORD")
	elif category == "Traffic Law":
	url = os.getenv("TRAFFIC_NEO4J_URL")
	username = os.getenv("TRAFFIC_NEO4J_USERNAME")
	password = os.getenv("TRAFFIC_NEO4J_PASSWORD")
	else:
	raise ValueError(f"Unknown category: {category}")

	return Neo4jGraph(url=url, username=username, password=password)

	openai_api_key = os.getenv("OPENAI_API_KEY")

	def get_llm():
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	raise ValueError("No OpenAI API key found in environment variables.")
	return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

	def get_extraction_chain(
	category,
	allowed_nodes: Optional[List[str]] = None,
	allowed_rels: Optional[List[str]] = None
	):
	if category == "Chemotherapy":
	# Chemotherapy-specific prompt
	prompt_text = f"""# Knowledge Graph Instructions for GPT-4
	## 1. Overview
	You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
	- Nodes symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
	- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.

	## 2. Labeling Nodes
	- Consistency: Utilize uniform labels for node types to maintain clarity.
	- For instance, consistently label drugs as "Drug", symptoms as "Symptom", and treatments as "Treatment".
	- Node IDs: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
	{'- Allowed Node Labels:' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
	{'- Allowed Relationship Types:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}

	## 3. Handling Numerical Data and Dates
	- Integrate numerical data and dates as attributes of the corresponding nodes.
	- No Isolated Nodes for Dates/Numbers: Directly associate dates and numerical figures as attributes with pertinent nodes.
	- Property Format: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.

	## 4. Coreference Resolution
	- Entity Consistency: Guarantee uniform identification of each entity across the graph.
	- For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.

	## 5. Relationship Naming Conventions
	- Clarity and Standardization: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
	- For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
	- Relevance and Specificity: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.

	## 6. Strict Compliance
	Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
	"""

	elif category == "Traffic Law":
	# Traffic Law-specific prompt
	prompt_text = f"""# Knowledge Graph Instructions for GPT-4
	## 1. Overview
	You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about traffic laws and regulations in the United States.
	- Nodes symbolize entities such as types of traffic violations, penalties, driving regulations, and relevant legal statutes.
	- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for legal professionals, law enforcement agencies, and the general public.

	## 2. Labeling Nodes
	- Consistency: Utilize uniform labels for node types to maintain clarity.
	- For instance, consistently label violations as "Violation", penalties as "Penalty", and statutes as "Statute".
	- Node IDs: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
	{'- Allowed Node Labels:' + ", ".join(['Violation', 'Penalty', 'Statute', 'VehicleType', 'LegalDocument']) if allowed_nodes else ""}
	{'- Allowed Relationship Types:' + ", ".join(['Violates', 'Penalizes', 'Governs', 'Cites']) if allowed_rels else ""}

	## 3. Handling Numerical Data and Dates
	- Integrate numerical data and dates as attributes of the corresponding nodes.
	- No Isolated Nodes for Dates/Numbers: Directly associate dates and numerical figures as attributes with pertinent nodes.
	- Property Format: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `fineAmount`, `lawEffectiveDate`.

	## 4. Coreference Resolution
	- Entity Consistency: Guarantee uniform identification of each entity across the graph.
	- For example, if "Vehicle Code 22350" and "Speed Law" reference the same statute, uniformly apply "Vehicle Code 22350" as the node ID.

	## 5. Relationship Naming Conventions
	- Clarity and Standardization: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
	- For instance, use "IS_PENALIZED_BY" instead of "ISPENALIZEDBY", use "IS_GOVERNED_BY" instead of "ISGOVERNEDBY" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
	- Relevance and Specificity: Choose relationship names that accurately reflect the connection between nodes, such as "REQUIRES" or "PROHIBITS" for legal requirements or prohibitions.

	## 6. Strict Compliance
	Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
	"""

	else:
	raise ValueError("Unknown category")

	llm = get_llm()
	prompt = ChatPromptTemplate.from_messages(
	[(
	"system",prompt_text),
	("human", "Use the given format to extract information from the following input: {input}"),
	("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
	])
	return create_structured_output_chain(KnowledgeGraph, llm, prompt)