Master-Thesis-Prakhar
/

GraphRAG

Model card Files Files and versions Community

GraphRAG / kg_builder /src /graph_creation.py

Prakhar Bhandari

Test

a16ade2 9 months ago

3.26 kB

	from langchain_community.document_loaders import WikipediaLoader
	from langchain.text_splitter import TokenTextSplitter
	from knowledge_graph_builder import extract_and_store_graph
	from dotenv import load_dotenv
	from tqdm import tqdm

	# Load environment variables
	load_dotenv()

	# Define articles to load
	articles = {
	"Chemotherapy": "Chemotherapy",
	"Traffic Law": "Traffic laws in the United States"
	}

	def build_graph_for_article(article_name, category):
	print(f"Loading documents for: {article_name}")
	# Load and process the Wikipedia article
	raw_documents = WikipediaLoader(query=article_name).load()
	if not raw_documents:
	print(f"Failed to load content for {article_name}")
	return

	text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
	documents = text_splitter.split_documents(raw_documents[:5]) # Only process the first 5 documents

	print("Building the knowledge graph...")
	for i, document in tqdm(enumerate(documents), total=len(documents)):
	extract_and_store_graph(document, category)

	def main():
	for category, title in articles.items():
	build_graph_for_article(title, category)

	if __name__ == "__main__":
	main()

	# import os
	# from openai import OpenAI
	# from api_connections import get_graph_connection
	# from knowledge_graph_builder import extract_and_store_graph
	# from query_graph import query_knowledge_graph
	# from langchain_community.document_loaders import WikipediaLoader
	# from langchain.text_splitter import TokenTextSplitter
	# from tqdm import tqdm

	# def get_llm():
	# api_key = os.getenv("OPENAI_API_KEY")
	# if not api_key:
	# raise ValueError("No OpenAI API key found in environment variables.")
	# return OpenAI(api_key=api_key)

	# def classify_query(query):
	# llm = get_llm()
	# response = llm.Completion.create(
	# model="text-davinci-003", # Consider updating to the latest model as necessary
	# prompt=f"Classify the following query into 'Chemotherapy' or 'Traffic Law': {query}",
	# max_tokens=60
	# )
	# return response.choices[0].text.strip()

	# def main():
	# print("Starting the script...")
	# # Take Wikipedia article name as input
	# article_name = input("Enter the Wikipedia article name: ")

	# print(f"Loading documents for: {article_name}")
	# # Load and process the Wikipedia article
	# raw_documents = WikipediaLoader(query=article_name).load()
	# text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
	# documents = text_splitter.split_documents(raw_documents[:5]) # Only process the first 5 documents

	# print("Building the knowledge graph...")
	# # Build the knowledge graph from the documents
	# for i, d in tqdm(enumerate(documents), total=len(documents)):
	# extract_and_store_graph(d)

	# print("Graph construction complete. Please enter your query.")
	# # Take a query related to the graph
	# user_query = input("Enter your query related to the graph: ")

	# print(f"Querying the graph with: {user_query}")
	# # Query the graph and print the answer
	# answer = query_knowledge_graph(user_query)
	# print("Answer to your query:", answer)

	# if __name__ == "__main__":
	# main()