import logging from langchain_community.document_loaders import WikipediaLoader, UnstructuredHTMLLoader from langchain.text_splitter import TokenTextSplitter from knowledge_graph_builder import extract_and_store_graph from dotenv import load_dotenv from tqdm import tqdm logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # Load environment variables load_dotenv() # IMPORTANT: Make sure data source names match with values inside api_connections.py # Define articles / topics to load articlesDISABLED = { "Chemotherapy": "Chemotherapy", "Traffic Law": "Traffic laws in the United States" } # Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de # Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road articles = { "Traffic Law": "Traffic laws in the United States" } articlesDISABLED = { "SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html" } articlesDISABLED = { "SquirroDocs": "/Users/michaelwechner/Desktop/docs.squirro.com_en_latest_technical_getting-started.html" } def build_graph_for_article(query, data_source_name): """ Build knowledge graph from loaded articles / documents of a particular topic :param query: The query string to search on Wikipedia, e.g. "Traffic laws in the United States" :param data_source_name: Data source name, e.g. "Traffic Law" :return: """ load_max_documents = 5 #chunk_size=4096 #chunk_overlap=96 chunk_size=400 chunk_overlap=10 if data_source_name == "SquirroDocs": logger.info(f"Loading document(s) from public website {query} ...") loader = UnstructuredHTMLLoader(query) raw_documents = loader.load() else: logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...") raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load() if not raw_documents: logger.error(f"Failed to load content for Data Source '{data_source_name}'!") return logger.info(f"{str(len(raw_documents))} document(s) loaded.") for doc in raw_documents: logger.info(f"Document: {doc.metadata['source']}") #print(f"Document: {doc.page_content}") logger.info(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...") text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents logger.info(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)") logger.info(f"Building the knowledge graph for document(s) found by query '{query}' ...") for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)): logger.info(f"Extract data from chunk {str(i)} ...") #print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}") extract_and_store_graph(chunkDoc, data_source_name) def main(): for data_source_name, query in articles.items(): build_graph_for_article(query, data_source_name) if __name__ == "__main__": main() # import os # from openai import OpenAI # from api_connections import get_graph_connection # from knowledge_graph_builder import extract_and_store_graph # from query_graph import query_knowledge_graph # from langchain_community.document_loaders import WikipediaLoader # from langchain.text_splitter import TokenTextSplitter # from tqdm import tqdm # def get_llm(): # api_key = os.getenv("OPENAI_API_KEY") # if not api_key: # raise ValueError("No OpenAI API key found in environment variables.") # return OpenAI(api_key=api_key) # def classify_query(query): # llm = get_llm() # response = llm.Completion.create( # model="text-davinci-003", # Consider updating to the latest model as necessary # prompt=f"Classify the following query into 'Chemotherapy' or 'Traffic Law': {query}", # max_tokens=60 # ) # return response.choices[0].text.strip() # def main(): # print("Starting the script...") # # Take Wikipedia article name as input # article_name = input("Enter the Wikipedia article name: ") # print(f"Loading documents for: {article_name}") # # Load and process the Wikipedia article # raw_documents = WikipediaLoader(query=article_name).load() # text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96) # documents = text_splitter.split_documents(raw_documents[:5]) # Only process the first 5 documents # print("Building the knowledge graph...") # # Build the knowledge graph from the documents # for i, d in tqdm(enumerate(documents), total=len(documents)): # extract_and_store_graph(d) # print("Graph construction complete. Please enter your query.") # # Take a query related to the graph # user_query = input("Enter your query related to the graph: ") # print(f"Querying the graph with: {user_query}") # # Query the graph and print the answer # answer = query_knowledge_graph(user_query) # print("Answer to your query:", answer) # if __name__ == "__main__": # main()