|
import logging |
|
|
|
from langchain_community.document_loaders import WikipediaLoader, UnstructuredHTMLLoader |
|
from langchain.text_splitter import TokenTextSplitter |
|
from knowledge_graph_builder import extract_and_store_graph |
|
from dotenv import load_dotenv |
|
from tqdm import tqdm |
|
|
|
logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
logger.setLevel(logging.INFO) |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
articlesDISABLED = { |
|
"Chemotherapy": "Chemotherapy", |
|
"Traffic Law": "Traffic laws in the United States" |
|
} |
|
|
|
|
|
articles = { |
|
"Traffic Law": "Traffic laws in the United States" |
|
} |
|
articlesDISABLED = { |
|
"SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html" |
|
} |
|
articlesDISABLED = { |
|
"SquirroDocs": "/Users/michaelwechner/Desktop/docs.squirro.com_en_latest_technical_getting-started.html" |
|
} |
|
|
|
def build_graph_for_article(query, data_source_name): |
|
""" |
|
Build knowledge graph from loaded articles / documents of a particular topic |
|
:param query: The query string to search on Wikipedia, e.g. "Traffic laws in the United States" |
|
:param data_source_name: Data source name, e.g. "Traffic Law" |
|
:return: |
|
""" |
|
load_max_documents = 5 |
|
|
|
|
|
chunk_size=400 |
|
chunk_overlap=10 |
|
|
|
if data_source_name == "SquirroDocs": |
|
logger.info(f"Loading document(s) from public website {query} ...") |
|
loader = UnstructuredHTMLLoader(query) |
|
raw_documents = loader.load() |
|
else: |
|
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...") |
|
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load() |
|
|
|
if not raw_documents: |
|
logger.error(f"Failed to load content for Data Source '{data_source_name}'!") |
|
return |
|
|
|
logger.info(f"{str(len(raw_documents))} document(s) loaded.") |
|
for doc in raw_documents: |
|
logger.info(f"Document: {doc.metadata['source']}") |
|
|
|
|
|
logger.info(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...") |
|
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |
|
chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) |
|
logger.info(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)") |
|
|
|
logger.info(f"Building the knowledge graph for document(s) found by query '{query}' ...") |
|
for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)): |
|
logger.info(f"Extract data from chunk {str(i)} ...") |
|
|
|
extract_and_store_graph(chunkDoc, data_source_name) |
|
|
|
def main(): |
|
for data_source_name, query in articles.items(): |
|
build_graph_for_article(query, data_source_name) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|