import pandas as pd from langchain.docstore.document import Document as LangchainDocument from langchain.text_splitter import RecursiveCharacterTextSplitter def load_and_split_markdown(filepath='https://drive.google.com/u/0/uc?id=1JQswhvNz6yNKKzJW0nrXU7AmUQaGevxA&export=download'): # Загрузка данных data_cities = pd.read_csv(filepath) # Создание документов без прогресс-бара RAW_KNOWLEDGE_BASE = [ LangchainDocument( page_content=f"{row['City']} | {row['Name']} | {row['description']}", metadata={ "longitude": row['Lon'], "latitude": row['Lat'], "image": row['image'], # "english_description": row['en_txt'] } ) for _, row in data_cities.iterrows() # Убрали tqdm ] # Настройки разделителя текста MARKDOWN_SEPARATORS = [ "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ", "", ] # Инициализация разделителя текста text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100, add_start_index=True, strip_whitespace=True, separators=MARKDOWN_SEPARATORS, ) # Разделение документов docs_processed = [] for doc in RAW_KNOWLEDGE_BASE: docs_processed += text_splitter.split_documents([doc]) return docs_processed