File size: 2,384 Bytes
5446629
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import json
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

script_dir = os.path.dirname(os.path.abspath(__file__))
data_folder = os.path.join(script_dir, 'data', 'raw_documents')

files = os.listdir(data_folder)

db_path = os.path.join(script_dir, 'data', 'chroma_db')

if not os.path.exists(db_path):
    document_to_store = []
    for file in files:
        with open(os.path.join(data_folder, file), 'r', encoding='utf-8') as f:
            json_dict = json.load(f)
            content = json_dict['text']
            metadata = {key: value for key, value in json_dict.items() if key != 'text'}
            document = Document(page_content=content,
                                metadata=metadata)
            document_to_store.append(document)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(document_to_store)
    min_chunk_size = 50
    long_texts = [doc for doc in texts if len(doc.page_content) > min_chunk_size]
    print(f"Original number of chunks: {len(texts)}")
    print(f"Number of chunks after filtering: {len(long_texts)}")

    # creating vector database using filtered chunks
    print('Creating the vector database...')
    db = Chroma.from_documents(long_texts,
                               embedding_function,
                               persist_directory=db_path)

    print('Finished creating the vector database.')

else:
    print('Vector database already exists. Loading...')
    db = Chroma(
        persist_directory=db_path,
        embedding_function=embedding_function
)
    print('Vector database loaded')

print("Checking titles in the database...")

retrieved_items = db.get(
    limit=1000000,
    include=['metadatas']
)

unique_titles = set()
for metadata in retrieved_items['metadatas']:
    if 'title' in metadata:
        unique_titles.add((metadata['title'], metadata['id']))

print(f"\n--- {len(unique_titles)} Unique Article Titles Found ---")
for title in sorted(list(unique_titles)):
    print(title)

if __name__ == '__main__':
    pass