File size: 3,085 Bytes
7910a4d
 
 
 
 
 
 
 
 
 
7d6d012
7910a4d
 
 
 
 
 
7d6d012
7910a4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from langchain.embeddings import GPT4AllEmbeddings
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import Chroma
import chromadb


class Vectorstore_client:
    def __init__(self):
        self.persist_directory = "data/vectorstore"
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        elections = ["2013", "2017", "2021"]
        for election in elections:
            # load all files from cleaned data set
            glob = "*" + election + ".txt"
            loader = DirectoryLoader(
                'data/clean/', glob=glob, use_multithreading=True, loader_cls=TextLoader)
            docs_list = loader.load()
            # split documents
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=200)
            all_splits = text_splitter.split_documents(docs_list)
            all_texts = [text.page_content for text in all_splits]
            # generate ids for all documents
            ids_list = ["id{}".format(i)
                        for i in range(1, len(all_texts) + 1)]
            # Store splits in database
            collection = self.client.get_or_create_collection(
                name=election)
            if collection.count() == 0:
                collection.add(
                    documents=all_texts,
                    ids=ids_list
                )
        return

    def get_client(self):
        return self.client


# class Vectorstore:
#     def __init__(self) -> None:
#         self.persist_directory = "/home/phisinger/Programmieren/wahlprogramm_analyse/data/vectorstore"
#         if False:
#             # load data from data persist_directory
#             print("use persisted db.")
#             self.vectordb = Chroma(persist_directory=persist_directory,
#                                    embedding_function=GPT4AllEmbeddings())
#         else:
#             print("Build new vector DB")
#             self.build_vectorstore()

#         return self.vectordb

#     def build_vectorstore(self):
#         elections = ["2013", "2017", "2021"]
#         for election in elections:
#             # load all files from cleaned data set
#             glob = "*" + election + ".txt"
#             loader = DirectoryLoader(
#                 '../data/clean/', glob=glob, use_multithreading=True, loader_cls=TextLoader)
#             docs_list = loader.load()
#             # split documents
#             text_splitter = RecursiveCharacterTextSplitter(
#                 chunk_size=1000, chunk_overlap=200)
#             all_splits = text_splitter.split_documents(docs_list)
#             # store documents in vector store
#             self.vectordb = Chroma.from_documents(
#                 documents=all_splits, embedding=GPT4AllEmbeddings(), persist_directory=self.persist_directory)
#             self.vectordb.persist()

#         def get(self):
#             return self.vectordb