heikowagner commited on
Commit
98639ab
·
1 Parent(s): 3fc85ef

move elements

Browse files
Files changed (9) hide show
  1. .dockerignore +12 -0
  2. Dockerfile +4 -2
  3. app/app.py +3 -1
  4. app/elements.py +67 -0
  5. app/load_model.py +2 -2
  6. app/load_vectors.py +18 -1
  7. app/run.py +16 -5
  8. app/utils.py +2 -59
  9. docker-compose.yaml +4 -1
.dockerignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./docker/zeppelin/logs/*
2
+ *.openaiapikey*
3
+ *.log
4
+ *.log.*
5
+ *__pycache__*
6
+ root
7
+ *.ipynb_checkpoints*
8
+ .vscode
9
+ /app/mymodels
10
+ /app/.cache
11
+ /app/VectorStore
12
+ *chroma-embeddings.parquet*
Dockerfile CHANGED
@@ -30,8 +30,9 @@ COPY ./requirements.txt requirements.txt
30
  RUN pip install -r requirements.txt
31
  COPY ./app .
32
  #RUN python load_docs.py
33
- RUN --mount=type=secret,id=OPENAI_API_KEY \
34
- cat /run/secrets/OPENAI_API_KEY > .openaiapikey
 
35
  RUN mkdir /.cache
36
  RUN mkdir /nltk_data
37
  RUN mkdir /VectorStore
@@ -39,6 +40,7 @@ RUN mkdir /app/.cache
39
  RUN ls -la
40
  RUN python run.py
41
  RUN chmod 777 /VectorStore
 
42
  RUN chmod 777 /nltk_data
43
  RUN chmod 777 /.cache
44
  RUN chmod 777 /app/.cache
 
30
  RUN pip install -r requirements.txt
31
  COPY ./app .
32
  #RUN python load_docs.py
33
+ #RUN --mount=type=secret,id=OPENAI_API_KEY \
34
+ # cat /run/secrets/OPENAI_API_KEY > .openaiapikey
35
+ RUN echo "" > .openaiapikey
36
  RUN mkdir /.cache
37
  RUN mkdir /nltk_data
38
  RUN mkdir /VectorStore
 
40
  RUN ls -la
41
  RUN python run.py
42
  RUN chmod 777 /VectorStore
43
+ RUN chmod 777 /mymodels
44
  RUN chmod 777 /nltk_data
45
  RUN chmod 777 /.cache
46
  RUN chmod 777 /app/.cache
app/app.py CHANGED
@@ -1,7 +1,9 @@
1
  import streamlit as st
2
  import load_model
3
  import utils as ut
 
4
  import os
 
5
 
6
  persist_directory = load_model.persist_directory
7
  st.title('myRetrievalGPT')
@@ -11,7 +13,7 @@ st.markdown('*Let $\phi$ be a word embedding mapping $W$ → $\mathbb{R}^n$ w
11
 
12
  agree = st.checkbox('Load new Documents')
13
  if agree:
14
- ut.load_files()
15
  else:
16
 
17
  import torch
 
1
  import streamlit as st
2
  import load_model
3
  import utils as ut
4
+ import elements as el
5
  import os
6
+ import torch
7
 
8
  persist_directory = load_model.persist_directory
9
  st.title('myRetrievalGPT')
 
13
 
14
  agree = st.checkbox('Load new Documents')
15
  if agree:
16
+ el.load_files()
17
  else:
18
 
19
  import torch
app/elements.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ from langchain.docstore.document import Document
4
+ from chromadb.config import Settings
5
+ from load_model import load_embedding
6
+ from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
7
+ from utils import retrieve_collections, get_chroma_client
8
+
9
+ def llm_module():
10
+ pass
11
+
12
+ def load_files():
13
+
14
+ client = get_chroma_client()
15
+
16
+ option = st.radio(
17
+ "",
18
+ options=["Add Documents", "Start new collection"],
19
+ )
20
+
21
+ if option == "Add Documents":
22
+ collections = retrieve_collections()
23
+ selected_collection = st.selectbox(
24
+ 'Add to exsisting collection or create a new one',
25
+ collections )
26
+ if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
27
+ client.delete_collection(name=selected_collection["name"])
28
+ #retrieve_collections.clear()
29
+ collections = retrieve_collections()
30
+
31
+ if selected_collection:
32
+ st.write("Selected Vectorstore:", selected_collection)
33
+ option = st.radio(
34
+ "",
35
+ options=["Upload Files from Local", "Upload Files from Web"],
36
+ )
37
+ if option == "Upload Files from Local":
38
+ st.write('Source Documents:')
39
+ uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
40
+ chunk_size = st.text_area('chunk Size:', 1000)
41
+
42
+ if st.button('Upload'):
43
+ docs = load_from_file(uploaded_files)
44
+ sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
45
+ vec1 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
46
+ st.write("Upload succesful")
47
+ else:
48
+ st.write('Urls of Source Documents (Comma separated):')
49
+ urls = chunk_size = st.text_area('Urls:', '')
50
+ chunk_size = st.text_area('chunk Size:', 1000)
51
+ urls = urls.replace(",", "" ).replace('"', "" ).split(',')
52
+
53
+ if st.button('Upload'):
54
+ docs = load_from_web(urls)
55
+ sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
56
+ vec2 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
57
+ st.write("Upload succesful")
58
+ else:
59
+ collection = st.text_area('Name of your new collection:', '')
60
+ model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
61
+ if st.button('Create'):
62
+ if len(collection)>3:
63
+ ef = load_embedding(model_name)
64
+ metadata= {"loaded_docs":[], "Subject":"Terms Example", "model_name": ef.model_name}
65
+ client.create_collection(collection, embedding_function=ef, metadata=metadata)
66
+ # retrieve_collections.clear()
67
+ st.write("Collection " +collection+" succesfully created.")
app/load_model.py CHANGED
@@ -82,8 +82,8 @@ def load_gpu_model(used_model):
82
  return llm
83
 
84
  #@st.cache_resource
85
- def load_openai_model():
86
- return OpenAI(temperature=0.9)
87
 
88
  @st.cache_resource
89
  def load_openai_embedding():
 
82
  return llm
83
 
84
  #@st.cache_resource
85
+ def load_openai_model(temperature=0.9):
86
+ return OpenAI(temperature=temperature)
87
 
88
  @st.cache_resource
89
  def load_openai_embedding():
app/load_vectors.py CHANGED
@@ -101,4 +101,21 @@ def load_from_web(urls, cache=True):
101
  def load_and_split(docs, chunk_size=700):
102
  text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
103
  sub_docs = text_splitter.split_documents(docs)
104
- return sub_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def load_and_split(docs, chunk_size=700):
102
  text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
103
  sub_docs = text_splitter.split_documents(docs)
104
+ return sub_docs
105
+
106
+ def metadata_generator(doc, llm,max_token=4000):
107
+ #query = f"Document = {doc.page_content[1:max_token]} -> Respond a python code using a dict filling xxxx like {{'document_type': xxxx, 'summary (max. 30 letters)':'xxxx'}} resond at leat 10 letter"
108
+ query = f"""
109
+ Cluster the following Input document into topic categories based on patterns seen within the text. Also mention reasoning behind how these categories were defined.
110
+ Output format:
111
+ {{
112
+ "DOCUMENT TYPE": "",
113
+ "SUMMARY": [],
114
+ "REASONING": ""
115
+ }}
116
+
117
+ Input document:
118
+ {doc.page_content[1:max_token]}
119
+ Output:
120
+ """
121
+ return llm(query)
app/run.py CHANGED
@@ -5,7 +5,7 @@ import pathlib
5
 
6
  from load_model import load_embedding
7
  from utils import get_chroma_client
8
- from load_vectors import load_from_web, create_and_add, load_and_split
9
 
10
  current_path = str( pathlib.Path(__file__).parent.resolve() )
11
  with open(current_path+'/.openaiapikey', 'r') as reader:
@@ -21,12 +21,12 @@ llm= load_model.load_openai_model()
21
  client = get_chroma_client()
22
  client.reset()
23
  ef = load_embedding("hkunlp/instructor-large")
24
- collection_name="axaterms"
25
- metadata= {"loaded_docs":[], "Subject":"AXA Terms", "model_name": ef.model_name}
26
  selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
27
 
28
  docs_tarifs= [
29
- "https://www.axa.de/site/axa-de/get/documents_E1883536226/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-L.pdf",
30
  ]
31
 
32
  # %%
@@ -40,7 +40,18 @@ docs = load_from_web(docs_tarifs)
40
  sub_docs = load_and_split(docs, chunk_size=1000)
41
  create_and_add(collection_name, sub_docs, model_name, metadata)
42
 
 
 
43
  # %%
44
- chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name, metadata=metadata)
45
  #result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
46
  #print(result)
 
 
 
 
 
 
 
 
 
 
5
 
6
  from load_model import load_embedding
7
  from utils import get_chroma_client
8
+ from load_vectors import load_from_web, create_and_add, load_and_split, metadata_generator
9
 
10
  current_path = str( pathlib.Path(__file__).parent.resolve() )
11
  with open(current_path+'/.openaiapikey', 'r') as reader:
 
21
  client = get_chroma_client()
22
  client.reset()
23
  ef = load_embedding("hkunlp/instructor-large")
24
+ collection_name="papers"
25
+ metadata= {"loaded_docs":[], "Subject":"Heikos Papers", "model_name": ef.model_name}
26
  selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
27
 
28
  docs_tarifs= [
29
+ "https://edoc.hu-berlin.de/bitstream/handle/18452/5294/33.pdf",
30
  ]
31
 
32
  # %%
 
40
  sub_docs = load_and_split(docs, chunk_size=1000)
41
  create_and_add(collection_name, sub_docs, model_name, metadata)
42
 
43
+
44
+
45
  # %%
46
+ #chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name, metadata=metadata)
47
  #result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
48
  #print(result)
49
+ #llm= load_model.load_openai_model(temperature=0.1)
50
+
51
+ #llm= load_model.load_cpu_model()
52
+
53
+ #meta= metadata_generator(docs[0], llm)
54
+ # %%
55
+ #print(meta)
56
+
57
+ # %%
app/utils.py CHANGED
@@ -27,66 +27,9 @@ def get_chroma_client():
27
  return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
28
  persist_directory=persist_directory
29
  ))
30
- #@st.cache_data
31
  def retrieve_collections():
32
  client = get_chroma_client()
33
  all_collections = client.list_collections()
34
  collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name'], "metadata": collection.metadata} for collection in all_collections] )
35
- return collections
36
-
37
- def load_files():
38
-
39
- client = get_chroma_client()
40
-
41
- option = st.radio(
42
- "",
43
- options=["Add Documents", "Start new collection"],
44
- )
45
-
46
- if option == "Add Documents":
47
- collections = retrieve_collections()
48
- selected_collection = st.selectbox(
49
- 'Add to exsisting collection or create a new one',
50
- collections )
51
- if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
52
- client.delete_collection(name=selected_collection["name"])
53
- #retrieve_collections.clear()
54
- collections = retrieve_collections()
55
-
56
- if selected_collection:
57
- st.write("Selected Vectorstore:", selected_collection)
58
- option = st.radio(
59
- "",
60
- options=["Upload Files from Local", "Upload Files from Web"],
61
- )
62
- if option == "Upload Files from Local":
63
- st.write('Source Documents:')
64
- uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
65
- chunk_size = st.text_area('chunk Size:', 1000)
66
-
67
- if st.button('Upload'):
68
- docs = load_from_file(uploaded_files)
69
- sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
70
- vec1 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
71
- st.write("Upload succesful")
72
- else:
73
- st.write('Urls of Source Documents (Comma separated):')
74
- urls = chunk_size = st.text_area('Urls:', '')
75
- chunk_size = st.text_area('chunk Size:', 1000)
76
- urls = urls.replace(",", "" ).replace('"', "" ).split(',')
77
-
78
- if st.button('Upload'):
79
- docs = load_from_web(urls)
80
- sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
81
- vec2 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
82
- st.write("Upload succesful")
83
- else:
84
- collection = st.text_area('Name of your new collection:', '')
85
- model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
86
- if st.button('Create'):
87
- if len(collection)>3:
88
- ef = load_embedding(model_name)
89
- metadata= {"loaded_docs":[], "Subject":"Terms Example", "model_name": ef.model_name}
90
- client.create_collection(collection, embedding_function=ef, metadata=metadata)
91
- # retrieve_collections.clear()
92
- st.write("Collection " +collection+" succesfully created.")
 
27
  return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
28
  persist_directory=persist_directory
29
  ))
30
+ @st.cache_data
31
  def retrieve_collections():
32
  client = get_chroma_client()
33
  all_collections = client.list_collections()
34
  collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name'], "metadata": collection.metadata} for collection in all_collections] )
35
+ return collections
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker-compose.yaml CHANGED
@@ -1,6 +1,7 @@
1
  version: "3.9"
2
  services:
3
  streamlit_app:
 
4
  build: .
5
  tty: true
6
  ports:
@@ -11,11 +12,13 @@ services:
11
  devices:
12
  - capabilities: [gpu]
13
  dev_app:
14
- image: tensorflow/tensorflow:latest-gpu
15
  tty: true
16
  volumes:
17
  - ./app:/app
18
  - ./root:/root
 
 
19
  deploy:
20
  resources:
21
  reservations:
 
1
  version: "3.9"
2
  services:
3
  streamlit_app:
4
+ image: myretrievalgpt
5
  build: .
6
  tty: true
7
  ports:
 
12
  devices:
13
  - capabilities: [gpu]
14
  dev_app:
15
+ image: myretrievalgpt
16
  tty: true
17
  volumes:
18
  - ./app:/app
19
  - ./root:/root
20
+ depends_on:
21
+ - streamlit_app
22
  deploy:
23
  resources:
24
  reservations: