Spaces:
Build error
Build error
heikowagner
commited on
Commit
·
40c3ade
0
Parent(s):
Duplicate from heikowagner/GPT-Docker
Browse files- .dockerignore +12 -0
- .gitattributes +34 -0
- .gitignore +12 -0
- Dockerfile +52 -0
- README.md +11 -0
- app/app.py +65 -0
- app/elements.py +67 -0
- app/exploration.py +51 -0
- app/load_model.py +122 -0
- app/load_test.py +29 -0
- app/load_vectors.py +121 -0
- app/playground/load_docs.py +155 -0
- app/playground/result.pkl +3 -0
- app/playground/st_render_doc.py +9 -0
- app/run.py +59 -0
- app/utils.py +35 -0
- docker-compose.yaml +26 -0
- requirements.txt +15 -0
.dockerignore
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
./docker/zeppelin/logs/*
|
2 |
+
*.openaiapikey*
|
3 |
+
*.log
|
4 |
+
*.log.*
|
5 |
+
*__pycache__*
|
6 |
+
root
|
7 |
+
*.ipynb_checkpoints*
|
8 |
+
.vscode
|
9 |
+
/app/mymodels
|
10 |
+
/app/.cache
|
11 |
+
/app/VectorStore
|
12 |
+
*chroma-embeddings.parquet*
|
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
./docker/zeppelin/logs/*
|
2 |
+
*.openaiapikey*
|
3 |
+
*.log
|
4 |
+
*.log.*
|
5 |
+
*__pycache__*
|
6 |
+
root
|
7 |
+
*.ipynb_checkpoints*
|
8 |
+
.vscode
|
9 |
+
/app/mymodels
|
10 |
+
/app/.cache
|
11 |
+
/app/VectorStore
|
12 |
+
*chroma-embeddings.parquet*
|
Dockerfile
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Navigate to your user folder cd $env:USERPROFILE\AppData\Local\Docker\wsl\data
|
2 |
+
#Enter the following command resize-vhd -Path .\ext4.vhdx -SizeBytes 300GB, after that I was able to continue building with docker-compose!
|
3 |
+
|
4 |
+
FROM python:latest AS builder
|
5 |
+
RUN apt update -y
|
6 |
+
RUN apt install -y git git-lfs make gcc g++ libgmp-dev libmpfr-dev libmpc-dev
|
7 |
+
RUN git lfs install
|
8 |
+
RUN git clone https://github.com/ggerganov/llama.cpp
|
9 |
+
RUN cd llama.cpp && make
|
10 |
+
RUN git clone https://huggingface.co/nyanko7/LLaMA-7B
|
11 |
+
RUN ls -la
|
12 |
+
RUN cp -r ./LLaMA-7B ./llama.cpp/models
|
13 |
+
RUN ls -la ./llama.cpp/models/LLaMA-7B
|
14 |
+
# convert the 7B model to ggml FP16 format
|
15 |
+
WORKDIR llama.cpp
|
16 |
+
RUN python3 -m pip install -r requirements.txt
|
17 |
+
RUN python3 convert.py ./models/LLaMA-7B
|
18 |
+
# quantize the model to 4-bits (using q4_0 method)
|
19 |
+
RUN mkdir ./models/7B/
|
20 |
+
RUN ./quantize ./models/LLaMA-7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
|
21 |
+
|
22 |
+
FROM tensorflow/tensorflow:latest-gpu
|
23 |
+
WORKDIR /app
|
24 |
+
COPY --from=builder /llama.cpp//models/7B/ ./mymodels/LLaMA-7B
|
25 |
+
# RUN apt-get upgrade -y
|
26 |
+
RUN apt update -y
|
27 |
+
RUN apt install -y git git-lfs
|
28 |
+
RUN apt install -y make wget git gcc g++ lhasa libgmp-dev libmpfr-dev libmpc-dev flex bison gettext texinfo ncurses-dev autoconf rsync
|
29 |
+
COPY ./requirements.txt requirements.txt
|
30 |
+
RUN pip install -r requirements.txt
|
31 |
+
COPY ./app .
|
32 |
+
#RUN python load_docs.py
|
33 |
+
#RUN --mount=type=secret,id=OPENAI_API_KEY \
|
34 |
+
# cat /run/secrets/OPENAI_API_KEY > .openaiapikey
|
35 |
+
RUN echo "" > .openaiapikey
|
36 |
+
RUN mkdir /.cache
|
37 |
+
RUN mkdir /nltk_data
|
38 |
+
RUN mkdir /VectorStore
|
39 |
+
RUN mkdir /app/.cache
|
40 |
+
RUN mkdir /mymodels
|
41 |
+
RUN ls -la
|
42 |
+
RUN python run.py
|
43 |
+
RUN chmod 777 /VectorStore
|
44 |
+
RUN chmod 777 /mymodels
|
45 |
+
RUN chmod 777 /nltk_data
|
46 |
+
RUN chmod 777 /.cache
|
47 |
+
RUN chmod 777 /app/.cache
|
48 |
+
RUN chmod 777 /app/mymodels
|
49 |
+
RUN chmod 777 /app/VectorStore/
|
50 |
+
CMD ["streamlit", "run", "app.py", "--server.port=7860"]
|
51 |
+
#CMD ls -la
|
52 |
+
EXPOSE 7860
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: myRetrievalGPT
|
3 |
+
emoji: 🔥
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
+
pinned: true
|
8 |
+
duplicated_from: heikowagner/GPT-Docker
|
9 |
+
---
|
10 |
+
|
11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app/app.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import load_model
|
3 |
+
import utils as ut
|
4 |
+
import elements as el
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
+
import psutil
|
8 |
+
|
9 |
+
persist_directory = load_model.persist_directory
|
10 |
+
st.title('myRetrievalGPT')
|
11 |
+
st.header('An GPT Retrieval example brought to you by Heiko Wagner')
|
12 |
+
|
13 |
+
st.markdown('*Let $\phi$ be a word embedding mapping $W$ → $\mathbb{R}^n$ where $W$ is the word space and $\mathbb{R}^n$ is an $n$-dimensional vector space then: $\phi(king)-\phi(man)+\phi(woman)=\phi(queen)$.* ')
|
14 |
+
|
15 |
+
agree = st.checkbox('Load new Documents')
|
16 |
+
if agree:
|
17 |
+
el.load_files()
|
18 |
+
else:
|
19 |
+
|
20 |
+
import torch
|
21 |
+
torch.cuda.empty_cache()
|
22 |
+
|
23 |
+
st.write(str( torch.cuda.is_available()) + str(psutil.virtual_memory()))
|
24 |
+
model_type = st.selectbox(
|
25 |
+
'Select the Documents to be used to answer your question',
|
26 |
+
('OpenAI', 'decapoda-research/llama-7b-hf (gpu+cpu)', 'llama-7b 4bit (cpu only)',) )
|
27 |
+
|
28 |
+
if model_type=='OpenAI':
|
29 |
+
if 'openai_key' not in st.session_state:
|
30 |
+
openai_key= st.text_area('OpenAI Key:', '')
|
31 |
+
if len(openai_key)>-1:
|
32 |
+
st.session_state['openai_key'] = openai_key
|
33 |
+
os.environ["OPENAI_API_KEY"] = openai_key
|
34 |
+
else:
|
35 |
+
os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
|
36 |
+
llm= load_model.load_openai_model()
|
37 |
+
elif model_type=='decapoda-research/llama-7b-hf (gpu+cpu)':
|
38 |
+
# Add more models here
|
39 |
+
if not torch.cuda.is_available() and psutil.virtual_memory().available< 18254768640:
|
40 |
+
st.write('You do not have enough memory to use this model:' + str(psutil.virtual_memory().available))
|
41 |
+
else:
|
42 |
+
llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
|
43 |
+
else:
|
44 |
+
llm = load_model.load_cpu_model()
|
45 |
+
|
46 |
+
|
47 |
+
collections = ut.retrieve_collections()
|
48 |
+
option = st.selectbox(
|
49 |
+
'Select the Documents to be used to answer your question',
|
50 |
+
collections )
|
51 |
+
|
52 |
+
st.write('You selected:', option['name'])
|
53 |
+
|
54 |
+
chain = load_model.create_chain(llm, collection=option['name'], model_name=option['model_name'], metadata= option['metadata'])
|
55 |
+
query = st.text_area('Ask a question:', 'Hallo how are you today?')
|
56 |
+
result = chain({"query": query + " Add a Score of the propability that your answer is correct to your answer"})
|
57 |
+
ut.format_result_set(result)
|
58 |
+
|
59 |
+
#from langchain.chains import ConversationChain
|
60 |
+
#from langchain.memory import ConversationBufferMemory
|
61 |
+
|
62 |
+
#conversation = ConversationChain(
|
63 |
+
# llm=chat,
|
64 |
+
# memory=ConversationBufferMemory()
|
65 |
+
#)
|
app/elements.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
from chromadb.config import Settings
|
5 |
+
from load_model import load_embedding
|
6 |
+
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
|
7 |
+
from utils import retrieve_collections, get_chroma_client
|
8 |
+
|
9 |
+
def llm_module():
|
10 |
+
pass
|
11 |
+
|
12 |
+
def load_files():
|
13 |
+
|
14 |
+
client = get_chroma_client()
|
15 |
+
|
16 |
+
option = st.radio(
|
17 |
+
"",
|
18 |
+
options=["Add Documents", "Start new collection"],
|
19 |
+
)
|
20 |
+
|
21 |
+
if option == "Add Documents":
|
22 |
+
collections = retrieve_collections()
|
23 |
+
selected_collection = st.selectbox(
|
24 |
+
'Add to exsisting collection or create a new one',
|
25 |
+
collections )
|
26 |
+
if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
|
27 |
+
client.delete_collection(name=selected_collection["name"])
|
28 |
+
#retrieve_collections.clear()
|
29 |
+
collections = retrieve_collections()
|
30 |
+
|
31 |
+
if selected_collection:
|
32 |
+
st.write("Selected Vectorstore:", selected_collection)
|
33 |
+
option = st.radio(
|
34 |
+
"",
|
35 |
+
options=["Upload Files from Local", "Upload Files from Web"],
|
36 |
+
)
|
37 |
+
if option == "Upload Files from Local":
|
38 |
+
st.write('Source Documents:')
|
39 |
+
uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
|
40 |
+
chunk_size = st.text_area('chunk Size:', 1000)
|
41 |
+
|
42 |
+
if st.button('Upload'):
|
43 |
+
docs = load_from_file(uploaded_files)
|
44 |
+
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
45 |
+
vec1 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
|
46 |
+
st.write("Upload succesful")
|
47 |
+
else:
|
48 |
+
st.write('Urls of Source Documents (Comma separated):')
|
49 |
+
urls = chunk_size = st.text_area('Urls:', '')
|
50 |
+
chunk_size = st.text_area('chunk Size:', 1000)
|
51 |
+
urls = urls.replace(",", "" ).replace('"', "" ).split(',')
|
52 |
+
|
53 |
+
if st.button('Upload'):
|
54 |
+
docs = load_from_web(urls)
|
55 |
+
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
56 |
+
vec2 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
|
57 |
+
st.write("Upload succesful")
|
58 |
+
else:
|
59 |
+
collection = st.text_area('Name of your new collection:', '')
|
60 |
+
model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
|
61 |
+
if st.button('Create'):
|
62 |
+
if len(collection)>3:
|
63 |
+
ef = load_embedding(model_name)
|
64 |
+
metadata= {"loaded_docs":[], "Subject":"Terms Example", "model_name": ef.model_name}
|
65 |
+
client.create_collection(collection, embedding_function=ef, metadata=metadata)
|
66 |
+
# retrieve_collections.clear()
|
67 |
+
st.write("Collection " +collection+" succesfully created.")
|
app/exploration.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
|
3 |
+
from utils import retrieve_collections, get_chroma_client
|
4 |
+
|
5 |
+
|
6 |
+
from load_model import load_embedding
|
7 |
+
|
8 |
+
#retrieve_collections()
|
9 |
+
|
10 |
+
client = get_chroma_client()
|
11 |
+
|
12 |
+
# %%
|
13 |
+
client.reset()
|
14 |
+
# %%
|
15 |
+
collections = tuple( [collection.name for collection in client.list_collections()] ) ##Keine Embedding function in der Collection angelegt...
|
16 |
+
|
17 |
+
ef = load_embedding("hkunlp/instructor-large")
|
18 |
+
collection="heikostest2"
|
19 |
+
client.create_collection(collection, embedding_function=ef, metadata={"loaded_docs":[]})
|
20 |
+
|
21 |
+
|
22 |
+
# %%
|
23 |
+
my_col = client.list_collections()
|
24 |
+
|
25 |
+
# %%
|
26 |
+
my_col.embedding_function
|
27 |
+
|
28 |
+
# %%
|
29 |
+
from langchain.vectorstores import Chroma
|
30 |
+
import load_model
|
31 |
+
|
32 |
+
from load_model import load_embedding
|
33 |
+
|
34 |
+
persist_directory = load_model.persist_directory
|
35 |
+
|
36 |
+
ef = load_embedding("hkunlp/instructor-large")
|
37 |
+
vectorstore = Chroma(
|
38 |
+
collection_name="papers",
|
39 |
+
embedding_function=ef,
|
40 |
+
persist_directory=persist_directory,
|
41 |
+
)
|
42 |
+
|
43 |
+
# %%
|
44 |
+
query = "What did the president say about Ketanji Brown Jackson"
|
45 |
+
docs = vectorstore.similarity_search(query)
|
46 |
+
|
47 |
+
|
48 |
+
# %%
|
49 |
+
docs
|
50 |
+
# %%
|
51 |
+
vectorstore.similarity_search_with_score(query)
|
app/load_model.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
# git clone https://huggingface.co/nyanko7/LLaMA-7B
|
3 |
+
# python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu117/torch2.00/index.html
|
4 |
+
# apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
|
5 |
+
from transformers import LlamaForCausalLM, LlamaTokenizer
|
6 |
+
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings
|
7 |
+
from langchain.llms import LlamaCpp, HuggingFacePipeline
|
8 |
+
from langchain.vectorstores import Chroma
|
9 |
+
from transformers import pipeline
|
10 |
+
import torch
|
11 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
12 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
13 |
+
import streamlit as st
|
14 |
+
import cloudpickle
|
15 |
+
import os
|
16 |
+
from langchain.chains import RetrievalQA
|
17 |
+
from langchain.indexes import VectorstoreIndexCreator
|
18 |
+
from langchain.llms import OpenAI
|
19 |
+
import multiprocessing
|
20 |
+
|
21 |
+
from chromadb.config import Settings
|
22 |
+
import chromadb
|
23 |
+
|
24 |
+
import pathlib
|
25 |
+
|
26 |
+
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
27 |
+
print(current_path)
|
28 |
+
persist_directory = current_path + "/VectorStore"
|
29 |
+
|
30 |
+
# %%
|
31 |
+
@st.cache_resource
|
32 |
+
def load_cpu_model():
|
33 |
+
"""Does not work atm, bc cpu model is not persisted"""
|
34 |
+
model_path= "./mymodels/LLaMA-7B/ggml-model-q4_0.bin"
|
35 |
+
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
|
36 |
+
llm = LlamaCpp(
|
37 |
+
model_path=model_path,
|
38 |
+
n_ctx=6000,
|
39 |
+
n_threads=multiprocessing.cpu_count(),
|
40 |
+
temperature=0.6,
|
41 |
+
top_p=0.95
|
42 |
+
)
|
43 |
+
|
44 |
+
llama_embeddings = LlamaCppEmbeddings(model_path=model_path)
|
45 |
+
return llm
|
46 |
+
|
47 |
+
@st.cache_resource(max_entries =1)
|
48 |
+
def load_gpu_model(used_model):
|
49 |
+
torch.cuda.empty_cache()
|
50 |
+
tokenizer = LlamaTokenizer.from_pretrained(used_model)
|
51 |
+
|
52 |
+
if not torch.cuda.is_available():
|
53 |
+
device_map = {
|
54 |
+
"": "cpu"
|
55 |
+
}
|
56 |
+
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
|
57 |
+
torch_dtype=torch.float32
|
58 |
+
load_in_8bit=False
|
59 |
+
else:
|
60 |
+
device_map="auto"
|
61 |
+
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) #atm no offload, bc device_map="auto"
|
62 |
+
|
63 |
+
|
64 |
+
base_model = LlamaForCausalLM.from_pretrained(
|
65 |
+
used_model,
|
66 |
+
device_map=device_map,
|
67 |
+
offload_folder=current_path + "/models_gpt/",
|
68 |
+
low_cpu_mem_usage=True,
|
69 |
+
quantization_config=quantization_config,
|
70 |
+
cache_dir = current_path + "/mymodels/"
|
71 |
+
)
|
72 |
+
pipe = pipeline(
|
73 |
+
"text-generation",
|
74 |
+
model=base_model,
|
75 |
+
tokenizer=tokenizer,
|
76 |
+
max_length=8000,
|
77 |
+
temperature=0.6,
|
78 |
+
top_p=0.95,
|
79 |
+
repetition_penalty=1.2
|
80 |
+
)
|
81 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
82 |
+
return llm
|
83 |
+
|
84 |
+
#@st.cache_resource
|
85 |
+
def load_openai_model(temperature=0.9):
|
86 |
+
return OpenAI(temperature=temperature)
|
87 |
+
|
88 |
+
@st.cache_resource
|
89 |
+
def load_openai_embedding():
|
90 |
+
return OpenAIEmbeddings()
|
91 |
+
|
92 |
+
#@st.cache_resource
|
93 |
+
def load_embedding(model_name):
|
94 |
+
embeddings = HuggingFaceInstructEmbeddings(
|
95 |
+
query_instruction="Represent the query for retrieval: ",
|
96 |
+
model_name = model_name,
|
97 |
+
cache_folder=current_path + "/mymodels/"
|
98 |
+
)
|
99 |
+
return embeddings
|
100 |
+
|
101 |
+
def load_vectorstore(model_name, collection, metadata):
|
102 |
+
embeddings = load_embedding(model_name)
|
103 |
+
client_settings = Settings(
|
104 |
+
chroma_db_impl="duckdb+parquet",
|
105 |
+
persist_directory=persist_directory,
|
106 |
+
anonymized_telemetry=False
|
107 |
+
)
|
108 |
+
vectorstore = Chroma(
|
109 |
+
collection_name=collection,
|
110 |
+
embedding_function=embeddings,
|
111 |
+
client_settings=client_settings,
|
112 |
+
persist_directory=persist_directory,
|
113 |
+
collection_metadata=metadata
|
114 |
+
)
|
115 |
+
return vectorstore
|
116 |
+
|
117 |
+
def create_chain(_llm, collection, model_name, metadata):
|
118 |
+
vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
|
119 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
|
120 |
+
chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
121 |
+
return chain
|
122 |
+
# %%
|
app/load_test.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
# %%
|
3 |
+
import os
|
4 |
+
import pathlib
|
5 |
+
|
6 |
+
from load_model import load_embedding
|
7 |
+
from utils import get_chroma_client
|
8 |
+
from load_vectors import load_from_web, create_and_add, load_and_split
|
9 |
+
|
10 |
+
collection="axaterms"
|
11 |
+
client = get_chroma_client()
|
12 |
+
# Load collection to get metadata
|
13 |
+
loaded_collection = client.get_collection(collection)
|
14 |
+
|
15 |
+
# %%
|
16 |
+
model_name = loaded_collection.metadata['model_name']
|
17 |
+
|
18 |
+
# %%
|
19 |
+
print( loaded_collection.json() )
|
20 |
+
|
21 |
+
|
22 |
+
# %%
|
23 |
+
client.get_collection(collection).json() #add documents destroys the metadata... maybe :)
|
24 |
+
# %%
|
25 |
+
|
26 |
+
#loaded_collection.modify(metadata={"Test":99})
|
27 |
+
|
28 |
+
# %%
|
29 |
+
loaded_collection.json()
|
app/load_vectors.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
import nltk
|
3 |
+
from langchain.indexes import VectorstoreIndexCreator
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
|
5 |
+
from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
|
6 |
+
from langchain.vectorstores import Chroma
|
7 |
+
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
|
8 |
+
from chromadb.config import Settings
|
9 |
+
import chromadb
|
10 |
+
from chromadb.utils import embedding_functions
|
11 |
+
from hashlib import sha256
|
12 |
+
import cloudpickle
|
13 |
+
import logging
|
14 |
+
import os
|
15 |
+
from load_model import load_embedding, load_vectorstore
|
16 |
+
import torch
|
17 |
+
import re
|
18 |
+
import pathlib
|
19 |
+
import tempfile
|
20 |
+
|
21 |
+
|
22 |
+
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
23 |
+
|
24 |
+
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
25 |
+
nltk.download('punkt')
|
26 |
+
|
27 |
+
persist_directory = current_path + "/VectorStore"
|
28 |
+
logger = logging.getLogger()
|
29 |
+
|
30 |
+
|
31 |
+
# %%
|
32 |
+
|
33 |
+
def create_collection(collection_name, model_name, client):
|
34 |
+
"""Not used atm"""
|
35 |
+
if not torch.cuda.is_available():
|
36 |
+
device= "cpu"
|
37 |
+
else:
|
38 |
+
device= "cuda"
|
39 |
+
ef = embedding_functions.InstructorEmbeddingFunction(
|
40 |
+
model_name=model_name, device=device)
|
41 |
+
client.get_or_create_collection(collection_name, embedding_function=ef)
|
42 |
+
return True
|
43 |
+
|
44 |
+
def create_and_add(collection_name, sub_docs, model_name, metadata):
|
45 |
+
logging.info(f"Adding documents to {collection_name}")
|
46 |
+
embeddings = load_embedding(model_name)
|
47 |
+
vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)
|
48 |
+
vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
|
49 |
+
vectorstore.persist()
|
50 |
+
|
51 |
+
# Test Vectorstore
|
52 |
+
vectorstore2 = load_vectorstore(model_name, collection_name, metadata = metadata)
|
53 |
+
print( vectorstore2.similarity_search_with_score(query="What is a transformer llm?", k=4) )
|
54 |
+
|
55 |
+
return True
|
56 |
+
|
57 |
+
def load_from_file(files):
|
58 |
+
|
59 |
+
saved_files=[]
|
60 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
61 |
+
for file in files:
|
62 |
+
temp_dir = pathlib.Path(tmpdirname)
|
63 |
+
file_name = os.path.join(temp_dir,file.name)
|
64 |
+
saved_files.append(file_name)
|
65 |
+
with open(file_name, mode='wb') as w:
|
66 |
+
w.write(file.read())
|
67 |
+
|
68 |
+
print(saved_files)
|
69 |
+
loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
|
70 |
+
docs = []
|
71 |
+
print(loaders)
|
72 |
+
for loader in loaders:
|
73 |
+
docs.extend(loader.load())
|
74 |
+
return docs
|
75 |
+
|
76 |
+
def load_from_web(urls, cache=True):
|
77 |
+
docs_list = urls
|
78 |
+
filename=f"{current_path}/.cache/{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
|
79 |
+
|
80 |
+
isFile = os.path.isfile(filename)
|
81 |
+
|
82 |
+
if cache and isFile:
|
83 |
+
logger.info("Using Cache")
|
84 |
+
pikd = open(filename, "rb")
|
85 |
+
docs = cloudpickle.load(pikd)
|
86 |
+
else:
|
87 |
+
loaders=[OnlinePDFLoader(pdf) for pdf in docs_list]
|
88 |
+
docs = []
|
89 |
+
for loader in loaders:
|
90 |
+
docs.extend(loader.load())
|
91 |
+
with open(filename, 'wb') as output:
|
92 |
+
cloudpickle.dump(docs, output)
|
93 |
+
|
94 |
+
#update metadata
|
95 |
+
i=0
|
96 |
+
for doc in docs:
|
97 |
+
doc.metadata = {'source': docs_list[i], 'url': docs_list[i], 'owner':'Heiko Wagner'}
|
98 |
+
i=i+1
|
99 |
+
return docs
|
100 |
+
|
101 |
+
def load_and_split(docs, chunk_size=700):
|
102 |
+
text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
|
103 |
+
sub_docs = text_splitter.split_documents(docs)
|
104 |
+
return sub_docs
|
105 |
+
|
106 |
+
def metadata_generator(doc, llm,max_token=4000):
|
107 |
+
#query = f"Document = {doc.page_content[1:max_token]} -> Respond a python code using a dict filling xxxx like {{'document_type': xxxx, 'summary (max. 30 letters)':'xxxx'}} resond at leat 10 letter"
|
108 |
+
query = f"""
|
109 |
+
Cluster the following Input document into topic categories based on patterns seen within the text. Also mention reasoning behind how these categories were defined.
|
110 |
+
Output format:
|
111 |
+
{{
|
112 |
+
"DOCUMENT TYPE": "",
|
113 |
+
"SUMMARY": [],
|
114 |
+
"REASONING": ""
|
115 |
+
}}
|
116 |
+
|
117 |
+
Input document:
|
118 |
+
{doc.page_content[1:max_token]}
|
119 |
+
Output:
|
120 |
+
"""
|
121 |
+
return llm(query)
|
app/playground/load_docs.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
from load_vectors import load_from_web, load_and_split, create_and_add
|
3 |
+
|
4 |
+
docs = [
|
5 |
+
"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/85ec0278-bf2f-4392-94b9-c086717fa8f6_axa_urd2022_accessible_va.pdf"
|
6 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/d97a94ff-a848-474b-b802-c22afc8311cd_axa_half_year_2022_financial_report.pdf"
|
7 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/51954d53-c0cf-4f90-84f7-53ee27dbe4e6_axa_ri2021_va_accessible.pdf"
|
8 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/e3f52b5e-d4aa-4fc8-8bcd-f432df86e804_axa_urd_2021_en_accessible.pdf"
|
9 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/4f303cec-a12d-480b-accb-7b56f706f60e_axa-ri2020-en-accessible.pdf"
|
10 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/d6aef906-e41f-40c7-ac9c-29044e98939d_AXA_URD_2020_EN_accessible_b.pdf"
|
11 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F47b47783-ddd1-47c3-912f-bc6e318ebbb3_axa_half_year_2020_financial_report.pdf"
|
12 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffd5a8bd8-9ef1-40eb-b953-c268c0ab4bf9_axa-ri2019-en-accessible.pdf"
|
13 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F90abd6c7-80c4-48ef-84bf-1d038670d9b7_axa-urd2019-en.pdf"
|
14 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F3ef6a9cc-6215-4e58-83b5-756774ef5b73_axa_half_year_2019_financial_report2.pdf"
|
15 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0a5e0bd9-78f2-4ef8-b32c-1d3d35ddce80_axa-ri2018-en-accessible.pdf"
|
16 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F913d1869-3d11-4eb2-b013-4caedb747fab_axa-ddr2018b-en.pdf"
|
17 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F476f79c9-c0c7-4ce3-88ed-4f99b3d22259_axa_half_year_2018_financial_report.pdf"
|
18 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F01f6966b-c26c-4935-91dc-1b296511ba8c_axa_ri2017_gb_planche.pdf"
|
19 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fec440dc9-69df-41b5-a3af-5b5f4fc29670_axa_reference_document_2017c.pdf"
|
20 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F72c59a61-8124-4066-a86d-bece5f41ce53_axa_us_statutory_statements_fy17.pdf"
|
21 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9237d78f-c1ac-43ca-9623-d0382a5aaaec_axa_us_statutory_statements_3q17.pdf"
|
22 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffdd639e0-2ea6-4c3f-8a42-8bca4359e858_axa_us_statutory_statements_2q17.pdf"
|
23 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F877e30a9-df72-480f-ac25-edcfcd4049c2_axa_us_statutory_statements_1q17.pdf"
|
24 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F6f3108fd-fabc-4dc6-a984-23eb0dca7a19_axa-ri2016-en_01.pdf"
|
25 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F268bab7a-2e78-4843-844a-fd3ad2d340bc_axa_reference_document_2016.pdf"
|
26 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fd2f66d05-e6ad-47a2-ab72-9bc727bd49c2_axa_half_year_2016_financial_report.pdf"
|
27 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F7a5f0af2-03c3-4a82-a077-46fdc52e5685_axa_us_statutory_statements_fy16.pdf"
|
28 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fdd643342-e975-473d-af54-c64491252a19_axa_us_statutory_statements_3q16.pdf"
|
29 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F53e10a7a-9348-40dc-935e-01fb0a1d0441_axa_us_statutory_statements_2q16.pdf"
|
30 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F8906bad6-14cb-4594-b7c0-029f8fc2172d_axa_us_statutory_statements_1q16.pdf"
|
31 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2d8e525a-1161-453a-a14f-817f0f070f79_axa_activity_cr_report_2015_accessible.pdf"
|
32 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F73719a96-c3b1-456b-abaf-63b80c06968c_axa_reference_document_2015.pdf"
|
33 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fe2936c1a-65f0-40db-b34b-bef9c27e91c0_axa_2015_half_year_financial_report.pdf"
|
34 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fdaac2a30-a3b8-4839-9331-041805836a6f_axa_us_statutory_statements_fy15.pdf"
|
35 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F61a6c98a-08fb-4cb1-b6c0-4d1ef0f72aa9_axa_us_statutory_statements_3q15.pdf"
|
36 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fe0689ffc-5aec-4388-a10e-26d1d1a7eb9a_axa_us_statutory_statements_2q15.pdf"
|
37 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbfa8ef5b-6533-4773-8502-5170a51735c9_axa_us_statutory_statements_1q15.pdf"
|
38 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbbb94857-f5d4-4afd-81d0-e85666883936_axa_annual+financial+report_2014.pdf"
|
39 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fb826839c-76c9-48c7-b8c1-9eda7fe3b032_axa_activity_csr_report_2014_va_b.pdf"
|
40 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fac63e0f9-60ba-47c2-9e23-f1d25731c7ee_axa_2014_half_year_financial_report.pdf"
|
41 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fb7db2a55-8eb6-4131-bc03-698e4bc756d6_axa_us_statutory_statements_fy2014.pdf"
|
42 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F417b48df-c585-4cb6-9d10-719d81228756_axa_us_statutory_statements_3q14.pdf"
|
43 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F4586d978-6fb8-4c44-b934-e15c14143b6d_axa_us_statutory_statements_2q14.pdf"
|
44 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F29cc016e-aff9-49c5-bb04-d55598aab844_axa_us_statutory_statements_1q14.pdf"
|
45 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F25fee379-c187-40e7-bf3a-5fe1423cec0f_axa_annual+financial+report_2013.pdf"
|
46 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F37614ed4-1fe0-483e-a0eb-0acefdedd065_axa_2013_half_year_financial_report.pdf"
|
47 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Febb51afc-af0e-4aff-9494-5b852b3233e5_axa_us_statutory_statements_fy2013.pdf"
|
48 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fdd4cd68e-710e-4e00-ba96-c7560d738a43_axa_us_statutory_statements_3q13.pdf"
|
49 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Feab93a81-859a-487c-941c-11e4ce08d5f0_axa_us_statutory_statements_2q13.pdf"
|
50 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F879d09d7-8ff7-4c43-9a24-7ee44ee55404_axa_us_statutory_statements_1q13.pdf"
|
51 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9224097f-d703-4efd-8050-6553ef4336f8_axa_annual+financial+report_2012b.pdf"
|
52 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fcb9d1279-948a-4238-ab8f-754e9e10f2a5_axa_activity_csr_report_2012b_va.pdf"
|
53 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbed64ef2-5078-425a-a616-ffb1947e0b65_axa_2012_half_year_financial_report.pdf"
|
54 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa148165a-b818-4ea1-b7ee-7949cc86ff9a_axa_us_statutory_statements_fy2012.pdf"
|
55 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F601ed5e8-189d-4e59-b0d4-d1c1eedb2ffe_axa_us_statutory_statements_3q12.pdf"
|
56 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9637b674-c740-4115-9c90-3a8827516cc0_axa_us_statutory_statements_2q12.pdf"
|
57 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F319a5964-ea51-4d51-96c8-cf6838047b72_axa_us_statutory_statements_1q12.pdf"
|
58 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0b75d1fe-4b11-4462-9883-4e3bc7532bf4_axa_annual+financial+report_2011.pdf"
|
59 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F17e098ab-3335-4ee1-ade7-058517a952c4_axa_activity_csr_report_2011_vab.pdf"
|
60 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F03996908-7e75-465e-8082-b44f02da326a_axa_us_statutory_statements_fy2011.pdf"
|
61 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fea813e84-7d08-4cf2-bea1-3a01fd4bdf62_axa_us_statutory_statements_3q11.pdf"
|
62 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fda28b496-275b-451d-bffd-108714eb2c39_axa_us_statutory_statements_2q11.pdf"
|
63 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fd6aa2b39-896e-47cf-9882-9985c8d44276_axa_us_statutory_statements_1q11.pdf"
|
64 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fc76f47f4-0917-4fb1-b1ae-78e2a4fbcef5_axa_annual+financial+report_2010c+%281%29.pdf"
|
65 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F68c2771e-5ed8-41d9-bb59-f37f6403b4bf_axa_activity_csr_report_2010_vac.pdf"
|
66 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9b7812a1-a1a2-4e17-9bf2-88c11aac4e08_axa_2010_half_year_financial_report.pdf.pdf"
|
67 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F3f4cc3bd-6823-4ccf-a918-f0c9d9063c2a_axa_us_statutory_statements_fy2010.pdf"
|
68 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F5a8a399f-9a0a-4475-8fbd-5bc0ca1dffe6_axa_us_statutory_statements_3q10.pdf"
|
69 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F67ba6c6d-7063-41d4-ad4e-75d86b15da43_axa_us_statutory_statements_1q10.pdf"
|
70 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa151a532-da4f-4d12-8b3b-9867df4f9724_axa_annual+financial+report_2009.pdf"
|
71 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F5f89c4dd-d935-47fe-ac69-23fada9bfc96_axa_2009_half_year_financial_report.pdf"
|
72 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ff322c77a-e2a2-4cd7-88a0-edd8ad4cd021_axa_annual+financial+report_2008.pdf"
|
73 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fb7f88f05-053a-460b-aa4d-6163d3644cfc_axa_activity_csr_report_2008_vad.pdf"
|
74 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ff657a419-e066-485a-a58e-1d2870a6a035_axa_2008_half_year_financial_report.pdf"
|
75 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F820b669d-b3b5-4c14-986d-2223e2bcbcfb_axa_annual+financial+report_2007.pdf"
|
76 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2741b55e-9349-47ef-9704-3cbca0853b76_axa_activity_csr_report_2007.pdf"
|
77 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F42159571-a3f1-4d36-b4b9-a5493fcc95e3_axa_2007_half_year_financial_report.pdf"
|
78 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F40f9da2a-1bcb-4e5e-9380-18f64b3ce86e_axa_annual+financial+report_2006b.pdf"
|
79 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa6a14e0c-62cd-4812-a2d0-3a0aae8c862d_axa_activity_csr_report_2006b.pdf"
|
80 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Faf242b66-1308-4331-829f-fa91bd0db43e_axa_annual+financial+report_2005.pdf"
|
81 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F6b3313d1-3b72-4f28-bc7b-f445b9b3190c_axa_activity_csr_report_2005.pdf"
|
82 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F048b0d90-b28f-4fc3-bc30-b02cf8e0d6fc_axa_annual+financial+report_2004_ci.pdf"
|
83 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F02acbd05-712f-4b73-93f0-dffa37e2faa2_axa_annual+financial+report_2004_ci.pdf"
|
84 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fcf0b84a5-6da9-499d-985f-530559940494_axa_activity_csr_report_2004.pdf"
|
85 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fafa397b5-d613-40f3-a28f-81bde0d461e2_axa_annual+financial+report_2003.pdf"
|
86 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2a31ebb9-ba04-4998-982e-9dd336abca1f_axa_annual+financial+report_2002.pdf"
|
87 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F51e5f017-954b-4f81-84f9-15a086bf1e33_axa_annual+financial+report_2002_ci01.pdf"
|
88 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F05fea38c-c626-4aaf-9ead-10e9c8f849c1_axa_annual+financial+report_2002_ci02.pdf"
|
89 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F3e41d00d-42b3-4bfd-babc-8b9f76b73d95_axa_activity_csr_report_2002.pdf"
|
90 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F4f2676f4-d36c-4d2e-b088-ef26878ff28b_axa_annual+financial+report_2001.pdf"
|
91 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffdfa0941-6fb5-4ce8-9f42-3b0152e72ce2_axa_activity_csr_report_2001.pdf"
|
92 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F98922150-f1c5-4df4-9006-a8ef17a514cd_axa_annual+financial+report_2000.pdf"
|
93 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F1a645a94-1c56-43be-9a5a-94495e902a23_axa_activity_csr_report_2000.pdf"
|
94 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F51c109ca-2bba-45b3-a03b-78fdd16faeca_axa_annual+financial+report_1999.pdf"
|
95 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F21cdedc6-c082-4ae6-abb3-4c57f0cf9dd8_axa_annual+financial+report_1998.pdf"
|
96 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fd3132d9d-b656-470d-ba4f-fe8d51586e4b_axa_activity_csr_report_1998.pdf"
|
97 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F746d88d3-a4f7-4126-b539-a5da353f53d7_axa_annual+financial+report_1997.pdf"
|
98 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F97097956-6cd5-4fb4-a6ea-9aeb32fd9023_axa_activity_csr_report_1997.pdf"
|
99 |
+
]
|
100 |
+
|
101 |
+
|
102 |
+
docs_tarifs= [
|
103 |
+
"https://www.axa.de/site/axa-de/get/documents_E1805589786/axade/medien/privatkunden/fahrzeugversicherungen/kfz-versicherung/start-and-drive/start-and-drive-versicherungsbedingungen.pdf",
|
104 |
+
"https://www.axa.de/site/axa-de/get/documents_E-298610932/axade/medien/privatkunden/haftpflicht-und-recht/rechtsschutz/versicherungsbedingungen-roland-rechtsschutz.pdf",
|
105 |
+
"https://www.axa.de/site/axa-de/get/documents_E101690225/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-S-5-mio.pdf",
|
106 |
+
"https://www.axa.de/site/axa-de/get/documents_E-1067805129/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-S-10-mio.pdf",
|
107 |
+
"https://www.axa.de/site/axa-de/get/documents_E1026401604/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-M.pdf",
|
108 |
+
"https://www.axa.de/site/axa-de/get/documents_E1450059874/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-L.pdf",
|
109 |
+
"https://www.axa.de/site/axa-de/get/documents_E1636759799/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-S.pdf",
|
110 |
+
"https://www.axa.de/site/axa-de/get/documents_E1147682774/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-M-20%25.pdf",
|
111 |
+
"https://www.axa.de/site/axa-de/get/documents_E1642308493/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-M-40%25.pdf",
|
112 |
+
"https://www.axa.de/site/axa-de/get/documents_E1883536226/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-L.pdf",
|
113 |
+
]
|
114 |
+
|
115 |
+
docs_list = [
|
116 |
+
"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/85ec0278-bf2f-4392-94b9-c086717fa8f6_axa_urd2022_accessible_va.pdf"
|
117 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/e3f52b5e-d4aa-4fc8-8bcd-f432df86e804_axa_urd_2021_en_accessible.pdf"
|
118 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/d6aef906-e41f-40c7-ac9c-29044e98939d_AXA_URD_2020_EN_accessible_b.pdf"
|
119 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ffd5a8bd8-9ef1-40eb-b953-c268c0ab4bf9_axa-ri2019-en-accessible.pdf"
|
120 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0a5e0bd9-78f2-4ef8-b32c-1d3d35ddce80_axa-ri2018-en-accessible.pdf"
|
121 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F01f6966b-c26c-4935-91dc-1b296511ba8c_axa_ri2017_gb_planche.pdf"
|
122 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F6f3108fd-fabc-4dc6-a984-23eb0dca7a19_axa-ri2016-en_01.pdf"
|
123 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fe2936c1a-65f0-40db-b34b-bef9c27e91c0_axa_2015_half_year_financial_report.pdf"
|
124 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fbbb94857-f5d4-4afd-81d0-e85666883936_axa_annual+financial+report_2014.pdf"
|
125 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F25fee379-c187-40e7-bf3a-5fe1423cec0f_axa_annual+financial+report_2013.pdf"
|
126 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F9224097f-d703-4efd-8050-6553ef4336f8_axa_annual+financial+report_2012b.pdf"
|
127 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F0b75d1fe-4b11-4462-9883-4e3bc7532bf4_axa_annual+financial+report_2011.pdf"
|
128 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fc76f47f4-0917-4fb1-b1ae-78e2a4fbcef5_axa_annual+financial+report_2010c+%281%29.pdf"
|
129 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fa151a532-da4f-4d12-8b3b-9867df4f9724_axa_annual+financial+report_2009.pdf"
|
130 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Ff322c77a-e2a2-4cd7-88a0-edd8ad4cd021_axa_annual+financial+report_2008.pdf"
|
131 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F820b669d-b3b5-4c14-986d-2223e2bcbcfb_axa_annual+financial+report_2007.pdf"
|
132 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F40f9da2a-1bcb-4e5e-9380-18f64b3ce86e_axa_annual+financial+report_2006b.pdf"
|
133 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Faf242b66-1308-4331-829f-fa91bd0db43e_axa_annual+financial+report_2005.pdf"
|
134 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F02acbd05-712f-4b73-93f0-dffa37e2faa2_axa_annual+financial+report_2004_ci.pdf"
|
135 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2Fafa397b5-d613-40f3-a28f-81bde0d461e2_axa_annual+financial+report_2003.pdf"
|
136 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F2a31ebb9-ba04-4998-982e-9dd336abca1f_axa_annual+financial+report_2002.pdf"
|
137 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F4f2676f4-d36c-4d2e-b088-ef26878ff28b_axa_annual+financial+report_2001.pdf"
|
138 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F98922150-f1c5-4df4-9006-a8ef17a514cd_axa_annual+financial+report_2000.pdf"
|
139 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F51c109ca-2bba-45b3-a03b-78fdd16faeca_axa_annual+financial+report_1999.pdf"
|
140 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F21cdedc6-c082-4ae6-abb3-4c57f0cf9dd8_axa_annual+financial+report_1998.pdf"
|
141 |
+
,"https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com%2F746d88d3-a4f7-4126-b539-a5da353f53d7_axa_annual+financial+report_1997.pdf"
|
142 |
+
]
|
143 |
+
|
144 |
+
|
145 |
+
docs = load_from_web(docs_tarifs)
|
146 |
+
sub_docs = load_and_split(docs, chunk_size=700)
|
147 |
+
|
148 |
+
# %%
|
149 |
+
create_and_add("axa_terms", sub_docs, "hkunlp/instructor-large")
|
150 |
+
|
151 |
+
docs = load_from_web(docs_list)
|
152 |
+
sub_docs = load_and_split(docs)
|
153 |
+
|
154 |
+
# %%
|
155 |
+
create_and_add("axa_gpt", sub_docs, "hkunlp/instructor-large")
|
app/playground/result.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5b61c0f601cb65f2779f18fdbe5bf47f88d61f23dfbe2afdafb64c951207da8
|
3 |
+
size 429
|
app/playground/st_render_doc.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import utils as ut
|
3 |
+
import cloudpickle
|
4 |
+
|
5 |
+
filename="./result.pkl"
|
6 |
+
pikd = open(filename, "rb")
|
7 |
+
result = dict( cloudpickle.load(pikd) )
|
8 |
+
del pikd
|
9 |
+
ut.format_result_set(result)
|
app/run.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This script inits the models and adds an example collection to the Vectorstore
|
2 |
+
# %%
|
3 |
+
import os
|
4 |
+
import pathlib
|
5 |
+
|
6 |
+
from load_model import load_embedding
|
7 |
+
from utils import get_chroma_client
|
8 |
+
from load_vectors import load_from_web, create_and_add, load_and_split, metadata_generator
|
9 |
+
|
10 |
+
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
11 |
+
with open(current_path+'/.openaiapikey', 'r') as reader:
|
12 |
+
os.environ['OPENAI_API_KEY']=reader.read()
|
13 |
+
import load_model
|
14 |
+
|
15 |
+
# %%
|
16 |
+
#load_model.load_gpu_model("decapoda-research/llama-7b-hf") #Download local model
|
17 |
+
#llm= load_model.load_openai_model()
|
18 |
+
|
19 |
+
# %%
|
20 |
+
#Load example Data
|
21 |
+
client = get_chroma_client()
|
22 |
+
client.reset()
|
23 |
+
ef = load_embedding("hkunlp/instructor-large")
|
24 |
+
collection_name="papers"
|
25 |
+
metadata= {"loaded_docs":[], "Subject":"Heikos Papers", "model_name": ef.model_name}
|
26 |
+
selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
|
27 |
+
|
28 |
+
docs_tarifs= [
|
29 |
+
"https://edoc.hu-berlin.de/bitstream/handle/18452/5294/33.pdf",
|
30 |
+
"https://arxiv.org/pdf/1702.03556v3.pdf",
|
31 |
+
"https://arxiv.org/pdf/1706.03762"
|
32 |
+
]
|
33 |
+
|
34 |
+
# %%
|
35 |
+
# Load collection to get metadata
|
36 |
+
loaded_collection = client.get_collection(collection_name)
|
37 |
+
model_name = loaded_collection.metadata['model_name']
|
38 |
+
|
39 |
+
# %%
|
40 |
+
|
41 |
+
docs = load_from_web(docs_tarifs)
|
42 |
+
sub_docs = load_and_split(docs, chunk_size=1000)
|
43 |
+
create_and_add(collection_name, sub_docs, model_name, metadata)
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
# %%
|
48 |
+
#chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name, metadata=metadata)
|
49 |
+
#result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
|
50 |
+
#print(result)
|
51 |
+
#llm= load_model.load_openai_model(temperature=0.1)
|
52 |
+
|
53 |
+
#llm= load_model.load_cpu_model()
|
54 |
+
|
55 |
+
#meta= metadata_generator(docs[0], llm)
|
56 |
+
# %%
|
57 |
+
#print(meta)
|
58 |
+
|
59 |
+
# %%
|
app/utils.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import latex2markdown
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
import chromadb
|
5 |
+
from chromadb.config import Settings
|
6 |
+
import load_model
|
7 |
+
from load_model import load_embedding
|
8 |
+
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
|
9 |
+
persist_directory = load_model.persist_directory
|
10 |
+
|
11 |
+
def format_document(document: Document):
|
12 |
+
"""TODO: Implement a nice style"""
|
13 |
+
return document.dict()
|
14 |
+
|
15 |
+
def format_result_set(result):
|
16 |
+
st.write(latex2markdown.LaTeX2Markdown(result["result"]).to_markdown())
|
17 |
+
|
18 |
+
agree = st.checkbox('Show source documents')
|
19 |
+
source_documents = result["source_documents"]
|
20 |
+
if agree:
|
21 |
+
st.write('Source Documents:')
|
22 |
+
for document in source_documents:
|
23 |
+
st.write(format_document(document))
|
24 |
+
|
25 |
+
#@st.cache_resource
|
26 |
+
def get_chroma_client():
|
27 |
+
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
28 |
+
persist_directory=persist_directory
|
29 |
+
))
|
30 |
+
#@st.cache_data
|
31 |
+
def retrieve_collections():
|
32 |
+
client = get_chroma_client()
|
33 |
+
all_collections = client.list_collections()
|
34 |
+
collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name'], "metadata": collection.metadata} for collection in all_collections] )
|
35 |
+
return collections
|
docker-compose.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: "3.9"
|
2 |
+
services:
|
3 |
+
streamlit_app:
|
4 |
+
image: myretrievalgpt
|
5 |
+
build: .
|
6 |
+
tty: true
|
7 |
+
ports:
|
8 |
+
- 7860:7860
|
9 |
+
deploy:
|
10 |
+
resources:
|
11 |
+
reservations:
|
12 |
+
devices:
|
13 |
+
- capabilities: [gpu]
|
14 |
+
dev_app:
|
15 |
+
image: myretrievalgpt
|
16 |
+
tty: true
|
17 |
+
volumes:
|
18 |
+
- ./app:/app
|
19 |
+
- ./root:/root
|
20 |
+
depends_on:
|
21 |
+
- streamlit_app
|
22 |
+
deploy:
|
23 |
+
resources:
|
24 |
+
reservations:
|
25 |
+
devices:
|
26 |
+
- capabilities: [gpu]
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#git+https://github.com/hwchase17/langchain.git
|
2 |
+
langchain==0.0.154
|
3 |
+
git+https://github.com/huggingface/transformers.git
|
4 |
+
#git+https://github.com/chroma-core/chroma.git
|
5 |
+
chromadb
|
6 |
+
accelerate
|
7 |
+
bitsandbytes
|
8 |
+
InstructorEmbedding
|
9 |
+
cloudpickle
|
10 |
+
streamlit
|
11 |
+
requests==2.28.0
|
12 |
+
latex2markdown
|
13 |
+
openai
|
14 |
+
unstructured[local-inference]
|
15 |
+
llama-cpp-python
|