Spaces:
Sleeping
Sleeping
Luca Foppiano
commited on
Commit
•
0b28b48
1
Parent(s):
6551eca
Setuptools build, pypy release, github action improvements (#5)
Browse files- .github/workflows/ci-build.yml +5 -14
- .github/workflows/ci-release.yml +80 -0
- CHANGELOG.md +18 -0
- README.md +16 -0
- document_qa_engine.py → document_qa/document_qa_engine.py +0 -0
- grobid_processors.py → document_qa/grobid_processors.py +0 -0
- pyproject.toml +35 -0
- pytest.ini +2 -0
- requirements.txt +14 -15
- streamlit_app.py +2 -2
.github/workflows/ci-build.yml
CHANGED
@@ -14,9 +14,12 @@ jobs:
|
|
14 |
steps:
|
15 |
- uses: actions/checkout@v2
|
16 |
- name: Set up Python 3.9
|
17 |
-
uses: actions/setup-python@
|
18 |
with:
|
19 |
python-version: "3.9"
|
|
|
|
|
|
|
20 |
- name: Install dependencies
|
21 |
run: |
|
22 |
python -m pip install --upgrade pip
|
@@ -30,16 +33,4 @@ jobs:
|
|
30 |
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
31 |
# - name: Test with pytest
|
32 |
# run: |
|
33 |
-
# pytest
|
34 |
-
|
35 |
-
docker-build-documentqa:
|
36 |
-
needs: [build]
|
37 |
-
|
38 |
-
runs-on: ubuntu-latest
|
39 |
-
|
40 |
-
steps:
|
41 |
-
- uses: actions/checkout@v2
|
42 |
-
- name: Build the Docker image
|
43 |
-
run: docker build . --file Dockerfile --tag lfoppiano/documentqa:develop
|
44 |
-
- name: Cleanup older than 24h images and containers
|
45 |
-
run: docker system prune --filter "until=24h" --force
|
|
|
14 |
steps:
|
15 |
- uses: actions/checkout@v2
|
16 |
- name: Set up Python 3.9
|
17 |
+
uses: actions/setup-python@v4
|
18 |
with:
|
19 |
python-version: "3.9"
|
20 |
+
cache: 'pip'
|
21 |
+
- name: Cleanup more disk space
|
22 |
+
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
23 |
- name: Install dependencies
|
24 |
run: |
|
25 |
python -m pip install --upgrade pip
|
|
|
33 |
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
34 |
# - name: Test with pytest
|
35 |
# run: |
|
36 |
+
# pytest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.github/workflows/ci-release.yml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Build release
|
2 |
+
|
3 |
+
on:
|
4 |
+
workflow_dispatch:
|
5 |
+
push:
|
6 |
+
tags:
|
7 |
+
- 'v*'
|
8 |
+
|
9 |
+
concurrency:
|
10 |
+
group: docker
|
11 |
+
cancel-in-progress: true
|
12 |
+
|
13 |
+
jobs:
|
14 |
+
build:
|
15 |
+
runs-on: ubuntu-latest
|
16 |
+
steps:
|
17 |
+
- uses: actions/checkout@v2
|
18 |
+
- name: Set up Python 3.9
|
19 |
+
uses: actions/setup-python@v4
|
20 |
+
with:
|
21 |
+
python-version: "3.9"
|
22 |
+
cache: 'pip'
|
23 |
+
- name: Cleanup more disk space
|
24 |
+
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
25 |
+
- name: Install dependencies
|
26 |
+
run: |
|
27 |
+
python -m pip install --upgrade pip
|
28 |
+
pip install --upgrade flake8 pytest pycodestyle
|
29 |
+
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
30 |
+
- name: Lint with flake8
|
31 |
+
run: |
|
32 |
+
# stop the build if there are Python syntax errors or undefined names
|
33 |
+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
34 |
+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
35 |
+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
36 |
+
# - name: Test with pytest
|
37 |
+
# run: |
|
38 |
+
# pytest
|
39 |
+
|
40 |
+
- name: Build and Publish to PyPI
|
41 |
+
uses: conchylicultor/pypi-build-publish@v1
|
42 |
+
with:
|
43 |
+
pypi-token: ${{ secrets.PYPI_API_TOKEN }}
|
44 |
+
|
45 |
+
|
46 |
+
docker-build:
|
47 |
+
needs: [build]
|
48 |
+
runs-on: ubuntu-latest
|
49 |
+
|
50 |
+
steps:
|
51 |
+
- name: Set tags
|
52 |
+
id: set_tags
|
53 |
+
run: |
|
54 |
+
DOCKER_IMAGE=lfoppiano/document-insights-qa
|
55 |
+
VERSION=""
|
56 |
+
if [[ $GITHUB_REF == refs/tags/v* ]]; then
|
57 |
+
VERSION=${GITHUB_REF#refs/tags/v}
|
58 |
+
fi
|
59 |
+
if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
60 |
+
TAGS="${VERSION}"
|
61 |
+
else
|
62 |
+
TAGS="latest"
|
63 |
+
fi
|
64 |
+
echo "TAGS=${TAGS}"
|
65 |
+
echo ::set-output name=tags::${TAGS}
|
66 |
+
- name: Create more disk space
|
67 |
+
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
68 |
+
- uses: actions/checkout@v2
|
69 |
+
- name: Build and push
|
70 |
+
id: docker_build
|
71 |
+
uses: mr-smithers-excellent/docker-build-push@v5
|
72 |
+
with:
|
73 |
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
74 |
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
75 |
+
image: lfoppiano/document-insights-qa
|
76 |
+
registry: docker.io
|
77 |
+
pushImage: ${{ github.event_name != 'pull_request' }}
|
78 |
+
tags: ${{ steps.set_tags.outputs.tags }}
|
79 |
+
- name: Image digest
|
80 |
+
run: echo ${{ steps.docker_build.outputs.digest }}
|
CHANGELOG.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Changelog
|
2 |
+
|
3 |
+
All notable changes to this project will be documented in this file.
|
4 |
+
|
5 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
6 |
+
|
7 |
+
## [0.0.1] – 2023-05-13
|
8 |
+
|
9 |
+
### Added
|
10 |
+
|
11 |
+
+ Kick off application
|
12 |
+
+ Support for GPT-3.5
|
13 |
+
+ Support for Mistral + SentenceTransformer
|
14 |
+
+ Streamlit application
|
15 |
+
+ Docker image
|
16 |
+
+ pypi package
|
17 |
+
|
18 |
+
<!-- markdownlint-disable-file MD024 MD033 -->
|
README.md
CHANGED
@@ -43,6 +43,22 @@ Allow to change the number of embedding chunks that are considered for respondin
|
|
43 |
By default, the mode is set to LLM (Language Model) which enables question/answering. You can directly ask questions related to the document content, and the system will answer the question using content from the document.
|
44 |
If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete.
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
## Acknolwedgement
|
48 |
|
|
|
43 |
By default, the mode is set to LLM (Language Model) which enables question/answering. You can directly ask questions related to the document content, and the system will answer the question using content from the document.
|
44 |
If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete.
|
45 |
|
46 |
+
## Development notes
|
47 |
+
|
48 |
+
To release a new version:
|
49 |
+
|
50 |
+
- `bump-my-version bump patch`
|
51 |
+
- `git push --tags
|
52 |
+
|
53 |
+
To use docker:
|
54 |
+
|
55 |
+
- docker run `lfoppiano/document-insights-qa:latest`
|
56 |
+
|
57 |
+
To install the library with Pypi:
|
58 |
+
|
59 |
+
- `pip install document-qa-engine`
|
60 |
+
|
61 |
+
|
62 |
|
63 |
## Acknolwedgement
|
64 |
|
document_qa_engine.py → document_qa/document_qa_engine.py
RENAMED
File without changes
|
grobid_processors.py → document_qa/grobid_processors.py
RENAMED
File without changes
|
pyproject.toml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools", "setuptools-scm"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[tool.bumpversion]
|
6 |
+
current_version = "0.0.1"
|
7 |
+
commit = "true"
|
8 |
+
tag = "true"
|
9 |
+
tag_name = "v{new_version}"
|
10 |
+
|
11 |
+
#[[tool.bumpversion.files]]
|
12 |
+
#filename = "version.txt"
|
13 |
+
#search = "{current_version}"
|
14 |
+
#replace = "{new_version}"
|
15 |
+
|
16 |
+
[project]
|
17 |
+
name = "document-qa-engine"
|
18 |
+
license = { file = "LICENSE" }
|
19 |
+
authors = [
|
20 |
+
{ name = "Luca Foppiano", email = "[email protected]" },
|
21 |
+
]
|
22 |
+
maintainers = [
|
23 |
+
{ name = "Luca Foppiano", email = "[email protected]" }
|
24 |
+
]
|
25 |
+
description = "Scientific Document Insight Q/A"
|
26 |
+
readme = "README.md"
|
27 |
+
|
28 |
+
dynamic = ['version']
|
29 |
+
|
30 |
+
[tool.setuptools_scm]
|
31 |
+
|
32 |
+
[project.urls]
|
33 |
+
Homepage = "https://document-insights.streamlit.app"
|
34 |
+
Repository = "https://github.com/lfoppiano/document-qa"
|
35 |
+
Changelog = "https://github.com/lfoppiano/document-qa/blob/main/CHANGELOG.md"
|
pytest.ini
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[pytest]
|
2 |
+
testpaths = tests
|
requirements.txt
CHANGED
@@ -1,15 +1,13 @@
|
|
1 |
# Grobid
|
2 |
-
grobid-quantities-client
|
3 |
-
grobid-client-python
|
4 |
-
grobid_tei_xml
|
5 |
|
6 |
# Utils
|
7 |
tqdm
|
8 |
-
|
9 |
-
pyyaml
|
10 |
-
dateparser
|
11 |
pytest
|
12 |
-
streamlit
|
13 |
lxml
|
14 |
Beautifulsoup4
|
15 |
python-dotenv
|
@@ -17,11 +15,12 @@ watchdog
|
|
17 |
|
18 |
# LLM
|
19 |
chromadb==0.3.25
|
20 |
-
tiktoken
|
21 |
-
openai
|
22 |
-
langchain==0.0.
|
23 |
-
promptlayer
|
24 |
-
typing-inspect==0.
|
25 |
-
typing_extensions==4.
|
26 |
-
pydantic==
|
27 |
-
sentence_transformers
|
|
|
|
1 |
# Grobid
|
2 |
+
grobid-quantities-client==0.4.0
|
3 |
+
grobid-client-python==0.0.5
|
4 |
+
grobid_tei_xml==0.1.3
|
5 |
|
6 |
# Utils
|
7 |
tqdm
|
8 |
+
pyyaml==6.0
|
|
|
|
|
9 |
pytest
|
10 |
+
streamlit==1.27.1
|
11 |
lxml
|
12 |
Beautifulsoup4
|
13 |
python-dotenv
|
|
|
15 |
|
16 |
# LLM
|
17 |
chromadb==0.3.25
|
18 |
+
tiktoken==0.4.0
|
19 |
+
openai==0.27.7
|
20 |
+
langchain==0.0.314
|
21 |
+
promptlayer==0.2.4
|
22 |
+
typing-inspect==0.9.0
|
23 |
+
typing_extensions==4.8.0
|
24 |
+
pydantic==2.4.2
|
25 |
+
sentence_transformers==2.2.2
|
26 |
+
bump-my-version
|
streamlit_app.py
CHANGED
@@ -13,8 +13,8 @@ import streamlit as st
|
|
13 |
from langchain.chat_models import PromptLayerChatOpenAI
|
14 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
15 |
|
16 |
-
from document_qa_engine import DocumentQAEngine
|
17 |
-
from grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
|
18 |
from grobid_client_generic import GrobidClientGeneric
|
19 |
|
20 |
if 'rqa' not in st.session_state:
|
|
|
13 |
from langchain.chat_models import PromptLayerChatOpenAI
|
14 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
15 |
|
16 |
+
from document_qa.document_qa_engine import DocumentQAEngine
|
17 |
+
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
|
18 |
from grobid_client_generic import GrobidClientGeneric
|
19 |
|
20 |
if 'rqa' not in st.session_state:
|