Spaces:
Sleeping
Sleeping
Merge branch 'main' of https://huggingface.co/spaces/yeastcoast/polars-documentation-rag
Browse files- .gitattributes +35 -0
- README.md +12 -0
- src/data_processing/embeddings.py +37 -0
- src/data_processing/upload_to_qdrant.py +61 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Polars Documentation Rag
|
3 |
+
emoji: 🐨
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.25.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
src/data_processing/embeddings.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModel, AutoTokenizer
|
2 |
+
from torch import Tensor
|
3 |
+
from torch import functional as F
|
4 |
+
|
5 |
+
from src.config import EMBEDDING_MODEL
|
6 |
+
from src.utils import batched
|
7 |
+
|
8 |
+
|
9 |
+
class TextEmbedder:
|
10 |
+
def __init__(self, modelname=EMBEDDING_MODEL, max_length=512):
|
11 |
+
self.tokenizer = AutoTokenizer.from_pretrained(modelname)
|
12 |
+
self.model = AutoModel.from_pretrained(modelname)
|
13 |
+
self.max_length = max_length
|
14 |
+
|
15 |
+
@staticmethod
|
16 |
+
def average_pool(last_hidden_states: Tensor,
|
17 |
+
attention_mask: Tensor) -> Tensor:
|
18 |
+
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
|
19 |
+
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
|
20 |
+
|
21 |
+
def embed_text(self, text: str | list[str], batch_size=128):
|
22 |
+
if isinstance(text, str):
|
23 |
+
text = [text]
|
24 |
+
|
25 |
+
outputs = []
|
26 |
+
|
27 |
+
for batch in batched(text, n=batch_size):
|
28 |
+
batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt')
|
29 |
+
output = self.model(**batch_dict)
|
30 |
+
embeddings = self.average_pool(output.last_hidden_state, batch_dict['attention_mask'])
|
31 |
+
|
32 |
+
# embeddings = F.norm(embeddings, p=2, dim=1)
|
33 |
+
# scores = (embeddings[:1] @ embeddings[1:].T) * 100
|
34 |
+
|
35 |
+
embeddings = embeddings.tolist()
|
36 |
+
outputs += embeddings
|
37 |
+
return outputs
|
src/data_processing/upload_to_qdrant.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
|
3 |
+
from qdrant_client import QdrantClient, models
|
4 |
+
from uuid import uuid4
|
5 |
+
|
6 |
+
from src.config import QDRANT_COLLECTION_NAME
|
7 |
+
|
8 |
+
|
9 |
+
class QdrantStore:
|
10 |
+
def __init__(self, client: QdrantClient, collection_config=None):
|
11 |
+
self.client = client
|
12 |
+
self.collection_names = set([i.name for i in client.get_collections().collections])
|
13 |
+
|
14 |
+
if collection_config is not None:
|
15 |
+
self.create_collection(collection_config)
|
16 |
+
|
17 |
+
def create_collection(self, collection_config: dict):
|
18 |
+
collection_name = collection_config["collection_name"]
|
19 |
+
if not self.client.collection_exists(collection_name):
|
20 |
+
self.client.create_collection(**collection_config)
|
21 |
+
self.collection_names.add(collection_name)
|
22 |
+
|
23 |
+
def _check_collection_name(self, collection_name):
|
24 |
+
if collection_name not in self.collection_names:
|
25 |
+
raise ValueError(f"Collection: {collection_name} does not exist.")
|
26 |
+
|
27 |
+
def upsert_points(self,
|
28 |
+
vectors: Any | list[Any],
|
29 |
+
payloads: dict | list[dict],
|
30 |
+
collection_name: str):
|
31 |
+
self._check_collection_name(collection_name)
|
32 |
+
|
33 |
+
ids = [str(uuid4()) for _ in payloads]
|
34 |
+
|
35 |
+
self.client.upsert(
|
36 |
+
collection_name=collection_name,
|
37 |
+
points=models.Batch(
|
38 |
+
ids=ids,
|
39 |
+
payloads=payloads,
|
40 |
+
vectors=vectors
|
41 |
+
)
|
42 |
+
)
|
43 |
+
|
44 |
+
def delete_points(self,
|
45 |
+
filters: dict[str, list[models.FieldCondition]],
|
46 |
+
collection_name: str):
|
47 |
+
self._check_collection_name(collection_name)
|
48 |
+
|
49 |
+
self.client.delete(
|
50 |
+
collection_name=collection_name,
|
51 |
+
points_selector=models.Filter(**filters)
|
52 |
+
)
|
53 |
+
|
54 |
+
def delete_points_by_match(self,
|
55 |
+
key_value: tuple[str, list[str] | str],
|
56 |
+
collection_name: str):
|
57 |
+
key, values = key_value
|
58 |
+
if isinstance(values, str):
|
59 |
+
values = [values]
|
60 |
+
filter = {"must": [models.FieldCondition(key=key, match=models.MatchAny(any=values))]}
|
61 |
+
self.delete_points(filter, collection_name)
|