GitHub Actions Bot commited on
Commit
4c50f10
·
2 Parent(s): c379a6e 34f9ef1

Merge branch 'main' of https://huggingface.co/spaces/yeastcoast/polars-documentation-rag

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Polars Documentation Rag
3
+ emoji: 🐨
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.25.2
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
src/data_processing/embeddings.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer
2
+ from torch import Tensor
3
+ from torch import functional as F
4
+
5
+ from src.config import EMBEDDING_MODEL
6
+ from src.utils import batched
7
+
8
+
9
+ class TextEmbedder:
10
+ def __init__(self, modelname=EMBEDDING_MODEL, max_length=512):
11
+ self.tokenizer = AutoTokenizer.from_pretrained(modelname)
12
+ self.model = AutoModel.from_pretrained(modelname)
13
+ self.max_length = max_length
14
+
15
+ @staticmethod
16
+ def average_pool(last_hidden_states: Tensor,
17
+ attention_mask: Tensor) -> Tensor:
18
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
19
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
20
+
21
+ def embed_text(self, text: str | list[str], batch_size=128):
22
+ if isinstance(text, str):
23
+ text = [text]
24
+
25
+ outputs = []
26
+
27
+ for batch in batched(text, n=batch_size):
28
+ batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt')
29
+ output = self.model(**batch_dict)
30
+ embeddings = self.average_pool(output.last_hidden_state, batch_dict['attention_mask'])
31
+
32
+ # embeddings = F.norm(embeddings, p=2, dim=1)
33
+ # scores = (embeddings[:1] @ embeddings[1:].T) * 100
34
+
35
+ embeddings = embeddings.tolist()
36
+ outputs += embeddings
37
+ return outputs
src/data_processing/upload_to_qdrant.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from qdrant_client import QdrantClient, models
4
+ from uuid import uuid4
5
+
6
+ from src.config import QDRANT_COLLECTION_NAME
7
+
8
+
9
+ class QdrantStore:
10
+ def __init__(self, client: QdrantClient, collection_config=None):
11
+ self.client = client
12
+ self.collection_names = set([i.name for i in client.get_collections().collections])
13
+
14
+ if collection_config is not None:
15
+ self.create_collection(collection_config)
16
+
17
+ def create_collection(self, collection_config: dict):
18
+ collection_name = collection_config["collection_name"]
19
+ if not self.client.collection_exists(collection_name):
20
+ self.client.create_collection(**collection_config)
21
+ self.collection_names.add(collection_name)
22
+
23
+ def _check_collection_name(self, collection_name):
24
+ if collection_name not in self.collection_names:
25
+ raise ValueError(f"Collection: {collection_name} does not exist.")
26
+
27
+ def upsert_points(self,
28
+ vectors: Any | list[Any],
29
+ payloads: dict | list[dict],
30
+ collection_name: str):
31
+ self._check_collection_name(collection_name)
32
+
33
+ ids = [str(uuid4()) for _ in payloads]
34
+
35
+ self.client.upsert(
36
+ collection_name=collection_name,
37
+ points=models.Batch(
38
+ ids=ids,
39
+ payloads=payloads,
40
+ vectors=vectors
41
+ )
42
+ )
43
+
44
+ def delete_points(self,
45
+ filters: dict[str, list[models.FieldCondition]],
46
+ collection_name: str):
47
+ self._check_collection_name(collection_name)
48
+
49
+ self.client.delete(
50
+ collection_name=collection_name,
51
+ points_selector=models.Filter(**filters)
52
+ )
53
+
54
+ def delete_points_by_match(self,
55
+ key_value: tuple[str, list[str] | str],
56
+ collection_name: str):
57
+ key, values = key_value
58
+ if isinstance(values, str):
59
+ values = [values]
60
+ filter = {"must": [models.FieldCondition(key=key, match=models.MatchAny(any=values))]}
61
+ self.delete_points(filter, collection_name)