Spaces:
Runtime error
Runtime error
Update ingest.py
Browse files
ingest.py
CHANGED
@@ -1,92 +1,52 @@
|
|
1 |
"""Load html from files, clean up, split, ingest into Weaviate."""
|
2 |
import os
|
3 |
from pathlib import Path
|
|
|
4 |
|
5 |
-
import
|
6 |
from bs4 import BeautifulSoup
|
7 |
from langchain.text_splitter import CharacterTextSplitter
|
|
|
|
|
|
|
8 |
|
|
|
9 |
|
10 |
def clean_data(data):
|
11 |
-
|
12 |
-
|
|
|
13 |
return "\n".join([t for t in text.split("\n") if t])
|
14 |
|
15 |
-
|
16 |
docs = []
|
17 |
metadatas = []
|
18 |
-
for p in Path("
|
19 |
if p.is_dir():
|
20 |
continue
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
25 |
|
26 |
text_splitter = CharacterTextSplitter(
|
27 |
separator="\n",
|
28 |
-
chunk_size=
|
29 |
-
chunk_overlap=
|
30 |
length_function=len,
|
31 |
)
|
32 |
|
33 |
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
34 |
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
url=WEAVIATE_URL,
|
39 |
-
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
|
40 |
-
)
|
41 |
-
|
42 |
-
client.schema.delete_class("Paragraph")
|
43 |
-
client.schema.get()
|
44 |
-
schema = {
|
45 |
-
"classes": [
|
46 |
-
{
|
47 |
-
"class": "Paragraph",
|
48 |
-
"description": "A written paragraph",
|
49 |
-
"vectorizer": "text2vec-openai",
|
50 |
-
"moduleConfig": {
|
51 |
-
"text2vec-openai": {
|
52 |
-
"model": "ada",
|
53 |
-
"modelVersion": "002",
|
54 |
-
"type": "text",
|
55 |
-
}
|
56 |
-
},
|
57 |
-
"properties": [
|
58 |
-
{
|
59 |
-
"dataType": ["text"],
|
60 |
-
"description": "The content of the paragraph",
|
61 |
-
"moduleConfig": {
|
62 |
-
"text2vec-openai": {
|
63 |
-
"skip": False,
|
64 |
-
"vectorizePropertyName": False,
|
65 |
-
}
|
66 |
-
},
|
67 |
-
"name": "content",
|
68 |
-
},
|
69 |
-
{
|
70 |
-
"dataType": ["text"],
|
71 |
-
"description": "The link",
|
72 |
-
"moduleConfig": {
|
73 |
-
"text2vec-openai": {
|
74 |
-
"skip": True,
|
75 |
-
"vectorizePropertyName": False,
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"name": "source",
|
79 |
-
},
|
80 |
-
],
|
81 |
-
},
|
82 |
-
]
|
83 |
-
}
|
84 |
|
85 |
-
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
for text in documents:
|
89 |
-
batch.add_data_object(
|
90 |
-
{"content": text.page_content, "source": str(text.metadata["source"])},
|
91 |
-
"Paragraph",
|
92 |
-
)
|
|
|
1 |
"""Load html from files, clean up, split, ingest into Weaviate."""
|
2 |
import os
|
3 |
from pathlib import Path
|
4 |
+
from markdown import markdown
|
5 |
|
6 |
+
import pickle
|
7 |
from bs4 import BeautifulSoup
|
8 |
from langchain.text_splitter import CharacterTextSplitter
|
9 |
+
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
|
10 |
+
from langchain.vectorstores import FAISS
|
11 |
+
from InstructorEmbedding import INSTRUCTOR
|
12 |
|
13 |
+
print(os.environ["HUGGINFACE_APIKEY"])
|
14 |
|
15 |
def clean_data(data):
|
16 |
+
html = markdown(data)
|
17 |
+
soup = BeautifulSoup(html, "html.parser")
|
18 |
+
text = ''.join(soup.findAll(text=True))
|
19 |
return "\n".join([t for t in text.split("\n") if t])
|
20 |
|
|
|
21 |
docs = []
|
22 |
metadatas = []
|
23 |
+
for p in Path("docs").rglob("*"):
|
24 |
if p.is_dir():
|
25 |
continue
|
26 |
+
if str(p).lower().endswith(('.md', '.mdx')):
|
27 |
+
with open(p) as f:
|
28 |
+
print(p)
|
29 |
+
filename = os.path.splitext(p)[0]
|
30 |
+
docs.append(clean_data(f.read()))
|
31 |
+
metadatas.append({"source": filename})
|
32 |
|
33 |
text_splitter = CharacterTextSplitter(
|
34 |
separator="\n",
|
35 |
+
chunk_size=512,
|
36 |
+
chunk_overlap=64,
|
37 |
length_function=len,
|
38 |
)
|
39 |
|
40 |
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
41 |
|
42 |
+
print("making embedding")
|
43 |
+
embedding = HuggingFaceEmbeddings()
|
44 |
|
45 |
+
print("beginning construction of faiss")
|
46 |
+
search_index = FAISS.from_documents(documents, embedding)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
print("beginning pickle")
|
49 |
+
with open("docs.pkl", 'wb') as f:
|
50 |
+
pickle.dump(search_index, f)
|
51 |
|
52 |
+
print("Pickle complete")
|
|
|
|
|
|
|
|
|
|