enoreyes commited on
Commit
b00f9c3
·
1 Parent(s): a3c3fcf

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +26 -66
ingest.py CHANGED
@@ -1,92 +1,52 @@
1
  """Load html from files, clean up, split, ingest into Weaviate."""
2
  import os
3
  from pathlib import Path
 
4
 
5
- import weaviate
6
  from bs4 import BeautifulSoup
7
  from langchain.text_splitter import CharacterTextSplitter
 
 
 
8
 
 
9
 
10
  def clean_data(data):
11
- soup = BeautifulSoup(data)
12
- text = soup.find_all("main", {"id": "main-content"})[0].get_text()
 
13
  return "\n".join([t for t in text.split("\n") if t])
14
 
15
-
16
  docs = []
17
  metadatas = []
18
- for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
19
  if p.is_dir():
20
  continue
21
- with open(p) as f:
22
- docs.append(clean_data(f.read()))
23
- metadatas.append({"source": p})
24
-
 
 
25
 
26
  text_splitter = CharacterTextSplitter(
27
  separator="\n",
28
- chunk_size=1000,
29
- chunk_overlap=200,
30
  length_function=len,
31
  )
32
 
33
  documents = text_splitter.create_documents(docs, metadatas=metadatas)
34
 
 
 
35
 
36
- WEAVIATE_URL = os.environ["WEAVIATE_URL"]
37
- client = weaviate.Client(
38
- url=WEAVIATE_URL,
39
- additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
40
- )
41
-
42
- client.schema.delete_class("Paragraph")
43
- client.schema.get()
44
- schema = {
45
- "classes": [
46
- {
47
- "class": "Paragraph",
48
- "description": "A written paragraph",
49
- "vectorizer": "text2vec-openai",
50
- "moduleConfig": {
51
- "text2vec-openai": {
52
- "model": "ada",
53
- "modelVersion": "002",
54
- "type": "text",
55
- }
56
- },
57
- "properties": [
58
- {
59
- "dataType": ["text"],
60
- "description": "The content of the paragraph",
61
- "moduleConfig": {
62
- "text2vec-openai": {
63
- "skip": False,
64
- "vectorizePropertyName": False,
65
- }
66
- },
67
- "name": "content",
68
- },
69
- {
70
- "dataType": ["text"],
71
- "description": "The link",
72
- "moduleConfig": {
73
- "text2vec-openai": {
74
- "skip": True,
75
- "vectorizePropertyName": False,
76
- }
77
- },
78
- "name": "source",
79
- },
80
- ],
81
- },
82
- ]
83
- }
84
 
85
- client.schema.create(schema)
 
 
86
 
87
- with client.batch as batch:
88
- for text in documents:
89
- batch.add_data_object(
90
- {"content": text.page_content, "source": str(text.metadata["source"])},
91
- "Paragraph",
92
- )
 
1
  """Load html from files, clean up, split, ingest into Weaviate."""
2
  import os
3
  from pathlib import Path
4
+ from markdown import markdown
5
 
6
+ import pickle
7
  from bs4 import BeautifulSoup
8
  from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
10
+ from langchain.vectorstores import FAISS
11
+ from InstructorEmbedding import INSTRUCTOR
12
 
13
+ print(os.environ["HUGGINFACE_APIKEY"])
14
 
15
  def clean_data(data):
16
+ html = markdown(data)
17
+ soup = BeautifulSoup(html, "html.parser")
18
+ text = ''.join(soup.findAll(text=True))
19
  return "\n".join([t for t in text.split("\n") if t])
20
 
 
21
  docs = []
22
  metadatas = []
23
+ for p in Path("docs").rglob("*"):
24
  if p.is_dir():
25
  continue
26
+ if str(p).lower().endswith(('.md', '.mdx')):
27
+ with open(p) as f:
28
+ print(p)
29
+ filename = os.path.splitext(p)[0]
30
+ docs.append(clean_data(f.read()))
31
+ metadatas.append({"source": filename})
32
 
33
  text_splitter = CharacterTextSplitter(
34
  separator="\n",
35
+ chunk_size=512,
36
+ chunk_overlap=64,
37
  length_function=len,
38
  )
39
 
40
  documents = text_splitter.create_documents(docs, metadatas=metadatas)
41
 
42
+ print("making embedding")
43
+ embedding = HuggingFaceEmbeddings()
44
 
45
+ print("beginning construction of faiss")
46
+ search_index = FAISS.from_documents(documents, embedding)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ print("beginning pickle")
49
+ with open("docs.pkl", 'wb') as f:
50
+ pickle.dump(search_index, f)
51
 
52
+ print("Pickle complete")