transformers-chat

Runtime error

App Files Files Community

enoreyes commited on Jan 30, 2023

Commit

b00f9c3

1 Parent(s): a3c3fcf

Update ingest.py

Browse files

Files changed (1) hide show

ingest.py +26 -66

ingest.py CHANGED Viewed

@@ -1,92 +1,52 @@
 """Load html from files, clean up, split, ingest into Weaviate."""
 import os
 from pathlib import Path
-import weaviate
 from bs4 import BeautifulSoup
 from langchain.text_splitter import CharacterTextSplitter
 def clean_data(data):
-    soup = BeautifulSoup(data)
-    text = soup.find_all("main", {"id": "main-content"})[0].get_text()
     return "\n".join([t for t in text.split("\n") if t])
 docs = []
 metadatas = []
-for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
     if p.is_dir():
         continue
-    with open(p) as f:
-        docs.append(clean_data(f.read()))
-        metadatas.append({"source": p})
 text_splitter = CharacterTextSplitter(
     separator="\n",
-    chunk_size=1000,
-    chunk_overlap=200,
     length_function=len,
 )
 documents = text_splitter.create_documents(docs, metadatas=metadatas)
-WEAVIATE_URL = os.environ["WEAVIATE_URL"]
-client = weaviate.Client(
-    url=WEAVIATE_URL,
-    additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
-)
-client.schema.delete_class("Paragraph")
-client.schema.get()
-schema = {
-    "classes": [
-        {
-            "class": "Paragraph",
-            "description": "A written paragraph",
-            "vectorizer": "text2vec-openai",
-            "moduleConfig": {
-                "text2vec-openai": {
-                    "model": "ada",
-                    "modelVersion": "002",
-                    "type": "text",
-                }
-            },
-            "properties": [
-                {
-                    "dataType": ["text"],
-                    "description": "The content of the paragraph",
-                    "moduleConfig": {
-                        "text2vec-openai": {
-                            "skip": False,
-                            "vectorizePropertyName": False,
-                        }
-                    },
-                    "name": "content",
-                },
-                {
-                    "dataType": ["text"],
-                    "description": "The link",
-                    "moduleConfig": {
-                        "text2vec-openai": {
-                            "skip": True,
-                            "vectorizePropertyName": False,
-                        }
-                    },
-                    "name": "source",
-                },
-            ],
-        },
-    ]
-}
-client.schema.create(schema)
-with client.batch as batch:
-    for text in documents:
-        batch.add_data_object(
-            {"content": text.page_content, "source": str(text.metadata["source"])},
-            "Paragraph",
-        )

 """Load html from files, clean up, split, ingest into Weaviate."""
 import os
 from pathlib import Path
+from markdown import markdown
+import pickle
 from bs4 import BeautifulSoup
 from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from InstructorEmbedding import INSTRUCTOR
+print(os.environ["HUGGINFACE_APIKEY"])
 def clean_data(data):
+    html = markdown(data)
+    soup = BeautifulSoup(html, "html.parser")
+    text = ''.join(soup.findAll(text=True))
     return "\n".join([t for t in text.split("\n") if t])
 docs = []
 metadatas = []
+for p in Path("docs").rglob("*"):
     if p.is_dir():
         continue
+    if str(p).lower().endswith(('.md', '.mdx')):
+        with open(p) as f:
+            print(p)
+            filename = os.path.splitext(p)[0]
+            docs.append(clean_data(f.read()))
+            metadatas.append({"source": filename})
 text_splitter = CharacterTextSplitter(
     separator="\n",
+    chunk_size=512,
+    chunk_overlap=64,
     length_function=len,
 )
 documents = text_splitter.create_documents(docs, metadatas=metadatas)
+print("making embedding")
+embedding = HuggingFaceEmbeddings()
+print("beginning construction of faiss")
+search_index = FAISS.from_documents(documents, embedding)
+print("beginning pickle")
+with open("docs.pkl", 'wb') as f:
+    pickle.dump(search_index, f)
+print("Pickle complete")