Spaces:

jerpint
/

buster

Runtime error

App Files Files Community

buster / generate_embeddings.py

jerpint

First commit

d60948f about 1 year ago

raw

history blame contribute delete

2.63 kB

	import os
	import zipfile
	import requests
	import pandas as pd
	import time

	from buster.documents_manager import DeepLakeDocumentsManager

	from buster.docparser import get_all_documents
	from buster.parser import HuggingfaceParser

	hf_transformers_zip_url = "https://huggingface.co/datasets/hf-doc-build/doc-build/resolve/main/transformers/main.zip"


	def download_and_unzip(zip_url, target_dir, overwrite=False):
	"""Download a zip file from zip_url and unzip it to target_dir.

	# Example usage
	zip_url = "https://example.com/example.zip"
	target_dir = "downloaded_files"
	download_and_unzip(zip_url, target_dir, overwrite=True)

	ChatGPT generated.
	"""
	# Create the target directory if it doesn't exist
	if not os.path.exists(target_dir):
	os.makedirs(target_dir)

	# Get the filename from the zip_url
	zip_filename = os.path.basename(zip_url)
	target_path = os.path.join(target_dir, zip_filename)

	# Check if the file already exists
	if os.path.exists(target_path) and not overwrite:
	print(f"{zip_filename} already exists in the target directory.")
	return

	# Download the zip file
	response = requests.get(zip_url, stream=True)
	if response.status_code == 200:
	with open(target_path, "wb") as file:
	for chunk in response.iter_content(chunk_size=8192):
	file.write(chunk)
	print(f"{zip_filename} downloaded successfully.")

	# Unzip the file
	with zipfile.ZipFile(target_path, "r") as zip_ref:
	zip_ref.extractall(target_dir)
	print(f"{zip_filename} extracted successfully.")
	else:
	print(f"Failed to download {zip_filename}. Status code: {response.status_code}")


	# Download the tranformers html pages and unzip it
	download_and_unzip(zip_url=hf_transformers_zip_url, target_dir=".")

	# Extract all documents from the html into a dataframe
	df = get_all_documents(
	root_dir="transformers/main/en/",
	base_url="https://huggingface.co/docs/transformers/main/en/",
	parser_cls=HuggingfaceParser,
	min_section_length=100,
	max_section_length=1000,
	)

	# Add the source column
	df["source"] = "hf_transformers"

	# Save the .csv with chunks to disk
	df.to_csv("hf_transformers.csv")

	# Initialize the vector store
	dm = DeepLakeDocumentsManager(
	vector_store_path="deeplake_store",
	overwrite=True,
	required_columns=["url", "content", "source", "title"],
	)

	# Add all embeddings to the vector store
	dm.batch_add(
	df=df,
	batch_size=3000,
	min_time_interval=60,
	num_workers=32,
	csv_filename="embeddings.csv",
	csv_overwrite=False,
	)