update all documentation
Browse files
data/scraping_scripts/github_to_markdown_ai_docs.py
CHANGED
|
@@ -142,16 +142,33 @@ def download_file(file_url: str, file_path: str, retries: int = 0):
|
|
| 142 |
else:
|
| 143 |
print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
|
| 144 |
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
exporter = MarkdownExporter()
|
| 151 |
-
markdown, _ = exporter.from_notebook_node(notebook)
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
def fetch_files(api_url: str, local_dir: str):
|
|
|
|
| 142 |
else:
|
| 143 |
print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
|
| 144 |
|
| 145 |
+
# def convert_ipynb_to_md(ipynb_path: str, md_path: str):
|
| 146 |
+
# with open(ipynb_path, "r", encoding="utf-8") as f:
|
| 147 |
+
# notebook = nbformat.read(f, as_version=4)
|
| 148 |
|
| 149 |
+
# exporter = MarkdownExporter()
|
| 150 |
+
# markdown, _ = exporter.from_notebook_node(notebook)
|
| 151 |
+
|
| 152 |
+
# with open(md_path, "w", encoding="utf-8") as f:
|
| 153 |
+
# f.write(markdown)
|
| 154 |
|
|
|
|
|
|
|
| 155 |
|
| 156 |
+
def convert_ipynb_to_md(ipynb_path: str, md_path: str):
|
| 157 |
+
try:
|
| 158 |
+
with open(ipynb_path, "r", encoding="utf-8") as f:
|
| 159 |
+
notebook = nbformat.read(f, as_version=4)
|
| 160 |
+
|
| 161 |
+
exporter = MarkdownExporter()
|
| 162 |
+
markdown, _ = exporter.from_notebook_node(notebook)
|
| 163 |
+
|
| 164 |
+
with open(md_path, "w", encoding="utf-8") as f:
|
| 165 |
+
f.write(markdown)
|
| 166 |
+
except (json.JSONDecodeError, nbformat.reader.NotJSONError) as e:
|
| 167 |
+
print(f"Error converting notebook {ipynb_path}: {str(e)}")
|
| 168 |
+
print("Skipping this file and continuing with others...")
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f"Unexpected error converting notebook {ipynb_path}: {str(e)}")
|
| 171 |
+
print("Skipping this file and continuing with others...")
|
| 172 |
|
| 173 |
|
| 174 |
def fetch_files(api_url: str, local_dir: str):
|
data/scraping_scripts/process_md_files.py
CHANGED
|
@@ -381,7 +381,7 @@ SOURCE_CONFIGS = {
|
|
| 381 |
"url_extension": ".ipynb",
|
| 382 |
},
|
| 383 |
"langchain": {
|
| 384 |
-
"base_url": "https://python.langchain.com/
|
| 385 |
"input_directory": "data/langchain_md_files",
|
| 386 |
"output_file": "data/langchain_data.jsonl",
|
| 387 |
"source_name": "langchain",
|
|
@@ -460,11 +460,7 @@ def should_include_file(file_path: str, config: Dict) -> bool:
|
|
| 460 |
|
| 461 |
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
| 462 |
encoding = tiktoken.get_encoding(encoding_name)
|
| 463 |
-
num_tokens = len(
|
| 464 |
-
encoding.encode(
|
| 465 |
-
string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
|
| 466 |
-
)
|
| 467 |
-
)
|
| 468 |
return num_tokens
|
| 469 |
|
| 470 |
|
|
|
|
| 381 |
"url_extension": ".ipynb",
|
| 382 |
},
|
| 383 |
"langchain": {
|
| 384 |
+
"base_url": "https://python.langchain.com/docs/",
|
| 385 |
"input_directory": "data/langchain_md_files",
|
| 386 |
"output_file": "data/langchain_data.jsonl",
|
| 387 |
"source_name": "langchain",
|
|
|
|
| 460 |
|
| 461 |
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
| 462 |
encoding = tiktoken.get_encoding(encoding_name)
|
| 463 |
+
num_tokens = len(encoding.encode(string, disallowed_special=()))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
return num_tokens
|
| 465 |
|
| 466 |
|
requirements.txt
CHANGED
|
@@ -17,4 +17,5 @@ google-generativeai
|
|
| 17 |
llama-index-llms-gemini
|
| 18 |
gradio
|
| 19 |
pymongo
|
| 20 |
-
huggingface_hub
|
|
|
|
|
|
| 17 |
llama-index-llms-gemini
|
| 18 |
gradio
|
| 19 |
pymongo
|
| 20 |
+
huggingface_hub
|
| 21 |
+
nbconvert
|