Update process_documents.py
Browse files- process_documents.py +3 -1
process_documents.py
CHANGED
|
@@ -36,14 +36,16 @@ def process_documents(urls):
|
|
| 36 |
def process_web(url, source_id):
|
| 37 |
data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
|
| 38 |
try:
|
|
|
|
| 39 |
page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
|
| 40 |
except Exception as e:
|
|
|
|
| 41 |
page_content = data.page_content.strip()
|
| 42 |
document_snippets = [
|
| 43 |
Document(
|
| 44 |
page_content=page_content,
|
| 45 |
metadata={
|
| 46 |
-
"header":
|
| 47 |
"source_url": url,
|
| 48 |
"source_type": "web",
|
| 49 |
"chunk_id": source_id,
|
|
|
|
| 36 |
def process_web(url, source_id):
|
| 37 |
data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
|
| 38 |
try:
|
| 39 |
+
header = re.search(r"Title: (.*)?", data.page_content).group(1)
|
| 40 |
page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
|
| 41 |
except Exception as e:
|
| 42 |
+
header = ""
|
| 43 |
page_content = data.page_content.strip()
|
| 44 |
document_snippets = [
|
| 45 |
Document(
|
| 46 |
page_content=page_content,
|
| 47 |
metadata={
|
| 48 |
+
"header": header,
|
| 49 |
"source_url": url,
|
| 50 |
"source_type": "web",
|
| 51 |
"chunk_id": source_id,
|