Spaces:

PabloVD
/

CAMELSDocBot

Sleeping

App Files Files Community

PabloVD commited on Nov 7, 2024

Commit

35d69cc

1 Parent(s): 3ad9a49

Revert to using local file for urls since requesting urls in HuggingFace spaces does not work properly

Browse files

Files changed (2) hide show

app.py +4 -27
urls.txt +42 -0

app.py CHANGED Viewed

@@ -9,11 +9,8 @@ from langchain_core.runnables import RunnablePassthrough
 from langchain_community.embeddings import HuggingFaceInstructEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_mistralai import ChatMistralAI
-import requests
 from langchain_community.document_loaders import WebBaseLoader
-import bs4
 from langchain_core.rate_limiters import InMemoryRateLimiter
-from urllib.parse import urljoin
 # Define a limiter to avoid rate limit issues with MistralAI
 rate_limiter = InMemoryRateLimiter(
@@ -22,31 +19,11 @@ rate_limiter = InMemoryRateLimiter(
     max_bucket_size=10,  # Controls the maximum burst size.
 )
-# Function to get all the subpages from a base url
-def get_subpages(base_url):
-    visited_urls = []
-    urls_to_visit = [base_url]
-    while urls_to_visit:
-        url = urls_to_visit.pop(0)
-        if url in visited_urls:
-            continue
-        visited_urls.append(url)
-        response = requests.get(url)
-        soup = bs4.BeautifulSoup(response.content, "html.parser")
-        for link in soup.find_all("a", href=True):
-            full_url = urljoin(base_url, link['href'])
-            if base_url in full_url and full_url.endswith(".html") and full_url not in visited_urls:
-                urls_to_visit.append(full_url)
-    visited_urls = visited_urls[1:]
-    return visited_urls
 # Get urls
-base_url = "https://camels.readthedocs.io/en/latest/"
-urls = get_subpages(base_url)
 # Load, chunk and index the contents of the blog.
 loader = WebBaseLoader(urls)

 from langchain_community.embeddings import HuggingFaceInstructEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_mistralai import ChatMistralAI
 from langchain_community.document_loaders import WebBaseLoader
 from langchain_core.rate_limiters import InMemoryRateLimiter
 # Define a limiter to avoid rate limit issues with MistralAI
 rate_limiter = InMemoryRateLimiter(
     max_bucket_size=10,  # Controls the maximum burst size.
 )
 # Get urls
+urlsfile = open("urls.txt")
+urls = urlsfile.readlines()
+urls = [url.replace("\n","") for url in urls]
+urlsfile.close()
 # Load, chunk and index the contents of the blog.
 loader = WebBaseLoader(urls)

urls.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+https://camels.readthedocs.io/en/latest/
+https://camels.readthedocs.io/en/latest/news.html
+https://camels.readthedocs.io/en/latest/goals.html
+https://camels.readthedocs.io/en/latest/publications.html
+https://camels.readthedocs.io/en/latest/data_access.html
+https://camels.readthedocs.io/en/latest/citation.html
+https://camels.readthedocs.io/en/latest/description.html
+https://camels.readthedocs.io/en/latest/suites_sets.html
+https://camels.readthedocs.io/en/latest/codes.html
+https://camels.readthedocs.io/en/latest/parameters.html
+https://camels.readthedocs.io/en/latest/organization.html
+https://camels.readthedocs.io/en/latest/snapshots.html
+https://camels.readthedocs.io/en/latest/subfind.html
+https://camels.readthedocs.io/en/latest/SubLink.html
+https://camels.readthedocs.io/en/latest/rockstar.html
+https://camels.readthedocs.io/en/latest/ahf.html
+https://camels.readthedocs.io/en/latest/caesar.html
+https://camels.readthedocs.io/en/latest/Pk.html
+https://camels.readthedocs.io/en/latest/Bk.html
+https://camels.readthedocs.io/en/latest/pdf.html
+https://camels.readthedocs.io/en/latest/VIDE.html
+https://camels.readthedocs.io/en/latest/Lya.html
+https://camels.readthedocs.io/en/latest/Xrays.html
+https://camels.readthedocs.io/en/latest/Profiles.html
+https://camels.readthedocs.io/en/latest/CMD.html
+https://camels.readthedocs.io/en/latest/SAM.html
+https://camels.readthedocs.io/en/latest/zoomGZ.html
+https://camels.readthedocs.io/en/latest/tutorials.html
+https://camels.readthedocs.io/en/latest/images.html
+https://camels.readthedocs.io/en/latest/camels_library.html
+https://camels.readthedocs.io/en/latest/pylians3.html
+https://camels.readthedocs.io/en/latest/team.html
+https://camels.readthedocs.io/en/latest/contact.html
+https://camels.readthedocs.io/en/latest/logo.html
+https://camels.readthedocs.io/en/latest/examples/Reading_Manipulating_Snapshots.html
+https://camels.readthedocs.io/en/latest/examples/Pk.html
+https://camels.readthedocs.io/en/latest/examples/Images.html
+https://camels.readthedocs.io/en/latest/examples/particles_subhalos.html
+https://camels.readthedocs.io/en/latest/index.html
+https://camels.readthedocs.io/en/latest/Images.html
+https://camels.readthedocs.io/en/latest/particles_subhalos.html
+https://camels.readthedocs.io/en/latest/Reading_Manipulating_Snapshots.html