demo_obsei / obsei_module /obsei /source /website_crawler_source.py
kltn20133118's picture
Upload 337 files
dbaa71b verified
raw
history blame
4.24 kB
import json
import logging
from abc import abstractmethod
from typing import List, Optional, Dict, Any
import mmh3
from obsei.payload import TextPayload
from obsei.source.base_source import BaseSource, BaseSourceConfig
logger = logging.getLogger(__name__)
class BaseCrawlerConfig(BaseSourceConfig):
TYPE: str = "BaseCrawler"
@abstractmethod
def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
pass
@abstractmethod
def find_urls(self, url: str) -> List[str]:
pass
class TrafilaturaCrawlerConfig(BaseCrawlerConfig):
# To understand about these configuration params refer:
# https://trafilatura.readthedocs.io/
_output_format: str = "json"
TYPE: str = "Crawler"
urls: List[str]
include_comments: bool = False
include_tables: bool = True
no_fallback: bool = False
include_images: bool = False
include_formatting: bool = False
deduplicate: bool = True
no_ssl: bool = False
is_feed: bool = False
is_sitemap: bool = False
include_links: bool = True
target_language: Optional[str] = None
url_blacklist: Optional[List[str]] = None
def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
try:
from trafilatura import extract, fetch_url
except:
logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
return {}
url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
url_content = fetch_url(
url=url,
no_ssl=self.no_ssl,
)
extracted_dict: Dict[str, Any] = {}
if url_content is not None:
extracted_data = extract(
filecontent=url_content,
record_id=url_id,
no_fallback=self.no_fallback,
output_format=self._output_format,
include_comments=self.include_comments,
include_tables=self.include_tables,
include_images=self.include_images,
include_formatting=self.include_formatting,
include_links=self.include_links,
deduplicate=self.deduplicate,
url_blacklist=self.url_blacklist,
target_language=self.target_language
)
if extracted_data:
extracted_dict = json.loads(extracted_data)
if "raw-text" in extracted_dict:
del extracted_dict["raw-text"]
return extracted_dict
def find_urls(self, url: str) -> List[str]:
try:
from trafilatura import feeds, sitemaps
except:
logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
return []
urls: List[str] = []
if self.is_sitemap:
urls = sitemaps.sitemap_search(url=url, target_lang=self.target_language)
elif self.is_feed:
urls = feeds.find_feed_urls(url=url, target_lang=self.target_language)
return urls
class TrafilaturaCrawlerSource(BaseSource):
NAME: Optional[str] = "Crawler"
def lookup( # type: ignore[override]
self, config: TrafilaturaCrawlerConfig, **kwargs: Any
) -> List[TextPayload]:
source_responses: List[TextPayload] = []
final_urls = []
if config.is_sitemap or config.is_feed:
for url in config.urls:
final_urls.extend(config.find_urls(url=url))
else:
final_urls = config.urls
for url in final_urls:
extracted_data = config.extract_url(url=url)
if extracted_data is None:
logger.warning(f"Unable to crawl {url}, hence skipping it")
continue
comments = (
"" if "comments" not in extracted_data else extracted_data["comments"]
)
source_responses.append(
TextPayload(
processed_text=f"{extracted_data['text']}. {comments}",
meta=extracted_data,
source_name=self.NAME,
)
)
return source_responses