Spaces:

kltn20133118
/

demo_obsei

Sleeping

App Files Files Community

demo_obsei / obsei_module /obsei /source /website_crawler_source.py

kltn20133118

Upload 337 files

dbaa71b verified 7 months ago

raw

history blame

4.24 kB

	import json
	import logging
	from abc import abstractmethod
	from typing import List, Optional, Dict, Any

	import mmh3

	from obsei.payload import TextPayload
	from obsei.source.base_source import BaseSource, BaseSourceConfig

	logger = logging.getLogger(__name__)


	class BaseCrawlerConfig(BaseSourceConfig):
	TYPE: str = "BaseCrawler"

	@abstractmethod
	def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
	pass

	@abstractmethod
	def find_urls(self, url: str) -> List[str]:
	pass


	class TrafilaturaCrawlerConfig(BaseCrawlerConfig):
	# To understand about these configuration params refer:
	# https://trafilatura.readthedocs.io/
	_output_format: str = "json"
	TYPE: str = "Crawler"
	urls: List[str]
	include_comments: bool = False
	include_tables: bool = True
	no_fallback: bool = False
	include_images: bool = False
	include_formatting: bool = False
	deduplicate: bool = True
	no_ssl: bool = False
	is_feed: bool = False
	is_sitemap: bool = False
	include_links: bool = True
	target_language: Optional[str] = None
	url_blacklist: Optional[List[str]] = None

	def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
	try:
	from trafilatura import extract, fetch_url
	except:
	logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
	return {}

	url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
	url_content = fetch_url(
	url=url,
	no_ssl=self.no_ssl,
	)
	extracted_dict: Dict[str, Any] = {}
	if url_content is not None:
	extracted_data = extract(
	filecontent=url_content,
	record_id=url_id,
	no_fallback=self.no_fallback,
	output_format=self._output_format,
	include_comments=self.include_comments,
	include_tables=self.include_tables,
	include_images=self.include_images,
	include_formatting=self.include_formatting,
	include_links=self.include_links,
	deduplicate=self.deduplicate,
	url_blacklist=self.url_blacklist,
	target_language=self.target_language
	)

	if extracted_data:
	extracted_dict = json.loads(extracted_data)
	if "raw-text" in extracted_dict:
	del extracted_dict["raw-text"]

	return extracted_dict

	def find_urls(self, url: str) -> List[str]:
	try:
	from trafilatura import feeds, sitemaps
	except:
	logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
	return []

	urls: List[str] = []
	if self.is_sitemap:
	urls = sitemaps.sitemap_search(url=url, target_lang=self.target_language)
	elif self.is_feed:
	urls = feeds.find_feed_urls(url=url, target_lang=self.target_language)

	return urls


	class TrafilaturaCrawlerSource(BaseSource):
	NAME: Optional[str] = "Crawler"

	def lookup( # type: ignore[override]
	self, config: TrafilaturaCrawlerConfig, **kwargs: Any
	) -> List[TextPayload]:
	source_responses: List[TextPayload] = []

	final_urls = []
	if config.is_sitemap or config.is_feed:
	for url in config.urls:
	final_urls.extend(config.find_urls(url=url))
	else:
	final_urls = config.urls

	for url in final_urls:
	extracted_data = config.extract_url(url=url)
	if extracted_data is None:
	logger.warning(f"Unable to crawl {url}, hence skipping it")
	continue
	comments = (
	"" if "comments" not in extracted_data else extracted_data["comments"]
	)
	source_responses.append(
	TextPayload(
	processed_text=f"{extracted_data['text']}. {comments}",
	meta=extracted_data,
	source_name=self.NAME,
	)
	)

	return source_responses