from pydantic import BaseModel from typing import List, Dict, Self from bs4 import BeautifulSoup import re import feedparser from tqdm import tqdm import requests import time feeds = [ "https://www.dealnews.com/c142/Electronics/?rss=1", "https://www.dealnews.com/c39/Computers/?rss=1", "https://www.dealnews.com/c238/Automotive/?rss=1", "https://www.dealnews.com/f1912/Smart-Home/?rss=1", "https://www.dealnews.com/c196/Home-Garden/?rss=1", ] def extract(html_snippet: str) -> str: """ Use Beautiful Soup to clean up this HTML snippet and extract useful text """ soup = BeautifulSoup(html_snippet, 'html.parser') snippet_div = soup.find('div', class_='snippet summary') if snippet_div: description = snippet_div.get_text(strip=True) description = BeautifulSoup(description, 'html.parser').get_text() description = re.sub('<[^<]+?>', '', description) result = description.strip() else: result = html_snippet return result.replace('\n', ' ') class ScrapedDeal: """ A class to represent a Deal retrieved from an RSS feed """ category: str title: str summary: str url: str details: str features: str def __init__(self, entry: Dict[str, str]): """ Populate this instance based on the provided dict """ self.title = entry['title'] self.summary = extract(entry['summary']) self.url = entry['links'][0]['href'] stuff = requests.get(self.url).content soup = BeautifulSoup(stuff, 'html.parser') content = soup.find('div', class_='content-section').get_text() content = content.replace('\nmore', '').replace('\n', ' ') if "Features" in content: self.details, self.features = content.split("Features") else: self.details = content self.features = "" def __repr__(self): """ Return a string to describe this deal """ return f"<{self.title}>" def describe(self): """ Return a longer string to describe this deal for use in calling a model """ return f"Title: {self.title}\nDetails: {self.details.strip()}\nFeatures: {self.features.strip()}\nURL: {self.url}" @classmethod def fetch(cls, show_progress : bool = False) -> List[Self]: """ Retrieve all deals from the selected RSS feeds """ deals = [] feed_iter = tqdm(feeds) if show_progress else feeds for feed_url in feed_iter: feed = feedparser.parse(feed_url) for entry in feed.entries[:10]: deals.append(cls(entry)) time.sleep(0.5) return deals class Deal(BaseModel): """ A class to Represent a Deal with a summary description """ product_description: str price: float url: str class DealSelection(BaseModel): """ A class to Represent a list of Deals """ deals: List[Deal] class Opportunity(BaseModel): """ A class to represent a possible opportunity: a Deal where we estimate it should cost more than it's being offered """ deal: Deal estimate: float discount: float