File size: 8,342 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
crawl_news.py – Crawls a list of RSS feeds, grabs full-text when needed,
merges with any existing Parquet in Filebase S3 and uploads the fresh file.

✱ 2025-07-11 – switched backend to Filebase S3
  β€’ Uses boto3 pointed at Filebase's S3-compatible endpoint
  β€’ No local caching of seen URLs: state lives in S3 under seen_urls.txt

Requirements:
  β€’ FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com
  β€’ FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY env vars
  β€’ FILEBASE_BUCKET env var with your bucket name
"""

import os
import sys
import asyncio
import tempfile
from datetime import datetime
from io import BytesIO
from pathlib import Path

from dotenv import load_dotenv
import feedparser
import trafilatura
import pandas as pd
import rich.console
from crawl4ai import AsyncWebCrawler

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
from data_cloud.cloud_utils import StorageHandler

# ─── Configuration ────────────────────────────────────────────────────────────
load_dotenv()

FEED_URLS = [
    "https://www.marketwatch.com/rss/topstories",
    "https://thedefiant.io/feed/",
    "https://www.coindesk.com/arc/outboundfeeds/rss/?outputType=xml",
    "https://cointelegraph.com/rss",
    "https://cryptopotato.com/feed/",
    "https://cryptoslate.com/feed/",
    "https://cryptonews.com/news/feed/",
    "https://smartliquidity.info/feed/",
    "https://www.cnbc.com/id/10000664/device/rss/rss.html",
    "https://time.com/nextadvisor/feed/",
]
MAX_AGE_DAYS      = 1
MIN_SUMMARY_LEN   = 200
MIN_CRAWL_LEN     = 100
CRAWL_CONCURRENCY = 4

S3_NEWS_PATH      = "news/crawled_news/news-latest.parquet"
S3_SEEN_PATH      = "news/crawled_news/seen_urls.txt"

# Filebase S3 creds & endpoint ------------------------------------------------
FILEBASE_ENDPOINT    = os.getenv("FILEBASE_ENDPOINT")
FILEBASE_ACCESS_KEY  = os.getenv("FILEBASE_ACCESS_KEY")
FILEBASE_SECRET_KEY  = os.getenv("FILEBASE_SECRET_KEY")
FILEBASE_BUCKET      = os.getenv("FILEBASE_BUCKET")

if not (FILEBASE_ENDPOINT and FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY and FILEBASE_BUCKET):
    print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set")
    sys.exit(1)

# Silence logs ----------------------------------------------------------------
rich.console.Console.print = lambda *a, **k: None
os.environ.update({
    "RICH_NO_COLOR": "1",
    "RICH_DISABLE": "1",
    "CRAWL4AI_LOG_LEVEL": "CRITICAL",
})

# ─── Main routine ─────────────────────────────────────────────────────────────
async def main() -> None:
    # Setup storage handler
    storage = StorageHandler(
        endpoint_url=FILEBASE_ENDPOINT,
        access_key=FILEBASE_ACCESS_KEY,
        secret_key=FILEBASE_SECRET_KEY,
        bucket_name=FILEBASE_BUCKET,
        local_base="data"
    )

    # Load seen-URL cache from S3 only, do not fallback to local or create locally
    seen_urls: set[str] = set()
    try:
        seen_data = storage.s3.get_object(Bucket=storage.bucket, Key=S3_SEEN_PATH)['Body'].read()
        text = seen_data.decode()
        seen_urls = {line.strip() for line in text.splitlines() if line.strip()}
        print(f"[INFO] Loaded {len(seen_urls)} seen URLs from S3")
    except Exception:
        print(f"[INFO] No seen URLs found in S3. Treating as empty.")
        seen_urls = set()

    # Fetch & parse RSS feeds -------------------------------------------------
    to_crawl, immediate = [], []
    now_utc = datetime.utcnow()
    for url in FEED_URLS:
        feed = feedparser.parse(url)
        new_count = 0
        for e in feed.entries:
            ts = e.get("published_parsed") or e.get("updated_parsed")
            if not ts:
                continue
            link = e.link
            if link in seen_urls:
                continue
            new_count += 1

            content = e.get("content")
            if content:
                txt = "".join(p.value for p in content).strip()
                if len(txt) >= MIN_CRAWL_LEN:
                    immediate.append({"url": link, "text": txt, "timestamp": now_utc.isoformat()})
                    seen_urls.add(link)
                    continue

            summ = e.get("summary", "").strip()
            if len(summ) >= MIN_SUMMARY_LEN:
                immediate.append({"url": link, "text": summ, "timestamp": now_utc.isoformat()})
                seen_urls.add(link)
            else:
                to_crawl.append(link)

        print(f"β€’ Feed {url} -> {new_count} new items")

    # Selective crawl for short summaries ------------------------------------
    crawled = []
    if to_crawl:
        print(f"[INFO] Crawling {len(to_crawl)} pages…")
        async with AsyncWebCrawler(
            seeds=to_crawl,
            max_pages=len(to_crawl),
            concurrency=CRAWL_CONCURRENCY,
            obey_robots_txt=True,
        ) as crawler:
            pages = await asyncio.gather(*(crawler.arun(u) for u in to_crawl))
            for sub in pages:
                for page in sub:
                    if page.url not in seen_urls:
                        txt = trafilatura.extract(page.html, favor_recall=True)
                        if txt and len(txt.strip()) >= MIN_CRAWL_LEN:
                            crawled.append({"url": page.url, "text": txt.strip(), "timestamp": now_utc.isoformat()})
                            seen_urls.add(page.url)

    # Merge, filter & dedupe --------------------------------------------------
    new_results = immediate + crawled
    if not new_results:
        print("[WARNING] No new articles to process")
        return

    df_new = pd.DataFrame(new_results)
    df_new["timestamp"] = pd.to_datetime(df_new["timestamp"], utc=True)

    # Load existing Parquet (cloud or local)
    df_old = pd.DataFrame()
    try:
        parquet_bytes = storage.download(S3_NEWS_PATH)
        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
            tmp.write(parquet_bytes)
            tmp_path = tmp.name
        df_old = pd.read_parquet(tmp_path)
        os.remove(tmp_path)
        print(f"[INFO] Loaded {len(df_old)} existing articles from {storage.get_last_mode()}")
    except Exception:
        print(f"[INFO] No existing Parquet found in cloud or local storage.")

    df = pd.concat([df_old, df_new], ignore_index=True)
    cutoff = pd.Timestamp.utcnow() - pd.Timedelta(days=MAX_AGE_DAYS)
    df = df[df.timestamp >= cutoff]
    df = df.sort_values("timestamp").drop_duplicates("url", keep="last")
    print(f"[DEBUG] old rows: {len(df_old)}, new rows: {len(df_new)}, merged: {len(df)}")

    # Upload updated Parquet to S3 only
    parquet_buf = BytesIO()
    df.to_parquet(parquet_buf, index=False)
    data = parquet_buf.getvalue()
    if not data:
        raise RuntimeError("Refusing to upload empty Parquet")
    storage.s3.put_object(Bucket=storage.bucket, Key=S3_NEWS_PATH, Body=data, ContentType="application/octet-stream")
    print(f"[OK] Parquet updated: S3:{S3_NEWS_PATH}")

    # Persist seen URLs to S3 only
    seen_body = "\n".join(sorted(seen_urls)) + "\n"
    storage.s3.put_object(Bucket=storage.bucket, Key=S3_SEEN_PATH, Body=seen_body.encode(), ContentType="text/plain")
    print(f"[OK] Seen URLs updated: S3:{S3_SEEN_PATH}")

    # Upload all files in data/crawled-news to S3 under news/ (no local fallback)
    local_news_dir = os.path.join("data", "crawled-news")
    s3_news_prefix = "news/crawled_news/"
    for root, _, files in os.walk(local_news_dir):
        for fname in files:
            local_path = os.path.join(root, fname)
            rel_path = os.path.relpath(local_path, local_news_dir)
            s3_key = s3_news_prefix + rel_path.replace("\\", "/")
            with open(local_path, "rb") as f:
                file_bytes = f.read()
            storage.s3.put_object(Bucket=storage.bucket, Key=s3_key, Body=file_bytes, ContentType="application/octet-stream")
            print(f"[OK] Uploaded {local_path} -> S3:{s3_key}")

if __name__ == "__main__":
    asyncio.run(main())