MANOJSEQ commited on
Commit
93f1fb4
·
verified ·
1 Parent(s): f75d93a

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +216 -60
main.py CHANGED
@@ -30,6 +30,7 @@ from starlette.middleware.gzip import GZipMiddleware
30
  from transformers import pipeline as hf_pipeline
31
  os.environ.setdefault("OMP_NUM_THREADS", "1")
32
  from fastapi.responses import PlainTextResponse, JSONResponse
 
33
 
34
  # ----------------- Torch Runtime Settings -----------------
35
  import torch
@@ -309,13 +310,14 @@ def _extract_desc_from_html(html: str) -> Optional[str]:
309
  def _desc_cache_get(url: str) -> Optional[str]:
310
  if not url:
311
  return None
312
- entry = DESC_CACHE.get(url)
313
- if not entry:
314
- return None
315
- if _now_mono() - entry["t"] > DESC_CACHE_TTL:
316
- DESC_CACHE.pop(url, None)
317
- return None
318
- return entry["text"]
 
319
 
320
  def _desc_cache_put(url: str, text: str):
321
  if url and text:
@@ -991,7 +993,9 @@ def fetch_gdelt_articles(
991
  language=None,
992
  timespan="3d",
993
  category=None,
994
- extra_tokens: Optional[List[str]] = None
 
 
995
  ):
996
  q = _gdelt_safe_query(query, language)
997
  if extra_tokens:
@@ -1003,8 +1007,12 @@ def fetch_gdelt_articles(
1003
  "format": "json",
1004
  "sort": "DateDesc",
1005
  "maxrecords": int(min(250, max(1, limit))),
1006
- "timespan": timespan,
1007
  }
 
 
 
 
 
1008
  headers = {
1009
  "User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:[email protected])",
1010
  "Accept": "application/json",
@@ -1056,10 +1064,18 @@ def fetch_gdelt_articles(
1056
  log.info(f"GDELT returned {len(results)}")
1057
  return results
1058
 
1059
- def fetch_gdelt_multi(limit=120, query=None, language=None, timespan="48h", category=None, speed: Speed = Speed.balanced):
 
 
 
 
1060
  if language:
1061
- primary = fetch_gdelt_articles(limit=limit, query=query, language=language, timespan=timespan, category=category)
1062
- booster = fetch_gdelt_articles(limit=max(10, limit // 6), query=query, language="en", timespan=timespan, category=category)
 
 
 
 
1063
  return primary + booster
1064
  if speed == Speed.fast:
1065
  langs = LANG_ROTATION[:3]
@@ -1073,20 +1089,18 @@ def fetch_gdelt_multi(limit=120, query=None, language=None, timespan="48h", cate
1073
  per_lang = max(8, math.ceil(limit / len(langs)))
1074
  out = []
1075
  for lg in langs:
1076
- out.extend(fetch_gdelt_articles(limit=per_lang, query=query, language=lg, timespan=timespan, category=category))
 
 
1077
  if speed != Speed.fast:
1078
  per_cc = max(4, limit // 30) if speed == Speed.max else max(2, limit // 40)
1079
  for cc in COUNTRY_SEEDS[: (8 if speed == Speed.balanced else 16)]:
1080
- out.extend(
1081
- fetch_gdelt_articles(
1082
- limit=per_cc,
1083
- query=query,
1084
- language="en",
1085
- timespan=timespan,
1086
- category=category,
1087
- extra_tokens=[f"sourcecountry:{cc}"]
1088
- )
1089
- )
1090
  return out
1091
 
1092
 
@@ -1377,36 +1391,109 @@ CACHE_TTL_SECS = 900
1377
  SIM_THRESHOLD = 0.6
1378
  _events_cache: Dict[Tuple, Dict[str, Any]] = {}
1379
 
1380
- def cache_key_for(q, category, language, limit_each, translate=False, target_lang=None, speed=Speed.balanced):
1381
- return (q or "", category or "", language or "", int(limit_each or 50),
1382
- bool(translate), (target_lang or "").lower(), speed.value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1383
 
1384
  _first_real_build = True
1385
 
1386
- def get_or_build_events_cache(q, category, language, translate, target_lang, limit_each, speed=Speed.balanced):
 
 
 
 
 
1387
  global _first_real_build
1388
- key = cache_key_for(q, category, language, limit_each, translate, target_lang, speed)
1389
  now = monotonic()
 
1390
  if speed == Speed.fast:
1391
  use_timespan, use_limit = "24h", min(limit_each, 20)
1392
  elif speed == Speed.balanced:
1393
  use_timespan, use_limit = "48h", min(limit_each, 100)
1394
  else:
1395
  use_timespan, use_limit = "3d", limit_each
 
1396
  entry = _events_cache.get(key)
1397
  if entry and now - entry["t"] < CACHE_TTL_SECS:
1398
  log.info(f"CACHE HIT for {key}")
1399
  return key, entry["enriched"], entry["clusters"]
 
1400
  lock = _get_inflight_lock(key)
1401
  with lock:
1402
  entry = _events_cache.get(key)
1403
  if entry and now - entry["t"] < CACHE_TTL_SECS:
1404
  log.info(f"CACHE HIT (post-lock) for {key}")
1405
  return key, entry["enriched"], entry["clusters"]
1406
- if _first_real_build:
 
1407
  use_timespan = "24h" if use_timespan != "24h" else use_timespan
1408
  use_limit = min(use_limit, 100)
1409
- log.info(f"CACHE MISS for {key} — fetching (timespan={use_timespan}, limit_each={use_limit})")
 
 
1410
  raw = combine_raw_articles(
1411
  category=category,
1412
  query=q,
@@ -1414,6 +1501,8 @@ def get_or_build_events_cache(q, category, language, translate, target_lang, lim
1414
  limit_each=use_limit,
1415
  timespan=use_timespan,
1416
  speed=speed,
 
 
1417
  )
1418
  prefetch_descriptions_async(raw, speed)
1419
  enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
@@ -1507,13 +1596,25 @@ def fetch_newsapi_headlines_multi(limit=50, language=None):
1507
  time.sleep(0.2)
1508
  return all_[:limit]
1509
 
1510
- def fetch_newsapi_articles(category=None, limit=20, query=None, language=None):
 
 
 
 
 
 
 
1511
  if not _newsapi_enabled():
1512
  return []
1513
  if query:
1514
  url = f"https://newsapi.org/v2/everything?pageSize={limit}&apiKey={NEWSAPI_KEY}&q={requests.utils.quote(query)}"
1515
  if language:
1516
  url += f"&language={language}"
 
 
 
 
 
1517
  try:
1518
  r = _session_get(url, timeout=12)
1519
  if r.status_code != 200:
@@ -1548,19 +1649,22 @@ def fetch_newsapi_articles(category=None, limit=20, query=None, language=None):
1548
 
1549
  # ----------------- Provider Combiner / Dedup -----------------
1550
  def combine_raw_articles(category=None, query=None, language=None, limit_each=30,
1551
- timespan="3d", speed=Speed.balanced, log_summary: bool = True):
 
1552
  if speed == Speed.fast:
1553
  timespan = "24h"
1554
  limit_each = min(limit_each, 20)
1555
  elif speed == Speed.balanced:
1556
  timespan = "48h"
1557
  limit_each = min(limit_each, 100)
 
1558
  a1 = []
1559
  if USE_NEWSAPI:
1560
  if not query:
1561
  a1 = fetch_newsapi_headlines_multi(limit=limit_each, language=language)
1562
  else:
1563
- a1 = fetch_newsapi_articles(category=category, limit=limit_each, query=query, language=language)
 
1564
  a2 = []
1565
  if USE_NEWSDATA_API:
1566
  a2 = [
@@ -1569,10 +1673,10 @@ def combine_raw_articles(category=None, query=None, language=None, limit_each=30
1569
  if a.get("link")
1570
  ]
1571
  a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
1572
- gdelt_limit = limit_each
1573
  a4 = fetch_gdelt_multi(
1574
  limit=limit_each, query=query, language=language,
1575
- timespan=timespan, category=category, speed=speed
 
1576
  ) if USE_GDELT_API else []
1577
 
1578
  seen, merged = set(), []
@@ -1583,6 +1687,23 @@ def combine_raw_articles(category=None, query=None, language=None, limit_each=30
1583
  if url not in seen:
1584
  seen.add(url)
1585
  merged.append(a)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1586
  if log_summary:
1587
  fetch_log.info("----- Article Fetch Summary -----")
1588
  fetch_log.info(f"📊 NewsAPI returned: {len(a1)} articles")
@@ -1593,6 +1714,7 @@ def combine_raw_articles(category=None, query=None, language=None, limit_each=30
1593
  fetch_log.info("---------------------------------")
1594
  return merged
1595
 
 
1596
  # ----------------- API: /events -----------------
1597
  @app.get("/events")
1598
  def get_events(
@@ -1606,9 +1728,17 @@ def get_events(
1606
  min_countries: int = Query(2, ge=1, le=50),
1607
  min_articles: int = Query(2, ge=1, le=200),
1608
  speed: Speed = Query(Speed.balanced),
 
 
1609
  ):
 
 
 
 
 
1610
  cache_key, enriched, clusters = get_or_build_events_cache(
1611
- q, category, language, False, None, limit_each, speed=speed
 
1612
  )
1613
  view = enriched
1614
  if translate and target_lang:
@@ -1635,28 +1765,47 @@ def get_event_details(
1635
  target_lang: Optional[str] = Query(None),
1636
  limit_each: int = Query(150, ge=5, le=250),
1637
  max_samples: int = Query(5, ge=0, le=1000),
 
 
1638
  ):
 
 
 
1639
  if cache_key:
1640
  parts = cache_key.split("|")
1641
- if len(parts) != 7:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1642
  raise HTTPException(status_code=400, detail="Bad cache_key")
1643
- speed_str = parts[6]
1644
- try:
1645
- speed_obj = Speed(speed_str)
1646
- except ValueError:
1647
- speed_obj = Speed.balanced
1648
- key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
1649
- parts[4] == "True", parts[5].lower(), speed_str)
1650
  else:
1651
  speed_obj = Speed.balanced
1652
- key_tuple = cache_key_for(q, category, language, limit_each, translate, target_lang, speed=speed_obj)
 
 
1653
  entry = _events_cache.get(key_tuple)
1654
  if not entry:
1655
  _, enriched, clusters = get_or_build_events_cache(
1656
- q, category, language, False, None, limit_each, speed=speed_obj
 
1657
  )
1658
  else:
1659
  enriched, clusters = entry["enriched"], entry["clusters"]
 
1660
  eview = enriched
1661
  if translate and target_lang:
1662
  eview = [dict(i) for i in enriched]
@@ -1688,25 +1837,32 @@ def get_news(
1688
  speed: Speed = Query(Speed.balanced),
1689
  page: int = Query(1, ge=1),
1690
  page_size: int = Query(120, ge=5, le=300),
 
 
1691
  ):
 
 
 
1692
  enriched: List[Dict[str, Any]] = []
1693
  if cache_key:
1694
  parts = cache_key.split("|")
1695
- if len(parts) == 7:
1696
- key_tuple = (
1697
- parts[0],
1698
- parts[1],
1699
- parts[2],
1700
- int(parts[3]),
1701
- parts[4] == "True",
1702
- parts[5].lower(),
1703
- parts[6],
1704
- )
1705
  entry = _events_cache.get(key_tuple)
1706
  if entry:
1707
  enriched = entry["enriched"]
 
 
 
 
 
 
 
1708
  if not enriched:
1709
- raw = combine_raw_articles(category=category, query=q, language=language, limit_each=limit_each, speed=speed)
 
 
1710
  prefetch_descriptions_async(raw, speed)
1711
  enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
1712
  if category:
@@ -1733,9 +1889,9 @@ def get_news(
1733
  s = sentiment.strip().lower()
1734
  enriched = [i for i in enriched if i.get("sentiment", "").lower() == s]
1735
  total = len(enriched)
1736
- start = (page - 1) * page_size
1737
- end = start + page_size
1738
- items = [dict(i) for i in enriched[start:end]]
1739
  if lite:
1740
  drop = {"_ml_text"}
1741
  for i in items:
 
30
  from transformers import pipeline as hf_pipeline
31
  os.environ.setdefault("OMP_NUM_THREADS", "1")
32
  from fastapi.responses import PlainTextResponse, JSONResponse
33
+ from datetime import datetime, timezone
34
 
35
  # ----------------- Torch Runtime Settings -----------------
36
  import torch
 
310
  def _desc_cache_get(url: str) -> Optional[str]:
311
  if not url:
312
  return None
313
+ with DESC_CACHE_LOCK:
314
+ entry = DESC_CACHE.get(url)
315
+ if not entry:
316
+ return None
317
+ if _now_mono() - entry["t"] > DESC_CACHE_TTL:
318
+ DESC_CACHE.pop(url, None)
319
+ return None
320
+ return entry["text"]
321
 
322
  def _desc_cache_put(url: str, text: str):
323
  if url and text:
 
993
  language=None,
994
  timespan="3d",
995
  category=None,
996
+ extra_tokens: Optional[List[str]] = None,
997
+ start_utc: Optional[datetime] = None,
998
+ end_utc: Optional[datetime] = None,
999
  ):
1000
  q = _gdelt_safe_query(query, language)
1001
  if extra_tokens:
 
1007
  "format": "json",
1008
  "sort": "DateDesc",
1009
  "maxrecords": int(min(250, max(1, limit))),
 
1010
  }
1011
+ if start_utc and end_utc:
1012
+ params["startdatetime"] = _gdelt_fmt(start_utc)
1013
+ params["enddatetime"] = _gdelt_fmt(end_utc)
1014
+ else:
1015
+ params["timespan"] = timespan
1016
  headers = {
1017
  "User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:[email protected])",
1018
  "Accept": "application/json",
 
1064
  log.info(f"GDELT returned {len(results)}")
1065
  return results
1066
 
1067
+ def fetch_gdelt_multi(
1068
+ limit=120, query=None, language=None, timespan="48h",
1069
+ category=None, speed: Speed = Speed.balanced,
1070
+ start_utc: Optional[datetime] = None, end_utc: Optional[datetime] = None
1071
+ ):
1072
  if language:
1073
+ primary = fetch_gdelt_articles(limit=limit, query=query, language=language,
1074
+ timespan=timespan, category=category,
1075
+ start_utc=start_utc, end_utc=end_utc)
1076
+ booster = fetch_gdelt_articles(limit=max(10, limit // 6), query=query, language="en",
1077
+ timespan=timespan, category=category,
1078
+ start_utc=start_utc, end_utc=end_utc)
1079
  return primary + booster
1080
  if speed == Speed.fast:
1081
  langs = LANG_ROTATION[:3]
 
1089
  per_lang = max(8, math.ceil(limit / len(langs)))
1090
  out = []
1091
  for lg in langs:
1092
+ out.extend(fetch_gdelt_articles(limit=per_lang, query=query, language=lg,
1093
+ timespan=timespan, category=category,
1094
+ start_utc=start_utc, end_utc=end_utc))
1095
  if speed != Speed.fast:
1096
  per_cc = max(4, limit // 30) if speed == Speed.max else max(2, limit // 40)
1097
  for cc in COUNTRY_SEEDS[: (8 if speed == Speed.balanced else 16)]:
1098
+ out.extend(fetch_gdelt_articles(
1099
+ limit=per_cc, query=query, language="en",
1100
+ timespan=timespan, category=category,
1101
+ extra_tokens=[f"sourcecountry:{cc}"],
1102
+ start_utc=start_utc, end_utc=end_utc
1103
+ ))
 
 
 
 
1104
  return out
1105
 
1106
 
 
1391
  SIM_THRESHOLD = 0.6
1392
  _events_cache: Dict[Tuple, Dict[str, Any]] = {}
1393
 
1394
+ # -------- Date parsing helpers (Option B) --------
1395
+ ISO_BASIC_RE = re.compile(r'^(\d{4})(\d{2})(\d{2})(?:[T ]?(\d{2})(\d{2})(\d{2}))?(Z|[+-]\d{2}:?\d{2})?$')
1396
+
1397
+ def _parse_user_dt(s: Optional[str], which: str) -> Optional[datetime]:
1398
+ """Parse query 'start'/'end' into UTC-aware datetimes."""
1399
+ if not s:
1400
+ return None
1401
+ s = s.strip()
1402
+ try:
1403
+ # Normalize Z
1404
+ if s.endswith("Z"):
1405
+ s = s[:-1] + "+00:00"
1406
+ # Date-only
1407
+ if re.match(r'^\d{4}-\d{2}-\d{2}$', s):
1408
+ s = s + ("T00:00:00+00:00" if which == "start" else "T23:59:59+00:00")
1409
+ dt = datetime.fromisoformat(s)
1410
+ if dt.tzinfo is None:
1411
+ dt = dt.replace(tzinfo=timezone.utc)
1412
+ return dt.astimezone(timezone.utc)
1413
+ except Exception:
1414
+ m = ISO_BASIC_RE.match(s)
1415
+ if m:
1416
+ yyyy, MM, dd, hh, mm, ss, tz = m.groups()
1417
+ hh = hh or ("00" if which == "start" else "23")
1418
+ mm = mm or ("00" if which == "start" else "59")
1419
+ ss = ss or ("00" if which == "start" else "59")
1420
+ return datetime(int(yyyy), int(MM), int(dd), int(hh), int(mm), int(ss), tzinfo=timezone.utc)
1421
+ return None
1422
+
1423
+ def _gdelt_fmt(dt: datetime) -> str:
1424
+ return dt.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
1425
+
1426
+ def _parse_any_pubdate(s: Optional[str]) -> Optional[datetime]:
1427
+ """Best-effort parse of provider publishedAt strings to UTC."""
1428
+ if not s:
1429
+ return None
1430
+ try:
1431
+ t = s.strip()
1432
+ if t.endswith("Z"):
1433
+ t = t[:-1] + "+00:00"
1434
+ return datetime.fromisoformat(t).astimezone(timezone.utc)
1435
+ except Exception:
1436
+ m = ISO_BASIC_RE.match(s)
1437
+ if m:
1438
+ yyyy, MM, dd, hh, mm, ss, tz = m.groups()
1439
+ hh = hh or "00"; mm = mm or "00"; ss = ss or "00"
1440
+ return datetime(int(yyyy), int(MM), int(dd), int(hh), int(mm), int(ss), tzinfo=timezone.utc)
1441
+ return None
1442
+
1443
+
1444
+ def cache_key_for(
1445
+ q, category, language, limit_each,
1446
+ translate=False, target_lang=None,
1447
+ start_utc: Optional[datetime] = None,
1448
+ end_utc: Optional[datetime] = None,
1449
+ speed: Speed = Speed.balanced
1450
+ ):
1451
+ return (
1452
+ q or "", category or "", language or "", int(limit_each or 50),
1453
+ bool(translate), (target_lang or "").lower(),
1454
+ (start_utc and _gdelt_fmt(start_utc)) or "",
1455
+ (end_utc and _gdelt_fmt(end_utc)) or "",
1456
+ speed.value,
1457
+ )
1458
+
1459
 
1460
  _first_real_build = True
1461
 
1462
+ def get_or_build_events_cache(
1463
+ q, category, language, translate, target_lang, limit_each,
1464
+ start_utc: Optional[datetime] = None,
1465
+ end_utc: Optional[datetime] = None,
1466
+ speed: Speed = Speed.balanced
1467
+ ):
1468
  global _first_real_build
1469
+ key = cache_key_for(q, category, language, limit_each, translate, target_lang, start_utc, end_utc, speed)
1470
  now = monotonic()
1471
+
1472
  if speed == Speed.fast:
1473
  use_timespan, use_limit = "24h", min(limit_each, 20)
1474
  elif speed == Speed.balanced:
1475
  use_timespan, use_limit = "48h", min(limit_each, 100)
1476
  else:
1477
  use_timespan, use_limit = "3d", limit_each
1478
+
1479
  entry = _events_cache.get(key)
1480
  if entry and now - entry["t"] < CACHE_TTL_SECS:
1481
  log.info(f"CACHE HIT for {key}")
1482
  return key, entry["enriched"], entry["clusters"]
1483
+
1484
  lock = _get_inflight_lock(key)
1485
  with lock:
1486
  entry = _events_cache.get(key)
1487
  if entry and now - entry["t"] < CACHE_TTL_SECS:
1488
  log.info(f"CACHE HIT (post-lock) for {key}")
1489
  return key, entry["enriched"], entry["clusters"]
1490
+
1491
+ if _first_real_build and not (start_utc and end_utc):
1492
  use_timespan = "24h" if use_timespan != "24h" else use_timespan
1493
  use_limit = min(use_limit, 100)
1494
+
1495
+ log.info(f"CACHE MISS for {key} — fetching (timespan={use_timespan}, limit_each={use_limit}, start={start_utc}, end={end_utc})")
1496
+
1497
  raw = combine_raw_articles(
1498
  category=category,
1499
  query=q,
 
1501
  limit_each=use_limit,
1502
  timespan=use_timespan,
1503
  speed=speed,
1504
+ start_utc=start_utc,
1505
+ end_utc=end_utc,
1506
  )
1507
  prefetch_descriptions_async(raw, speed)
1508
  enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
 
1596
  time.sleep(0.2)
1597
  return all_[:limit]
1598
 
1599
+ def fetch_newsapi_articles(
1600
+ category=None,
1601
+ limit=20,
1602
+ query=None,
1603
+ language=None,
1604
+ start_utc: Optional[datetime] = None,
1605
+ end_utc: Optional[datetime] = None,
1606
+ ):
1607
  if not _newsapi_enabled():
1608
  return []
1609
  if query:
1610
  url = f"https://newsapi.org/v2/everything?pageSize={limit}&apiKey={NEWSAPI_KEY}&q={requests.utils.quote(query)}"
1611
  if language:
1612
  url += f"&language={language}"
1613
+ # NEW: date range for /everything
1614
+ if start_utc:
1615
+ url += f"&from={start_utc.date().isoformat()}"
1616
+ if end_utc:
1617
+ url += f"&to={end_utc.date().isoformat()}"
1618
  try:
1619
  r = _session_get(url, timeout=12)
1620
  if r.status_code != 200:
 
1649
 
1650
  # ----------------- Provider Combiner / Dedup -----------------
1651
  def combine_raw_articles(category=None, query=None, language=None, limit_each=30,
1652
+ timespan="3d", speed=Speed.balanced, log_summary: bool = True,
1653
+ start_utc: Optional[datetime] = None, end_utc: Optional[datetime] = None):
1654
  if speed == Speed.fast:
1655
  timespan = "24h"
1656
  limit_each = min(limit_each, 20)
1657
  elif speed == Speed.balanced:
1658
  timespan = "48h"
1659
  limit_each = min(limit_each, 100)
1660
+
1661
  a1 = []
1662
  if USE_NEWSAPI:
1663
  if not query:
1664
  a1 = fetch_newsapi_headlines_multi(limit=limit_each, language=language)
1665
  else:
1666
+ a1 = fetch_newsapi_articles(category=category, limit=limit_each, query=query,
1667
+ language=language, start_utc=start_utc, end_utc=end_utc)
1668
  a2 = []
1669
  if USE_NEWSDATA_API:
1670
  a2 = [
 
1673
  if a.get("link")
1674
  ]
1675
  a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
 
1676
  a4 = fetch_gdelt_multi(
1677
  limit=limit_each, query=query, language=language,
1678
+ timespan=timespan, category=category, speed=speed,
1679
+ start_utc=start_utc, end_utc=end_utc
1680
  ) if USE_GDELT_API else []
1681
 
1682
  seen, merged = set(), []
 
1687
  if url not in seen:
1688
  seen.add(url)
1689
  merged.append(a)
1690
+
1691
+ #Apply date filter locally (for providers that can’t filter server-side)
1692
+ if start_utc or end_utc:
1693
+ s_ts = start_utc.timestamp() if start_utc else None
1694
+ e_ts = end_utc.timestamp() if end_utc else None
1695
+
1696
+ def _in_range(row):
1697
+ dt = _parse_any_pubdate(row.get("publishedAt") or "")
1698
+ if not dt:
1699
+ return False
1700
+ t = dt.timestamp()
1701
+ if s_ts and t < s_ts: return False
1702
+ if e_ts and t > e_ts: return False
1703
+ return True
1704
+
1705
+ merged = [a for a in merged if _in_range(a)]
1706
+
1707
  if log_summary:
1708
  fetch_log.info("----- Article Fetch Summary -----")
1709
  fetch_log.info(f"📊 NewsAPI returned: {len(a1)} articles")
 
1714
  fetch_log.info("---------------------------------")
1715
  return merged
1716
 
1717
+
1718
  # ----------------- API: /events -----------------
1719
  @app.get("/events")
1720
  def get_events(
 
1728
  min_countries: int = Query(2, ge=1, le=50),
1729
  min_articles: int = Query(2, ge=1, le=200),
1730
  speed: Speed = Query(Speed.balanced),
1731
+ start: Optional[str] = Query(None),
1732
+ end: Optional[str] = Query(None),
1733
  ):
1734
+ start_dt = _parse_user_dt(start, "start")
1735
+ end_dt = _parse_user_dt(end, "end")
1736
+ if start_dt and end_dt and start_dt > end_dt:
1737
+ start_dt, end_dt = end_dt, start_dt # swap
1738
+
1739
  cache_key, enriched, clusters = get_or_build_events_cache(
1740
+ q, category, language, False, None, limit_each,
1741
+ start_utc=start_dt, end_utc=end_dt, speed=speed
1742
  )
1743
  view = enriched
1744
  if translate and target_lang:
 
1765
  target_lang: Optional[str] = Query(None),
1766
  limit_each: int = Query(150, ge=5, le=250),
1767
  max_samples: int = Query(5, ge=0, le=1000),
1768
+ start: Optional[str] = Query(None),
1769
+ end: Optional[str] = Query(None),
1770
  ):
1771
+ start_dt = _parse_user_dt(start, "start")
1772
+ end_dt = _parse_user_dt(end, "end")
1773
+
1774
  if cache_key:
1775
  parts = cache_key.split("|")
1776
+ if len(parts) == 9:
1777
+ speed_str = parts[8]
1778
+ try:
1779
+ speed_obj = Speed(speed_str)
1780
+ except ValueError:
1781
+ speed_obj = Speed.balanced
1782
+ key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
1783
+ parts[4] == "True", parts[5].lower(),
1784
+ parts[6], parts[7], speed_str)
1785
+ elif len(parts) == 7: # backwards compat
1786
+ speed_str = parts[6]
1787
+ try:
1788
+ speed_obj = Speed(speed_str)
1789
+ except ValueError:
1790
+ speed_obj = Speed.balanced
1791
+ key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
1792
+ parts[4] == "True", parts[5].lower(), "", "", speed_str)
1793
+ else:
1794
  raise HTTPException(status_code=400, detail="Bad cache_key")
 
 
 
 
 
 
 
1795
  else:
1796
  speed_obj = Speed.balanced
1797
+ key_tuple = cache_key_for(q, category, language, limit_each, translate, target_lang,
1798
+ start_utc=start_dt, end_utc=end_dt, speed=speed_obj)
1799
+
1800
  entry = _events_cache.get(key_tuple)
1801
  if not entry:
1802
  _, enriched, clusters = get_or_build_events_cache(
1803
+ q, category, language, False, None, limit_each,
1804
+ start_utc=start_dt, end_utc=end_dt, speed=speed_obj
1805
  )
1806
  else:
1807
  enriched, clusters = entry["enriched"], entry["clusters"]
1808
+
1809
  eview = enriched
1810
  if translate and target_lang:
1811
  eview = [dict(i) for i in enriched]
 
1837
  speed: Speed = Query(Speed.balanced),
1838
  page: int = Query(1, ge=1),
1839
  page_size: int = Query(120, ge=5, le=300),
1840
+ start: Optional[str] = Query(None),
1841
+ end: Optional[str] = Query(None),
1842
  ):
1843
+ start_dt = _parse_user_dt(start, "start")
1844
+ end_dt = _parse_user_dt(end, "end")
1845
+
1846
  enriched: List[Dict[str, Any]] = []
1847
  if cache_key:
1848
  parts = cache_key.split("|")
1849
+ if len(parts) == 9:
1850
+ key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
1851
+ parts[4] == "True", parts[5].lower(), parts[6], parts[7], parts[8])
 
 
 
 
 
 
 
1852
  entry = _events_cache.get(key_tuple)
1853
  if entry:
1854
  enriched = entry["enriched"]
1855
+ elif len(parts) == 7: # backwards compat
1856
+ key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
1857
+ parts[4] == "True", parts[5].lower(), "", "", parts[6])
1858
+ entry = _events_cache.get(key_tuple)
1859
+ if entry:
1860
+ enriched = entry["enriched"]
1861
+
1862
  if not enriched:
1863
+ raw = combine_raw_articles(category=category, query=q, language=language,
1864
+ limit_each=limit_each, speed=speed,
1865
+ start_utc=start_dt, end_utc=end_dt)
1866
  prefetch_descriptions_async(raw, speed)
1867
  enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
1868
  if category:
 
1889
  s = sentiment.strip().lower()
1890
  enriched = [i for i in enriched if i.get("sentiment", "").lower() == s]
1891
  total = len(enriched)
1892
+ offset = (page - 1) * page_size
1893
+ end_idx = offset + page_size
1894
+ items = [dict(i) for i in enriched[offset:end_idx]]
1895
  if lite:
1896
  drop = {"_ml_text"}
1897
  for i in items: