Spaces:
Running
Running
Upload main.py
Browse files
main.py
CHANGED
@@ -30,6 +30,7 @@ from starlette.middleware.gzip import GZipMiddleware
|
|
30 |
from transformers import pipeline as hf_pipeline
|
31 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
32 |
from fastapi.responses import PlainTextResponse, JSONResponse
|
|
|
33 |
|
34 |
# ----------------- Torch Runtime Settings -----------------
|
35 |
import torch
|
@@ -309,13 +310,14 @@ def _extract_desc_from_html(html: str) -> Optional[str]:
|
|
309 |
def _desc_cache_get(url: str) -> Optional[str]:
|
310 |
if not url:
|
311 |
return None
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
|
|
319 |
|
320 |
def _desc_cache_put(url: str, text: str):
|
321 |
if url and text:
|
@@ -991,7 +993,9 @@ def fetch_gdelt_articles(
|
|
991 |
language=None,
|
992 |
timespan="3d",
|
993 |
category=None,
|
994 |
-
extra_tokens: Optional[List[str]] = None
|
|
|
|
|
995 |
):
|
996 |
q = _gdelt_safe_query(query, language)
|
997 |
if extra_tokens:
|
@@ -1003,8 +1007,12 @@ def fetch_gdelt_articles(
|
|
1003 |
"format": "json",
|
1004 |
"sort": "DateDesc",
|
1005 |
"maxrecords": int(min(250, max(1, limit))),
|
1006 |
-
"timespan": timespan,
|
1007 |
}
|
|
|
|
|
|
|
|
|
|
|
1008 |
headers = {
|
1009 |
"User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:[email protected])",
|
1010 |
"Accept": "application/json",
|
@@ -1056,10 +1064,18 @@ def fetch_gdelt_articles(
|
|
1056 |
log.info(f"GDELT returned {len(results)}")
|
1057 |
return results
|
1058 |
|
1059 |
-
def fetch_gdelt_multi(
|
|
|
|
|
|
|
|
|
1060 |
if language:
|
1061 |
-
primary = fetch_gdelt_articles(limit=limit, query=query, language=language,
|
1062 |
-
|
|
|
|
|
|
|
|
|
1063 |
return primary + booster
|
1064 |
if speed == Speed.fast:
|
1065 |
langs = LANG_ROTATION[:3]
|
@@ -1073,20 +1089,18 @@ def fetch_gdelt_multi(limit=120, query=None, language=None, timespan="48h", cate
|
|
1073 |
per_lang = max(8, math.ceil(limit / len(langs)))
|
1074 |
out = []
|
1075 |
for lg in langs:
|
1076 |
-
out.extend(fetch_gdelt_articles(limit=per_lang, query=query, language=lg,
|
|
|
|
|
1077 |
if speed != Speed.fast:
|
1078 |
per_cc = max(4, limit // 30) if speed == Speed.max else max(2, limit // 40)
|
1079 |
for cc in COUNTRY_SEEDS[: (8 if speed == Speed.balanced else 16)]:
|
1080 |
-
out.extend(
|
1081 |
-
|
1082 |
-
|
1083 |
-
|
1084 |
-
|
1085 |
-
|
1086 |
-
category=category,
|
1087 |
-
extra_tokens=[f"sourcecountry:{cc}"]
|
1088 |
-
)
|
1089 |
-
)
|
1090 |
return out
|
1091 |
|
1092 |
|
@@ -1377,36 +1391,109 @@ CACHE_TTL_SECS = 900
|
|
1377 |
SIM_THRESHOLD = 0.6
|
1378 |
_events_cache: Dict[Tuple, Dict[str, Any]] = {}
|
1379 |
|
1380 |
-
|
1381 |
-
|
1382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1383 |
|
1384 |
_first_real_build = True
|
1385 |
|
1386 |
-
def get_or_build_events_cache(
|
|
|
|
|
|
|
|
|
|
|
1387 |
global _first_real_build
|
1388 |
-
key = cache_key_for(q, category, language, limit_each, translate, target_lang, speed)
|
1389 |
now = monotonic()
|
|
|
1390 |
if speed == Speed.fast:
|
1391 |
use_timespan, use_limit = "24h", min(limit_each, 20)
|
1392 |
elif speed == Speed.balanced:
|
1393 |
use_timespan, use_limit = "48h", min(limit_each, 100)
|
1394 |
else:
|
1395 |
use_timespan, use_limit = "3d", limit_each
|
|
|
1396 |
entry = _events_cache.get(key)
|
1397 |
if entry and now - entry["t"] < CACHE_TTL_SECS:
|
1398 |
log.info(f"CACHE HIT for {key}")
|
1399 |
return key, entry["enriched"], entry["clusters"]
|
|
|
1400 |
lock = _get_inflight_lock(key)
|
1401 |
with lock:
|
1402 |
entry = _events_cache.get(key)
|
1403 |
if entry and now - entry["t"] < CACHE_TTL_SECS:
|
1404 |
log.info(f"CACHE HIT (post-lock) for {key}")
|
1405 |
return key, entry["enriched"], entry["clusters"]
|
1406 |
-
|
|
|
1407 |
use_timespan = "24h" if use_timespan != "24h" else use_timespan
|
1408 |
use_limit = min(use_limit, 100)
|
1409 |
-
|
|
|
|
|
1410 |
raw = combine_raw_articles(
|
1411 |
category=category,
|
1412 |
query=q,
|
@@ -1414,6 +1501,8 @@ def get_or_build_events_cache(q, category, language, translate, target_lang, lim
|
|
1414 |
limit_each=use_limit,
|
1415 |
timespan=use_timespan,
|
1416 |
speed=speed,
|
|
|
|
|
1417 |
)
|
1418 |
prefetch_descriptions_async(raw, speed)
|
1419 |
enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
|
@@ -1507,13 +1596,25 @@ def fetch_newsapi_headlines_multi(limit=50, language=None):
|
|
1507 |
time.sleep(0.2)
|
1508 |
return all_[:limit]
|
1509 |
|
1510 |
-
def fetch_newsapi_articles(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1511 |
if not _newsapi_enabled():
|
1512 |
return []
|
1513 |
if query:
|
1514 |
url = f"https://newsapi.org/v2/everything?pageSize={limit}&apiKey={NEWSAPI_KEY}&q={requests.utils.quote(query)}"
|
1515 |
if language:
|
1516 |
url += f"&language={language}"
|
|
|
|
|
|
|
|
|
|
|
1517 |
try:
|
1518 |
r = _session_get(url, timeout=12)
|
1519 |
if r.status_code != 200:
|
@@ -1548,19 +1649,22 @@ def fetch_newsapi_articles(category=None, limit=20, query=None, language=None):
|
|
1548 |
|
1549 |
# ----------------- Provider Combiner / Dedup -----------------
|
1550 |
def combine_raw_articles(category=None, query=None, language=None, limit_each=30,
|
1551 |
-
timespan="3d", speed=Speed.balanced, log_summary: bool = True
|
|
|
1552 |
if speed == Speed.fast:
|
1553 |
timespan = "24h"
|
1554 |
limit_each = min(limit_each, 20)
|
1555 |
elif speed == Speed.balanced:
|
1556 |
timespan = "48h"
|
1557 |
limit_each = min(limit_each, 100)
|
|
|
1558 |
a1 = []
|
1559 |
if USE_NEWSAPI:
|
1560 |
if not query:
|
1561 |
a1 = fetch_newsapi_headlines_multi(limit=limit_each, language=language)
|
1562 |
else:
|
1563 |
-
a1 = fetch_newsapi_articles(category=category, limit=limit_each, query=query,
|
|
|
1564 |
a2 = []
|
1565 |
if USE_NEWSDATA_API:
|
1566 |
a2 = [
|
@@ -1569,10 +1673,10 @@ def combine_raw_articles(category=None, query=None, language=None, limit_each=30
|
|
1569 |
if a.get("link")
|
1570 |
]
|
1571 |
a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
|
1572 |
-
gdelt_limit = limit_each
|
1573 |
a4 = fetch_gdelt_multi(
|
1574 |
limit=limit_each, query=query, language=language,
|
1575 |
-
timespan=timespan, category=category, speed=speed
|
|
|
1576 |
) if USE_GDELT_API else []
|
1577 |
|
1578 |
seen, merged = set(), []
|
@@ -1583,6 +1687,23 @@ def combine_raw_articles(category=None, query=None, language=None, limit_each=30
|
|
1583 |
if url not in seen:
|
1584 |
seen.add(url)
|
1585 |
merged.append(a)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1586 |
if log_summary:
|
1587 |
fetch_log.info("----- Article Fetch Summary -----")
|
1588 |
fetch_log.info(f"📊 NewsAPI returned: {len(a1)} articles")
|
@@ -1593,6 +1714,7 @@ def combine_raw_articles(category=None, query=None, language=None, limit_each=30
|
|
1593 |
fetch_log.info("---------------------------------")
|
1594 |
return merged
|
1595 |
|
|
|
1596 |
# ----------------- API: /events -----------------
|
1597 |
@app.get("/events")
|
1598 |
def get_events(
|
@@ -1606,9 +1728,17 @@ def get_events(
|
|
1606 |
min_countries: int = Query(2, ge=1, le=50),
|
1607 |
min_articles: int = Query(2, ge=1, le=200),
|
1608 |
speed: Speed = Query(Speed.balanced),
|
|
|
|
|
1609 |
):
|
|
|
|
|
|
|
|
|
|
|
1610 |
cache_key, enriched, clusters = get_or_build_events_cache(
|
1611 |
-
q, category, language, False, None, limit_each,
|
|
|
1612 |
)
|
1613 |
view = enriched
|
1614 |
if translate and target_lang:
|
@@ -1635,28 +1765,47 @@ def get_event_details(
|
|
1635 |
target_lang: Optional[str] = Query(None),
|
1636 |
limit_each: int = Query(150, ge=5, le=250),
|
1637 |
max_samples: int = Query(5, ge=0, le=1000),
|
|
|
|
|
1638 |
):
|
|
|
|
|
|
|
1639 |
if cache_key:
|
1640 |
parts = cache_key.split("|")
|
1641 |
-
if len(parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1642 |
raise HTTPException(status_code=400, detail="Bad cache_key")
|
1643 |
-
speed_str = parts[6]
|
1644 |
-
try:
|
1645 |
-
speed_obj = Speed(speed_str)
|
1646 |
-
except ValueError:
|
1647 |
-
speed_obj = Speed.balanced
|
1648 |
-
key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
|
1649 |
-
parts[4] == "True", parts[5].lower(), speed_str)
|
1650 |
else:
|
1651 |
speed_obj = Speed.balanced
|
1652 |
-
key_tuple = cache_key_for(q, category, language, limit_each, translate, target_lang,
|
|
|
|
|
1653 |
entry = _events_cache.get(key_tuple)
|
1654 |
if not entry:
|
1655 |
_, enriched, clusters = get_or_build_events_cache(
|
1656 |
-
q, category, language, False, None, limit_each,
|
|
|
1657 |
)
|
1658 |
else:
|
1659 |
enriched, clusters = entry["enriched"], entry["clusters"]
|
|
|
1660 |
eview = enriched
|
1661 |
if translate and target_lang:
|
1662 |
eview = [dict(i) for i in enriched]
|
@@ -1688,25 +1837,32 @@ def get_news(
|
|
1688 |
speed: Speed = Query(Speed.balanced),
|
1689 |
page: int = Query(1, ge=1),
|
1690 |
page_size: int = Query(120, ge=5, le=300),
|
|
|
|
|
1691 |
):
|
|
|
|
|
|
|
1692 |
enriched: List[Dict[str, Any]] = []
|
1693 |
if cache_key:
|
1694 |
parts = cache_key.split("|")
|
1695 |
-
if len(parts) ==
|
1696 |
-
key_tuple = (
|
1697 |
-
|
1698 |
-
parts[1],
|
1699 |
-
parts[2],
|
1700 |
-
int(parts[3]),
|
1701 |
-
parts[4] == "True",
|
1702 |
-
parts[5].lower(),
|
1703 |
-
parts[6],
|
1704 |
-
)
|
1705 |
entry = _events_cache.get(key_tuple)
|
1706 |
if entry:
|
1707 |
enriched = entry["enriched"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1708 |
if not enriched:
|
1709 |
-
raw = combine_raw_articles(category=category, query=q, language=language,
|
|
|
|
|
1710 |
prefetch_descriptions_async(raw, speed)
|
1711 |
enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
|
1712 |
if category:
|
@@ -1733,9 +1889,9 @@ def get_news(
|
|
1733 |
s = sentiment.strip().lower()
|
1734 |
enriched = [i for i in enriched if i.get("sentiment", "").lower() == s]
|
1735 |
total = len(enriched)
|
1736 |
-
|
1737 |
-
|
1738 |
-
items = [dict(i) for i in enriched[
|
1739 |
if lite:
|
1740 |
drop = {"_ml_text"}
|
1741 |
for i in items:
|
|
|
30 |
from transformers import pipeline as hf_pipeline
|
31 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
32 |
from fastapi.responses import PlainTextResponse, JSONResponse
|
33 |
+
from datetime import datetime, timezone
|
34 |
|
35 |
# ----------------- Torch Runtime Settings -----------------
|
36 |
import torch
|
|
|
310 |
def _desc_cache_get(url: str) -> Optional[str]:
|
311 |
if not url:
|
312 |
return None
|
313 |
+
with DESC_CACHE_LOCK:
|
314 |
+
entry = DESC_CACHE.get(url)
|
315 |
+
if not entry:
|
316 |
+
return None
|
317 |
+
if _now_mono() - entry["t"] > DESC_CACHE_TTL:
|
318 |
+
DESC_CACHE.pop(url, None)
|
319 |
+
return None
|
320 |
+
return entry["text"]
|
321 |
|
322 |
def _desc_cache_put(url: str, text: str):
|
323 |
if url and text:
|
|
|
993 |
language=None,
|
994 |
timespan="3d",
|
995 |
category=None,
|
996 |
+
extra_tokens: Optional[List[str]] = None,
|
997 |
+
start_utc: Optional[datetime] = None,
|
998 |
+
end_utc: Optional[datetime] = None,
|
999 |
):
|
1000 |
q = _gdelt_safe_query(query, language)
|
1001 |
if extra_tokens:
|
|
|
1007 |
"format": "json",
|
1008 |
"sort": "DateDesc",
|
1009 |
"maxrecords": int(min(250, max(1, limit))),
|
|
|
1010 |
}
|
1011 |
+
if start_utc and end_utc:
|
1012 |
+
params["startdatetime"] = _gdelt_fmt(start_utc)
|
1013 |
+
params["enddatetime"] = _gdelt_fmt(end_utc)
|
1014 |
+
else:
|
1015 |
+
params["timespan"] = timespan
|
1016 |
headers = {
|
1017 |
"User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:[email protected])",
|
1018 |
"Accept": "application/json",
|
|
|
1064 |
log.info(f"GDELT returned {len(results)}")
|
1065 |
return results
|
1066 |
|
1067 |
+
def fetch_gdelt_multi(
|
1068 |
+
limit=120, query=None, language=None, timespan="48h",
|
1069 |
+
category=None, speed: Speed = Speed.balanced,
|
1070 |
+
start_utc: Optional[datetime] = None, end_utc: Optional[datetime] = None
|
1071 |
+
):
|
1072 |
if language:
|
1073 |
+
primary = fetch_gdelt_articles(limit=limit, query=query, language=language,
|
1074 |
+
timespan=timespan, category=category,
|
1075 |
+
start_utc=start_utc, end_utc=end_utc)
|
1076 |
+
booster = fetch_gdelt_articles(limit=max(10, limit // 6), query=query, language="en",
|
1077 |
+
timespan=timespan, category=category,
|
1078 |
+
start_utc=start_utc, end_utc=end_utc)
|
1079 |
return primary + booster
|
1080 |
if speed == Speed.fast:
|
1081 |
langs = LANG_ROTATION[:3]
|
|
|
1089 |
per_lang = max(8, math.ceil(limit / len(langs)))
|
1090 |
out = []
|
1091 |
for lg in langs:
|
1092 |
+
out.extend(fetch_gdelt_articles(limit=per_lang, query=query, language=lg,
|
1093 |
+
timespan=timespan, category=category,
|
1094 |
+
start_utc=start_utc, end_utc=end_utc))
|
1095 |
if speed != Speed.fast:
|
1096 |
per_cc = max(4, limit // 30) if speed == Speed.max else max(2, limit // 40)
|
1097 |
for cc in COUNTRY_SEEDS[: (8 if speed == Speed.balanced else 16)]:
|
1098 |
+
out.extend(fetch_gdelt_articles(
|
1099 |
+
limit=per_cc, query=query, language="en",
|
1100 |
+
timespan=timespan, category=category,
|
1101 |
+
extra_tokens=[f"sourcecountry:{cc}"],
|
1102 |
+
start_utc=start_utc, end_utc=end_utc
|
1103 |
+
))
|
|
|
|
|
|
|
|
|
1104 |
return out
|
1105 |
|
1106 |
|
|
|
1391 |
SIM_THRESHOLD = 0.6
|
1392 |
_events_cache: Dict[Tuple, Dict[str, Any]] = {}
|
1393 |
|
1394 |
+
# -------- Date parsing helpers (Option B) --------
|
1395 |
+
ISO_BASIC_RE = re.compile(r'^(\d{4})(\d{2})(\d{2})(?:[T ]?(\d{2})(\d{2})(\d{2}))?(Z|[+-]\d{2}:?\d{2})?$')
|
1396 |
+
|
1397 |
+
def _parse_user_dt(s: Optional[str], which: str) -> Optional[datetime]:
|
1398 |
+
"""Parse query 'start'/'end' into UTC-aware datetimes."""
|
1399 |
+
if not s:
|
1400 |
+
return None
|
1401 |
+
s = s.strip()
|
1402 |
+
try:
|
1403 |
+
# Normalize Z
|
1404 |
+
if s.endswith("Z"):
|
1405 |
+
s = s[:-1] + "+00:00"
|
1406 |
+
# Date-only
|
1407 |
+
if re.match(r'^\d{4}-\d{2}-\d{2}$', s):
|
1408 |
+
s = s + ("T00:00:00+00:00" if which == "start" else "T23:59:59+00:00")
|
1409 |
+
dt = datetime.fromisoformat(s)
|
1410 |
+
if dt.tzinfo is None:
|
1411 |
+
dt = dt.replace(tzinfo=timezone.utc)
|
1412 |
+
return dt.astimezone(timezone.utc)
|
1413 |
+
except Exception:
|
1414 |
+
m = ISO_BASIC_RE.match(s)
|
1415 |
+
if m:
|
1416 |
+
yyyy, MM, dd, hh, mm, ss, tz = m.groups()
|
1417 |
+
hh = hh or ("00" if which == "start" else "23")
|
1418 |
+
mm = mm or ("00" if which == "start" else "59")
|
1419 |
+
ss = ss or ("00" if which == "start" else "59")
|
1420 |
+
return datetime(int(yyyy), int(MM), int(dd), int(hh), int(mm), int(ss), tzinfo=timezone.utc)
|
1421 |
+
return None
|
1422 |
+
|
1423 |
+
def _gdelt_fmt(dt: datetime) -> str:
|
1424 |
+
return dt.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
|
1425 |
+
|
1426 |
+
def _parse_any_pubdate(s: Optional[str]) -> Optional[datetime]:
|
1427 |
+
"""Best-effort parse of provider publishedAt strings to UTC."""
|
1428 |
+
if not s:
|
1429 |
+
return None
|
1430 |
+
try:
|
1431 |
+
t = s.strip()
|
1432 |
+
if t.endswith("Z"):
|
1433 |
+
t = t[:-1] + "+00:00"
|
1434 |
+
return datetime.fromisoformat(t).astimezone(timezone.utc)
|
1435 |
+
except Exception:
|
1436 |
+
m = ISO_BASIC_RE.match(s)
|
1437 |
+
if m:
|
1438 |
+
yyyy, MM, dd, hh, mm, ss, tz = m.groups()
|
1439 |
+
hh = hh or "00"; mm = mm or "00"; ss = ss or "00"
|
1440 |
+
return datetime(int(yyyy), int(MM), int(dd), int(hh), int(mm), int(ss), tzinfo=timezone.utc)
|
1441 |
+
return None
|
1442 |
+
|
1443 |
+
|
1444 |
+
def cache_key_for(
|
1445 |
+
q, category, language, limit_each,
|
1446 |
+
translate=False, target_lang=None,
|
1447 |
+
start_utc: Optional[datetime] = None,
|
1448 |
+
end_utc: Optional[datetime] = None,
|
1449 |
+
speed: Speed = Speed.balanced
|
1450 |
+
):
|
1451 |
+
return (
|
1452 |
+
q or "", category or "", language or "", int(limit_each or 50),
|
1453 |
+
bool(translate), (target_lang or "").lower(),
|
1454 |
+
(start_utc and _gdelt_fmt(start_utc)) or "",
|
1455 |
+
(end_utc and _gdelt_fmt(end_utc)) or "",
|
1456 |
+
speed.value,
|
1457 |
+
)
|
1458 |
+
|
1459 |
|
1460 |
_first_real_build = True
|
1461 |
|
1462 |
+
def get_or_build_events_cache(
|
1463 |
+
q, category, language, translate, target_lang, limit_each,
|
1464 |
+
start_utc: Optional[datetime] = None,
|
1465 |
+
end_utc: Optional[datetime] = None,
|
1466 |
+
speed: Speed = Speed.balanced
|
1467 |
+
):
|
1468 |
global _first_real_build
|
1469 |
+
key = cache_key_for(q, category, language, limit_each, translate, target_lang, start_utc, end_utc, speed)
|
1470 |
now = monotonic()
|
1471 |
+
|
1472 |
if speed == Speed.fast:
|
1473 |
use_timespan, use_limit = "24h", min(limit_each, 20)
|
1474 |
elif speed == Speed.balanced:
|
1475 |
use_timespan, use_limit = "48h", min(limit_each, 100)
|
1476 |
else:
|
1477 |
use_timespan, use_limit = "3d", limit_each
|
1478 |
+
|
1479 |
entry = _events_cache.get(key)
|
1480 |
if entry and now - entry["t"] < CACHE_TTL_SECS:
|
1481 |
log.info(f"CACHE HIT for {key}")
|
1482 |
return key, entry["enriched"], entry["clusters"]
|
1483 |
+
|
1484 |
lock = _get_inflight_lock(key)
|
1485 |
with lock:
|
1486 |
entry = _events_cache.get(key)
|
1487 |
if entry and now - entry["t"] < CACHE_TTL_SECS:
|
1488 |
log.info(f"CACHE HIT (post-lock) for {key}")
|
1489 |
return key, entry["enriched"], entry["clusters"]
|
1490 |
+
|
1491 |
+
if _first_real_build and not (start_utc and end_utc):
|
1492 |
use_timespan = "24h" if use_timespan != "24h" else use_timespan
|
1493 |
use_limit = min(use_limit, 100)
|
1494 |
+
|
1495 |
+
log.info(f"CACHE MISS for {key} — fetching (timespan={use_timespan}, limit_each={use_limit}, start={start_utc}, end={end_utc})")
|
1496 |
+
|
1497 |
raw = combine_raw_articles(
|
1498 |
category=category,
|
1499 |
query=q,
|
|
|
1501 |
limit_each=use_limit,
|
1502 |
timespan=use_timespan,
|
1503 |
speed=speed,
|
1504 |
+
start_utc=start_utc,
|
1505 |
+
end_utc=end_utc,
|
1506 |
)
|
1507 |
prefetch_descriptions_async(raw, speed)
|
1508 |
enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
|
|
|
1596 |
time.sleep(0.2)
|
1597 |
return all_[:limit]
|
1598 |
|
1599 |
+
def fetch_newsapi_articles(
|
1600 |
+
category=None,
|
1601 |
+
limit=20,
|
1602 |
+
query=None,
|
1603 |
+
language=None,
|
1604 |
+
start_utc: Optional[datetime] = None,
|
1605 |
+
end_utc: Optional[datetime] = None,
|
1606 |
+
):
|
1607 |
if not _newsapi_enabled():
|
1608 |
return []
|
1609 |
if query:
|
1610 |
url = f"https://newsapi.org/v2/everything?pageSize={limit}&apiKey={NEWSAPI_KEY}&q={requests.utils.quote(query)}"
|
1611 |
if language:
|
1612 |
url += f"&language={language}"
|
1613 |
+
# NEW: date range for /everything
|
1614 |
+
if start_utc:
|
1615 |
+
url += f"&from={start_utc.date().isoformat()}"
|
1616 |
+
if end_utc:
|
1617 |
+
url += f"&to={end_utc.date().isoformat()}"
|
1618 |
try:
|
1619 |
r = _session_get(url, timeout=12)
|
1620 |
if r.status_code != 200:
|
|
|
1649 |
|
1650 |
# ----------------- Provider Combiner / Dedup -----------------
|
1651 |
def combine_raw_articles(category=None, query=None, language=None, limit_each=30,
|
1652 |
+
timespan="3d", speed=Speed.balanced, log_summary: bool = True,
|
1653 |
+
start_utc: Optional[datetime] = None, end_utc: Optional[datetime] = None):
|
1654 |
if speed == Speed.fast:
|
1655 |
timespan = "24h"
|
1656 |
limit_each = min(limit_each, 20)
|
1657 |
elif speed == Speed.balanced:
|
1658 |
timespan = "48h"
|
1659 |
limit_each = min(limit_each, 100)
|
1660 |
+
|
1661 |
a1 = []
|
1662 |
if USE_NEWSAPI:
|
1663 |
if not query:
|
1664 |
a1 = fetch_newsapi_headlines_multi(limit=limit_each, language=language)
|
1665 |
else:
|
1666 |
+
a1 = fetch_newsapi_articles(category=category, limit=limit_each, query=query,
|
1667 |
+
language=language, start_utc=start_utc, end_utc=end_utc)
|
1668 |
a2 = []
|
1669 |
if USE_NEWSDATA_API:
|
1670 |
a2 = [
|
|
|
1673 |
if a.get("link")
|
1674 |
]
|
1675 |
a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
|
|
|
1676 |
a4 = fetch_gdelt_multi(
|
1677 |
limit=limit_each, query=query, language=language,
|
1678 |
+
timespan=timespan, category=category, speed=speed,
|
1679 |
+
start_utc=start_utc, end_utc=end_utc
|
1680 |
) if USE_GDELT_API else []
|
1681 |
|
1682 |
seen, merged = set(), []
|
|
|
1687 |
if url not in seen:
|
1688 |
seen.add(url)
|
1689 |
merged.append(a)
|
1690 |
+
|
1691 |
+
#Apply date filter locally (for providers that can’t filter server-side)
|
1692 |
+
if start_utc or end_utc:
|
1693 |
+
s_ts = start_utc.timestamp() if start_utc else None
|
1694 |
+
e_ts = end_utc.timestamp() if end_utc else None
|
1695 |
+
|
1696 |
+
def _in_range(row):
|
1697 |
+
dt = _parse_any_pubdate(row.get("publishedAt") or "")
|
1698 |
+
if not dt:
|
1699 |
+
return False
|
1700 |
+
t = dt.timestamp()
|
1701 |
+
if s_ts and t < s_ts: return False
|
1702 |
+
if e_ts and t > e_ts: return False
|
1703 |
+
return True
|
1704 |
+
|
1705 |
+
merged = [a for a in merged if _in_range(a)]
|
1706 |
+
|
1707 |
if log_summary:
|
1708 |
fetch_log.info("----- Article Fetch Summary -----")
|
1709 |
fetch_log.info(f"📊 NewsAPI returned: {len(a1)} articles")
|
|
|
1714 |
fetch_log.info("---------------------------------")
|
1715 |
return merged
|
1716 |
|
1717 |
+
|
1718 |
# ----------------- API: /events -----------------
|
1719 |
@app.get("/events")
|
1720 |
def get_events(
|
|
|
1728 |
min_countries: int = Query(2, ge=1, le=50),
|
1729 |
min_articles: int = Query(2, ge=1, le=200),
|
1730 |
speed: Speed = Query(Speed.balanced),
|
1731 |
+
start: Optional[str] = Query(None),
|
1732 |
+
end: Optional[str] = Query(None),
|
1733 |
):
|
1734 |
+
start_dt = _parse_user_dt(start, "start")
|
1735 |
+
end_dt = _parse_user_dt(end, "end")
|
1736 |
+
if start_dt and end_dt and start_dt > end_dt:
|
1737 |
+
start_dt, end_dt = end_dt, start_dt # swap
|
1738 |
+
|
1739 |
cache_key, enriched, clusters = get_or_build_events_cache(
|
1740 |
+
q, category, language, False, None, limit_each,
|
1741 |
+
start_utc=start_dt, end_utc=end_dt, speed=speed
|
1742 |
)
|
1743 |
view = enriched
|
1744 |
if translate and target_lang:
|
|
|
1765 |
target_lang: Optional[str] = Query(None),
|
1766 |
limit_each: int = Query(150, ge=5, le=250),
|
1767 |
max_samples: int = Query(5, ge=0, le=1000),
|
1768 |
+
start: Optional[str] = Query(None),
|
1769 |
+
end: Optional[str] = Query(None),
|
1770 |
):
|
1771 |
+
start_dt = _parse_user_dt(start, "start")
|
1772 |
+
end_dt = _parse_user_dt(end, "end")
|
1773 |
+
|
1774 |
if cache_key:
|
1775 |
parts = cache_key.split("|")
|
1776 |
+
if len(parts) == 9:
|
1777 |
+
speed_str = parts[8]
|
1778 |
+
try:
|
1779 |
+
speed_obj = Speed(speed_str)
|
1780 |
+
except ValueError:
|
1781 |
+
speed_obj = Speed.balanced
|
1782 |
+
key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
|
1783 |
+
parts[4] == "True", parts[5].lower(),
|
1784 |
+
parts[6], parts[7], speed_str)
|
1785 |
+
elif len(parts) == 7: # backwards compat
|
1786 |
+
speed_str = parts[6]
|
1787 |
+
try:
|
1788 |
+
speed_obj = Speed(speed_str)
|
1789 |
+
except ValueError:
|
1790 |
+
speed_obj = Speed.balanced
|
1791 |
+
key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
|
1792 |
+
parts[4] == "True", parts[5].lower(), "", "", speed_str)
|
1793 |
+
else:
|
1794 |
raise HTTPException(status_code=400, detail="Bad cache_key")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1795 |
else:
|
1796 |
speed_obj = Speed.balanced
|
1797 |
+
key_tuple = cache_key_for(q, category, language, limit_each, translate, target_lang,
|
1798 |
+
start_utc=start_dt, end_utc=end_dt, speed=speed_obj)
|
1799 |
+
|
1800 |
entry = _events_cache.get(key_tuple)
|
1801 |
if not entry:
|
1802 |
_, enriched, clusters = get_or_build_events_cache(
|
1803 |
+
q, category, language, False, None, limit_each,
|
1804 |
+
start_utc=start_dt, end_utc=end_dt, speed=speed_obj
|
1805 |
)
|
1806 |
else:
|
1807 |
enriched, clusters = entry["enriched"], entry["clusters"]
|
1808 |
+
|
1809 |
eview = enriched
|
1810 |
if translate and target_lang:
|
1811 |
eview = [dict(i) for i in enriched]
|
|
|
1837 |
speed: Speed = Query(Speed.balanced),
|
1838 |
page: int = Query(1, ge=1),
|
1839 |
page_size: int = Query(120, ge=5, le=300),
|
1840 |
+
start: Optional[str] = Query(None),
|
1841 |
+
end: Optional[str] = Query(None),
|
1842 |
):
|
1843 |
+
start_dt = _parse_user_dt(start, "start")
|
1844 |
+
end_dt = _parse_user_dt(end, "end")
|
1845 |
+
|
1846 |
enriched: List[Dict[str, Any]] = []
|
1847 |
if cache_key:
|
1848 |
parts = cache_key.split("|")
|
1849 |
+
if len(parts) == 9:
|
1850 |
+
key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
|
1851 |
+
parts[4] == "True", parts[5].lower(), parts[6], parts[7], parts[8])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1852 |
entry = _events_cache.get(key_tuple)
|
1853 |
if entry:
|
1854 |
enriched = entry["enriched"]
|
1855 |
+
elif len(parts) == 7: # backwards compat
|
1856 |
+
key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
|
1857 |
+
parts[4] == "True", parts[5].lower(), "", "", parts[6])
|
1858 |
+
entry = _events_cache.get(key_tuple)
|
1859 |
+
if entry:
|
1860 |
+
enriched = entry["enriched"]
|
1861 |
+
|
1862 |
if not enriched:
|
1863 |
+
raw = combine_raw_articles(category=category, query=q, language=language,
|
1864 |
+
limit_each=limit_each, speed=speed,
|
1865 |
+
start_utc=start_dt, end_utc=end_dt)
|
1866 |
prefetch_descriptions_async(raw, speed)
|
1867 |
enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
|
1868 |
if category:
|
|
|
1889 |
s = sentiment.strip().lower()
|
1890 |
enriched = [i for i in enriched if i.get("sentiment", "").lower() == s]
|
1891 |
total = len(enriched)
|
1892 |
+
offset = (page - 1) * page_size
|
1893 |
+
end_idx = offset + page_size
|
1894 |
+
items = [dict(i) for i in enriched[offset:end_idx]]
|
1895 |
if lite:
|
1896 |
drop = {"_ml_text"}
|
1897 |
for i in items:
|