Spaces:

MANOJSEQ
/

newsglobe-backend

Sleeping

App Files Files

xet

Community

MANOJSEQ commited on Aug 27

Commit

f75d93a

verified ·

1 Parent(s): e1dfc68

Update main.py

Browse files

Files changed (1) hide show

main.py +152 -52

main.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # ----------------- Imports (Stdlib + Typing) -----------------
 from fastapi import FastAPI, Query, HTTPException, Body
-from fastapi import FastAPI, Query, HTTPException, Body
 from typing import Optional, List, Dict, Any, Tuple, Set
 import os
 import time
@@ -451,18 +450,59 @@ SECTION_HINTS = {
 KEYWORDS = {
     "sports": r"\b(NBA|NFL|MLB|NHL|Olympic|goal|match|tournament|coach|transfer)\b",
-    "business": r"\b(stocks?|earnings|IPO|merger|acquisition|revenue|inflation|market)\b",
     "technology": r"\b(AI|software|chip|semiconductor|app|startup|cyber|hack|quantum|robot)\b",
     "science": r"\b(researchers?|study|physics|astronomy|genome|spacecraft|telescope)\b",
-    "health": r"\b(virus|vaccine|disease|hospital|doctor|public health|covid)\b",
-    "entertainment": r"\b(movie|film|box office|celebrity|series|show|album|music)\b",
     "crime": r"\b(arrested|charged|police|homicide|fraud|theft|court|lawsuit)\b",
     "weather": r"\b(hurricane|storm|flood|heatwave|blizzard|tornado|forecast)\b",
     "environment": r"\b(climate|emissions|wildfire|deforestation|biodiversity)\b",
     "travel": r"\b(flight|airline|airport|tourism|visa|cruise|hotel)\b",
-    "politics": r"\b(president|parliament|congress|minister|policy|campaign|election)\b",
 }
 def get_news_clf():
     # Lazy-init topic classifier
     global _news_clf
@@ -496,29 +536,25 @@ def _infer_category_from_text(text: str) -> Optional[str]:
 def infer_category(article_url, title, description, provided):
     if provided:
-        p = provided.strip().lower()
-        if p:
-            return p
     try:
         p = urlparse(article_url).path or ""
         cat = _infer_category_from_url_path(p)
         if cat:
-            return cat
     except Exception:
         pass
     text = f"{title or ''} {description or ''}".strip()
     cat = _infer_category_from_text(text)
     if cat:
-        return cat
     try:
         preds = get_news_clf()(text[:512])
-        if isinstance(preds[0], list):
-            label = preds[0][0]["label"]
-        else:
-            label = preds[0]["label"]
-        return label.lower()
-    except Exception as e:
-        log.warning(f"ML category failed: {e}")
         return "general"
 # ----------------- Language Detection / Embeddings -----------------
@@ -710,25 +746,48 @@ def opus_model_for(src2: str, tgt2: str) -> Optional[str]:
 SUPPORTED = {"en", "fr", "de", "es", "it", "hi", "ar", "ru", "ja", "ko", "pt", "zh"}
 LIBRETRANSLATE_URL = os.getenv("LIBRETRANSLATE_URL")
 def _translate_via_libre(text: str, src: str, tgt: str) -> Optional[str]:
     url = LIBRETRANSLATE_URL
     if not url or not text or src == tgt:
         return None
-    try:
-        r = SESSION.post(
-            f"{url.rstrip('/')}/translate",
-            json={"q": text, "source": src, "target": tgt, "format": "text"},
-            timeout=6
-        )
-        if r.status_code == 200:
-            j = r.json()
-            out = j.get("translatedText")
-            return out if isinstance(out, str) and out else None
-        else:
-            log.warning("LibreTranslate HTTP %s: %s", r.status_code, r.text[:200])
-    except Exception as e:
-        log.warning("LibreTranslate failed: %s", e)
-    return None
 def _hf_call(model_id: str, payload: dict) -> Optional[str]:
     if not (HUGGINGFACE_API_TOKEN and ALLOW_HF_REMOTE):
@@ -886,10 +945,16 @@ def _warm_once():
 @app.on_event("startup")
 def warm():
     get_sbert()
     get_news_clf()
     threading.Thread(target=_warm_once, daemon=True).start()
 # ----------------- GDELT Query Helpers -----------------
 _GDELT_LANG = {
     "en": "english",
@@ -1026,8 +1091,8 @@ def fetch_gdelt_multi(limit=120, query=None, language=None, timespan="48h", cate
 # ----------------- Provider Flags / Keys / Logging -----------------
-USE_GNEWS_API = True
-USE_NEWSDATA_API = True
 USE_GDELT_API = True
 USE_NEWSAPI = False
@@ -1157,7 +1222,11 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
     sentiment = classify_sentiment(f"{orig_title} {orig_description}")
     seed = f"{source_name}|{article_url}|{title}"
     uid = hashlib.md5(seed.encode("utf-8")).hexdigest()[:12]
-    cat = infer_category(article_url, orig_title, orig_description, None)
     return {
         "id": uid,
         "title": title,
@@ -1232,7 +1301,7 @@ def event_payload_from_cluster(cluster, enriched_articles):
         "sample_urls": [a["url"] for a in arts[:3] if a.get("url")],
     }
-def aggregate_event_by_country(cluster, enriched_articles):
     idxs = cluster["indices"]
     arts = [enriched_articles[i] for i in idxs]
     by_country: Dict[str, Dict[str, Any]] = {}
@@ -1251,6 +1320,7 @@ def aggregate_event_by_country(cluster, enriched_articles):
         avg_sent = "positive" if avg > 0.15 else "negative" if avg < -0.15 else "neutral"
         top_sources = [s for s, _ in Counter([a["source"] for a in arr]).most_common(3)]
         summary = " • ".join([a["title"] for a in arr[:2]])
         results.append(
             {
                 "country": c,
@@ -1270,7 +1340,8 @@ def aggregate_event_by_country(cluster, enriched_articles):
                         "sentiment": a["sentiment"],
                         "detected_lang": a.get("detected_lang"),
                     }
-                    for a in arr[:5]
                 ],
             }
         )
@@ -1500,13 +1571,10 @@ def combine_raw_articles(category=None, query=None, language=None, limit_each=30
     a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
     gdelt_limit = limit_each
     a4 = fetch_gdelt_multi(
-        limit=gdelt_limit,
-        query=query,
-        language=language,
-        timespan=timespan,
-        category=category,
-        speed=speed,
-    )
     seen, merged = set(), []
     for a in a1 + a3 + a2 + a4:
         if a.get("url"):
@@ -1566,6 +1634,7 @@ def get_event_details(
     translate: Optional[bool] = Query(False),
     target_lang: Optional[str] = Query(None),
     limit_each: int = Query(150, ge=5, le=250),
 ):
     if cache_key:
         parts = cache_key.split("|")
@@ -1600,7 +1669,7 @@ def get_event_details(
     if not cluster:
         raise HTTPException(status_code=404, detail="Event not found with current filters")
     payload = event_payload_from_cluster(cluster, eview)
-    countries = aggregate_event_by_country(cluster, eview)
     payload["articles_in_event"] = sum(c["count"] for c in countries)
     return {"event": payload, "countries": countries}
@@ -1746,14 +1815,45 @@ def client_metric(payload: Dict[str, Any] = Body(...)):
 # ----------------- Diagnostics: Translation Health -----------------
 @app.get("/diag/translate")
-def diag_translate():
-    remote = _hf_call("Helsinki-NLP/opus-mt-es-en", {"inputs":"Hola mundo"})
-    local  = _translate_local("Hola mundo", "es", "en")
-    libre  = _translate_via_libre("Hola mundo", "es", "en")
     return {
-        "token": bool(HUGGINGFACE_API_TOKEN),
         "remote_ok": bool(remote),
         "local_ok": bool(local),
-        "libre_ok": bool(libre),
-        "sample": libre or remote or local
     }

 # ----------------- Imports (Stdlib + Typing) -----------------
 from fastapi import FastAPI, Query, HTTPException, Body
 from typing import Optional, List, Dict, Any, Tuple, Set
 import os
 import time
 KEYWORDS = {
     "sports": r"\b(NBA|NFL|MLB|NHL|Olympic|goal|match|tournament|coach|transfer)\b",
+    "business": r"\b(stocks?|earnings|IPO|merger|acquisition|revenue|inflation|market|tax|budget|inflation|revenue|deficit)\b",
     "technology": r"\b(AI|software|chip|semiconductor|app|startup|cyber|hack|quantum|robot)\b",
     "science": r"\b(researchers?|study|physics|astronomy|genome|spacecraft|telescope)\b",
+    "health": r"\b(virus|vaccine|disease|hospital|doctor|public health|covid|recall|FDA|contamination|disease outbreak)\b",
+    "entertainment": r"\b(movie|film|box office|celebrity|series|show|album|music|)\b",
     "crime": r"\b(arrested|charged|police|homicide|fraud|theft|court|lawsuit)\b",
     "weather": r"\b(hurricane|storm|flood|heatwave|blizzard|tornado|forecast)\b",
     "environment": r"\b(climate|emissions|wildfire|deforestation|biodiversity)\b",
     "travel": r"\b(flight|airline|airport|tourism|visa|cruise|hotel)\b",
+    "politics": r"\b(president|parliament|congress|minister|policy|campaign|election|rally|protest|demonstration)\b",
+}
+# ----------------- Category normalization to frontend set -----------------
+FRONTEND_CATS = {
+    "politics","technology","sports","business","entertainment",
+    "science","health","crime","weather","environment","travel",
+    "viral","general"
 }
+ML_TO_FRONTEND = {
+    "arts_&_culture": "entertainment",
+    "business": "business",
+    "business_&_entrepreneurs": "business",
+    "celebrity_&_pop_culture": "entertainment",
+    "crime": "crime",
+    "diaries_&_daily_life": "viral",
+    "entertainment": "entertainment",
+    "environment": "environment",
+    "fashion_&_style": "entertainment",
+    "film_tv_&_video": "entertainment",
+    "fitness_&_health": "health",
+    "food_&_dining": "entertainment",
+    "general": "general",
+    "learning_&_educational": "science",
+    "news_&_social_concern": "politics",
+    "politics": "politics",
+    "science_&_technology": "science",
+    "sports": "sports",
+    "technology": "technology",
+    "travel_&_adventure": "travel",
+    "other_hobbies": "viral"
+}
+def normalize_category(c: Optional[str]) -> str:
+    s = (c or "").strip().lower()
+    if not s:
+        return "general"
+    if s in FRONTEND_CATS:
+        return s
+    return ML_TO_FRONTEND.get(s, "general")
 def get_news_clf():
     # Lazy-init topic classifier
     global _news_clf
 def infer_category(article_url, title, description, provided):
     if provided:
+        got = normalize_category(provided)
+        if got:
+            return got
     try:
         p = urlparse(article_url).path or ""
         cat = _infer_category_from_url_path(p)
         if cat:
+            return normalize_category(cat)
     except Exception:
         pass
     text = f"{title or ''} {description or ''}".strip()
     cat = _infer_category_from_text(text)
     if cat:
+        return normalize_category(cat)
     try:
         preds = get_news_clf()(text[:512])
+        label = preds[0][0]["label"] if isinstance(preds[0], list) else preds[0]["label"]
+        return normalize_category(label)
+    except Exception:
         return "general"
 # ----------------- Language Detection / Embeddings -----------------
 SUPPORTED = {"en", "fr", "de", "es", "it", "hi", "ar", "ru", "ja", "ko", "pt", "zh"}
 LIBRETRANSLATE_URL = os.getenv("LIBRETRANSLATE_URL")
+def _lt_lang(code: str) -> str:
+    if not code:
+        return code
+    c = code.lower()
+    # LibreTranslate uses zh-Hans; normalize zh* to zh-Hans
+    if c.startswith("zh"):
+        return "zh-Hans"
+    return c
 def _translate_via_libre(text: str, src: str, tgt: str) -> Optional[str]:
     url = LIBRETRANSLATE_URL
     if not url or not text or src == tgt:
         return None
+    payload = {
+        "q": text,
+        "source": _lt_lang(src),
+        "target": _lt_lang(tgt),
+        "format": "text",
+    }
+    # First call can be slow while LT warms models; retry once.
+    for attempt in (1, 2):
+        try:
+            r = SESSION.post(
+                f"{url.rstrip('/')}/translate",
+                json=payload,
+                timeout=15  # was 6
+            )
+            if r.status_code == 200:
+                j = r.json()
+                out = j.get("translatedText")
+                return out if isinstance(out, str) and out else None
+            else:
+                log.warning("LibreTranslate HTTP %s: %s", r.status_code, r.text[:200])
+                return None
+        except Exception as e:
+            if attempt == 2:
+                log.warning("LibreTranslate failed: %s", e)
+                return None
+            time.sleep(0.5)
 def _hf_call(model_id: str, payload: dict) -> Optional[str]:
     if not (HUGGINGFACE_API_TOKEN and ALLOW_HF_REMOTE):
 @app.on_event("startup")
 def warm():
+    try:
+        _translate_cached.cache_clear()
+    except Exception:
+        pass
     get_sbert()
     get_news_clf()
     threading.Thread(target=_warm_once, daemon=True).start()
 # ----------------- GDELT Query Helpers -----------------
 _GDELT_LANG = {
     "en": "english",
 # ----------------- Provider Flags / Keys / Logging -----------------
+USE_GNEWS_API = False
+USE_NEWSDATA_API = False
 USE_GDELT_API = True
 USE_NEWSAPI = False
     sentiment = classify_sentiment(f"{orig_title} {orig_description}")
     seed = f"{source_name}|{article_url}|{title}"
     uid = hashlib.md5(seed.encode("utf-8")).hexdigest()[:12]
+    provided = a.get("category")
+    if provided and normalize_category(provided) != "general":
+        cat = normalize_category(provided)
+    else:
+        cat = infer_category(article_url, orig_title, orig_description, provided)
     return {
         "id": uid,
         "title": title,
         "sample_urls": [a["url"] for a in arts[:3] if a.get("url")],
     }
+def aggregate_event_by_country(cluster, enriched_articles, max_samples: int | None = 5):
     idxs = cluster["indices"]
     arts = [enriched_articles[i] for i in idxs]
     by_country: Dict[str, Dict[str, Any]] = {}
         avg_sent = "positive" if avg > 0.15 else "negative" if avg < -0.15 else "neutral"
         top_sources = [s for s, _ in Counter([a["source"] for a in arr]).most_common(3)]
         summary = " • ".join([a["title"] for a in arr[:2]])
+        use = arr if (max_samples in (None, 0) or max_samples < 0) else arr[:max_samples]
         results.append(
             {
                 "country": c,
                         "sentiment": a["sentiment"],
                         "detected_lang": a.get("detected_lang"),
                     }
+                    # for a in arr[:5]
+                    for a in use
                 ],
             }
         )
     a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
     gdelt_limit = limit_each
     a4 = fetch_gdelt_multi(
+        limit=limit_each, query=query, language=language,
+        timespan=timespan, category=category, speed=speed
+    ) if USE_GDELT_API else []
     seen, merged = set(), []
     for a in a1 + a3 + a2 + a4:
         if a.get("url"):
     translate: Optional[bool] = Query(False),
     target_lang: Optional[str] = Query(None),
     limit_each: int = Query(150, ge=5, le=250),
+    max_samples: int = Query(5, ge=0, le=1000),
 ):
     if cache_key:
         parts = cache_key.split("|")
     if not cluster:
         raise HTTPException(status_code=404, detail="Event not found with current filters")
     payload = event_payload_from_cluster(cluster, eview)
+    countries = aggregate_event_by_country(cluster, eview, max_samples=max_samples)
     payload["articles_in_event"] = sum(c["count"] for c in countries)
     return {"event": payload, "countries": countries}
 # ----------------- Diagnostics: Translation Health -----------------
 @app.get("/diag/translate")
+def diag_translate(
+    src: str = Query("pt"),
+    tgt: str = Query("en"),
+    text: str = Query("Olá mundo")
+):
+    # Try each path explicitly (same order your runtime uses)
+    libre  = _translate_via_libre(text, src, tgt)
+    remote = None
+    local  = None
+    opus_id = opus_model_for(src, tgt)
+    if opus_id:
+        remote = _hf_call(opus_id, {"inputs": text})
+        local  = _translate_local(text, src, tgt)
+    # Optional: try primary NLLB if configured
+    nllb = None
+    if HF_MODEL_PRIMARY and (src in NLLB_CODES) and (tgt in NLLB_CODES):
+        nllb = _hf_call(
+            HF_MODEL_PRIMARY,
+            {
+                "inputs": text,
+                "parameters": {"src_lang": NLLB_CODES[src], "tgt_lang": NLLB_CODES[tgt]},
+                "options": {"wait_for_model": True},
+            },
+        )
+    sample_out = libre or remote or local or nllb
+    out_lang = detect_lang(sample_out or "") or None
     return {
+        "src": src, "tgt": tgt, "text": text,
+        "libre_url": LIBRETRANSLATE_URL,
+        "token_present": bool(HUGGINGFACE_API_TOKEN),
+        "libre_ok": bool(libre),
         "remote_ok": bool(remote),
         "local_ok": bool(local),
+        "nllb_ok": bool(nllb),
+        "sample_out": sample_out,
+        "sample_out_lang_detected": out_lang,
+        "lang_match": (out_lang == tgt)
     }