Spaces:
Sleeping
Sleeping
Upload main.py
Browse files
main.py
CHANGED
@@ -28,9 +28,9 @@ import threading
|
|
28 |
import difflib
|
29 |
from starlette.middleware.gzip import GZipMiddleware
|
30 |
from transformers import pipeline as hf_pipeline
|
31 |
-
import os
|
32 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
33 |
-
from fastapi import
|
|
|
34 |
|
35 |
import torch
|
36 |
torch.set_num_threads(2)
|
@@ -53,6 +53,17 @@ _local_pipes = {}
|
|
53 |
_news_clf = None
|
54 |
_sbert = None
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
# --- Translation runtime flags / caches ---
|
57 |
ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
|
58 |
_hf_bad_models: Set[str] = set()
|
@@ -796,10 +807,21 @@ def cluster_id(cluster, enriched_articles):
|
|
796 |
|
797 |
|
798 |
# ----------------- NLTK / VADER -----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
799 |
try:
|
800 |
nltk.data.find("sentiment/vader_lexicon")
|
801 |
except LookupError:
|
802 |
-
|
|
|
|
|
|
|
|
|
|
|
803 |
|
804 |
try:
|
805 |
_vader = SentimentIntensityAnalyzer()
|
@@ -875,7 +897,7 @@ def geocode_source(source_text: str, domain: str = "", do_network: bool = False)
|
|
875 |
if cache_key in domain_geo_cache:
|
876 |
return domain_geo_cache[cache_key]
|
877 |
|
878 |
-
ext =
|
879 |
fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
880 |
|
881 |
# 0) Major outlets / domain map
|
@@ -1456,7 +1478,7 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
|
|
1456 |
# Canonicalize URL & derive domain
|
1457 |
article_url = _canonical_url(a.get("url") or "")
|
1458 |
try:
|
1459 |
-
ext =
|
1460 |
domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
1461 |
except Exception:
|
1462 |
domain = ""
|
@@ -2020,3 +2042,11 @@ def diag_translate():
|
|
2020 |
"libre_ok": bool(libre),
|
2021 |
"sample": libre or remote or local
|
2022 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
import difflib
|
29 |
from starlette.middleware.gzip import GZipMiddleware
|
30 |
from transformers import pipeline as hf_pipeline
|
|
|
31 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
32 |
+
from fastapi.responses import PlainTextResponse
|
33 |
+
|
34 |
|
35 |
import torch
|
36 |
torch.set_num_threads(2)
|
|
|
53 |
_news_clf = None
|
54 |
_sbert = None
|
55 |
|
56 |
+
|
57 |
+
# set a writable cache for tldextract and avoid network PSL fetches
|
58 |
+
_TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
|
59 |
+
try:
|
60 |
+
# suffix_list_urls=None => use cached public suffix list only (no HTTP on startup)
|
61 |
+
_tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
|
62 |
+
except Exception:
|
63 |
+
# safe fallback: still parses domains without PSL refresh
|
64 |
+
_tld = tldextract.extract
|
65 |
+
|
66 |
+
|
67 |
# --- Translation runtime flags / caches ---
|
68 |
ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
|
69 |
_hf_bad_models: Set[str] = set()
|
|
|
807 |
|
808 |
|
809 |
# ----------------- NLTK / VADER -----------------
|
810 |
+
NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
|
811 |
+
|
812 |
+
# Make sure NLTK looks in the baked, writable dir first
|
813 |
+
if NLTK_DATA_DIR not in nltk.data.path:
|
814 |
+
nltk.data.path.insert(0, NLTK_DATA_DIR)
|
815 |
+
|
816 |
try:
|
817 |
nltk.data.find("sentiment/vader_lexicon")
|
818 |
except LookupError:
|
819 |
+
# As a fallback, try downloading into the writable dir (won't run if already baked)
|
820 |
+
try:
|
821 |
+
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
|
822 |
+
nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
|
823 |
+
except Exception:
|
824 |
+
pass # don't crash if download is blocked
|
825 |
|
826 |
try:
|
827 |
_vader = SentimentIntensityAnalyzer()
|
|
|
897 |
if cache_key in domain_geo_cache:
|
898 |
return domain_geo_cache[cache_key]
|
899 |
|
900 |
+
ext = _tld(domain or "")
|
901 |
fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
902 |
|
903 |
# 0) Major outlets / domain map
|
|
|
1478 |
# Canonicalize URL & derive domain
|
1479 |
article_url = _canonical_url(a.get("url") or "")
|
1480 |
try:
|
1481 |
+
ext = _tld(article_url)
|
1482 |
domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
1483 |
except Exception:
|
1484 |
domain = ""
|
|
|
2042 |
"libre_ok": bool(libre),
|
2043 |
"sample": libre or remote or local
|
2044 |
}
|
2045 |
+
|
2046 |
+
@app.get("/", include_in_schema=False)
|
2047 |
+
def root():
|
2048 |
+
return {"ok": True, "service": "newsglobe-backend"}
|
2049 |
+
|
2050 |
+
@app.get("/favicon.ico", include_in_schema=False)
|
2051 |
+
def favicon():
|
2052 |
+
return PlainTextResponse("", status_code=204)
|