File size: 4,314 Bytes
a97143b
 
 
 
 
 
 
 
 
 
60dac59
 
 
 
 
 
1933120
60dac59
 
1933120
60dac59
a97143b
 
 
60dac59
 
9a666a0
a97143b
 
60dac59
a97143b
 
 
 
 
60dac59
a97143b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import io
import torch
import nltk
import speech_recognition as sr
from fastapi import FastAPI, UploadFile, File, HTTPException
from pydantic import BaseModel
from transformers import pipeline, BertForSequenceClassification, BertTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# ---------- Force writable cache locations (must match Dockerfile) ----------
os.environ.setdefault("NLTK_DATA", "/data/nltk_data")
os.environ.setdefault("HF_HOME", "/data/huggingface")
os.environ.setdefault("TRANSFORMERS_CACHE", "/data/huggingface/transformers")
os.environ.setdefault("HF_DATASETS_CACHE", "/data/huggingface/datasets")
os.environ.setdefault("TMPDIR", "/data/tmp")

# Also ensure nltk uses this path immediately
nltk.data.path = [os.environ["NLTK_DATA"]] + nltk.data.path

# ---------- NLTK VADER ----------
try:
    nltk.data.find("sentiment/vader_lexicon")
except LookupError:
    # download into /data/nltk_data (writable)
    nltk.download("vader_lexicon", download_dir=os.environ["NLTK_DATA"])

vader = SentimentIntensityAnalyzer()

# ---------- Models ----------
emotion_model = pipeline("sentiment-analysis", model="tabularisai/multilingual-sentiment-analysis")
finbert = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone", num_labels=3)
finbert_tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
tone_labels = ["Neutral", "Positive", "Negative"]

app = FastAPI(title="Sentiment • Emotion • Tone API", version="2.0.1")


# ---------------- HELPERS ----------------
def _label3(label_str: str) -> str:
    l = label_str.lower()
    if "pos" in l:
        return "Positive"
    if "neg" in l:
        return "Negative"
    return "Neutral"

def _signed_score(label: str, score01: float) -> float:
    if label == "Positive":
        return +abs(float(score01))
    if label == "Negative":
        return -abs(float(score01))
    return 0.0

def score_sentiment(text: str) -> float:
    c = vader.polarity_scores(text)["compound"]
    if c >= 0.05:
        return _signed_score("Positive", abs(c))
    elif c <= -0.05:
        return _signed_score("Negative", abs(c))
    else:
        return 0.0

def score_emotion(text: str) -> float:
    out = emotion_model(text)[0]
    lab = _label3(out["label"])
    return _signed_score(lab, float(out["score"]))

def score_tone(text: str) -> float:
    inputs = finbert_tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = finbert(**inputs).logits
        probs = torch.softmax(logits, dim=1).squeeze()
        idx = torch.argmax(probs).item()
        lab = tone_labels[idx]
        scr = float(probs[idx].item())
    return _signed_score(lab, scr)

def analyze_text_core(text: str):
    return [{
        "sentiment": round(score_sentiment(text), 4),
        "emotion":   round(score_emotion(text),   4),
        "tone":      round(score_tone(text),      4),
    }]


# ---------------- SCHEMAS ----------------
class TextIn(BaseModel):
    text: str


# ---------------- ROUTES ----------------
@app.get("/")
def root():
    return {"ok": True, "endpoints": ["/analyze-text", "/analyze-voice"]}

@app.post("/analyze-text")
def analyze_text(payload: TextIn):
    text = (payload.text or "").strip()
    if not text:
        raise HTTPException(status_code=400, detail="Text cannot be empty.")
    return analyze_text_core(text)

@app.post("/analyze-voice")
async def analyze_voice(file: UploadFile = File(...)):
    # Save uploaded audio temporarily
    fname = (file.filename or "audio").lower()
    if not any(fname.endswith(ext) for ext in (".wav", ".aiff", ".aif")):
        raise HTTPException(status_code=400, detail="Please upload WAV/AIFF file (MP3 not supported by speech_recognition without ffmpeg).")

    data = await file.read()
    tmp_path = f"/tmp/{fname}"
    with open(tmp_path, "wb") as f:
        f.write(data)

    # SpeechRecognition with Google Web Speech API (free, no key)
    recognizer = sr.Recognizer()
    with sr.AudioFile(tmp_path) as source:
        audio = recognizer.record(source)

    try:
        transcript = recognizer.recognize_google(audio, language="en-US")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")

    return analyze_text_core(transcript)