GitHub Actions commited on
Commit
27c8444
·
1 Parent(s): b97094d

Sync from GitHub repo

Browse files
Files changed (7) hide show
  1. app.py +1673 -26
  2. migrate.py +58 -4
  3. migrate_consumed_sentences.py +52 -0
  4. models.py +127 -13
  5. requirements.txt +2 -1
  6. templates/arena.html +6 -11
  7. tts.py +0 -2
app.py CHANGED
@@ -1,33 +1,1680 @@
1
- from flask import Flask, render_template_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  app = Flask(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- HTML = """
6
- <!DOCTYPE html>
7
- <html lang="en">
8
- <head>
9
- <meta charset="UTF-8">
10
- <title>Maintenance</title>
11
- <meta name="viewport" content="width=device-width, initial-scale=1">
12
- <script src="https://cdn.tailwindcss.com"></script>
13
- </head>
14
- <body class="bg-gray-100 flex items-center justify-center h-screen">
15
- <div class="bg-white p-8 rounded-2xl shadow-lg text-center max-w-md">
16
- <svg class="mx-auto mb-4 w-16 h-16 text-yellow-500" fill="none" stroke="currentColor" stroke-width="1.5"
17
- viewBox="0 0 24 24">
18
- <path stroke-linecap="round" stroke-linejoin="round"
19
- d="M12 9v2m0 4h.01M4.93 4.93a10 10 0 0114.14 0 10 10 0 010 14.14 10 10 0 01-14.14 0 10 10 0 010-14.14z"/>
20
- </svg>
21
- <h1 class="text-2xl font-bold text-gray-800 mb-2">We'll be back soon!</h1>
22
- <p class="text-gray-600">The TTS Arena is temporarily undergoing maintenance.<br>Thank you for your patience.</p>
23
- </div>
24
- </body>
25
- </html>
26
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  @app.route("/")
29
- def maintenance():
30
- return render_template_string(HTML)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  if __name__ == "__main__":
33
- app.run(host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi, hf_hub_download
3
+ from apscheduler.schedulers.background import BackgroundScheduler
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from datetime import datetime
6
+ import threading # Added for locking
7
+ from sqlalchemy import or_ # Added for vote counting query
8
+ from datasets import load_dataset
9
+
10
+ year = datetime.now().year
11
+ month = datetime.now().month
12
+
13
+ # Check if running in a Huggin Face Space
14
+ IS_SPACES = False
15
+ if os.getenv("SPACE_REPO_NAME"):
16
+ print("Running in a Hugging Face Space 🤗")
17
+ IS_SPACES = True
18
+
19
+ # Setup database sync for HF Spaces
20
+ if not os.path.exists("instance/tts_arena.db"):
21
+ os.makedirs("instance", exist_ok=True)
22
+ try:
23
+ print("Database not found, downloading from HF dataset...")
24
+ hf_hub_download(
25
+ repo_id="TTS-AGI/database-arena-v2",
26
+ filename="tts_arena.db",
27
+ repo_type="dataset",
28
+ local_dir="instance",
29
+ token=os.getenv("HF_TOKEN"),
30
+ )
31
+ print("Database downloaded successfully ✅")
32
+ except Exception as e:
33
+ print(f"Error downloading database from HF dataset: {str(e)} ⚠️")
34
+
35
+ from flask import (
36
+ Flask,
37
+ render_template,
38
+ g,
39
+ request,
40
+ jsonify,
41
+ send_file,
42
+ redirect,
43
+ url_for,
44
+ session,
45
+ abort,
46
+ )
47
+ from flask_login import LoginManager, current_user
48
+ from models import *
49
+ from models import (
50
+ hash_sentence, is_sentence_consumed, mark_sentence_consumed,
51
+ get_unconsumed_sentences, get_consumed_sentences_count, get_random_unconsumed_sentence
52
+ )
53
+ from auth import auth, init_oauth, is_admin
54
+ from admin import admin
55
+ from security import is_vote_allowed, check_user_security_score, detect_coordinated_voting
56
+ import os
57
+ from dotenv import load_dotenv
58
+ from flask_limiter import Limiter
59
+ from flask_limiter.util import get_remote_address
60
+ import uuid
61
+ import tempfile
62
+ import shutil
63
+ from tts import predict_tts
64
+ import random
65
+ import json
66
+ from datetime import datetime, timedelta
67
+ from flask_migrate import Migrate
68
+ import requests
69
+ import functools
70
+ import time # Added for potential retries
71
+
72
+
73
+ def get_client_ip():
74
+ """Get the client's IP address, handling proxies and load balancers."""
75
+ # Check for forwarded headers first (common with reverse proxies)
76
+ if request.headers.get('X-Forwarded-For'):
77
+ # X-Forwarded-For can contain multiple IPs, take the first one
78
+ return request.headers.get('X-Forwarded-For').split(',')[0].strip()
79
+ elif request.headers.get('X-Real-IP'):
80
+ return request.headers.get('X-Real-IP')
81
+ elif request.headers.get('CF-Connecting-IP'): # Cloudflare
82
+ return request.headers.get('CF-Connecting-IP')
83
+ else:
84
+ return request.remote_addr
85
+
86
+
87
+ # Load environment variables
88
+ if not IS_SPACES:
89
+ load_dotenv() # Only load .env if not running in a Hugging Face Space
90
 
91
  app = Flask(__name__)
92
+ app.config["SECRET_KEY"] = os.getenv("SECRET_KEY", os.urandom(24))
93
+ app.config["SQLALCHEMY_DATABASE_URI"] = os.getenv(
94
+ "DATABASE_URI", "sqlite:///tts_arena.db"
95
+ )
96
+ app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
97
+ app.config["SESSION_COOKIE_SECURE"] = True
98
+ app.config["SESSION_COOKIE_SAMESITE"] = (
99
+ "None" if IS_SPACES else "Lax"
100
+ ) # HF Spaces uses iframes to load the app, so we need to set SAMESITE to None
101
+ app.config["PERMANENT_SESSION_LIFETIME"] = timedelta(days=30) # Set to desired duration
102
+
103
+ # Force HTTPS when running in HuggingFace Spaces
104
+ if IS_SPACES:
105
+ app.config["PREFERRED_URL_SCHEME"] = "https"
106
+
107
+ # Cloudflare Turnstile settings
108
+ app.config["TURNSTILE_ENABLED"] = (
109
+ os.getenv("TURNSTILE_ENABLED", "False").lower() == "true"
110
+ )
111
+ app.config["TURNSTILE_SITE_KEY"] = os.getenv("TURNSTILE_SITE_KEY", "")
112
+ app.config["TURNSTILE_SECRET_KEY"] = os.getenv("TURNSTILE_SECRET_KEY", "")
113
+ app.config["TURNSTILE_VERIFY_URL"] = (
114
+ "https://challenges.cloudflare.com/turnstile/v0/siteverify"
115
+ )
116
+
117
+ migrate = Migrate(app, db)
118
+
119
+ # Initialize extensions
120
+ db.init_app(app)
121
+ login_manager = LoginManager()
122
+ login_manager.init_app(app)
123
+ login_manager.login_view = "auth.login"
124
+
125
+ # Initialize OAuth
126
+ init_oauth(app)
127
+
128
+ # Configure rate limits
129
+ limiter = Limiter(
130
+ app=app,
131
+ key_func=get_remote_address,
132
+ default_limits=["2000 per day", "50 per minute"],
133
+ storage_uri="memory://",
134
+ )
135
+
136
+ # TTS Cache Configuration - Read from environment
137
+ TTS_CACHE_SIZE = int(os.getenv("TTS_CACHE_SIZE", "10"))
138
+ CACHE_AUDIO_SUBDIR = "cache"
139
+ tts_cache = {} # sentence -> {model_a, model_b, audio_a, audio_b, created_at}
140
+ tts_cache_lock = threading.Lock()
141
+ SMOOTHING_FACTOR_MODEL_SELECTION = 500 # For weighted random model selection
142
+ # Increased max_workers to 8 for concurrent generation/refill
143
+ cache_executor = ThreadPoolExecutor(max_workers=8, thread_name_prefix='CacheReplacer')
144
+ all_harvard_sentences = [] # Keep the full list available
145
+
146
+ # Create temp directories
147
+ TEMP_AUDIO_DIR = os.path.join(tempfile.gettempdir(), "tts_arena_audio")
148
+ CACHE_AUDIO_DIR = os.path.join(TEMP_AUDIO_DIR, CACHE_AUDIO_SUBDIR)
149
+ os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
150
+ os.makedirs(CACHE_AUDIO_DIR, exist_ok=True) # Ensure cache subdir exists
151
+
152
+
153
+ # Store active TTS sessions
154
+ app.tts_sessions = {}
155
+ tts_sessions = app.tts_sessions
156
+
157
+ # Store active conversational sessions
158
+ app.conversational_sessions = {}
159
+ conversational_sessions = app.conversational_sessions
160
+
161
+ # Register blueprints
162
+ app.register_blueprint(auth, url_prefix="/auth")
163
+ app.register_blueprint(admin)
164
+
165
+
166
+ @login_manager.user_loader
167
+ def load_user(user_id):
168
+ return User.query.get(int(user_id))
169
+
170
+
171
+ @app.before_request
172
+ def before_request():
173
+ g.user = current_user
174
+ g.is_admin = is_admin(current_user)
175
+
176
+ # Ensure HTTPS for HuggingFace Spaces environment
177
+ if IS_SPACES and request.headers.get("X-Forwarded-Proto") == "http":
178
+ url = request.url.replace("http://", "https://", 1)
179
+ return redirect(url, code=301)
180
+
181
+ # Check if Turnstile verification is required
182
+ if app.config["TURNSTILE_ENABLED"]:
183
+ # Exclude verification routes
184
+ excluded_routes = ["verify_turnstile", "turnstile_page", "static"]
185
+ if request.endpoint not in excluded_routes:
186
+ # Check if user is verified
187
+ if not session.get("turnstile_verified"):
188
+ # Save original URL for redirect after verification
189
+ redirect_url = request.url
190
+ # Force HTTPS in HuggingFace Spaces
191
+ if IS_SPACES and redirect_url.startswith("http://"):
192
+ redirect_url = redirect_url.replace("http://", "https://", 1)
193
+
194
+ # If it's an API request, return a JSON response
195
+ if request.path.startswith("/api/"):
196
+ return jsonify({"error": "Turnstile verification required"}), 403
197
+ # For regular requests, redirect to verification page
198
+ return redirect(url_for("turnstile_page", redirect_url=redirect_url))
199
+ else:
200
+ # Check if verification has expired (default: 24 hours)
201
+ verification_timeout = (
202
+ int(os.getenv("TURNSTILE_TIMEOUT_HOURS", "24")) * 3600
203
+ ) # Convert hours to seconds
204
+ verified_at = session.get("turnstile_verified_at", 0)
205
+ current_time = datetime.utcnow().timestamp()
206
+
207
+ if current_time - verified_at > verification_timeout:
208
+ # Verification expired, clear status and redirect to verification page
209
+ session.pop("turnstile_verified", None)
210
+ session.pop("turnstile_verified_at", None)
211
+
212
+ redirect_url = request.url
213
+ # Force HTTPS in HuggingFace Spaces
214
+ if IS_SPACES and redirect_url.startswith("http://"):
215
+ redirect_url = redirect_url.replace("http://", "https://", 1)
216
+
217
+ if request.path.startswith("/api/"):
218
+ return jsonify({"error": "Turnstile verification expired"}), 403
219
+ return redirect(
220
+ url_for("turnstile_page", redirect_url=redirect_url)
221
+ )
222
+
223
+
224
+ @app.route("/turnstile", methods=["GET"])
225
+ def turnstile_page():
226
+ """Display Cloudflare Turnstile verification page"""
227
+ redirect_url = request.args.get("redirect_url", url_for("arena", _external=True))
228
+
229
+ # Force HTTPS in HuggingFace Spaces
230
+ if IS_SPACES and redirect_url.startswith("http://"):
231
+ redirect_url = redirect_url.replace("http://", "https://", 1)
232
+
233
+ return render_template(
234
+ "turnstile.html",
235
+ turnstile_site_key=app.config["TURNSTILE_SITE_KEY"],
236
+ redirect_url=redirect_url,
237
+ )
238
 
239
+
240
+ @app.route("/verify-turnstile", methods=["POST"])
241
+ def verify_turnstile():
242
+ """Verify Cloudflare Turnstile token"""
243
+ token = request.form.get("cf-turnstile-response")
244
+ redirect_url = request.form.get("redirect_url", url_for("arena", _external=True))
245
+
246
+ # Force HTTPS in HuggingFace Spaces
247
+ if IS_SPACES and redirect_url.startswith("http://"):
248
+ redirect_url = redirect_url.replace("http://", "https://", 1)
249
+
250
+ if not token:
251
+ # If AJAX request, return JSON error
252
+ if request.headers.get("X-Requested-With") == "XMLHttpRequest":
253
+ return (
254
+ jsonify({"success": False, "error": "Missing verification token"}),
255
+ 400,
256
+ )
257
+ # Otherwise redirect back to turnstile page
258
+ return redirect(url_for("turnstile_page", redirect_url=redirect_url))
259
+
260
+ # Verify token with Cloudflare
261
+ data = {
262
+ "secret": app.config["TURNSTILE_SECRET_KEY"],
263
+ "response": token,
264
+ "remoteip": request.remote_addr,
265
+ }
266
+
267
+ try:
268
+ response = requests.post(app.config["TURNSTILE_VERIFY_URL"], data=data)
269
+ result = response.json()
270
+
271
+ if result.get("success"):
272
+ # Set verification status in session
273
+ session["turnstile_verified"] = True
274
+ session["turnstile_verified_at"] = datetime.utcnow().timestamp()
275
+
276
+ # Determine response type based on request
277
+ is_xhr = request.headers.get("X-Requested-With") == "XMLHttpRequest"
278
+ accepts_json = "application/json" in request.headers.get("Accept", "")
279
+
280
+ # If AJAX or JSON request, return success JSON
281
+ if is_xhr or accepts_json:
282
+ return jsonify({"success": True, "redirect": redirect_url})
283
+
284
+ # For regular form submissions, redirect to the target URL
285
+ return redirect(redirect_url)
286
+ else:
287
+ # Verification failed
288
+ app.logger.warning(f"Turnstile verification failed: {result}")
289
+
290
+ # If AJAX request, return JSON error
291
+ if request.headers.get("X-Requested-With") == "XMLHttpRequest":
292
+ return jsonify({"success": False, "error": "Verification failed"}), 403
293
+
294
+ # Otherwise redirect back to turnstile page
295
+ return redirect(url_for("turnstile_page", redirect_url=redirect_url))
296
+
297
+ except Exception as e:
298
+ app.logger.error(f"Turnstile verification error: {str(e)}")
299
+
300
+ # If AJAX request, return JSON error
301
+ if request.headers.get("X-Requested-With") == "XMLHttpRequest":
302
+ return (
303
+ jsonify(
304
+ {"success": False, "error": "Server error during verification"}
305
+ ),
306
+ 500,
307
+ )
308
+
309
+ # Otherwise redirect back to turnstile page
310
+ return redirect(url_for("turnstile_page", redirect_url=redirect_url))
311
+
312
+ # Load sentences from the TTS-AGI/arena-prompts dataset
313
+ print("Loading TTS-AGI/arena-prompts dataset...")
314
+ dataset = load_dataset("TTS-AGI/arena-prompts", split="train")
315
+ # Extract the text column and clean up
316
+ all_harvard_sentences = [item['text'].strip() for item in dataset if item['text'] and item['text'].strip()]
317
+ print(f"Loaded {len(all_harvard_sentences)} sentences from dataset")
318
+
319
+ # Initialize initial_sentences as empty - will be populated with unconsumed sentences only
320
+ initial_sentences = []
321
 
322
  @app.route("/")
323
+ def arena():
324
+ # Pass a subset of sentences for the random button fallback
325
+ return render_template("arena.html", harvard_sentences=json.dumps(initial_sentences))
326
+
327
+
328
+ @app.route("/leaderboard")
329
+ def leaderboard():
330
+ tts_leaderboard = get_leaderboard_data(ModelType.TTS)
331
+ conversational_leaderboard = get_leaderboard_data(ModelType.CONVERSATIONAL)
332
+ top_voters = get_top_voters(10) # Get top 10 voters
333
+
334
+ # Initialize personal leaderboard data
335
+ tts_personal_leaderboard = None
336
+ conversational_personal_leaderboard = None
337
+ user_leaderboard_visibility = None
338
+
339
+ # If user is logged in, get their personal leaderboard and visibility setting
340
+ if current_user.is_authenticated:
341
+ tts_personal_leaderboard = get_user_leaderboard(current_user.id, ModelType.TTS)
342
+ conversational_personal_leaderboard = get_user_leaderboard(
343
+ current_user.id, ModelType.CONVERSATIONAL
344
+ )
345
+ user_leaderboard_visibility = current_user.show_in_leaderboard
346
+
347
+ # Get key dates for the timeline
348
+ tts_key_dates = get_key_historical_dates(ModelType.TTS)
349
+ conversational_key_dates = get_key_historical_dates(ModelType.CONVERSATIONAL)
350
+
351
+ # Format dates for display in the dropdown
352
+ formatted_tts_dates = [date.strftime("%B %Y") for date in tts_key_dates]
353
+ formatted_conversational_dates = [
354
+ date.strftime("%B %Y") for date in conversational_key_dates
355
+ ]
356
+
357
+ return render_template(
358
+ "leaderboard.html",
359
+ tts_leaderboard=tts_leaderboard,
360
+ conversational_leaderboard=conversational_leaderboard,
361
+ tts_personal_leaderboard=tts_personal_leaderboard,
362
+ conversational_personal_leaderboard=conversational_personal_leaderboard,
363
+ tts_key_dates=tts_key_dates,
364
+ conversational_key_dates=conversational_key_dates,
365
+ formatted_tts_dates=formatted_tts_dates,
366
+ formatted_conversational_dates=formatted_conversational_dates,
367
+ top_voters=top_voters,
368
+ user_leaderboard_visibility=user_leaderboard_visibility
369
+ )
370
+
371
+
372
+ @app.route("/api/historical-leaderboard/<model_type>")
373
+ def historical_leaderboard(model_type):
374
+ """Get historical leaderboard data for a specific date"""
375
+ if model_type not in [ModelType.TTS, ModelType.CONVERSATIONAL]:
376
+ return jsonify({"error": "Invalid model type"}), 400
377
+
378
+ # Get date from query parameter
379
+ date_str = request.args.get("date")
380
+ if not date_str:
381
+ return jsonify({"error": "Date parameter is required"}), 400
382
+
383
+ try:
384
+ # Parse date from URL parameter (format: YYYY-MM-DD)
385
+ target_date = datetime.strptime(date_str, "%Y-%m-%d")
386
+
387
+ # Get historical leaderboard data
388
+ leaderboard_data = get_historical_leaderboard_data(model_type, target_date)
389
+
390
+ return jsonify(
391
+ {"date": target_date.strftime("%B %d, %Y"), "leaderboard": leaderboard_data}
392
+ )
393
+ except ValueError:
394
+ return jsonify({"error": "Invalid date format. Use YYYY-MM-DD"}), 400
395
+
396
+
397
+ @app.route("/about")
398
+ def about():
399
+ return render_template("about.html")
400
+
401
+
402
+ # --- TTS Caching Functions ---
403
+
404
+ def generate_and_save_tts(text, model_id, output_dir):
405
+ """Generates TTS and saves it to a specific directory, returning the full path."""
406
+ temp_audio_path = None # Initialize to None
407
+ try:
408
+ app.logger.debug(f"[TTS Gen {model_id}] Starting generation for: '{text[:30]}...'")
409
+ # If predict_tts saves file itself and returns path:
410
+ temp_audio_path = predict_tts(text, model_id)
411
+ app.logger.debug(f"[TTS Gen {model_id}] predict_tts returned: {temp_audio_path}")
412
+
413
+ if not temp_audio_path or not os.path.exists(temp_audio_path):
414
+ app.logger.warning(f"[TTS Gen {model_id}] predict_tts failed or returned invalid path: {temp_audio_path}")
415
+ raise ValueError("predict_tts did not return a valid path or file does not exist")
416
+
417
+ file_uuid = str(uuid.uuid4())
418
+ dest_path = os.path.join(output_dir, f"{file_uuid}.wav")
419
+ app.logger.debug(f"[TTS Gen {model_id}] Moving {temp_audio_path} to {dest_path}")
420
+ # Move the file generated by predict_tts to the target cache directory
421
+ shutil.move(temp_audio_path, dest_path)
422
+ app.logger.debug(f"[TTS Gen {model_id}] Move successful. Returning {dest_path}")
423
+ return dest_path
424
+
425
+ except Exception as e:
426
+ app.logger.error(f"Error generating/saving TTS for model {model_id} and text '{text[:30]}...': {str(e)}")
427
+ # Ensure temporary file from predict_tts (if any) is cleaned up
428
+ if temp_audio_path and os.path.exists(temp_audio_path):
429
+ try:
430
+ app.logger.debug(f"[TTS Gen {model_id}] Cleaning up temporary file {temp_audio_path} after error.")
431
+ os.remove(temp_audio_path)
432
+ except OSError:
433
+ pass # Ignore error if file couldn't be removed
434
+ return None
435
+
436
+
437
+ def _generate_cache_entry_task(sentence):
438
+ """Task function to generate audio for a sentence and add to cache."""
439
+ # Wrap the entire task in an application context
440
+ with app.app_context():
441
+ if not sentence:
442
+ # Select a new sentence if not provided (for replacement)
443
+ with tts_cache_lock:
444
+ cached_keys = set(tts_cache.keys())
445
+ # Get unconsumed sentences that are also not already cached
446
+ unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
447
+ available_sentences = [s for s in unconsumed_sentences if s not in cached_keys]
448
+ if not available_sentences:
449
+ app.logger.warning("No more unconsumed sentences available for caching. All sentences have been consumed.")
450
+ return
451
+ sentence = random.choice(available_sentences)
452
+
453
+ # app.logger.info removed duplicate log
454
+ print(f"[Cache Task] Querying models for: '{sentence[:50]}...'")
455
+ available_models = Model.query.filter_by(
456
+ model_type=ModelType.TTS, is_active=True
457
+ ).all()
458
+
459
+ if len(available_models) < 2:
460
+ app.logger.error("Not enough active TTS models to generate cache entry.")
461
+ return
462
+
463
+ try:
464
+ models = get_weighted_random_models(available_models, 2, ModelType.TTS)
465
+ model_a_id = models[0].id
466
+ model_b_id = models[1].id
467
+
468
+ # Generate audio concurrently using a local executor for clarity within the task
469
+ with ThreadPoolExecutor(max_workers=2, thread_name_prefix='AudioGen') as audio_executor:
470
+ future_a = audio_executor.submit(generate_and_save_tts, sentence, model_a_id, CACHE_AUDIO_DIR)
471
+ future_b = audio_executor.submit(generate_and_save_tts, sentence, model_b_id, CACHE_AUDIO_DIR)
472
+
473
+ timeout_seconds = 120
474
+ audio_a_path = future_a.result(timeout=timeout_seconds)
475
+ audio_b_path = future_b.result(timeout=timeout_seconds)
476
+
477
+ if audio_a_path and audio_b_path:
478
+ with tts_cache_lock:
479
+ # Only add if the sentence isn't already back in the cache
480
+ # And ensure cache size doesn't exceed limit
481
+ if sentence not in tts_cache and len(tts_cache) < TTS_CACHE_SIZE:
482
+ tts_cache[sentence] = {
483
+ "model_a": model_a_id,
484
+ "model_b": model_b_id,
485
+ "audio_a": audio_a_path,
486
+ "audio_b": audio_b_path,
487
+ "created_at": datetime.utcnow(),
488
+ }
489
+ # Mark sentence as consumed for cache usage
490
+ mark_sentence_consumed(sentence, usage_type='cache')
491
+ app.logger.info(f"Successfully cached entry for: '{sentence[:50]}...'")
492
+ elif sentence in tts_cache:
493
+ app.logger.warning(f"Sentence '{sentence[:50]}...' already re-cached. Discarding new generation.")
494
+ # Clean up the newly generated files if not added
495
+ if os.path.exists(audio_a_path): os.remove(audio_a_path)
496
+ if os.path.exists(audio_b_path): os.remove(audio_b_path)
497
+ else: # Cache is full
498
+ app.logger.warning(f"Cache is full ({len(tts_cache)} entries). Discarding new generation for '{sentence[:50]}...'.")
499
+ # Clean up the newly generated files if not added
500
+ if os.path.exists(audio_a_path): os.remove(audio_a_path)
501
+ if os.path.exists(audio_b_path): os.remove(audio_b_path)
502
+
503
+ else:
504
+ app.logger.error(f"Failed to generate one or both audio files for cache: '{sentence[:50]}...'")
505
+ # Clean up whichever file might have been created
506
+ if audio_a_path and os.path.exists(audio_a_path): os.remove(audio_a_path)
507
+ if audio_b_path and os.path.exists(audio_b_path): os.remove(audio_b_path)
508
+
509
+ except Exception as e:
510
+ # Log the exception within the app context
511
+ app.logger.error(f"Exception in _generate_cache_entry_task for '{sentence[:50]}...': {str(e)}", exc_info=True)
512
+
513
+
514
+ def update_initial_sentences():
515
+ """Update initial sentences to only include unconsumed ones."""
516
+ global initial_sentences
517
+ try:
518
+ unconsumed_for_initial = get_unconsumed_sentences(all_harvard_sentences)
519
+ if unconsumed_for_initial:
520
+ initial_sentences = random.sample(unconsumed_for_initial, min(len(unconsumed_for_initial), 500))
521
+ print(f"Updated initial sentences with {len(initial_sentences)} unconsumed sentences")
522
+ else:
523
+ print("Warning: No unconsumed sentences available for initial selection, disabling fallback")
524
+ initial_sentences = [] # No fallback to consumed sentences
525
+ except Exception as e:
526
+ print(f"Error updating initial sentences: {e}, disabling fallback for security")
527
+ initial_sentences = [] # No fallback to consumed sentences
528
+
529
+
530
+ def initialize_tts_cache():
531
+ print("Initializing TTS cache")
532
+ """Selects initial sentences and starts generation tasks."""
533
+ with app.app_context(): # Ensure access to models
534
+ if not all_harvard_sentences:
535
+ app.logger.error("Harvard sentences not loaded. Cannot initialize cache.")
536
+ return
537
+
538
+ # Update initial sentences with unconsumed ones
539
+ update_initial_sentences()
540
+
541
+ # Only use unconsumed sentences for initial cache population
542
+ unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
543
+ if not unconsumed_sentences:
544
+ app.logger.error("No unconsumed sentences available for cache initialization. Cache will remain empty.")
545
+ app.logger.warning("WARNING: All sentences from the dataset have been consumed. No new TTS generations will be possible.")
546
+ return
547
+ initial_selection = random.sample(unconsumed_sentences, min(len(unconsumed_sentences), TTS_CACHE_SIZE))
548
+ app.logger.info(f"Initializing TTS cache with {len(initial_selection)} sentences...")
549
+
550
+ for sentence in initial_selection:
551
+ # Use the main cache_executor for initial population too
552
+ cache_executor.submit(_generate_cache_entry_task, sentence)
553
+ app.logger.info("Submitted initial cache generation tasks.")
554
+
555
+ # --- End TTS Caching Functions ---
556
+
557
+
558
+ @app.route("/api/tts/generate", methods=["POST"])
559
+ @limiter.limit("10 per minute") # Keep limit, cached responses are still requests
560
+ def generate_tts():
561
+ # If verification not setup, handle it first
562
+ if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
563
+ return jsonify({"error": "Turnstile verification required"}), 403
564
+
565
+ # Require user to be logged in to generate audio
566
+ if not current_user.is_authenticated:
567
+ return jsonify({"error": "You must be logged in to generate audio"}), 401
568
+
569
+ data = request.json
570
+ text = data.get("text", "").strip() # Ensure text is stripped
571
+
572
+ if not text or len(text) > 1000:
573
+ return jsonify({"error": "Invalid or too long text"}), 400
574
+
575
+ # Check if sentence has already been consumed
576
+ if is_sentence_consumed(text):
577
+ remaining_count = len(get_unconsumed_sentences(all_harvard_sentences))
578
+ if remaining_count == 0:
579
+ return jsonify({"error": "This sentence has already been used and no unconsumed sentences remain. All sentences from the dataset have been consumed."}), 400
580
+ else:
581
+ return jsonify({"error": f"This sentence has already been used. Please select a different sentence. {remaining_count} sentences remain available."}), 400
582
+
583
+ # --- Cache Check ---
584
+ cache_hit = False
585
+ session_data_from_cache = None
586
+ with tts_cache_lock:
587
+ if text in tts_cache:
588
+ cache_hit = True
589
+ cached_entry = tts_cache.pop(text) # Remove from cache immediately
590
+ app.logger.info(f"TTS Cache HIT for: '{text[:50]}...'")
591
+
592
+ # Prepare session data using cached info
593
+ session_id = str(uuid.uuid4())
594
+ session_data_from_cache = {
595
+ "model_a": cached_entry["model_a"],
596
+ "model_b": cached_entry["model_b"],
597
+ "audio_a": cached_entry["audio_a"], # Paths are now from cache_dir
598
+ "audio_b": cached_entry["audio_b"],
599
+ "text": text,
600
+ "created_at": datetime.utcnow(),
601
+ "expires_at": datetime.utcnow() + timedelta(minutes=30),
602
+ "voted": False,
603
+ "cache_hit": True,
604
+ }
605
+ app.tts_sessions[session_id] = session_data_from_cache
606
+
607
+ # Note: Sentence was already marked as consumed when it was cached
608
+ # No need to mark it again here
609
+
610
+ # --- Trigger background tasks to refill the cache ---
611
+ # Calculate how many slots need refilling
612
+ current_cache_size = len(tts_cache) # Size *before* adding potentially new items
613
+ needed_refills = TTS_CACHE_SIZE - current_cache_size
614
+ # Limit concurrent refills to 8 or the actual need
615
+ refills_to_submit = min(needed_refills, 8)
616
+
617
+ if refills_to_submit > 0:
618
+ app.logger.info(f"Cache hit: Submitting {refills_to_submit} background task(s) to refill cache (current size: {current_cache_size}, target: {TTS_CACHE_SIZE}).")
619
+ for _ in range(refills_to_submit):
620
+ # Pass None to signal replacement selection within the task
621
+ cache_executor.submit(_generate_cache_entry_task, None)
622
+ else:
623
+ app.logger.info(f"Cache hit: Cache is already full or at target size ({current_cache_size}/{TTS_CACHE_SIZE}). No refill tasks submitted.")
624
+ # --- End Refill Trigger ---
625
+
626
+ if cache_hit and session_data_from_cache:
627
+ # Return response using cached data
628
+ # Note: The files are now managed by the session lifecycle (cleanup_session)
629
+ return jsonify(
630
+ {
631
+ "session_id": session_id,
632
+ "audio_a": f"/api/tts/audio/{session_id}/a",
633
+ "audio_b": f"/api/tts/audio/{session_id}/b",
634
+ "expires_in": 1800, # 30 minutes in seconds
635
+ "cache_hit": True,
636
+ }
637
+ )
638
+ # --- End Cache Check ---
639
+
640
+ # --- Cache Miss: Generate on the fly ---
641
+ app.logger.info(f"TTS Cache MISS for: '{text[:50]}...'. Generating on the fly.")
642
+ available_models = Model.query.filter_by(
643
+ model_type=ModelType.TTS, is_active=True
644
+ ).all()
645
+ if len(available_models) < 2:
646
+ return jsonify({"error": "Not enough TTS models available"}), 500
647
+
648
+ selected_models = get_weighted_random_models(available_models, 2, ModelType.TTS)
649
+
650
+ try:
651
+ audio_files = []
652
+ model_ids = []
653
+
654
+ # Function to process a single model (generate directly to TEMP_AUDIO_DIR, not cache subdir)
655
+ def process_model_on_the_fly(model):
656
+ # Generate and save directly to the main temp dir
657
+ # Assume predict_tts handles saving temporary files
658
+ temp_audio_path = predict_tts(text, model.id)
659
+ if not temp_audio_path or not os.path.exists(temp_audio_path):
660
+ raise ValueError(f"predict_tts failed for model {model.id}")
661
+
662
+ # Create a unique name in the main TEMP_AUDIO_DIR for the session
663
+ file_uuid = str(uuid.uuid4())
664
+ dest_path = os.path.join(TEMP_AUDIO_DIR, f"{file_uuid}.wav")
665
+ shutil.move(temp_audio_path, dest_path) # Move from predict_tts's temp location
666
+
667
+ return {"model_id": model.id, "audio_path": dest_path}
668
+
669
+
670
+ # Use ThreadPoolExecutor to process models concurrently
671
+ with ThreadPoolExecutor(max_workers=2) as executor:
672
+ results = list(executor.map(process_model_on_the_fly, selected_models))
673
+
674
+ # Extract results
675
+ for result in results:
676
+ model_ids.append(result["model_id"])
677
+ audio_files.append(result["audio_path"])
678
+
679
+ # Create session
680
+ session_id = str(uuid.uuid4())
681
+ app.tts_sessions[session_id] = {
682
+ "model_a": model_ids[0],
683
+ "model_b": model_ids[1],
684
+ "audio_a": audio_files[0], # Paths are now from TEMP_AUDIO_DIR directly
685
+ "audio_b": audio_files[1],
686
+ "text": text,
687
+ "created_at": datetime.utcnow(),
688
+ "expires_at": datetime.utcnow() + timedelta(minutes=30),
689
+ "voted": False,
690
+ "cache_hit": False,
691
+ }
692
+
693
+ # Mark sentence as consumed for direct usage
694
+ mark_sentence_consumed(text, session_id=session_id, usage_type='direct')
695
+
696
+ # Return audio file paths and session
697
+ return jsonify(
698
+ {
699
+ "session_id": session_id,
700
+ "audio_a": f"/api/tts/audio/{session_id}/a",
701
+ "audio_b": f"/api/tts/audio/{session_id}/b",
702
+ "expires_in": 1800,
703
+ "cache_hit": False,
704
+ }
705
+ )
706
+
707
+ except Exception as e:
708
+ app.logger.error(f"TTS on-the-fly generation error: {str(e)}", exc_info=True)
709
+ # Cleanup any files potentially created during the failed attempt
710
+ if 'results' in locals():
711
+ for res in results:
712
+ if 'audio_path' in res and os.path.exists(res['audio_path']):
713
+ try:
714
+ os.remove(res['audio_path'])
715
+ except OSError:
716
+ pass
717
+ return jsonify({"error": "Failed to generate TTS"}), 500
718
+ # --- End Cache Miss ---
719
+
720
+
721
+ @app.route("/api/tts/audio/<session_id>/<model_key>")
722
+ def get_audio(session_id, model_key):
723
+ # If verification not setup, handle it first
724
+ if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
725
+ return jsonify({"error": "Turnstile verification required"}), 403
726
+
727
+ if session_id not in app.tts_sessions:
728
+ return jsonify({"error": "Invalid or expired session"}), 404
729
+
730
+ session_data = app.tts_sessions[session_id]
731
+
732
+ # Check if session expired
733
+ if datetime.utcnow() > session_data["expires_at"]:
734
+ cleanup_session(session_id)
735
+ return jsonify({"error": "Session expired"}), 410
736
+
737
+ if model_key == "a":
738
+ audio_path = session_data["audio_a"]
739
+ elif model_key == "b":
740
+ audio_path = session_data["audio_b"]
741
+ else:
742
+ return jsonify({"error": "Invalid model key"}), 400
743
+
744
+ # Check if file exists
745
+ if not os.path.exists(audio_path):
746
+ return jsonify({"error": "Audio file not found"}), 404
747
+
748
+ return send_file(audio_path, mimetype="audio/wav")
749
+
750
+
751
+ @app.route("/api/tts/vote", methods=["POST"])
752
+ @limiter.limit("30 per minute")
753
+ def submit_vote():
754
+ # If verification not setup, handle it first
755
+ if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
756
+ return jsonify({"error": "Turnstile verification required"}), 403
757
+
758
+ # Require user to be logged in to vote
759
+ if not current_user.is_authenticated:
760
+ return jsonify({"error": "You must be logged in to vote"}), 401
761
+
762
+ # Security checks for vote manipulation prevention
763
+ client_ip = get_client_ip()
764
+ vote_allowed, security_reason, security_score = is_vote_allowed(current_user.id, client_ip)
765
+
766
+ if not vote_allowed:
767
+ app.logger.warning(f"Vote blocked for user {current_user.username} (ID: {current_user.id}): {security_reason} (Score: {security_score})")
768
+ return jsonify({"error": f"Vote not allowed: {security_reason}"}), 403
769
+
770
+ data = request.json
771
+ session_id = data.get("session_id")
772
+ chosen_model_key = data.get("chosen_model") # "a" or "b"
773
+
774
+ if not session_id or session_id not in app.tts_sessions:
775
+ return jsonify({"error": "Invalid or expired session"}), 404
776
+
777
+ if not chosen_model_key or chosen_model_key not in ["a", "b"]:
778
+ return jsonify({"error": "Invalid chosen model"}), 400
779
+
780
+ session_data = app.tts_sessions[session_id]
781
+
782
+ # Check if session expired
783
+ if datetime.utcnow() > session_data["expires_at"]:
784
+ cleanup_session(session_id)
785
+ return jsonify({"error": "Session expired"}), 410
786
+
787
+ # Check if already voted
788
+ if session_data["voted"]:
789
+ return jsonify({"error": "Vote already submitted for this session"}), 400
790
+
791
+ # Get model IDs and audio paths
792
+ chosen_id = (
793
+ session_data["model_a"] if chosen_model_key == "a" else session_data["model_b"]
794
+ )
795
+ rejected_id = (
796
+ session_data["model_b"] if chosen_model_key == "a" else session_data["model_a"]
797
+ )
798
+ chosen_audio_path = (
799
+ session_data["audio_a"] if chosen_model_key == "a" else session_data["audio_b"]
800
+ )
801
+ rejected_audio_path = (
802
+ session_data["audio_b"] if chosen_model_key == "a" else session_data["audio_a"]
803
+ )
804
+
805
+ # Calculate session duration and gather analytics data
806
+ vote_time = datetime.utcnow()
807
+ session_duration = (vote_time - session_data["created_at"]).total_seconds()
808
+ client_ip = get_client_ip()
809
+ user_agent = request.headers.get('User-Agent')
810
+ cache_hit = session_data.get("cache_hit", False)
811
+
812
+ # Record vote in database with analytics data
813
+ vote, error = record_vote(
814
+ current_user.id,
815
+ session_data["text"],
816
+ chosen_id,
817
+ rejected_id,
818
+ ModelType.TTS,
819
+ session_duration=session_duration,
820
+ ip_address=client_ip,
821
+ user_agent=user_agent,
822
+ generation_date=session_data["created_at"],
823
+ cache_hit=cache_hit,
824
+ all_dataset_sentences=all_harvard_sentences
825
+ )
826
+
827
+ if error:
828
+ return jsonify({"error": error}), 500
829
+
830
+ # Mark sentence as consumed AFTER successful vote recording (only for dataset sentences)
831
+ if vote and vote.sentence_origin == 'dataset' and vote.counts_for_public_leaderboard:
832
+ try:
833
+ mark_sentence_consumed(session_data["text"], session_id=session_id, usage_type='voted')
834
+ app.logger.info(f"Marked dataset sentence as consumed after vote: '{session_data['text'][:50]}...'")
835
+ except Exception as e:
836
+ app.logger.error(f"Error marking sentence as consumed after vote: {str(e)}")
837
+
838
+ # --- Save preference data ---
839
+ try:
840
+ vote_uuid = str(uuid.uuid4())
841
+ vote_dir = os.path.join("./votes", vote_uuid)
842
+ os.makedirs(vote_dir, exist_ok=True)
843
+
844
+ # Copy audio files
845
+ shutil.copy(chosen_audio_path, os.path.join(vote_dir, "chosen.wav"))
846
+ shutil.copy(rejected_audio_path, os.path.join(vote_dir, "rejected.wav"))
847
+
848
+ # Create metadata
849
+ chosen_model_obj = Model.query.get(chosen_id)
850
+ rejected_model_obj = Model.query.get(rejected_id)
851
+ metadata = {
852
+ "text": session_data["text"],
853
+ "chosen_model": chosen_model_obj.name if chosen_model_obj else "Unknown",
854
+ "chosen_model_id": chosen_model_obj.id if chosen_model_obj else "Unknown",
855
+ "rejected_model": rejected_model_obj.name if rejected_model_obj else "Unknown",
856
+ "rejected_model_id": rejected_model_obj.id if rejected_model_obj else "Unknown",
857
+ "session_id": session_id,
858
+ "timestamp": datetime.utcnow().isoformat(),
859
+ "username": current_user.username,
860
+ "model_type": "TTS"
861
+ }
862
+ with open(os.path.join(vote_dir, "metadata.json"), "w") as f:
863
+ json.dump(metadata, f, indent=2)
864
+
865
+ except Exception as e:
866
+ app.logger.error(f"Error saving preference data for vote {session_id}: {str(e)}")
867
+ # Continue even if saving preference data fails, vote is already recorded
868
+
869
+ # Mark session as voted
870
+ session_data["voted"] = True
871
+
872
+ # Check for coordinated voting campaigns (async to not slow down response)
873
+ try:
874
+ from threading import Thread
875
+ campaign_check_thread = Thread(target=check_for_coordinated_campaigns)
876
+ campaign_check_thread.daemon = True
877
+ campaign_check_thread.start()
878
+ except Exception as e:
879
+ app.logger.error(f"Error starting coordinated campaign check thread: {str(e)}")
880
+
881
+ # Return updated models (use previously fetched objects)
882
+ return jsonify(
883
+ {
884
+ "success": True,
885
+ "chosen_model": {"id": chosen_id, "name": chosen_model_obj.name if chosen_model_obj else "Unknown"},
886
+ "rejected_model": {
887
+ "id": rejected_id,
888
+ "name": rejected_model_obj.name if rejected_model_obj else "Unknown",
889
+ },
890
+ "names": {
891
+ "a": (
892
+ chosen_model_obj.name if chosen_model_key == "a" else rejected_model_obj.name
893
+ if chosen_model_obj and rejected_model_obj else "Unknown"
894
+ ),
895
+ "b": (
896
+ rejected_model_obj.name if chosen_model_key == "a" else chosen_model_obj.name
897
+ if chosen_model_obj and rejected_model_obj else "Unknown"
898
+ ),
899
+ },
900
+ }
901
+ )
902
+
903
+
904
+ def cleanup_session(session_id):
905
+ """Remove session and its audio files"""
906
+ if session_id in app.tts_sessions:
907
+ session = app.tts_sessions[session_id]
908
+
909
+ # Remove audio files
910
+ for audio_file in [session["audio_a"], session["audio_b"]]:
911
+ if os.path.exists(audio_file):
912
+ try:
913
+ os.remove(audio_file)
914
+ except Exception as e:
915
+ app.logger.error(f"Error removing audio file: {str(e)}")
916
+
917
+ # Remove session
918
+ del app.tts_sessions[session_id]
919
+
920
+
921
+ @app.route("/api/conversational/generate", methods=["POST"])
922
+ @limiter.limit("5 per minute")
923
+ def generate_podcast():
924
+ # If verification not setup, handle it first
925
+ if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
926
+ return jsonify({"error": "Turnstile verification required"}), 403
927
+
928
+ # Require user to be logged in to generate audio
929
+ if not current_user.is_authenticated:
930
+ return jsonify({"error": "You must be logged in to generate audio"}), 401
931
+
932
+ data = request.json
933
+ script = data.get("script")
934
+
935
+ if not script or not isinstance(script, list) or len(script) < 2:
936
+ return jsonify({"error": "Invalid script format or too short"}), 400
937
+
938
+ # Validate script format
939
+ for line in script:
940
+ if not isinstance(line, dict) or "text" not in line or "speaker_id" not in line:
941
+ return (
942
+ jsonify(
943
+ {
944
+ "error": "Invalid script line format. Each line must have text and speaker_id"
945
+ }
946
+ ),
947
+ 400,
948
+ )
949
+ if (
950
+ not line["text"]
951
+ or not isinstance(line["speaker_id"], int)
952
+ or line["speaker_id"] not in [0, 1]
953
+ ):
954
+ return (
955
+ jsonify({"error": "Invalid script content. Speaker ID must be 0 or 1"}),
956
+ 400,
957
+ )
958
+
959
+ # Get two conversational models (currently only CSM and PlayDialog)
960
+ available_models = Model.query.filter_by(
961
+ model_type=ModelType.CONVERSATIONAL, is_active=True
962
+ ).all()
963
+
964
+ if len(available_models) < 2:
965
+ return jsonify({"error": "Not enough conversational models available"}), 500
966
+
967
+ selected_models = get_weighted_random_models(available_models, 2, ModelType.CONVERSATIONAL)
968
+
969
+ try:
970
+ # Generate audio for both models concurrently
971
+ audio_files = []
972
+ model_ids = []
973
+
974
+ # Function to process a single model
975
+ def process_model(model):
976
+ # Call conversational TTS service
977
+ audio_content = predict_tts(script, model.id)
978
+
979
+ # Save to temp file with unique name
980
+ file_uuid = str(uuid.uuid4())
981
+ dest_path = os.path.join(TEMP_AUDIO_DIR, f"{file_uuid}.wav")
982
+
983
+ with open(dest_path, "wb") as f:
984
+ f.write(audio_content)
985
+
986
+ return {"model_id": model.id, "audio_path": dest_path}
987
+
988
+ # Use ThreadPoolExecutor to process models concurrently
989
+ with ThreadPoolExecutor(max_workers=2) as executor:
990
+ results = list(executor.map(process_model, selected_models))
991
+
992
+ # Extract results
993
+ for result in results:
994
+ model_ids.append(result["model_id"])
995
+ audio_files.append(result["audio_path"])
996
+
997
+ # Create session
998
+ session_id = str(uuid.uuid4())
999
+ script_text = " ".join([line["text"] for line in script])
1000
+ app.conversational_sessions[session_id] = {
1001
+ "model_a": model_ids[0],
1002
+ "model_b": model_ids[1],
1003
+ "audio_a": audio_files[0],
1004
+ "audio_b": audio_files[1],
1005
+ "text": script_text[:1000], # Limit text length
1006
+ "created_at": datetime.utcnow(),
1007
+ "expires_at": datetime.utcnow() + timedelta(minutes=30),
1008
+ "voted": False,
1009
+ "script": script,
1010
+ "cache_hit": False, # Conversational is always generated on-demand
1011
+ }
1012
+
1013
+ # Return audio file paths and session
1014
+ return jsonify(
1015
+ {
1016
+ "session_id": session_id,
1017
+ "audio_a": f"/api/conversational/audio/{session_id}/a",
1018
+ "audio_b": f"/api/conversational/audio/{session_id}/b",
1019
+ "expires_in": 1800, # 30 minutes in seconds
1020
+ }
1021
+ )
1022
+
1023
+ except Exception as e:
1024
+ app.logger.error(f"Conversational generation error: {str(e)}")
1025
+ return jsonify({"error": f"Failed to generate podcast: {str(e)}"}), 500
1026
+
1027
+
1028
+ @app.route("/api/conversational/audio/<session_id>/<model_key>")
1029
+ def get_podcast_audio(session_id, model_key):
1030
+ # If verification not setup, handle it first
1031
+ if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
1032
+ return jsonify({"error": "Turnstile verification required"}), 403
1033
+
1034
+ if session_id not in app.conversational_sessions:
1035
+ return jsonify({"error": "Invalid or expired session"}), 404
1036
+
1037
+ session_data = app.conversational_sessions[session_id]
1038
+
1039
+ # Check if session expired
1040
+ if datetime.utcnow() > session_data["expires_at"]:
1041
+ cleanup_conversational_session(session_id)
1042
+ return jsonify({"error": "Session expired"}), 410
1043
+
1044
+ if model_key == "a":
1045
+ audio_path = session_data["audio_a"]
1046
+ elif model_key == "b":
1047
+ audio_path = session_data["audio_b"]
1048
+ else:
1049
+ return jsonify({"error": "Invalid model key"}), 400
1050
+
1051
+ # Check if file exists
1052
+ if not os.path.exists(audio_path):
1053
+ return jsonify({"error": "Audio file not found"}), 404
1054
+
1055
+ return send_file(audio_path, mimetype="audio/wav")
1056
+
1057
+
1058
+ @app.route("/api/conversational/vote", methods=["POST"])
1059
+ @limiter.limit("30 per minute")
1060
+ def submit_podcast_vote():
1061
+ # If verification not setup, handle it first
1062
+ if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
1063
+ return jsonify({"error": "Turnstile verification required"}), 403
1064
+
1065
+ # Require user to be logged in to vote
1066
+ if not current_user.is_authenticated:
1067
+ return jsonify({"error": "You must be logged in to vote"}), 401
1068
+
1069
+ # Security checks for vote manipulation prevention
1070
+ client_ip = get_client_ip()
1071
+ vote_allowed, security_reason, security_score = is_vote_allowed(current_user.id, client_ip)
1072
+
1073
+ if not vote_allowed:
1074
+ app.logger.warning(f"Conversational vote blocked for user {current_user.username} (ID: {current_user.id}): {security_reason} (Score: {security_score})")
1075
+ return jsonify({"error": f"Vote not allowed: {security_reason}"}), 403
1076
+
1077
+ data = request.json
1078
+ session_id = data.get("session_id")
1079
+ chosen_model_key = data.get("chosen_model") # "a" or "b"
1080
+
1081
+ if not session_id or session_id not in app.conversational_sessions:
1082
+ return jsonify({"error": "Invalid or expired session"}), 404
1083
+
1084
+ if not chosen_model_key or chosen_model_key not in ["a", "b"]:
1085
+ return jsonify({"error": "Invalid chosen model"}), 400
1086
+
1087
+ session_data = app.conversational_sessions[session_id]
1088
+
1089
+ # Check if session expired
1090
+ if datetime.utcnow() > session_data["expires_at"]:
1091
+ cleanup_conversational_session(session_id)
1092
+ return jsonify({"error": "Session expired"}), 410
1093
+
1094
+ # Check if already voted
1095
+ if session_data["voted"]:
1096
+ return jsonify({"error": "Vote already submitted for this session"}), 400
1097
+
1098
+ # Get model IDs and audio paths
1099
+ chosen_id = (
1100
+ session_data["model_a"] if chosen_model_key == "a" else session_data["model_b"]
1101
+ )
1102
+ rejected_id = (
1103
+ session_data["model_b"] if chosen_model_key == "a" else session_data["model_a"]
1104
+ )
1105
+ chosen_audio_path = (
1106
+ session_data["audio_a"] if chosen_model_key == "a" else session_data["audio_b"]
1107
+ )
1108
+ rejected_audio_path = (
1109
+ session_data["audio_b"] if chosen_model_key == "a" else session_data["audio_a"]
1110
+ )
1111
+
1112
+ # Calculate session duration and gather analytics data
1113
+ vote_time = datetime.utcnow()
1114
+ session_duration = (vote_time - session_data["created_at"]).total_seconds()
1115
+ client_ip = get_client_ip()
1116
+ user_agent = request.headers.get('User-Agent')
1117
+ cache_hit = session_data.get("cache_hit", False)
1118
+
1119
+ # Record vote in database with analytics data
1120
+ vote, error = record_vote(
1121
+ current_user.id,
1122
+ session_data["text"],
1123
+ chosen_id,
1124
+ rejected_id,
1125
+ ModelType.CONVERSATIONAL,
1126
+ session_duration=session_duration,
1127
+ ip_address=client_ip,
1128
+ user_agent=user_agent,
1129
+ generation_date=session_data["created_at"],
1130
+ cache_hit=cache_hit,
1131
+ all_dataset_sentences=all_harvard_sentences # Note: conversational uses scripts, not sentences
1132
+ )
1133
+
1134
+ if error:
1135
+ return jsonify({"error": error}), 500
1136
+
1137
+ # Mark sentence as consumed AFTER successful vote recording (only for dataset sentences)
1138
+ # Note: Conversational votes typically use custom scripts, not dataset sentences
1139
+ if vote and vote.sentence_origin == 'dataset' and vote.counts_for_public_leaderboard:
1140
+ try:
1141
+ mark_sentence_consumed(session_data["text"], session_id=session_id, usage_type='voted')
1142
+ app.logger.info(f"Marked dataset sentence as consumed after conversational vote: '{session_data['text'][:50]}...'")
1143
+ except Exception as e:
1144
+ app.logger.error(f"Error marking sentence as consumed after conversational vote: {str(e)}")
1145
+
1146
+ # --- Save preference data ---\
1147
+ try:
1148
+ vote_uuid = str(uuid.uuid4())
1149
+ vote_dir = os.path.join("./votes", vote_uuid)
1150
+ os.makedirs(vote_dir, exist_ok=True)
1151
+
1152
+ # Copy audio files
1153
+ shutil.copy(chosen_audio_path, os.path.join(vote_dir, "chosen.wav"))
1154
+ shutil.copy(rejected_audio_path, os.path.join(vote_dir, "rejected.wav"))
1155
+
1156
+ # Create metadata
1157
+ chosen_model_obj = Model.query.get(chosen_id)
1158
+ rejected_model_obj = Model.query.get(rejected_id)
1159
+ metadata = {
1160
+ "script": session_data["script"], # Save the full script
1161
+ "chosen_model": chosen_model_obj.name if chosen_model_obj else "Unknown",
1162
+ "chosen_model_id": chosen_model_obj.id if chosen_model_obj else "Unknown",
1163
+ "rejected_model": rejected_model_obj.name if rejected_model_obj else "Unknown",
1164
+ "rejected_model_id": rejected_model_obj.id if rejected_model_obj else "Unknown",
1165
+ "session_id": session_id,
1166
+ "timestamp": datetime.utcnow().isoformat(),
1167
+ "username": current_user.username,
1168
+ "model_type": "CONVERSATIONAL"
1169
+ }
1170
+ with open(os.path.join(vote_dir, "metadata.json"), "w") as f:
1171
+ json.dump(metadata, f, indent=2)
1172
+
1173
+ except Exception as e:
1174
+ app.logger.error(f"Error saving preference data for conversational vote {session_id}: {str(e)}")
1175
+ # Continue even if saving preference data fails, vote is already recorded
1176
+
1177
+ # Mark session as voted
1178
+ session_data["voted"] = True
1179
+
1180
+ # Check for coordinated voting campaigns (async to not slow down response)
1181
+ try:
1182
+ from threading import Thread
1183
+ campaign_check_thread = Thread(target=check_for_coordinated_campaigns)
1184
+ campaign_check_thread.daemon = True
1185
+ campaign_check_thread.start()
1186
+ except Exception as e:
1187
+ app.logger.error(f"Error starting coordinated campaign check thread: {str(e)}")
1188
+
1189
+ # Return updated models (use previously fetched objects)
1190
+ return jsonify(
1191
+ {
1192
+ "success": True,
1193
+ "chosen_model": {"id": chosen_id, "name": chosen_model_obj.name if chosen_model_obj else "Unknown"},
1194
+ "rejected_model": {
1195
+ "id": rejected_id,
1196
+ "name": rejected_model_obj.name if rejected_model_obj else "Unknown",
1197
+ },
1198
+ "names": {
1199
+ "a": Model.query.get(session_data["model_a"]).name,
1200
+ "b": Model.query.get(session_data["model_b"]).name,
1201
+ },
1202
+ }
1203
+ )
1204
+
1205
+
1206
+ def cleanup_conversational_session(session_id):
1207
+ """Remove conversational session and its audio files"""
1208
+ if session_id in app.conversational_sessions:
1209
+ session = app.conversational_sessions[session_id]
1210
+
1211
+ # Remove audio files
1212
+ for audio_file in [session["audio_a"], session["audio_b"]]:
1213
+ if os.path.exists(audio_file):
1214
+ try:
1215
+ os.remove(audio_file)
1216
+ except Exception as e:
1217
+ app.logger.error(
1218
+ f"Error removing conversational audio file: {str(e)}"
1219
+ )
1220
+
1221
+ # Remove session
1222
+ del app.conversational_sessions[session_id]
1223
+
1224
+
1225
+ # Schedule periodic cleanup
1226
+ def setup_cleanup():
1227
+ def cleanup_expired_sessions():
1228
+ with app.app_context(): # Ensure app context for logging
1229
+ current_time = datetime.utcnow()
1230
+ # Cleanup TTS sessions
1231
+ expired_tts_sessions = [
1232
+ sid
1233
+ for sid, session_data in app.tts_sessions.items()
1234
+ if current_time > session_data["expires_at"]
1235
+ ]
1236
+ for sid in expired_tts_sessions:
1237
+ cleanup_session(sid)
1238
+
1239
+ # Cleanup conversational sessions
1240
+ expired_conv_sessions = [
1241
+ sid
1242
+ for sid, session_data in app.conversational_sessions.items()
1243
+ if current_time > session_data["expires_at"]
1244
+ ]
1245
+ for sid in expired_conv_sessions:
1246
+ cleanup_conversational_session(sid)
1247
+ app.logger.info(f"Cleaned up {len(expired_tts_sessions)} TTS and {len(expired_conv_sessions)} conversational sessions.")
1248
+
1249
+ # Also cleanup potentially expired cache entries (e.g., > 1 hour old)
1250
+ # This prevents stale cache entries if generation is slow or failing
1251
+ # cleanup_stale_cache_entries()
1252
+
1253
+ # Run cleanup every 15 minutes
1254
+ scheduler = BackgroundScheduler(daemon=True) # Run scheduler as daemon thread
1255
+ scheduler.add_job(cleanup_expired_sessions, "interval", minutes=15)
1256
+ scheduler.start()
1257
+ print("Cleanup scheduler started") # Use print for startup messages
1258
+
1259
+
1260
+ # Schedule periodic tasks (database sync and preference upload)
1261
+ def setup_periodic_tasks():
1262
+ """Setup periodic database synchronization and preference data upload for Spaces"""
1263
+ if not IS_SPACES:
1264
+ return
1265
+
1266
+ db_path = app.config["SQLALCHEMY_DATABASE_URI"].replace("sqlite:///", "instance/") # Get relative path
1267
+ preferences_repo_id = "TTS-AGI/arena-v2-preferences"
1268
+ database_repo_id = "TTS-AGI/database-arena-v2"
1269
+ votes_dir = "./votes"
1270
+
1271
+ def sync_database():
1272
+ """Uploads the database to HF dataset"""
1273
+ with app.app_context(): # Ensure app context for logging
1274
+ try:
1275
+ if not os.path.exists(db_path):
1276
+ app.logger.warning(f"Database file not found at {db_path}, skipping sync.")
1277
+ return
1278
+
1279
+ api = HfApi(token=os.getenv("HF_TOKEN"))
1280
+ api.upload_file(
1281
+ path_or_fileobj=db_path,
1282
+ path_in_repo="tts_arena.db",
1283
+ repo_id=database_repo_id,
1284
+ repo_type="dataset",
1285
+ )
1286
+ app.logger.info(f"Database uploaded to {database_repo_id} at {datetime.utcnow()}")
1287
+ except Exception as e:
1288
+ app.logger.error(f"Error uploading database to {database_repo_id}: {str(e)}")
1289
+
1290
+ def sync_preferences_data():
1291
+ """Zips and uploads preference data folders in batches to HF dataset"""
1292
+ with app.app_context(): # Ensure app context for logging
1293
+ if not os.path.isdir(votes_dir):
1294
+ return # Don't log every 5 mins if dir doesn't exist yet
1295
+
1296
+ temp_batch_dir = None # Initialize to manage cleanup
1297
+ temp_individual_zip_dir = None # Initialize for individual zips
1298
+ local_batch_zip_path = None # Initialize for batch zip path
1299
+
1300
+ try:
1301
+ api = HfApi(token=os.getenv("HF_TOKEN"))
1302
+ vote_uuids = [d for d in os.listdir(votes_dir) if os.path.isdir(os.path.join(votes_dir, d))]
1303
+
1304
+ if not vote_uuids:
1305
+ return # No data to process
1306
+
1307
+ app.logger.info(f"Found {len(vote_uuids)} vote directories to process.")
1308
+
1309
+ # Create temporary directories
1310
+ temp_batch_dir = tempfile.mkdtemp(prefix="hf_batch_")
1311
+ temp_individual_zip_dir = tempfile.mkdtemp(prefix="hf_indiv_zips_")
1312
+ app.logger.debug(f"Created temp directories: {temp_batch_dir}, {temp_individual_zip_dir}")
1313
+
1314
+ processed_vote_dirs = []
1315
+ individual_zips_in_batch = []
1316
+
1317
+ # 1. Create individual zips and move them to the batch directory
1318
+ for vote_uuid in vote_uuids:
1319
+ dir_path = os.path.join(votes_dir, vote_uuid)
1320
+ individual_zip_base_path = os.path.join(temp_individual_zip_dir, vote_uuid)
1321
+ individual_zip_path = f"{individual_zip_base_path}.zip"
1322
+
1323
+ try:
1324
+ shutil.make_archive(individual_zip_base_path, 'zip', dir_path)
1325
+ app.logger.debug(f"Created individual zip: {individual_zip_path}")
1326
+
1327
+ # Move the created zip into the batch directory
1328
+ final_individual_zip_path = os.path.join(temp_batch_dir, f"{vote_uuid}.zip")
1329
+ shutil.move(individual_zip_path, final_individual_zip_path)
1330
+ app.logger.debug(f"Moved individual zip to batch dir: {final_individual_zip_path}")
1331
+
1332
+ processed_vote_dirs.append(dir_path) # Mark original dir for later cleanup
1333
+ individual_zips_in_batch.append(final_individual_zip_path)
1334
+
1335
+ except Exception as zip_err:
1336
+ app.logger.error(f"Error creating or moving zip for {vote_uuid}: {str(zip_err)}")
1337
+ # Clean up partial zip if it exists
1338
+ if os.path.exists(individual_zip_path):
1339
+ try:
1340
+ os.remove(individual_zip_path)
1341
+ except OSError:
1342
+ pass
1343
+ # Continue processing other votes
1344
+
1345
+ # Clean up the temporary dir used for creating individual zips
1346
+ shutil.rmtree(temp_individual_zip_dir)
1347
+ temp_individual_zip_dir = None # Mark as cleaned
1348
+ app.logger.debug("Cleaned up temporary individual zip directory.")
1349
+
1350
+ if not individual_zips_in_batch:
1351
+ app.logger.warning("No individual zips were successfully created for batching.")
1352
+ # Clean up batch dir if it's empty or only contains failed attempts
1353
+ if temp_batch_dir and os.path.exists(temp_batch_dir):
1354
+ shutil.rmtree(temp_batch_dir)
1355
+ temp_batch_dir = None
1356
+ return
1357
+
1358
+ # 2. Create the batch zip file
1359
+ batch_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
1360
+ batch_uuid_short = str(uuid.uuid4())[:8]
1361
+ batch_zip_filename = f"{batch_timestamp}_batch_{batch_uuid_short}.zip"
1362
+ # Create batch zip in a standard temp location first
1363
+ local_batch_zip_base = os.path.join(tempfile.gettempdir(), batch_zip_filename.replace('.zip', ''))
1364
+ local_batch_zip_path = f"{local_batch_zip_base}.zip"
1365
+
1366
+ app.logger.info(f"Creating batch zip: {local_batch_zip_path} with {len(individual_zips_in_batch)} individual zips.")
1367
+ shutil.make_archive(local_batch_zip_base, 'zip', temp_batch_dir)
1368
+ app.logger.info(f"Batch zip created successfully: {local_batch_zip_path}")
1369
+
1370
+ # 3. Upload the batch zip file
1371
+ hf_repo_path = f"votes/{year}/{month}/{batch_zip_filename}"
1372
+ app.logger.info(f"Uploading batch zip to HF Hub: {preferences_repo_id}/{hf_repo_path}")
1373
+
1374
+ api.upload_file(
1375
+ path_or_fileobj=local_batch_zip_path,
1376
+ path_in_repo=hf_repo_path,
1377
+ repo_id=preferences_repo_id,
1378
+ repo_type="dataset",
1379
+ commit_message=f"Add batch preference data {batch_zip_filename} ({len(individual_zips_in_batch)} votes)"
1380
+ )
1381
+ app.logger.info(f"Successfully uploaded batch {batch_zip_filename} to {preferences_repo_id}")
1382
+
1383
+ # 4. Cleanup after successful upload
1384
+ app.logger.info("Cleaning up local files after successful upload.")
1385
+ # Remove original vote directories that were successfully zipped and uploaded
1386
+ for dir_path in processed_vote_dirs:
1387
+ try:
1388
+ shutil.rmtree(dir_path)
1389
+ app.logger.debug(f"Removed original vote directory: {dir_path}")
1390
+ except OSError as e:
1391
+ app.logger.error(f"Error removing processed vote directory {dir_path}: {str(e)}")
1392
+
1393
+ # Remove the temporary batch directory (containing the individual zips)
1394
+ shutil.rmtree(temp_batch_dir)
1395
+ temp_batch_dir = None
1396
+ app.logger.debug("Removed temporary batch directory.")
1397
+
1398
+ # Remove the local batch zip file
1399
+ os.remove(local_batch_zip_path)
1400
+ local_batch_zip_path = None
1401
+ app.logger.debug("Removed local batch zip file.")
1402
+
1403
+ app.logger.info(f"Finished preference data sync. Uploaded batch {batch_zip_filename}.")
1404
+
1405
+ except Exception as e:
1406
+ app.logger.error(f"Error during preference data batch sync: {str(e)}", exc_info=True)
1407
+ # If upload failed, the local batch zip might exist, clean it up.
1408
+ if local_batch_zip_path and os.path.exists(local_batch_zip_path):
1409
+ try:
1410
+ os.remove(local_batch_zip_path)
1411
+ app.logger.debug("Cleaned up local batch zip after failed upload.")
1412
+ except OSError as clean_err:
1413
+ app.logger.error(f"Error cleaning up batch zip after failed upload: {clean_err}")
1414
+ # Do NOT remove temp_batch_dir if it exists; its contents will be retried next time.
1415
+ # Do NOT remove original vote directories if upload failed.
1416
+
1417
+ finally:
1418
+ # Final cleanup for temporary directories in case of unexpected exits
1419
+ if temp_individual_zip_dir and os.path.exists(temp_individual_zip_dir):
1420
+ try:
1421
+ shutil.rmtree(temp_individual_zip_dir)
1422
+ except Exception as final_clean_err:
1423
+ app.logger.error(f"Error in final cleanup (indiv zips): {final_clean_err}")
1424
+ # Only clean up batch dir in finally block if it *wasn't* kept intentionally after upload failure
1425
+ if temp_batch_dir and os.path.exists(temp_batch_dir):
1426
+ # Check if an upload attempt happened and failed
1427
+ upload_failed = 'e' in locals() and isinstance(e, Exception) # Crude check if exception occurred
1428
+ if not upload_failed: # If no upload error or upload succeeded, clean up
1429
+ try:
1430
+ shutil.rmtree(temp_batch_dir)
1431
+ except Exception as final_clean_err:
1432
+ app.logger.error(f"Error in final cleanup (batch dir): {final_clean_err}")
1433
+ else:
1434
+ app.logger.warning("Keeping temporary batch directory due to upload failure for next attempt.")
1435
+
1436
+
1437
+ # Schedule periodic tasks
1438
+ scheduler = BackgroundScheduler()
1439
+ # Sync database less frequently if needed, e.g., every 15 minutes
1440
+ scheduler.add_job(sync_database, "interval", minutes=15, id="sync_db_job")
1441
+ # Sync preferences more frequently
1442
+ scheduler.add_job(sync_preferences_data, "interval", minutes=5, id="sync_pref_job")
1443
+ scheduler.start()
1444
+ print("Periodic tasks scheduler started (DB sync and Preferences upload)") # Use print for startup
1445
+
1446
+
1447
+ @app.cli.command("init-db")
1448
+ def init_db():
1449
+ """Initialize the database."""
1450
+ with app.app_context():
1451
+ db.create_all()
1452
+ print("Database initialized!")
1453
+
1454
+
1455
+ @app.route("/api/toggle-leaderboard-visibility", methods=["POST"])
1456
+ def toggle_leaderboard_visibility():
1457
+ """Toggle whether the current user appears in the top voters leaderboard"""
1458
+ if not current_user.is_authenticated:
1459
+ return jsonify({"error": "You must be logged in to change this setting"}), 401
1460
+
1461
+ new_status = toggle_user_leaderboard_visibility(current_user.id)
1462
+ if new_status is None:
1463
+ return jsonify({"error": "User not found"}), 404
1464
+
1465
+ return jsonify({
1466
+ "success": True,
1467
+ "visible": new_status,
1468
+ "message": "You are now visible in the voters leaderboard" if new_status else "You are now hidden from the voters leaderboard"
1469
+ })
1470
+
1471
+
1472
+ @app.route("/api/tts/cached-sentences")
1473
+ def get_cached_sentences():
1474
+ """Returns a list of unconsumed sentences available for random selection."""
1475
+ # Get unconsumed sentences from the full pool (not just cached ones)
1476
+ unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
1477
+
1478
+ # Limit the response size to avoid overwhelming the frontend
1479
+ max_sentences = 1000
1480
+ if len(unconsumed_sentences) > max_sentences:
1481
+ import random
1482
+ unconsumed_sentences = random.sample(unconsumed_sentences, max_sentences)
1483
+
1484
+ return jsonify(unconsumed_sentences)
1485
+
1486
+
1487
+ @app.route("/api/tts/sentence-stats")
1488
+ def get_sentence_stats():
1489
+ """Returns statistics about sentence consumption."""
1490
+ total_sentences = len(all_harvard_sentences)
1491
+ consumed_count = get_consumed_sentences_count()
1492
+ remaining_count = total_sentences - consumed_count
1493
+
1494
+ return jsonify({
1495
+ "total_sentences": total_sentences,
1496
+ "consumed_sentences": consumed_count,
1497
+ "remaining_sentences": remaining_count,
1498
+ "consumption_percentage": round((consumed_count / total_sentences) * 100, 2) if total_sentences > 0 else 0
1499
+ })
1500
+
1501
+
1502
+ @app.route("/api/tts/random-sentence")
1503
+ def get_random_sentence():
1504
+ """Returns a random unconsumed sentence."""
1505
+ random_sentence = get_random_unconsumed_sentence(all_harvard_sentences)
1506
+ if random_sentence:
1507
+ return jsonify({"sentence": random_sentence})
1508
+ else:
1509
+ total_sentences = len(all_harvard_sentences)
1510
+ consumed_count = get_consumed_sentences_count()
1511
+ return jsonify({
1512
+ "error": "No unconsumed sentences available",
1513
+ "details": f"All {total_sentences} sentences have been consumed ({consumed_count} total consumed)"
1514
+ }), 404
1515
+
1516
+
1517
+ def get_weighted_random_models(
1518
+ applicable_models: list[Model], num_to_select: int, model_type: ModelType
1519
+ ) -> list[Model]:
1520
+ """
1521
+ Selects a specified number of models randomly from a list of applicable_models,
1522
+ weighting models with fewer votes higher. A smoothing factor is used to ensure
1523
+ the preference is slight and to prevent models with zero votes from being
1524
+ overwhelmingly favored. Models are selected without replacement.
1525
+
1526
+ Assumes len(applicable_models) >= num_to_select, which should be checked by the caller.
1527
+ """
1528
+ model_votes_counts = {}
1529
+ for model in applicable_models:
1530
+ votes = (
1531
+ Vote.query.filter(Vote.model_type == model_type)
1532
+ .filter(or_(Vote.model_chosen == model.id, Vote.model_rejected == model.id))
1533
+ .count()
1534
+ )
1535
+ model_votes_counts[model.id] = votes
1536
+
1537
+ weights = [
1538
+ 1.0 / (model_votes_counts[model.id] + SMOOTHING_FACTOR_MODEL_SELECTION)
1539
+ for model in applicable_models
1540
+ ]
1541
+
1542
+ selected_models_list = []
1543
+ # Create copies to modify during selection process
1544
+ current_candidates = list(applicable_models)
1545
+ current_weights = list(weights)
1546
+
1547
+ # Assumes num_to_select is positive and less than or equal to len(current_candidates)
1548
+ # Callers should ensure this (e.g., len(available_models) >= 2).
1549
+ for _ in range(num_to_select):
1550
+ if not current_candidates: # Safety break
1551
+ app.logger.warning("Not enough candidates left for weighted selection.")
1552
+ break
1553
+
1554
+ chosen_model = random.choices(current_candidates, weights=current_weights, k=1)[0]
1555
+ selected_models_list.append(chosen_model)
1556
+
1557
+ try:
1558
+ idx_to_remove = current_candidates.index(chosen_model)
1559
+ current_candidates.pop(idx_to_remove)
1560
+ current_weights.pop(idx_to_remove)
1561
+ except ValueError:
1562
+ # This should ideally not happen if chosen_model came from current_candidates.
1563
+ app.logger.error(f"Error removing model {chosen_model.id} from weighted selection candidates.")
1564
+ break # Avoid potential issues
1565
+
1566
+ return selected_models_list
1567
+
1568
+
1569
+ def check_for_coordinated_campaigns():
1570
+ """Check all active models for potential coordinated voting campaigns"""
1571
+ try:
1572
+ from security import detect_coordinated_voting
1573
+ from models import Model, ModelType
1574
+
1575
+ # Check TTS models
1576
+ tts_models = Model.query.filter_by(model_type=ModelType.TTS, is_active=True).all()
1577
+ for model in tts_models:
1578
+ try:
1579
+ detect_coordinated_voting(model.id)
1580
+ except Exception as e:
1581
+ app.logger.error(f"Error checking coordinated voting for TTS model {model.id}: {str(e)}")
1582
+
1583
+ # Check conversational models
1584
+ conv_models = Model.query.filter_by(model_type=ModelType.CONVERSATIONAL, is_active=True).all()
1585
+ for model in conv_models:
1586
+ try:
1587
+ detect_coordinated_voting(model.id)
1588
+ except Exception as e:
1589
+ app.logger.error(f"Error checking coordinated voting for conversational model {model.id}: {str(e)}")
1590
+
1591
+ except Exception as e:
1592
+ app.logger.error(f"Error in coordinated campaign check: {str(e)}")
1593
+
1594
 
1595
  if __name__ == "__main__":
1596
+ with app.app_context():
1597
+ # Ensure ./instance and ./votes directories exist
1598
+ os.makedirs("instance", exist_ok=True)
1599
+ os.makedirs("./votes", exist_ok=True) # Create votes directory if it doesn't exist
1600
+ os.makedirs(CACHE_AUDIO_DIR, exist_ok=True) # Ensure cache audio dir exists
1601
+
1602
+ # Clean up old cache audio files on startup
1603
+ try:
1604
+ app.logger.info(f"Clearing old cache audio files from {CACHE_AUDIO_DIR}")
1605
+ for filename in os.listdir(CACHE_AUDIO_DIR):
1606
+ file_path = os.path.join(CACHE_AUDIO_DIR, filename)
1607
+ try:
1608
+ if os.path.isfile(file_path) or os.path.islink(file_path):
1609
+ os.unlink(file_path)
1610
+ elif os.path.isdir(file_path):
1611
+ shutil.rmtree(file_path)
1612
+ except Exception as e:
1613
+ app.logger.error(f'Failed to delete {file_path}. Reason: {e}')
1614
+ except Exception as e:
1615
+ app.logger.error(f"Error clearing cache directory {CACHE_AUDIO_DIR}: {e}")
1616
+
1617
+
1618
+ # Download database if it doesn't exist (only on initial space start)
1619
+ if IS_SPACES and not os.path.exists(app.config["SQLALCHEMY_DATABASE_URI"].replace("sqlite:///", "")):
1620
+ try:
1621
+ print("Database not found, downloading from HF dataset...")
1622
+ hf_hub_download(
1623
+ repo_id="TTS-AGI/database-arena-v2",
1624
+ filename="tts_arena.db",
1625
+ repo_type="dataset",
1626
+ local_dir="instance", # download to instance/
1627
+ token=os.getenv("HF_TOKEN"),
1628
+ )
1629
+ print("Database downloaded successfully ✅")
1630
+ except Exception as e:
1631
+ print(f"Error downloading database from HF dataset: {str(e)} ⚠️")
1632
+
1633
+
1634
+ db.create_all() # Create tables if they don't exist
1635
+ insert_initial_models()
1636
+ # Setup background tasks
1637
+ initialize_tts_cache() # Start populating the cache
1638
+ setup_cleanup()
1639
+ setup_periodic_tasks() # Renamed function call
1640
+
1641
+ # Configure Flask to recognize HTTPS when behind a reverse proxy
1642
+ from werkzeug.middleware.proxy_fix import ProxyFix
1643
+
1644
+ # Apply ProxyFix middleware to handle reverse proxy headers
1645
+ # This ensures Flask generates correct URLs with https scheme
1646
+ # X-Forwarded-Proto header will be used to detect the original protocol
1647
+ app.wsgi_app = ProxyFix(app.wsgi_app, x_proto=1, x_host=1)
1648
+
1649
+ # Force Flask to prefer HTTPS for generated URLs
1650
+ app.config["PREFERRED_URL_SCHEME"] = "https"
1651
+
1652
+ from waitress import serve
1653
+
1654
+ # Configuration for 2 vCPUs:
1655
+ # - threads: typically 4-8 threads per CPU core is a good balance
1656
+ # - connection_limit: maximum concurrent connections
1657
+ # - channel_timeout: prevent hanging connections
1658
+ threads = 12 # 6 threads per vCPU is a good balance for mixed IO/CPU workloads
1659
+
1660
+ if IS_SPACES:
1661
+ serve(
1662
+ app,
1663
+ host="0.0.0.0",
1664
+ port=int(os.environ.get("PORT", 7860)),
1665
+ threads=threads,
1666
+ connection_limit=100,
1667
+ channel_timeout=30,
1668
+ url_scheme='https'
1669
+ )
1670
+ else:
1671
+ print(f"Starting Waitress server with {threads} threads")
1672
+ serve(
1673
+ app,
1674
+ host="0.0.0.0",
1675
+ port=5000,
1676
+ threads=threads,
1677
+ connection_limit=100,
1678
+ channel_timeout=30,
1679
+ url_scheme='https' # Keep https for local dev if using proxy/tunnel
1680
+ )
migrate.py CHANGED
@@ -103,6 +103,25 @@ def create_timeout_and_campaign_tables(cursor):
103
  else:
104
  click.echo("⏭️ Table 'user_timeout' already exists, skipping")
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  return tables_created
107
 
108
 
@@ -129,12 +148,16 @@ def add_analytics_columns_and_tables(db_path):
129
  ("ip_address_partial", "VARCHAR(20)"),
130
  ("user_agent", "VARCHAR(500)"),
131
  ("generation_date", "DATETIME"),
132
- ("cache_hit", "BOOLEAN")
 
 
 
133
  ]
134
 
135
  # Define the columns to add to user table
136
  user_columns_to_add = [
137
- ("hf_account_created", "DATETIME")
 
138
  ]
139
 
140
  added_columns = []
@@ -176,6 +199,15 @@ def add_analytics_columns_and_tables(db_path):
176
  click.echo("🔒 Creating security and timeout management tables...")
177
  tables_created = create_timeout_and_campaign_tables(cursor)
178
 
 
 
 
 
 
 
 
 
 
179
  # Commit the changes
180
  conn.commit()
181
  conn.close()
@@ -206,10 +238,17 @@ def add_analytics_columns_and_tables(db_path):
206
  click.echo("\n🚨 New Security Features Enabled:")
207
  click.echo(" • Automatic coordinated voting campaign detection")
208
  click.echo(" • User timeout management")
 
 
 
209
  click.echo(" • Admin panels for security monitoring")
210
  click.echo("\nNew admin panel sections:")
211
  click.echo(" • /admin/timeouts - Manage user timeouts")
212
  click.echo(" • /admin/campaigns - View coordinated voting campaigns")
 
 
 
 
213
 
214
  return True
215
 
@@ -229,11 +268,18 @@ def migrate(database_path, dry_run, backup):
229
  """
230
  Add analytics columns and security tables to the TTS Arena database.
231
 
 
 
 
 
 
 
 
232
  DATABASE_PATH: Path to the SQLite database file (e.g., instance/tts_arena.db)
233
  """
234
  click.echo("🚀 TTS Arena Migration Tool")
235
- click.echo("Analytics + Security Features")
236
- click.echo("=" * 40)
237
 
238
  # Resolve the database path
239
  db_path = Path(database_path).resolve()
@@ -262,12 +308,20 @@ def migrate(database_path, dry_run, backup):
262
  click.echo(" • user_agent (VARCHAR(500))")
263
  click.echo(" • generation_date (DATETIME)")
264
  click.echo(" • cache_hit (BOOLEAN)")
 
 
 
265
  click.echo("\nThe following columns would be added to the 'user' table:")
266
  click.echo(" • hf_account_created (DATETIME)")
 
267
  click.echo("\nThe following security tables would be created:")
268
  click.echo(" • coordinated_voting_campaign - Track detected voting campaigns")
269
  click.echo(" • campaign_participant - Track users involved in campaigns")
270
  click.echo(" • user_timeout - Manage user timeouts/bans")
 
 
 
 
271
  click.echo("\nRun without --dry-run to apply changes.")
272
  return
273
 
 
103
  else:
104
  click.echo("⏭️ Table 'user_timeout' already exists, skipping")
105
 
106
+ # Create consumed_sentence table
107
+ if not check_table_exists(cursor, "consumed_sentence"):
108
+ cursor.execute("""
109
+ CREATE TABLE consumed_sentence (
110
+ id INTEGER PRIMARY KEY,
111
+ sentence_hash VARCHAR(64) UNIQUE NOT NULL,
112
+ sentence_text TEXT NOT NULL,
113
+ consumed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
114
+ session_id VARCHAR(100),
115
+ usage_type VARCHAR(20) NOT NULL
116
+ )
117
+ """)
118
+ # Create index on sentence_hash for performance
119
+ cursor.execute("CREATE INDEX IF NOT EXISTS ix_consumed_sentence_sentence_hash ON consumed_sentence (sentence_hash)")
120
+ tables_created.append("consumed_sentence")
121
+ click.echo("✅ Created table 'consumed_sentence' with index")
122
+ else:
123
+ click.echo("⏭️ Table 'consumed_sentence' already exists, skipping")
124
+
125
  return tables_created
126
 
127
 
 
148
  ("ip_address_partial", "VARCHAR(20)"),
149
  ("user_agent", "VARCHAR(500)"),
150
  ("generation_date", "DATETIME"),
151
+ ("cache_hit", "BOOLEAN"),
152
+ ("sentence_hash", "VARCHAR(64)"),
153
+ ("sentence_origin", "VARCHAR(20)"),
154
+ ("counts_for_public_leaderboard", "BOOLEAN DEFAULT 1")
155
  ]
156
 
157
  # Define the columns to add to user table
158
  user_columns_to_add = [
159
+ ("hf_account_created", "DATETIME"),
160
+ ("show_in_leaderboard", "BOOLEAN DEFAULT 1")
161
  ]
162
 
163
  added_columns = []
 
199
  click.echo("🔒 Creating security and timeout management tables...")
200
  tables_created = create_timeout_and_campaign_tables(cursor)
201
 
202
+ # Create indexes for new columns
203
+ click.echo("📊 Creating indexes for performance...")
204
+ try:
205
+ # Index on vote.sentence_hash for origin tracking queries
206
+ cursor.execute("CREATE INDEX IF NOT EXISTS ix_vote_sentence_hash ON vote (sentence_hash)")
207
+ click.echo("✅ Created index on vote.sentence_hash")
208
+ except sqlite3.Error as e:
209
+ click.echo(f"⚠️ Note: Could not create vote.sentence_hash index: {e}")
210
+
211
  # Commit the changes
212
  conn.commit()
213
  conn.close()
 
238
  click.echo("\n🚨 New Security Features Enabled:")
239
  click.echo(" • Automatic coordinated voting campaign detection")
240
  click.echo(" • User timeout management")
241
+ click.echo(" • Sentence consumption tracking (no reuse)")
242
+ click.echo(" • Vote origin tracking (dataset vs custom)")
243
+ click.echo(" • Public leaderboard integrity protection")
244
  click.echo(" • Admin panels for security monitoring")
245
  click.echo("\nNew admin panel sections:")
246
  click.echo(" • /admin/timeouts - Manage user timeouts")
247
  click.echo(" • /admin/campaigns - View coordinated voting campaigns")
248
+ click.echo("\nLeaderboard Changes:")
249
+ click.echo(" • Public leaderboard: Only unconsumed dataset sentences count")
250
+ click.echo(" • Personal leaderboard: All votes (dataset + custom) included")
251
+ click.echo(" • Each sentence can only be used once for public rankings")
252
 
253
  return True
254
 
 
268
  """
269
  Add analytics columns and security tables to the TTS Arena database.
270
 
271
+ This migration adds:
272
+ - Vote analytics (session duration, IP, user agent, etc.)
273
+ - Sentence origin tracking (dataset vs custom)
274
+ - Sentence consumption tracking (prevent reuse)
275
+ - Security features (coordinated voting detection, user timeouts)
276
+ - Leaderboard integrity protection
277
+
278
  DATABASE_PATH: Path to the SQLite database file (e.g., instance/tts_arena.db)
279
  """
280
  click.echo("🚀 TTS Arena Migration Tool")
281
+ click.echo("Analytics + Security + Vote Origin Tracking")
282
+ click.echo("=" * 50)
283
 
284
  # Resolve the database path
285
  db_path = Path(database_path).resolve()
 
308
  click.echo(" • user_agent (VARCHAR(500))")
309
  click.echo(" • generation_date (DATETIME)")
310
  click.echo(" • cache_hit (BOOLEAN)")
311
+ click.echo(" • sentence_hash (VARCHAR(64))")
312
+ click.echo(" • sentence_origin (VARCHAR(20))")
313
+ click.echo(" • counts_for_public_leaderboard (BOOLEAN DEFAULT 1)")
314
  click.echo("\nThe following columns would be added to the 'user' table:")
315
  click.echo(" • hf_account_created (DATETIME)")
316
+ click.echo(" • show_in_leaderboard (BOOLEAN DEFAULT 1)")
317
  click.echo("\nThe following security tables would be created:")
318
  click.echo(" • coordinated_voting_campaign - Track detected voting campaigns")
319
  click.echo(" • campaign_participant - Track users involved in campaigns")
320
  click.echo(" • user_timeout - Manage user timeouts/bans")
321
+ click.echo(" • consumed_sentence - Track sentence usage for security")
322
+ click.echo("\nIndexes would be created:")
323
+ click.echo(" • ix_vote_sentence_hash - For vote origin tracking")
324
+ click.echo(" • ix_consumed_sentence_sentence_hash - For sentence consumption queries")
325
  click.echo("\nRun without --dry-run to apply changes.")
326
  return
327
 
migrate_consumed_sentences.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Migration script to add ConsumedSentence table for tracking used sentences.
4
+ Run this script once to update existing databases.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ from flask import Flask
10
+ from models import db, ConsumedSentence
11
+
12
+ def create_app():
13
+ app = Flask(__name__)
14
+ app.config["SQLALCHEMY_DATABASE_URI"] = os.getenv(
15
+ "DATABASE_URI", "sqlite:///tts_arena.db"
16
+ )
17
+ app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
18
+
19
+ db.init_app(app)
20
+ return app
21
+
22
+ def migrate():
23
+ app = create_app()
24
+
25
+ with app.app_context():
26
+ try:
27
+ # Create the ConsumedSentence table
28
+ db.create_all()
29
+ print("✅ Successfully created ConsumedSentence table")
30
+
31
+ # Check if table was created
32
+ inspector = db.inspect(db.engine)
33
+ tables = inspector.get_table_names()
34
+
35
+ if 'consumed_sentence' in tables:
36
+ print("✅ ConsumedSentence table confirmed in database")
37
+ else:
38
+ print("❌ ConsumedSentence table not found after creation")
39
+
40
+ except Exception as e:
41
+ print(f"❌ Error during migration: {e}")
42
+ return False
43
+
44
+ return True
45
+
46
+ if __name__ == "__main__":
47
+ print("Running ConsumedSentence table migration...")
48
+ if migrate():
49
+ print("Migration completed successfully!")
50
+ else:
51
+ print("Migration failed!")
52
+ sys.exit(1)
models.py CHANGED
@@ -4,6 +4,7 @@ from datetime import datetime, timedelta
4
  import math
5
  from sqlalchemy import func, text
6
  import logging
 
7
 
8
  db = SQLAlchemy()
9
 
@@ -72,6 +73,11 @@ class Vote(db.Model):
72
  user_agent = db.Column(db.String(500), nullable=True) # Browser/device info
73
  generation_date = db.Column(db.DateTime, nullable=True) # When audio was generated
74
  cache_hit = db.Column(db.Boolean, nullable=True) # Whether generation was from cache
 
 
 
 
 
75
 
76
  chosen = db.relationship(
77
  "Model",
@@ -174,6 +180,19 @@ class UserTimeout(db.Model):
174
  return f"<UserTimeout {self.user_id}: {self.timeout_type} until {self.expires_at}>"
175
 
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  def calculate_elo_change(winner_elo, loser_elo, k_factor=32):
178
  """Calculate Elo rating changes for a match."""
179
  expected_winner = 1 / (1 + math.pow(10, (loser_elo - winner_elo) / 400))
@@ -214,8 +233,23 @@ def anonymize_ip_address(ip_address):
214
 
215
  def record_vote(user_id, text, chosen_model_id, rejected_model_id, model_type,
216
  session_duration=None, ip_address=None, user_agent=None,
217
- generation_date=None, cache_hit=None):
218
  """Record a vote and update Elo ratings."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  # Create the vote
220
  vote = Vote(
221
  user_id=user_id, # Required - user must be logged in to vote
@@ -228,6 +262,9 @@ def record_vote(user_id, text, chosen_model_id, rejected_model_id, model_type,
228
  user_agent=user_agent[:500] if user_agent else None, # Truncate if too long
229
  generation_date=generation_date,
230
  cache_hit=cache_hit,
 
 
 
231
  )
232
  db.session.add(vote)
233
  db.session.flush() # Get the vote ID without committing
@@ -244,18 +281,24 @@ def record_vote(user_id, text, chosen_model_id, rejected_model_id, model_type,
244
  db.session.rollback()
245
  return None, "One or both models not found for the specified model type"
246
 
247
- # Calculate new Elo ratings
248
- new_chosen_elo, new_rejected_elo = calculate_elo_change(
249
- chosen_model.current_elo, rejected_model.current_elo
250
- )
 
 
251
 
252
- # Update model stats
253
- chosen_model.current_elo = new_chosen_elo
254
- chosen_model.win_count += 1
255
- chosen_model.match_count += 1
256
 
257
- rejected_model.current_elo = new_rejected_elo
258
- rejected_model.match_count += 1
 
 
 
 
259
 
260
  # Record Elo history
261
  chosen_history = EloHistory(
@@ -281,6 +324,7 @@ def record_vote(user_id, text, chosen_model_id, rejected_model_id, model_type,
281
  def get_leaderboard_data(model_type):
282
  """
283
  Get leaderboard data for the specified model type.
 
284
 
285
  Args:
286
  model_type (str): The model type ('tts' or 'conversational')
@@ -291,6 +335,7 @@ def get_leaderboard_data(model_type):
291
  query = Model.query.filter_by(model_type=model_type)
292
 
293
  # Get models with >1k votes ordered by ELO score
 
294
  models = query.filter(Model.match_count > 1000).order_by(Model.current_elo.desc()).all()
295
 
296
  result = []
@@ -325,6 +370,7 @@ def get_leaderboard_data(model_type):
325
  def get_user_leaderboard(user_id, model_type):
326
  """
327
  Get personalized leaderboard data for a specific user.
 
328
 
329
  Args:
330
  user_id (int): The user ID
@@ -336,7 +382,7 @@ def get_user_leaderboard(user_id, model_type):
336
  # Get all models of the specified type
337
  models = Model.query.filter_by(model_type=model_type).all()
338
 
339
- # Get user's votes
340
  user_votes = Vote.query.filter_by(user_id=user_id, model_type=model_type).all()
341
 
342
  # Calculate win counts and match counts for each model based on user's votes
@@ -415,17 +461,19 @@ def get_historical_leaderboard_data(model_type, target_date=None):
415
  if not elo_entry:
416
  continue
417
 
418
- # Count wins and matches up to the target date
419
  match_count = Vote.query.filter(
420
  db.or_(Vote.model_chosen == model.id, Vote.model_rejected == model.id),
421
  Vote.model_type == model_type,
422
  Vote.vote_date <= target_date,
 
423
  ).count()
424
 
425
  win_count = Vote.query.filter(
426
  Vote.model_chosen == model.id,
427
  Vote.model_type == model_type,
428
  Vote.vote_date <= target_date,
 
429
  ).count()
430
 
431
  # Calculate win rate
@@ -823,3 +871,69 @@ def resolve_campaign(campaign_id, resolved_by, status, admin_notes=None):
823
 
824
  db.session.commit()
825
  return True, "Campaign resolved successfully"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import math
5
  from sqlalchemy import func, text
6
  import logging
7
+ import hashlib
8
 
9
  db = SQLAlchemy()
10
 
 
73
  user_agent = db.Column(db.String(500), nullable=True) # Browser/device info
74
  generation_date = db.Column(db.DateTime, nullable=True) # When audio was generated
75
  cache_hit = db.Column(db.Boolean, nullable=True) # Whether generation was from cache
76
+
77
+ # Sentence origin tracking
78
+ sentence_hash = db.Column(db.String(64), nullable=True, index=True) # SHA-256 hash of the sentence
79
+ sentence_origin = db.Column(db.String(20), nullable=True) # 'dataset', 'custom', 'unknown'
80
+ counts_for_public_leaderboard = db.Column(db.Boolean, default=True) # Whether this vote counts for public leaderboard
81
 
82
  chosen = db.relationship(
83
  "Model",
 
180
  return f"<UserTimeout {self.user_id}: {self.timeout_type} until {self.expires_at}>"
181
 
182
 
183
+ class ConsumedSentence(db.Model):
184
+ """Track sentences that have been used to ensure each sentence is only used once"""
185
+ id = db.Column(db.Integer, primary_key=True)
186
+ sentence_hash = db.Column(db.String(64), unique=True, nullable=False, index=True) # SHA-256 hash
187
+ sentence_text = db.Column(db.Text, nullable=False) # Store original text for debugging/admin purposes
188
+ consumed_at = db.Column(db.DateTime, default=datetime.utcnow)
189
+ session_id = db.Column(db.String(100), nullable=True) # Track which session consumed it
190
+ usage_type = db.Column(db.String(20), nullable=False) # 'cache', 'direct', 'random'
191
+
192
+ def __repr__(self):
193
+ return f"<ConsumedSentence {self.sentence_hash[:8]}...({self.usage_type})>"
194
+
195
+
196
  def calculate_elo_change(winner_elo, loser_elo, k_factor=32):
197
  """Calculate Elo rating changes for a match."""
198
  expected_winner = 1 / (1 + math.pow(10, (loser_elo - winner_elo) / 400))
 
233
 
234
  def record_vote(user_id, text, chosen_model_id, rejected_model_id, model_type,
235
  session_duration=None, ip_address=None, user_agent=None,
236
+ generation_date=None, cache_hit=None, all_dataset_sentences=None):
237
  """Record a vote and update Elo ratings."""
238
+
239
+ # Determine sentence origin and whether it should count for public leaderboard
240
+ sentence_hash = hash_sentence(text)
241
+ sentence_origin = 'unknown'
242
+ counts_for_public = True
243
+
244
+ if all_dataset_sentences and text in all_dataset_sentences:
245
+ sentence_origin = 'dataset'
246
+ # Only count for public leaderboard if sentence was unconsumed when used
247
+ # Check if it was consumed BEFORE this vote (don't consume yet)
248
+ counts_for_public = not is_sentence_consumed(text)
249
+ else:
250
+ sentence_origin = 'custom'
251
+ counts_for_public = False # Custom sentences never count for public leaderboard
252
+
253
  # Create the vote
254
  vote = Vote(
255
  user_id=user_id, # Required - user must be logged in to vote
 
262
  user_agent=user_agent[:500] if user_agent else None, # Truncate if too long
263
  generation_date=generation_date,
264
  cache_hit=cache_hit,
265
+ sentence_hash=sentence_hash,
266
+ sentence_origin=sentence_origin,
267
+ counts_for_public_leaderboard=counts_for_public,
268
  )
269
  db.session.add(vote)
270
  db.session.flush() # Get the vote ID without committing
 
281
  db.session.rollback()
282
  return None, "One or both models not found for the specified model type"
283
 
284
+ # Only update Elo ratings and public stats if this vote counts for public leaderboard
285
+ if counts_for_public:
286
+ # Calculate new Elo ratings
287
+ new_chosen_elo, new_rejected_elo = calculate_elo_change(
288
+ chosen_model.current_elo, rejected_model.current_elo
289
+ )
290
 
291
+ # Update model stats
292
+ chosen_model.current_elo = new_chosen_elo
293
+ chosen_model.win_count += 1
294
+ chosen_model.match_count += 1
295
 
296
+ rejected_model.current_elo = new_rejected_elo
297
+ rejected_model.match_count += 1
298
+ else:
299
+ # For votes that don't count for public leaderboard, keep current Elo
300
+ new_chosen_elo = chosen_model.current_elo
301
+ new_rejected_elo = rejected_model.current_elo
302
 
303
  # Record Elo history
304
  chosen_history = EloHistory(
 
324
  def get_leaderboard_data(model_type):
325
  """
326
  Get leaderboard data for the specified model type.
327
+ Only includes votes that count for the public leaderboard.
328
 
329
  Args:
330
  model_type (str): The model type ('tts' or 'conversational')
 
335
  query = Model.query.filter_by(model_type=model_type)
336
 
337
  # Get models with >1k votes ordered by ELO score
338
+ # Note: Model.match_count now only includes votes that count for public leaderboard
339
  models = query.filter(Model.match_count > 1000).order_by(Model.current_elo.desc()).all()
340
 
341
  result = []
 
370
  def get_user_leaderboard(user_id, model_type):
371
  """
372
  Get personalized leaderboard data for a specific user.
373
+ Includes ALL votes (both dataset and custom sentences).
374
 
375
  Args:
376
  user_id (int): The user ID
 
382
  # Get all models of the specified type
383
  models = Model.query.filter_by(model_type=model_type).all()
384
 
385
+ # Get user's votes (includes both public and custom sentence votes)
386
  user_votes = Vote.query.filter_by(user_id=user_id, model_type=model_type).all()
387
 
388
  # Calculate win counts and match counts for each model based on user's votes
 
461
  if not elo_entry:
462
  continue
463
 
464
+ # Count wins and matches up to the target date (only public leaderboard votes)
465
  match_count = Vote.query.filter(
466
  db.or_(Vote.model_chosen == model.id, Vote.model_rejected == model.id),
467
  Vote.model_type == model_type,
468
  Vote.vote_date <= target_date,
469
+ Vote.counts_for_public_leaderboard == True,
470
  ).count()
471
 
472
  win_count = Vote.query.filter(
473
  Vote.model_chosen == model.id,
474
  Vote.model_type == model_type,
475
  Vote.vote_date <= target_date,
476
+ Vote.counts_for_public_leaderboard == True,
477
  ).count()
478
 
479
  # Calculate win rate
 
871
 
872
  db.session.commit()
873
  return True, "Campaign resolved successfully"
874
+
875
+
876
+ def hash_sentence(sentence_text):
877
+ """Generate a SHA-256 hash for a sentence"""
878
+ return hashlib.sha256(sentence_text.strip().encode('utf-8')).hexdigest()
879
+
880
+
881
+ def is_sentence_consumed(sentence_text):
882
+ """Check if a sentence has already been consumed"""
883
+ sentence_hash = hash_sentence(sentence_text)
884
+ return ConsumedSentence.query.filter_by(sentence_hash=sentence_hash).first() is not None
885
+
886
+
887
+ def mark_sentence_consumed(sentence_text, session_id=None, usage_type='direct'):
888
+ """Mark a sentence as consumed"""
889
+ sentence_hash = hash_sentence(sentence_text)
890
+
891
+ # Check if already consumed
892
+ existing = ConsumedSentence.query.filter_by(sentence_hash=sentence_hash).first()
893
+ if existing:
894
+ return existing # Already consumed
895
+
896
+ consumed_sentence = ConsumedSentence(
897
+ sentence_hash=sentence_hash,
898
+ sentence_text=sentence_text,
899
+ session_id=session_id,
900
+ usage_type=usage_type
901
+ )
902
+
903
+ db.session.add(consumed_sentence)
904
+ db.session.commit()
905
+ return consumed_sentence
906
+
907
+
908
+ def get_unconsumed_sentences(sentence_pool):
909
+ """Filter a list of sentences to only include unconsumed ones"""
910
+ if not sentence_pool:
911
+ return []
912
+
913
+ # Get all consumed sentence hashes
914
+ consumed_hashes = set(
915
+ row[0] for row in db.session.query(ConsumedSentence.sentence_hash).all()
916
+ )
917
+
918
+ # Filter out consumed sentences
919
+ unconsumed = []
920
+ for sentence in sentence_pool:
921
+ if hash_sentence(sentence) not in consumed_hashes:
922
+ unconsumed.append(sentence)
923
+
924
+ return unconsumed
925
+
926
+
927
+ def get_consumed_sentences_count():
928
+ """Get the total count of consumed sentences"""
929
+ return ConsumedSentence.query.count()
930
+
931
+
932
+ def get_random_unconsumed_sentence(sentence_pool):
933
+ """Get a random unconsumed sentence from the pool"""
934
+ unconsumed = get_unconsumed_sentences(sentence_pool)
935
+ if not unconsumed:
936
+ return None
937
+
938
+ import random
939
+ return random.choice(unconsumed)
requirements.txt CHANGED
@@ -11,4 +11,5 @@ flask-migrate
11
  gunicorn
12
  waitress
13
  fal-client
14
- git+https://github.com/playht/pyht
 
 
11
  gunicorn
12
  waitress
13
  fal-client
14
+ git+https://github.com/playht/pyht
15
+ datasets
templates/arena.html CHANGED
@@ -1467,19 +1467,14 @@
1467
  function handleRandom() {
1468
  let selectedText = '';
1469
  if (cachedSentences && cachedSentences.length > 0) {
1470
- // Select a random text from the cache
1471
  selectedText = cachedSentences[Math.floor(Math.random() * cachedSentences.length)];
1472
- console.log("Using random sentence from cache.");
1473
  } else {
1474
- // Fallback to the initial list if cache is empty or failed to load
1475
- console.log("Cache empty or unavailable, using random sentence from fallback list.");
1476
- if (fallbackRandomTexts && fallbackRandomTexts.length > 0) {
1477
- selectedText = fallbackRandomTexts[Math.floor(Math.random() * fallbackRandomTexts.length)];
1478
- } else {
1479
- // If fallback list is also empty, do nothing. Log an error.
1480
- console.error("Both cached sentences and fallback sentences are unavailable.");
1481
- return;
1482
- }
1483
  }
1484
  textInput.value = selectedText;
1485
  textInput.focus();
 
1467
  function handleRandom() {
1468
  let selectedText = '';
1469
  if (cachedSentences && cachedSentences.length > 0) {
1470
+ // Select a random text from the unconsumed sentences
1471
  selectedText = cachedSentences[Math.floor(Math.random() * cachedSentences.length)];
1472
+ console.log("Using random sentence from unconsumed sentences.");
1473
  } else {
1474
+ // No fallback to consumed sentences for security reasons
1475
+ console.error("No unconsumed sentences available. All sentences may have been used.");
1476
+ openToast("No unused sentences available. All sentences from the dataset may have been consumed.", "error");
1477
+ return;
 
 
 
 
 
1478
  }
1479
  textInput.value = selectedText;
1480
  textInput.focus();
tts.py CHANGED
@@ -165,7 +165,6 @@ def predict_dia(script):
165
  else:
166
  # If it's already a string, use as is
167
  text = script
168
- print(text)
169
  # Make a POST request to initiate the dialogue generation
170
  headers = {
171
  # "Content-Type": "application/json",
@@ -219,7 +218,6 @@ def predict_tts(text, model):
219
  }
220
  ),
221
  )
222
-
223
  response_json = result.json()
224
 
225
  audio_data = response_json["audio_data"] # base64 encoded audio data
 
165
  else:
166
  # If it's already a string, use as is
167
  text = script
 
168
  # Make a POST request to initiate the dialogue generation
169
  headers = {
170
  # "Content-Type": "application/json",
 
218
  }
219
  ),
220
  )
 
221
  response_json = result.json()
222
 
223
  audio_data = response_json["audio_data"] # base64 encoded audio data