import gradio as gr import os import sys import json import random import hashlib import requests import tempfile from datetime import datetime from openai import OpenAI from huggingface_hub import upload_file, list_repo_files, create_repo, hf_hub_download MODEL = "gpt-4.1-mini" def get_env_bool(key, default="False"): value = os.getenv(key, default) if isinstance(value, bool): return value return str(value).lower() in ('true', '1', 'yes', 'on') def get_env_list(key, default=""): value = os.getenv(key, default) if not value or value == "": return [] if value.startswith('[') and value.endswith(']'): try: parsed = json.loads(value) if isinstance(parsed, list): return [str(item).strip() for item in parsed if str(item).strip()] except json.JSONDecodeError: pass return [item.strip() for item in str(value).split(',') if item.strip()] DISABLED = get_env_bool("DISABLED", "False") OPENAI_API_KEYS = get_env_list("OPENAI_API_KEYS", "") NUM_THREADS = int(os.getenv("NUM_THREADS", "4")) IP_SALT = os.getenv("IP_SALT", "latamgpt-default-salt-2025") HF_TOKEN = os.getenv("HF_TOKEN") DATASET_REPO = os.getenv("DATASET_REPO", "latam-gpt/copuchat-conversations") def exception_handler(exception_type, exception, traceback): print(f"{exception_type.__name__}: {exception}") sys.excepthook = exception_handler sys.tracebacklimit = 0 def get_user_fingerprint(request): real_ip = ( request.headers.get('x-forwarded-for', '').split(',')[0].strip() or request.headers.get('x-real-ip', '') or getattr(request, 'client', {}).get('host', 'unknown') ) fingerprint_data = f"{real_ip}:{IP_SALT}" user_fingerprint = hashlib.sha256(fingerprint_data.encode()).hexdigest()[:16] return real_ip, user_fingerprint def get_country_from_ip(ip): try: response = requests.get(f"http://ip-api.com/json/{ip}", timeout=2) if response.status_code == 200: data = response.json() return { "country": data.get('country', 'Unknown'), "country_code": data.get('countryCode', 'UN'), "region": data.get('regionName', 'Unknown') } except: pass return {"country": "Unknown", "country_code": "UN", "region": "Unknown"} def generate_conversation_hash(session_id, user_fingerprint): return hashlib.sha256(f"{session_id}:{user_fingerprint}".encode()).hexdigest()[:12] def generate_conversation_filename(session_id, user_fingerprint, timestamp): conversation_hash = generate_conversation_hash(session_id, user_fingerprint) timestamp_str = timestamp.strftime('%Y%m%d_%H%M%S_%f') return f"conversations/{timestamp_str}_{conversation_hash}.jsonl" def get_conversation_files(): if not HF_TOKEN: return [] try: files = list_repo_files(repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN) return sorted([f for f in files if f.startswith("conversations/") and f.endswith(".jsonl")]) except: return [] def get_global_chat_counter(): conversation_files = get_conversation_files() return len(conversation_files) + 1 def find_existing_conversation(session_id, user_fingerprint): conversation_hash = generate_conversation_hash(session_id, user_fingerprint) conversation_files = get_conversation_files() matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")] if matching_files: try: latest_file = matching_files[-1] local_file = hf_hub_download( repo_id=DATASET_REPO, repo_type="dataset", filename=latest_file, token=HF_TOKEN ) with open(local_file, 'r') as f: return json.load(f) except: pass return None def upload_conversation(conversation_data, session_id, user_fingerprint): if not HF_TOKEN: return try: try: create_repo( repo_id=DATASET_REPO, repo_type="dataset", private=True, exist_ok=True, token=HF_TOKEN ) except: pass conversation_hash = generate_conversation_hash(session_id, user_fingerprint) conversation_files = get_conversation_files() matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")] if matching_files: filename = matching_files[-1] else: filename = generate_conversation_filename(session_id, user_fingerprint, datetime.now()) with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: json.dump(conversation_data, f) temp_path = f.name upload_file( path_or_fileobj=temp_path, path_in_repo=filename, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN ) os.unlink(temp_path) except Exception as e: print(f"Upload failed: {e}") GLOBAL_CHAT_COUNTER = get_global_chat_counter() print(f"Starting global chat counter at: {GLOBAL_CHAT_COUNTER}") def predict(inputs, top_p, temperature, chat_counter, chatbot, history, request: gr.Request): global GLOBAL_CHAT_COUNTER if not OPENAI_API_KEYS or not OPENAI_API_KEYS[0]: yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "No API keys configured", gr.update(interactive=True), gr.update(interactive=True) return api_key = random.choice(OPENAI_API_KEYS) client = OpenAI(api_key=api_key) session_id = getattr(request, 'session_hash', 'unknown') real_ip, user_fingerprint = get_user_fingerprint(request) geo_info = get_country_from_ip(real_ip) headers_dict = {key.decode('utf-8'): value.decode('utf-8') for key, value in request.headers.raw} existing_conversation = find_existing_conversation(session_id, user_fingerprint) if chat_counter == 0 else None if existing_conversation: history = existing_conversation['messages_history'] chat_counter = existing_conversation['chat_counter'] chatbot = [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)] messages = [] for i, data in enumerate(history): role = 'user' if i % 2 == 0 else 'assistant' messages.append({"role": role, "content": data}) messages.append({"role": "user", "content": inputs}) GLOBAL_CHAT_COUNTER += 1 global_counter = GLOBAL_CHAT_COUNTER chat_counter += 1 history.append(inputs) token_counter = 0 partial_words = "" try: stream = client.chat.completions.create( model=MODEL, messages=messages, temperature=temperature, top_p=top_p, stream=True, presence_penalty=0, frequency_penalty=0, max_tokens=4096 ) for chunk in stream: if chunk.choices[0].delta.content is not None: partial_words += chunk.choices[0].delta.content if token_counter == 0: history.append(" " + partial_words) else: history[-1] = partial_words token_counter += 1 yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=False), gr.update(interactive=False) yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=True), gr.update(interactive=True) except Exception as e: print(f'Error API OpenAI: {e}') yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, str(e), gr.update(interactive=True), gr.update(interactive=True) conversation_data = { "session_id": session_id, "user_fingerprint": user_fingerprint, "conversation_id": f"{session_id}_{datetime.now().strftime('%Y%m%d_%H')}", "conversation_hash": generate_conversation_hash(session_id, user_fingerprint), "country": geo_info["country"], "country_code": geo_info["country_code"], "region": geo_info["region"], "chat_counter": chat_counter, "global_chat_counter": global_counter, "model": MODEL, "messages": messages, "messages_history": history, "response": partial_words, "headers": headers_dict, "temperature": temperature, "top_p": top_p, "token_counter": token_counter, "timestamp": datetime.now().isoformat(), "last_updated": datetime.now().isoformat() } print(json.dumps({k: v for k, v in conversation_data.items() if k != 'messages_history'})) upload_conversation(conversation_data, session_id, user_fingerprint) def reset_textbox(): return gr.update(value='', interactive=False), gr.update(interactive=False) title = """

Copuchat: Recolección de datos para LatamGPT

""" if DISABLED: title = """

Esta app alcanzó su límite de uso. Porfavor intenta reingresar mañana.

""" description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form: ``` User: Assistant: User: Assistant: ... ``` In this app, you can explore the outputs of GPT-4.1 mini while contributing to LatamGPT research. """ with gr.Blocks(css=""" #col_container { margin-left: auto; margin-right: auto; max-width: 1200px; width: 95%; } #chatbot { height: 1200px; overflow: auto; } .gradio-container { max-width: unset !important; } #component-0 { max-width: unset; } """) as demo: gr.HTML(title) with gr.Column(elem_id="col_container", visible=False) as main_block: chatbot = gr.Chatbot(elem_id='chatbot') inputs = gr.Textbox( placeholder="", label="Escribe tu mensaje y presiona Enter", lines=3, max_lines=8, scale=1 ) state = gr.State([]) with gr.Row(): with gr.Column(scale=7): b1 = gr.Button(visible=not DISABLED) with gr.Column(scale=3): server_status_code = gr.Textbox(label="Código de estado del servidor") with gr.Accordion("Parámetros", open=False): top_p = gr.Slider(minimum=0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (muestreo de núcleo)") temperature = gr.Slider(minimum=0, maximum=2.0, value=0.2, step=0.1, interactive=True, label="Temperatura") chat_counter = gr.Number(value=0, visible=False, precision=0) with gr.Column(elem_id="user_consent_container") as user_consent_block: accept_checkbox = gr.Checkbox(visible=False) js = "(x) => confirm('Al hacer clic en \"Acepto\", acepto que mis datos pueden ser publicados o compartidos para investigación.')" with gr.Accordion("Consentimiento de Usuario para Recolección, Uso y Compartición de Datos", open=True): gr.HTML("""

Al usar nuestra aplicación, que funciona con la API de OpenAI, reconoces y aceptas los siguientes términos sobre los datos que proporcionas:

  1. Recolección: Podemos recopilar información, incluyendo las entradas que escribes en nuestra aplicación, las salidas generadas por la API de OpenAI, y ciertos detalles técnicos sobre tu dispositivo y conexión (como tipo de navegador, sistema operativo y ubicación geográfica) proporcionados por los headers de solicitud de tu dispositivo. Estos datos pasaran por un proceso de anonimización para evitar la recolección de información privada.
  2. Uso: Podemos usar los datos recopilados para propósitos de investigación y desarrollo de LatamGPT.
  3. Compartición y Publicación: Los datos recolectados, incluyendo los detalles técnicos recopilados de los headers de solicitud de tu dispositivo, pueden ser publicados, compartidos con terceros, o usados para análisis y propósitos de reportes.
  4. Retención de Datos: Podemos retener tus datos anonimizados, incluyendo los detalles técnicos recopilados de los headers de solicitud de tu dispositivo, por el tiempo que sea necesario.

Al continuar usando nuestra aplicación, proporcionas tu consentimiento explícito para la recolección, uso y potencial compartición de tus datos como se describe arriba. Si no estás de acuerdo con nuestras prácticas de recolección, uso y compartición de datos, por favor no uses nuestra aplicación.

Este proyecto contribuye al desarrollo de LatamGPT, un modelo de lenguaje para América Latina.

""") accept_button = gr.Button("Acepto / I Agree") def enable_inputs(): return gr.update(visible=False), gr.update(visible=True) accept_button.click(None, None, accept_checkbox, js=js, queue=False) accept_checkbox.change(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False) inputs.submit(reset_textbox, [], [inputs, b1], queue=False) inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1]) b1.click(reset_textbox, [], [inputs, b1], queue=False) b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1]) if __name__ == "__main__": demo.launch()