Copuchat

Running

File size: 14,386 Bytes

import gradio as gr
import os
import sys
import json
import random
import hashlib
import requests
import tempfile
from datetime import datetime
from openai import OpenAI
from huggingface_hub import upload_file, list_repo_files, create_repo, hf_hub_download

MODEL = "gpt-4.1-mini"

def get_env_bool(key, default="False"):
    value = os.getenv(key, default)
    if isinstance(value, bool):
        return value
    return str(value).lower() in ('true', '1', 'yes', 'on')

def get_env_list(key, default=""):
    value = os.getenv(key, default)
    if not value or value == "":
        return []
    
    if value.startswith('[') and value.endswith(']'):
        try:
            parsed = json.loads(value)
            if isinstance(parsed, list):
                return [str(item).strip() for item in parsed if str(item).strip()]
        except json.JSONDecodeError:
            pass
    
    return [item.strip() for item in str(value).split(',') if item.strip()]

DISABLED = get_env_bool("DISABLED", "False")
OPENAI_API_KEYS = get_env_list("OPENAI_API_KEYS", "")
NUM_THREADS = int(os.getenv("NUM_THREADS", "4"))
IP_SALT = os.getenv("IP_SALT", "latamgpt-default-salt-2025")
HF_TOKEN = os.getenv("HF_TOKEN")
DATASET_REPO = os.getenv("DATASET_REPO", "latam-gpt/copuchat-conversations")

def exception_handler(exception_type, exception, traceback):
    print(f"{exception_type.__name__}: {exception}")

sys.excepthook = exception_handler
sys.tracebacklimit = 0

def get_user_fingerprint(request):
    real_ip = (
        request.headers.get('x-forwarded-for', '').split(',')[0].strip() or
        request.headers.get('x-real-ip', '') or 
        getattr(request, 'client', {}).get('host', 'unknown')
    )
    fingerprint_data = f"{real_ip}:{IP_SALT}"
    user_fingerprint = hashlib.sha256(fingerprint_data.encode()).hexdigest()[:16]
    return real_ip, user_fingerprint

def get_country_from_ip(ip):
    try:
        response = requests.get(f"http://ip-api.com/json/{ip}", timeout=2)
        if response.status_code == 200:
            data = response.json()
            return {
                "country": data.get('country', 'Unknown'),
                "country_code": data.get('countryCode', 'UN'),
                "region": data.get('regionName', 'Unknown')
            }
    except:
        pass
    return {"country": "Unknown", "country_code": "UN", "region": "Unknown"}

def generate_conversation_hash(session_id, user_fingerprint):
    return hashlib.sha256(f"{session_id}:{user_fingerprint}".encode()).hexdigest()[:12]

def generate_conversation_filename(session_id, user_fingerprint, timestamp):
    conversation_hash = generate_conversation_hash(session_id, user_fingerprint)
    timestamp_str = timestamp.strftime('%Y%m%d_%H%M%S_%f')
    return f"conversations/{timestamp_str}_{conversation_hash}.jsonl"

def get_conversation_files():
    if not HF_TOKEN:
        return []
    try:
        files = list_repo_files(repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
        return sorted([f for f in files if f.startswith("conversations/") and f.endswith(".jsonl")])
    except:
        return []

def get_global_chat_counter():
    conversation_files = get_conversation_files()
    return len(conversation_files) + 1

def find_existing_conversation(session_id, user_fingerprint):
    conversation_hash = generate_conversation_hash(session_id, user_fingerprint)
    conversation_files = get_conversation_files()
    
    matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")]
    
    if matching_files:
        try:
            latest_file = matching_files[-1]
            local_file = hf_hub_download(
                repo_id=DATASET_REPO,
                repo_type="dataset",
                filename=latest_file,
                token=HF_TOKEN
            )
            with open(local_file, 'r') as f:
                return json.load(f)
        except:
            pass
    
    return None

def upload_conversation(conversation_data, session_id, user_fingerprint):
    if not HF_TOKEN:
        return
    
    try:
        try:
            create_repo(
                repo_id=DATASET_REPO,
                repo_type="dataset",
                private=True,
                exist_ok=True,
                token=HF_TOKEN
            )
        except:
            pass
        
        conversation_hash = generate_conversation_hash(session_id, user_fingerprint)
        conversation_files = get_conversation_files()
        matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")]
        
        if matching_files:
            filename = matching_files[-1]
        else:
            filename = generate_conversation_filename(session_id, user_fingerprint, datetime.now())
        
        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
            json.dump(conversation_data, f)
            temp_path = f.name
        
        upload_file(
            path_or_fileobj=temp_path,
            path_in_repo=filename,
            repo_id=DATASET_REPO,
            repo_type="dataset",
            token=HF_TOKEN
        )
        
        os.unlink(temp_path)
        
    except Exception as e:
        print(f"Upload failed: {e}")

GLOBAL_CHAT_COUNTER = get_global_chat_counter()
print(f"Starting global chat counter at: {GLOBAL_CHAT_COUNTER}")

def predict(inputs, top_p, temperature, chat_counter, chatbot, history, request: gr.Request):
    global GLOBAL_CHAT_COUNTER
    
    if not OPENAI_API_KEYS or not OPENAI_API_KEYS[0]:
        yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "No API keys configured", gr.update(interactive=True), gr.update(interactive=True)
        return
        
    api_key = random.choice(OPENAI_API_KEYS)
    client = OpenAI(api_key=api_key)
    
    session_id = getattr(request, 'session_hash', 'unknown')
    real_ip, user_fingerprint = get_user_fingerprint(request)
    geo_info = get_country_from_ip(real_ip)
    headers_dict = {key.decode('utf-8'): value.decode('utf-8') for key, value in request.headers.raw}
    
    existing_conversation = find_existing_conversation(session_id, user_fingerprint) if chat_counter == 0 else None
    
    if existing_conversation:
        history = existing_conversation['messages_history']
        chat_counter = existing_conversation['chat_counter']
        chatbot = [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)]
    
    messages = []
    for i, data in enumerate(history):
        role = 'user' if i % 2 == 0 else 'assistant'
        messages.append({"role": role, "content": data})
    
    messages.append({"role": "user", "content": inputs})
    
    GLOBAL_CHAT_COUNTER += 1
    global_counter = GLOBAL_CHAT_COUNTER
    chat_counter += 1
    history.append(inputs)
    token_counter = 0
    partial_words = ""
    
    try:
        stream = client.chat.completions.create(
            model=MODEL,
            messages=messages,
            temperature=temperature,
            top_p=top_p,
            stream=True,
            presence_penalty=0,
            frequency_penalty=0,
            max_tokens=4096
        )
        
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                partial_words += chunk.choices[0].delta.content
                if token_counter == 0:
                    history.append(" " + partial_words)
                else:
                    history[-1] = partial_words
                token_counter += 1
                yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=False), gr.update(interactive=False)
        
        yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=True), gr.update(interactive=True)
                
    except Exception as e:
        print(f'Error API OpenAI: {e}')
        yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, str(e), gr.update(interactive=True), gr.update(interactive=True)
    
    conversation_data = {
        "session_id": session_id,
        "user_fingerprint": user_fingerprint,
        "conversation_id": f"{session_id}_{datetime.now().strftime('%Y%m%d_%H')}",
        "conversation_hash": generate_conversation_hash(session_id, user_fingerprint),
        "country": geo_info["country"],
        "country_code": geo_info["country_code"],
        "region": geo_info["region"],
        "chat_counter": chat_counter,
        "global_chat_counter": global_counter,
        "model": MODEL,
        "messages": messages,
        "messages_history": history,
        "response": partial_words,
        "headers": headers_dict,
        "temperature": temperature,
        "top_p": top_p,
        "token_counter": token_counter,
        "timestamp": datetime.now().isoformat(),
        "last_updated": datetime.now().isoformat()
    }
    
    print(json.dumps({k: v for k, v in conversation_data.items() if k != 'messages_history'}))
    upload_conversation(conversation_data, session_id, user_fingerprint)

def reset_textbox():
    return gr.update(value='', interactive=False), gr.update(interactive=False)

title = """<h1 align="center">Copuchat: Recolección de datos para LatamGPT</h1>"""
if DISABLED:
    title = """<h1 align="center" style="color:red">Esta app alcanzó su límite de uso. Porfavor intenta reingresar mañana.</h1>"""

description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form:
```
User: <utterance>
Assistant: <utterance>
User: <utterance>
Assistant: <utterance>
...
```
In this app, you can explore the outputs of GPT-4.1 mini while contributing to LatamGPT research.
"""

with gr.Blocks(css="""
    #col_container { 
        margin-left: auto; 
        margin-right: auto;
        max-width: 1200px;
        width: 95%;
    }
    #chatbot {
        height: 1200px; 
        overflow: auto;
    }
    .gradio-container {
        max-width: unset !important;
    }
    #component-0 {
        max-width: unset;
    }
""") as demo:
    gr.HTML(title)
    
    with gr.Column(elem_id="col_container", visible=False) as main_block:
        chatbot = gr.Chatbot(elem_id='chatbot')
        inputs = gr.Textbox(
            placeholder="", 
            label="Escribe tu mensaje y presiona Enter",
            lines=3,
            max_lines=8,
            scale=1
        )
        state = gr.State([])
        
        with gr.Row():
            with gr.Column(scale=7):
                b1 = gr.Button(visible=not DISABLED)
            with gr.Column(scale=3):
                server_status_code = gr.Textbox(label="Código de estado del servidor")
        
        with gr.Accordion("Parámetros", open=False):
            top_p = gr.Slider(minimum=0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (muestreo de núcleo)")
            temperature = gr.Slider(minimum=0, maximum=2.0, value=0.2, step=0.1, interactive=True, label="Temperatura")
            chat_counter = gr.Number(value=0, visible=False, precision=0)
    
    with gr.Column(elem_id="user_consent_container") as user_consent_block:
        accept_checkbox = gr.Checkbox(visible=False)
        js = "(x) => confirm('Al hacer clic en \"Acepto\", acepto que mis datos pueden ser publicados o compartidos para investigación.')"
        
        with gr.Accordion("Consentimiento de Usuario para Recolección, Uso y Compartición de Datos", open=True):
            gr.HTML("""
            <div>
                <p>Al usar nuestra aplicación, que funciona con la API de OpenAI, reconoces y aceptas los siguientes términos sobre los datos que proporcionas:</p>
                <ol>
                    <li><strong>Recolección:</strong> Podemos recopilar información, incluyendo las entradas que escribes en nuestra aplicación, las salidas generadas por la API de OpenAI, y ciertos detalles técnicos sobre tu dispositivo y conexión (como tipo de navegador, sistema operativo y ubicación geográfica) proporcionados por los headers de solicitud de tu dispositivo. Estos datos pasaran por un proceso de anonimización para evitar la recolección de información privada.</li>
                    <li><strong>Uso:</strong> Podemos usar los datos recopilados para propósitos de investigación y desarrollo de LatamGPT.</li>
                    <li><strong>Compartición y Publicación:</strong> Los datos recolectados, incluyendo los detalles técnicos recopilados de los headers de solicitud de tu dispositivo, pueden ser publicados, compartidos con terceros, o usados para análisis y propósitos de reportes.</li>
                    <li><strong>Retención de Datos:</strong> Podemos retener tus datos anonimizados, incluyendo los detalles técnicos recopilados de los headers de solicitud de tu dispositivo, por el tiempo que sea necesario.</li>
                </ol>
                <p>Al continuar usando nuestra aplicación, proporcionas tu consentimiento explícito para la recolección, uso y potencial compartición de tus datos como se describe arriba. Si no estás de acuerdo con nuestras prácticas de recolección, uso y compartición de datos, por favor no uses nuestra aplicación.</p>
                <p><strong>Este proyecto contribuye al desarrollo de LatamGPT, un modelo de lenguaje para América Latina.</strong></p>
            </div>
            """)
            accept_button = gr.Button("Acepto / I Agree")

        def enable_inputs():
            return gr.update(visible=False), gr.update(visible=True)

    accept_button.click(None, None, accept_checkbox, js=js, queue=False)
    accept_checkbox.change(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False)

    inputs.submit(reset_textbox, [], [inputs, b1], queue=False)
    inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1])
    b1.click(reset_textbox, [], [inputs, b1], queue=False)
    b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1])

if __name__ == "__main__":
    demo.launch()