|
import gradio as gr |
|
import os |
|
import sys |
|
import json |
|
import random |
|
import hashlib |
|
import requests |
|
import tempfile |
|
from datetime import datetime |
|
from openai import OpenAI |
|
from huggingface_hub import upload_file, list_repo_files, create_repo, hf_hub_download |
|
|
|
MODEL = "gpt-4.1-mini" |
|
|
|
def get_env_bool(key, default="False"): |
|
value = os.getenv(key, default) |
|
if isinstance(value, bool): |
|
return value |
|
return str(value).lower() in ('true', '1', 'yes', 'on') |
|
|
|
def get_env_list(key, default=""): |
|
value = os.getenv(key, default) |
|
if not value or value == "": |
|
return [] |
|
|
|
if value.startswith('[') and value.endswith(']'): |
|
try: |
|
parsed = json.loads(value) |
|
if isinstance(parsed, list): |
|
return [str(item).strip() for item in parsed if str(item).strip()] |
|
except json.JSONDecodeError: |
|
pass |
|
|
|
return [item.strip() for item in str(value).split(',') if item.strip()] |
|
|
|
DISABLED = get_env_bool("DISABLED", "False") |
|
OPENAI_API_KEYS = get_env_list("OPENAI_API_KEYS", "") |
|
NUM_THREADS = int(os.getenv("NUM_THREADS", "4")) |
|
IP_SALT = os.getenv("IP_SALT", "latamgpt-default-salt-2025") |
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
DATASET_REPO = os.getenv("DATASET_REPO", "latam-gpt/copuchat-conversations") |
|
|
|
def exception_handler(exception_type, exception, traceback): |
|
print(f"{exception_type.__name__}: {exception}") |
|
|
|
sys.excepthook = exception_handler |
|
sys.tracebacklimit = 0 |
|
|
|
def get_user_fingerprint(request): |
|
real_ip = ( |
|
request.headers.get('x-forwarded-for', '').split(',')[0].strip() or |
|
request.headers.get('x-real-ip', '') or |
|
getattr(request, 'client', {}).get('host', 'unknown') |
|
) |
|
fingerprint_data = f"{real_ip}:{IP_SALT}" |
|
user_fingerprint = hashlib.sha256(fingerprint_data.encode()).hexdigest()[:16] |
|
return real_ip, user_fingerprint |
|
|
|
def get_country_from_ip(ip): |
|
try: |
|
response = requests.get(f"http://ip-api.com/json/{ip}", timeout=2) |
|
if response.status_code == 200: |
|
data = response.json() |
|
return { |
|
"country": data.get('country', 'Unknown'), |
|
"country_code": data.get('countryCode', 'UN'), |
|
"region": data.get('regionName', 'Unknown') |
|
} |
|
except: |
|
pass |
|
return {"country": "Unknown", "country_code": "UN", "region": "Unknown"} |
|
|
|
def generate_conversation_hash(session_id, user_fingerprint): |
|
return hashlib.sha256(f"{session_id}:{user_fingerprint}".encode()).hexdigest()[:12] |
|
|
|
def generate_conversation_filename(session_id, user_fingerprint, timestamp): |
|
conversation_hash = generate_conversation_hash(session_id, user_fingerprint) |
|
timestamp_str = timestamp.strftime('%Y%m%d_%H%M%S_%f') |
|
return f"conversations/{timestamp_str}_{conversation_hash}.jsonl" |
|
|
|
def get_conversation_files(): |
|
if not HF_TOKEN: |
|
return [] |
|
try: |
|
files = list_repo_files(repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN) |
|
return sorted([f for f in files if f.startswith("conversations/") and f.endswith(".jsonl")]) |
|
except: |
|
return [] |
|
|
|
def get_global_chat_counter(): |
|
conversation_files = get_conversation_files() |
|
return len(conversation_files) + 1 |
|
|
|
def find_existing_conversation(session_id, user_fingerprint): |
|
conversation_hash = generate_conversation_hash(session_id, user_fingerprint) |
|
conversation_files = get_conversation_files() |
|
|
|
matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")] |
|
|
|
if matching_files: |
|
try: |
|
latest_file = matching_files[-1] |
|
local_file = hf_hub_download( |
|
repo_id=DATASET_REPO, |
|
repo_type="dataset", |
|
filename=latest_file, |
|
token=HF_TOKEN |
|
) |
|
with open(local_file, 'r') as f: |
|
return json.load(f) |
|
except: |
|
pass |
|
|
|
return None |
|
|
|
def upload_conversation(conversation_data, session_id, user_fingerprint): |
|
if not HF_TOKEN: |
|
return |
|
|
|
try: |
|
try: |
|
create_repo( |
|
repo_id=DATASET_REPO, |
|
repo_type="dataset", |
|
private=True, |
|
exist_ok=True, |
|
token=HF_TOKEN |
|
) |
|
except: |
|
pass |
|
|
|
conversation_hash = generate_conversation_hash(session_id, user_fingerprint) |
|
conversation_files = get_conversation_files() |
|
matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")] |
|
|
|
if matching_files: |
|
filename = matching_files[-1] |
|
else: |
|
filename = generate_conversation_filename(session_id, user_fingerprint, datetime.now()) |
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: |
|
json.dump(conversation_data, f) |
|
temp_path = f.name |
|
|
|
upload_file( |
|
path_or_fileobj=temp_path, |
|
path_in_repo=filename, |
|
repo_id=DATASET_REPO, |
|
repo_type="dataset", |
|
token=HF_TOKEN |
|
) |
|
|
|
os.unlink(temp_path) |
|
|
|
except Exception as e: |
|
print(f"Upload failed: {e}") |
|
|
|
GLOBAL_CHAT_COUNTER = get_global_chat_counter() |
|
print(f"Starting global chat counter at: {GLOBAL_CHAT_COUNTER}") |
|
|
|
def predict(inputs, top_p, temperature, chat_counter, chatbot, history, request: gr.Request): |
|
global GLOBAL_CHAT_COUNTER |
|
|
|
if not OPENAI_API_KEYS or not OPENAI_API_KEYS[0]: |
|
yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "No API keys configured", gr.update(interactive=True), gr.update(interactive=True) |
|
return |
|
|
|
api_key = random.choice(OPENAI_API_KEYS) |
|
client = OpenAI(api_key=api_key) |
|
|
|
session_id = getattr(request, 'session_hash', 'unknown') |
|
real_ip, user_fingerprint = get_user_fingerprint(request) |
|
geo_info = get_country_from_ip(real_ip) |
|
headers_dict = {key.decode('utf-8'): value.decode('utf-8') for key, value in request.headers.raw} |
|
|
|
existing_conversation = find_existing_conversation(session_id, user_fingerprint) if chat_counter == 0 else None |
|
|
|
if existing_conversation: |
|
history = existing_conversation['messages_history'] |
|
chat_counter = existing_conversation['chat_counter'] |
|
chatbot = [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)] |
|
|
|
messages = [] |
|
for i, data in enumerate(history): |
|
role = 'user' if i % 2 == 0 else 'assistant' |
|
messages.append({"role": role, "content": data}) |
|
|
|
messages.append({"role": "user", "content": inputs}) |
|
|
|
GLOBAL_CHAT_COUNTER += 1 |
|
global_counter = GLOBAL_CHAT_COUNTER |
|
chat_counter += 1 |
|
history.append(inputs) |
|
token_counter = 0 |
|
partial_words = "" |
|
|
|
try: |
|
stream = client.chat.completions.create( |
|
model=MODEL, |
|
messages=messages, |
|
temperature=temperature, |
|
top_p=top_p, |
|
stream=True, |
|
presence_penalty=0, |
|
frequency_penalty=0, |
|
max_tokens=4096 |
|
) |
|
|
|
for chunk in stream: |
|
if chunk.choices[0].delta.content is not None: |
|
partial_words += chunk.choices[0].delta.content |
|
if token_counter == 0: |
|
history.append(" " + partial_words) |
|
else: |
|
history[-1] = partial_words |
|
token_counter += 1 |
|
yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=False), gr.update(interactive=False) |
|
|
|
yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=True), gr.update(interactive=True) |
|
|
|
except Exception as e: |
|
print(f'Error API OpenAI: {e}') |
|
yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, str(e), gr.update(interactive=True), gr.update(interactive=True) |
|
|
|
conversation_data = { |
|
"session_id": session_id, |
|
"user_fingerprint": user_fingerprint, |
|
"conversation_id": f"{session_id}_{datetime.now().strftime('%Y%m%d_%H')}", |
|
"conversation_hash": generate_conversation_hash(session_id, user_fingerprint), |
|
"country": geo_info["country"], |
|
"country_code": geo_info["country_code"], |
|
"region": geo_info["region"], |
|
"chat_counter": chat_counter, |
|
"global_chat_counter": global_counter, |
|
"model": MODEL, |
|
"messages": messages, |
|
"messages_history": history, |
|
"response": partial_words, |
|
"headers": headers_dict, |
|
"temperature": temperature, |
|
"top_p": top_p, |
|
"token_counter": token_counter, |
|
"timestamp": datetime.now().isoformat(), |
|
"last_updated": datetime.now().isoformat() |
|
} |
|
|
|
print(json.dumps({k: v for k, v in conversation_data.items() if k != 'messages_history'})) |
|
upload_conversation(conversation_data, session_id, user_fingerprint) |
|
|
|
def reset_textbox(): |
|
return gr.update(value='', interactive=False), gr.update(interactive=False) |
|
|
|
title = """<h1 align="center">Copuchat: Recolecci贸n de datos para LatamGPT</h1>""" |
|
if DISABLED: |
|
title = """<h1 align="center" style="color:red">Esta app alcanz贸 su l铆mite de uso. Porfavor intenta reingresar ma帽ana.</h1>""" |
|
|
|
description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form: |
|
``` |
|
User: <utterance> |
|
Assistant: <utterance> |
|
User: <utterance> |
|
Assistant: <utterance> |
|
... |
|
``` |
|
In this app, you can explore the outputs of GPT-4.1 mini while contributing to LatamGPT research. |
|
""" |
|
|
|
with gr.Blocks(css=""" |
|
#col_container { |
|
margin-left: auto; |
|
margin-right: auto; |
|
max-width: 1200px; |
|
width: 95%; |
|
} |
|
#chatbot { |
|
height: 1200px; |
|
overflow: auto; |
|
} |
|
.gradio-container { |
|
max-width: unset !important; |
|
} |
|
#component-0 { |
|
max-width: unset; |
|
} |
|
""") as demo: |
|
gr.HTML(title) |
|
|
|
with gr.Column(elem_id="col_container", visible=False) as main_block: |
|
chatbot = gr.Chatbot(elem_id='chatbot') |
|
inputs = gr.Textbox( |
|
placeholder="", |
|
label="Escribe tu mensaje y presiona Enter", |
|
lines=3, |
|
max_lines=8, |
|
scale=1 |
|
) |
|
state = gr.State([]) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=7): |
|
b1 = gr.Button(visible=not DISABLED) |
|
with gr.Column(scale=3): |
|
server_status_code = gr.Textbox(label="C贸digo de estado del servidor") |
|
|
|
with gr.Accordion("Par谩metros", open=False): |
|
top_p = gr.Slider(minimum=0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (muestreo de n煤cleo)") |
|
temperature = gr.Slider(minimum=0, maximum=2.0, value=0.2, step=0.1, interactive=True, label="Temperatura") |
|
chat_counter = gr.Number(value=0, visible=False, precision=0) |
|
|
|
with gr.Column(elem_id="user_consent_container") as user_consent_block: |
|
accept_checkbox = gr.Checkbox(visible=False) |
|
js = "(x) => confirm('Al hacer clic en \"Acepto\", acepto que mis datos pueden ser publicados o compartidos para investigaci贸n.')" |
|
|
|
with gr.Accordion("Consentimiento de Usuario para Recolecci贸n, Uso y Compartici贸n de Datos", open=True): |
|
gr.HTML(""" |
|
<div> |
|
<p>Al usar nuestra aplicaci贸n, que funciona con la API de OpenAI, reconoces y aceptas los siguientes t茅rminos sobre los datos que proporcionas:</p> |
|
<ol> |
|
<li><strong>Recolecci贸n:</strong> Podemos recopilar informaci贸n, incluyendo las entradas que escribes en nuestra aplicaci贸n, las salidas generadas por la API de OpenAI, y ciertos detalles t茅cnicos sobre tu dispositivo y conexi贸n (como tipo de navegador, sistema operativo y ubicaci贸n geogr谩fica) proporcionados por los headers de solicitud de tu dispositivo. Estos datos pasaran por un proceso de anonimizaci贸n para evitar la recolecci贸n de informaci贸n privada.</li> |
|
<li><strong>Uso:</strong> Podemos usar los datos recopilados para prop贸sitos de investigaci贸n y desarrollo de LatamGPT.</li> |
|
<li><strong>Compartici贸n y Publicaci贸n:</strong> Los datos recolectados, incluyendo los detalles t茅cnicos recopilados de los headers de solicitud de tu dispositivo, pueden ser publicados, compartidos con terceros, o usados para an谩lisis y prop贸sitos de reportes.</li> |
|
<li><strong>Retenci贸n de Datos:</strong> Podemos retener tus datos anonimizados, incluyendo los detalles t茅cnicos recopilados de los headers de solicitud de tu dispositivo, por el tiempo que sea necesario.</li> |
|
</ol> |
|
<p>Al continuar usando nuestra aplicaci贸n, proporcionas tu consentimiento expl铆cito para la recolecci贸n, uso y potencial compartici贸n de tus datos como se describe arriba. Si no est谩s de acuerdo con nuestras pr谩cticas de recolecci贸n, uso y compartici贸n de datos, por favor no uses nuestra aplicaci贸n.</p> |
|
<p><strong>Este proyecto contribuye al desarrollo de LatamGPT, un modelo de lenguaje para Am茅rica Latina.</strong></p> |
|
</div> |
|
""") |
|
accept_button = gr.Button("Acepto / I Agree") |
|
|
|
def enable_inputs(): |
|
return gr.update(visible=False), gr.update(visible=True) |
|
|
|
accept_button.click(None, None, accept_checkbox, js=js, queue=False) |
|
accept_checkbox.change(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False) |
|
|
|
inputs.submit(reset_textbox, [], [inputs, b1], queue=False) |
|
inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1]) |
|
b1.click(reset_textbox, [], [inputs, b1], queue=False) |
|
b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1]) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |