File size: 14,386 Bytes
c8c8070
558e9ac
 
1c05f09
f930069
45a9874
 
43ce33c
45a9874
1c05f09
93c46ef
f930069
1c05f09
957627e
 
 
 
 
85d7c4d
957627e
 
 
 
 
66924fd
 
 
 
 
 
 
 
 
85d7c4d
957627e
 
 
977e95b
45a9874
43ce33c
23c4caf
b15c3f8
558e9ac
85d7c4d
1c05f09
558e9ac
f0321ce
1c05f09
45a9874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93c46ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43ce33c
 
93c46ef
43ce33c
1836e90
 
 
 
 
 
 
 
 
 
 
65d4c90
 
 
 
 
 
 
 
1836e90
43ce33c
93c46ef
43ce33c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1836e90
 
 
1c05f09
1836e90
 
977e95b
 
 
 
1c05f09
 
34ab564
45a9874
 
 
f930069
1c05f09
93c46ef
 
 
 
 
 
 
1c05f09
93c46ef
 
 
1c05f09
 
 
1836e90
 
72553ca
c8c8070
1c05f09
 
 
72553ca
1c05f09
 
 
 
 
 
 
 
23c4caf
1c05f09
72553ca
1c05f09
 
 
 
 
 
 
 
 
8324cfa
 
1c05f09
72553ca
1836e90
1c05f09
 
93c46ef
45a9874
 
 
93c46ef
45a9874
 
 
1c05f09
1836e90
1c05f09
 
93c46ef
1c05f09
 
 
 
45a9874
93c46ef
 
1c05f09
93c46ef
 
 
c8c8070
 
72553ca
c8c8070
1836e90
8e7ba9f
1836e90
1c05f09
c8c8070
 
 
 
 
 
 
 
1c05f09
c8c8070
 
65d4c90
 
 
 
 
 
 
 
7764644
65d4c90
 
 
 
 
 
 
 
 
85d7c4d
 
 
 
65d4c90
 
 
 
 
 
 
85d7c4d
1c05f09
85d7c4d
 
 
 
1836e90
1c05f09
1836e90
 
23c4caf
85d7c4d
66924fd
85d7c4d
 
 
 
 
 
 
 
 
d8ed522
1836e90
d8ed522
 
85d7c4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import gradio as gr
import os
import sys
import json
import random
import hashlib
import requests
import tempfile
from datetime import datetime
from openai import OpenAI
from huggingface_hub import upload_file, list_repo_files, create_repo, hf_hub_download

MODEL = "gpt-4.1-mini"

def get_env_bool(key, default="False"):
    value = os.getenv(key, default)
    if isinstance(value, bool):
        return value
    return str(value).lower() in ('true', '1', 'yes', 'on')

def get_env_list(key, default=""):
    value = os.getenv(key, default)
    if not value or value == "":
        return []
    
    if value.startswith('[') and value.endswith(']'):
        try:
            parsed = json.loads(value)
            if isinstance(parsed, list):
                return [str(item).strip() for item in parsed if str(item).strip()]
        except json.JSONDecodeError:
            pass
    
    return [item.strip() for item in str(value).split(',') if item.strip()]

DISABLED = get_env_bool("DISABLED", "False")
OPENAI_API_KEYS = get_env_list("OPENAI_API_KEYS", "")
NUM_THREADS = int(os.getenv("NUM_THREADS", "4"))
IP_SALT = os.getenv("IP_SALT", "latamgpt-default-salt-2025")
HF_TOKEN = os.getenv("HF_TOKEN")
DATASET_REPO = os.getenv("DATASET_REPO", "latam-gpt/copuchat-conversations")

def exception_handler(exception_type, exception, traceback):
    print(f"{exception_type.__name__}: {exception}")

sys.excepthook = exception_handler
sys.tracebacklimit = 0

def get_user_fingerprint(request):
    real_ip = (
        request.headers.get('x-forwarded-for', '').split(',')[0].strip() or
        request.headers.get('x-real-ip', '') or 
        getattr(request, 'client', {}).get('host', 'unknown')
    )
    fingerprint_data = f"{real_ip}:{IP_SALT}"
    user_fingerprint = hashlib.sha256(fingerprint_data.encode()).hexdigest()[:16]
    return real_ip, user_fingerprint

def get_country_from_ip(ip):
    try:
        response = requests.get(f"http://ip-api.com/json/{ip}", timeout=2)
        if response.status_code == 200:
            data = response.json()
            return {
                "country": data.get('country', 'Unknown'),
                "country_code": data.get('countryCode', 'UN'),
                "region": data.get('regionName', 'Unknown')
            }
    except:
        pass
    return {"country": "Unknown", "country_code": "UN", "region": "Unknown"}

def generate_conversation_hash(session_id, user_fingerprint):
    return hashlib.sha256(f"{session_id}:{user_fingerprint}".encode()).hexdigest()[:12]

def generate_conversation_filename(session_id, user_fingerprint, timestamp):
    conversation_hash = generate_conversation_hash(session_id, user_fingerprint)
    timestamp_str = timestamp.strftime('%Y%m%d_%H%M%S_%f')
    return f"conversations/{timestamp_str}_{conversation_hash}.jsonl"

def get_conversation_files():
    if not HF_TOKEN:
        return []
    try:
        files = list_repo_files(repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
        return sorted([f for f in files if f.startswith("conversations/") and f.endswith(".jsonl")])
    except:
        return []

def get_global_chat_counter():
    conversation_files = get_conversation_files()
    return len(conversation_files) + 1

def find_existing_conversation(session_id, user_fingerprint):
    conversation_hash = generate_conversation_hash(session_id, user_fingerprint)
    conversation_files = get_conversation_files()
    
    matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")]
    
    if matching_files:
        try:
            latest_file = matching_files[-1]
            local_file = hf_hub_download(
                repo_id=DATASET_REPO,
                repo_type="dataset",
                filename=latest_file,
                token=HF_TOKEN
            )
            with open(local_file, 'r') as f:
                return json.load(f)
        except:
            pass
    
    return None

def upload_conversation(conversation_data, session_id, user_fingerprint):
    if not HF_TOKEN:
        return
    
    try:
        try:
            create_repo(
                repo_id=DATASET_REPO,
                repo_type="dataset",
                private=True,
                exist_ok=True,
                token=HF_TOKEN
            )
        except:
            pass
        
        conversation_hash = generate_conversation_hash(session_id, user_fingerprint)
        conversation_files = get_conversation_files()
        matching_files = [f for f in conversation_files if f.endswith(f"_{conversation_hash}.jsonl")]
        
        if matching_files:
            filename = matching_files[-1]
        else:
            filename = generate_conversation_filename(session_id, user_fingerprint, datetime.now())
        
        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
            json.dump(conversation_data, f)
            temp_path = f.name
        
        upload_file(
            path_or_fileobj=temp_path,
            path_in_repo=filename,
            repo_id=DATASET_REPO,
            repo_type="dataset",
            token=HF_TOKEN
        )
        
        os.unlink(temp_path)
        
    except Exception as e:
        print(f"Upload failed: {e}")

GLOBAL_CHAT_COUNTER = get_global_chat_counter()
print(f"Starting global chat counter at: {GLOBAL_CHAT_COUNTER}")

def predict(inputs, top_p, temperature, chat_counter, chatbot, history, request: gr.Request):
    global GLOBAL_CHAT_COUNTER
    
    if not OPENAI_API_KEYS or not OPENAI_API_KEYS[0]:
        yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "No API keys configured", gr.update(interactive=True), gr.update(interactive=True)
        return
        
    api_key = random.choice(OPENAI_API_KEYS)
    client = OpenAI(api_key=api_key)
    
    session_id = getattr(request, 'session_hash', 'unknown')
    real_ip, user_fingerprint = get_user_fingerprint(request)
    geo_info = get_country_from_ip(real_ip)
    headers_dict = {key.decode('utf-8'): value.decode('utf-8') for key, value in request.headers.raw}
    
    existing_conversation = find_existing_conversation(session_id, user_fingerprint) if chat_counter == 0 else None
    
    if existing_conversation:
        history = existing_conversation['messages_history']
        chat_counter = existing_conversation['chat_counter']
        chatbot = [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)]
    
    messages = []
    for i, data in enumerate(history):
        role = 'user' if i % 2 == 0 else 'assistant'
        messages.append({"role": role, "content": data})
    
    messages.append({"role": "user", "content": inputs})
    
    GLOBAL_CHAT_COUNTER += 1
    global_counter = GLOBAL_CHAT_COUNTER
    chat_counter += 1
    history.append(inputs)
    token_counter = 0
    partial_words = ""
    
    try:
        stream = client.chat.completions.create(
            model=MODEL,
            messages=messages,
            temperature=temperature,
            top_p=top_p,
            stream=True,
            presence_penalty=0,
            frequency_penalty=0,
            max_tokens=4096
        )
        
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                partial_words += chunk.choices[0].delta.content
                if token_counter == 0:
                    history.append(" " + partial_words)
                else:
                    history[-1] = partial_words
                token_counter += 1
                yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=False), gr.update(interactive=False)
        
        yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200", gr.update(interactive=True), gr.update(interactive=True)
                
    except Exception as e:
        print(f'Error API OpenAI: {e}')
        yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, str(e), gr.update(interactive=True), gr.update(interactive=True)
    
    conversation_data = {
        "session_id": session_id,
        "user_fingerprint": user_fingerprint,
        "conversation_id": f"{session_id}_{datetime.now().strftime('%Y%m%d_%H')}",
        "conversation_hash": generate_conversation_hash(session_id, user_fingerprint),
        "country": geo_info["country"],
        "country_code": geo_info["country_code"],
        "region": geo_info["region"],
        "chat_counter": chat_counter,
        "global_chat_counter": global_counter,
        "model": MODEL,
        "messages": messages,
        "messages_history": history,
        "response": partial_words,
        "headers": headers_dict,
        "temperature": temperature,
        "top_p": top_p,
        "token_counter": token_counter,
        "timestamp": datetime.now().isoformat(),
        "last_updated": datetime.now().isoformat()
    }
    
    print(json.dumps({k: v for k, v in conversation_data.items() if k != 'messages_history'}))
    upload_conversation(conversation_data, session_id, user_fingerprint)

def reset_textbox():
    return gr.update(value='', interactive=False), gr.update(interactive=False)

title = """<h1 align="center">Copuchat: Recolecci贸n de datos para LatamGPT</h1>"""
if DISABLED:
    title = """<h1 align="center" style="color:red">Esta app alcanz贸 su l铆mite de uso. Porfavor intenta reingresar ma帽ana.</h1>"""

description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form:
```
User: <utterance>
Assistant: <utterance>
User: <utterance>
Assistant: <utterance>
...
```
In this app, you can explore the outputs of GPT-4.1 mini while contributing to LatamGPT research.
"""

with gr.Blocks(css="""
    #col_container { 
        margin-left: auto; 
        margin-right: auto;
        max-width: 1200px;
        width: 95%;
    }
    #chatbot {
        height: 1200px; 
        overflow: auto;
    }
    .gradio-container {
        max-width: unset !important;
    }
    #component-0 {
        max-width: unset;
    }
""") as demo:
    gr.HTML(title)
    
    with gr.Column(elem_id="col_container", visible=False) as main_block:
        chatbot = gr.Chatbot(elem_id='chatbot')
        inputs = gr.Textbox(
            placeholder="", 
            label="Escribe tu mensaje y presiona Enter",
            lines=3,
            max_lines=8,
            scale=1
        )
        state = gr.State([])
        
        with gr.Row():
            with gr.Column(scale=7):
                b1 = gr.Button(visible=not DISABLED)
            with gr.Column(scale=3):
                server_status_code = gr.Textbox(label="C贸digo de estado del servidor")
        
        with gr.Accordion("Par谩metros", open=False):
            top_p = gr.Slider(minimum=0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (muestreo de n煤cleo)")
            temperature = gr.Slider(minimum=0, maximum=2.0, value=0.2, step=0.1, interactive=True, label="Temperatura")
            chat_counter = gr.Number(value=0, visible=False, precision=0)
    
    with gr.Column(elem_id="user_consent_container") as user_consent_block:
        accept_checkbox = gr.Checkbox(visible=False)
        js = "(x) => confirm('Al hacer clic en \"Acepto\", acepto que mis datos pueden ser publicados o compartidos para investigaci贸n.')"
        
        with gr.Accordion("Consentimiento de Usuario para Recolecci贸n, Uso y Compartici贸n de Datos", open=True):
            gr.HTML("""
            <div>
                <p>Al usar nuestra aplicaci贸n, que funciona con la API de OpenAI, reconoces y aceptas los siguientes t茅rminos sobre los datos que proporcionas:</p>
                <ol>
                    <li><strong>Recolecci贸n:</strong> Podemos recopilar informaci贸n, incluyendo las entradas que escribes en nuestra aplicaci贸n, las salidas generadas por la API de OpenAI, y ciertos detalles t茅cnicos sobre tu dispositivo y conexi贸n (como tipo de navegador, sistema operativo y ubicaci贸n geogr谩fica) proporcionados por los headers de solicitud de tu dispositivo. Estos datos pasaran por un proceso de anonimizaci贸n para evitar la recolecci贸n de informaci贸n privada.</li>
                    <li><strong>Uso:</strong> Podemos usar los datos recopilados para prop贸sitos de investigaci贸n y desarrollo de LatamGPT.</li>
                    <li><strong>Compartici贸n y Publicaci贸n:</strong> Los datos recolectados, incluyendo los detalles t茅cnicos recopilados de los headers de solicitud de tu dispositivo, pueden ser publicados, compartidos con terceros, o usados para an谩lisis y prop贸sitos de reportes.</li>
                    <li><strong>Retenci贸n de Datos:</strong> Podemos retener tus datos anonimizados, incluyendo los detalles t茅cnicos recopilados de los headers de solicitud de tu dispositivo, por el tiempo que sea necesario.</li>
                </ol>
                <p>Al continuar usando nuestra aplicaci贸n, proporcionas tu consentimiento expl铆cito para la recolecci贸n, uso y potencial compartici贸n de tus datos como se describe arriba. Si no est谩s de acuerdo con nuestras pr谩cticas de recolecci贸n, uso y compartici贸n de datos, por favor no uses nuestra aplicaci贸n.</p>
                <p><strong>Este proyecto contribuye al desarrollo de LatamGPT, un modelo de lenguaje para Am茅rica Latina.</strong></p>
            </div>
            """)
            accept_button = gr.Button("Acepto / I Agree")

        def enable_inputs():
            return gr.update(visible=False), gr.update(visible=True)

    accept_button.click(None, None, accept_checkbox, js=js, queue=False)
    accept_checkbox.change(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False)

    inputs.submit(reset_textbox, [], [inputs, b1], queue=False)
    inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1])
    b1.click(reset_textbox, [], [inputs, b1], queue=False)
    b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1])

if __name__ == "__main__":
    demo.launch()