Spaces:

TaquitoTomatoe
/

MojicaPoC

Sleeping

App Files Files Community

Carlos Isael Ramírez González commited on Sep 2

Commit

56ff037

1 Parent(s): a3373fd

Modelo nuevo completado

Browse files

Files changed (10) hide show

app.py +15 -9
config.py +3 -2
examples.json +282 -0
intelligent_question_router.py +25 -0
load_json.py +14 -0
memory.py +23 -44
mojica_agent.py +234 -229
requirements.txt +0 -0
semantic_classifier.py +63 -0
supervised_classifier.py +93 -0

app.py CHANGED Viewed

@@ -9,33 +9,39 @@ app = FastAPI()
 mojica_bot = MojicaAgent(Config)
 # * Esquema de entrada como marshmellow
-class QuestionRequest(BaseModel):
     question: str
 class AnswerResponse(BaseModel):
-    sql: str
     result: Any
 @app.post("/")
 def ask_question(req: QuestionRequest):
     sql, result = mojica_bot.consult(req.question)
     # Si es dataframe lo convertimos a json
-    if isinstance(result, pd.DataFrame):
         result = result.to_dict(orient="records")
     return {"sql": sql, "result": result}
     # return {"sql": "WASA"}
 # @app.post("/", response_model=AnswerResponse)
-# def ask_question(req: QuestionRequest):
 #     sql, result = mojica_bot.consult(req.question)
 #     # * Si es dataframe lo convertimos a json
-#     if isinstance(result, pd.DataFrame):
 #         result = result.to_dict(orient="records")
 #     return {"sql": sql, "result": result}
 # @app.get("/")

 mojica_bot = MojicaAgent(Config)
 # * Esquema de entrada como marshmellow
+class QuestionRequest(BaseModel):
     question: str
+    number: str
 class AnswerResponse(BaseModel):
+    sql: str
     result: Any
 @app.post("/")
 def ask_question(req: QuestionRequest):
     sql, result = mojica_bot.consult(req.question)
     # Si es dataframe lo convertimos a json
+    if isinstance(result, pd.DataFrame):
         result = result.to_dict(orient="records")
     return {"sql": sql, "result": result}
     # return {"sql": "WASA"}
 # @app.post("/", response_model=AnswerResponse)
+# def ask_question(req: QuestionRequest):
 #     sql, result = mojica_bot.consult(req.question)
 #     # * Si es dataframe lo convertimos a json
+#     if isinstance(result, pd.DataFrame):
 #         result = result.to_dict(orient="records")
 #     return {"sql": sql, "result": result}
 # @app.get("/")

config.py CHANGED Viewed

@@ -5,11 +5,12 @@ from huggingface_hub import hf_hub_download
 class Config:
     DB_PATH = '/tmp/dataset.db'
     TABLE_NAME = 'mojica_Ventas'
     MODEL_NAME = "ibm-granite/granite-3b-code-instruct-128k"
     CSV_PATH = "/kaggle/input/mojica-hoja-1/mojica_hoja_1.csv"
     MAX_HISTORY = 3  # Mantener las últimas 3 interacciones (memoria)
-    MAX_TOKENS = 8_000
-    MAX_NEW_TOKENS = 400
     DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 if not os.path.exists(Config.DB_PATH):

 class Config:
     DB_PATH = '/tmp/dataset.db'
     TABLE_NAME = 'mojica_Ventas'
+    EXMAPLES_JSON = '/examples.json'
     MODEL_NAME = "ibm-granite/granite-3b-code-instruct-128k"
     CSV_PATH = "/kaggle/input/mojica-hoja-1/mojica_hoja_1.csv"
     MAX_HISTORY = 3  # Mantener las últimas 3 interacciones (memoria)
+    MAX_TOKENS = 4096
+    MAX_NEW_TOKENS = 256
     DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 if not os.path.exists(Config.DB_PATH):

examples.json ADDED Viewed

	@@ -0,0 +1,282 @@

+{
+    "PRODUCTOS": [
+        {
+            "pregunta": "Top 5 productos más vendidos este año",
+            "query": "SELECT \"Descripcion\", SUM(\"Cantidad\") AS total_vendido\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL AND strftime('%Y', \"Fecha\") = '2025'\nGROUP BY \"Descripcion\"\nORDER BY total_vendido DESC\nLIMIT 5;"
+        },
+        {
+            "pregunta": "Productos con mayor margen de ganancia",
+            "query": "SELECT \"Descripcion\", (SUM(\"Neto\") / SUM(\"Cantidad\")) AS margen_unitario\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL AND \"Cantidad\" > 0\nGROUP BY \"Descripcion\"\nHAVING SUM(\"Cantidad\") > 30\nORDER BY margen_unitario DESC\nLIMIT 10;"
+        },
+        {
+            "pregunta": "Productos con menor rotación en inventario",
+            "query": "SELECT \"Descripcion\", SUM(\"Cantidad\") AS total_vendido\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL AND \"Fecha\" BETWEEN DATE('now','-6 months') AND DATE('now')\nGROUP BY \"Descripcion\"\nORDER BY total_vendido ASC\nLIMIT 5;"
+        }
+    ],
+    "CLIENTES_CERO": [
+        {
+            "pregunta": "Total de clientes cero actuales",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") > DATE('now', '-28 day')\n);"
+        },
+        {
+            "pregunta": "Clientes cero que fueron VIP anteriormente",
+            "query": "SELECT \"Cliente\", \"Razon Social\", SUM(\"Neto\") AS historial_compra\nFROM \"sells\"\nWHERE \"Cliente\" IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE \"Cliente\" NOT IN (\n        SELECT DISTINCT \"Cliente\"\n        FROM \"sells\"\n        WHERE DATE(\"Fecha\") > DATE('now', '-28 day')\n    )\n) AND \"Cliente\" IS NOT NULL\nGROUP BY \"Cliente\", \"Razon Social\"\nHAVING SUM(\"Neto\") > 5000\nORDER BY historial_compra DESC;"
+        },
+        {
+            "pregunta": "Clientes cero por antigüedad de inactividad",
+            "query": "SELECT \"Cliente\", \"Razon Social\", MAX(DATE(\"Fecha\")) AS ultima_compra,\n(julianday('now') - julianday(MAX(\"Fecha\"))) AS dias_inactivo\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") > DATE('now', '-28 day')\n)\nGROUP BY \"Cliente\", \"Razon Social\"\nORDER BY dias_inactivo DESC\nLIMIT 10;"
+        }
+    ],
+    "CIUDAD_CERO": [
+        {
+            "pregunta": "Clientes cero por ciudad",
+            "query": "SELECT \"Ciudad\", COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") > DATE('now', '-28 day')\n)\nGROUP BY \"Ciudad\"\nORDER BY clientes_cero DESC;"
+        },
+        {
+            "pregunta": "Ciudad con mayor porcentaje de clientes cero",
+            "query": "SELECT \"Ciudad\", \n(COUNT(DISTINCT CASE WHEN \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") > DATE('now', '-28 day')\n) THEN \"Cliente\" END) * 100.0 / COUNT(DISTINCT \"Cliente\")) AS porcentaje_cero\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL\nGROUP BY \"Ciudad\"\nORDER BY porcentaje_cero DESC\nLIMIT 5;"
+        },
+        {
+            "pregunta": "Clientes cero en Guadalajara con historial de compras",
+            "query": "SELECT \"Cliente\", \"Razon Social\", SUM(\"Neto\") AS historial_compra\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Guadalajara%'\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") > DATE('now', '-28 day')\n)\nGROUP BY \"Cliente\", \"Razon Social\"\nHAVING SUM(\"Neto\") > 1000\nORDER BY historial_compra DESC;"
+        }
+    ],
+    "PRODUCTO_CIUDAD": [
+        {
+            "pregunta": "Producto más vendido por ciudad",
+            "query": "SELECT \"Ciudad\", \"Descripcion\", SUM(\"Cantidad\") AS total_vendido\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL AND \"Descripcion\" IS NOT NULL\nGROUP BY \"Ciudad\", \"Descripcion\"\nORDER BY \"Ciudad\", total_vendido DESC;"
+        },
+        {
+            "pregunta": "Ciudades donde el queso gouda es más popular",
+            "query": "SELECT \"Ciudad\", SUM(\"Cantidad\") AS total_vendido\nFROM \"sells\"\nWHERE \"Descripcion\" LIKE '%Queso Gouda%' AND \"Ciudad\" IS NOT NULL\nGROUP BY \"Ciudad\"\nORDER BY total_vendido DESC\nLIMIT 5;"
+        },
+        {
+            "pregunta": "Productos exclusivos por ciudad",
+            "query": "SELECT \"Ciudad\", \"Descripcion\"\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL AND \"Ciudad\" IS NOT NULL\nGROUP BY \"Ciudad\", \"Descripcion\"\nHAVING COUNT(DISTINCT \"Ciudad\") = 1\nORDER BY \"Ciudad\";"
+        }
+    ],
+    "CLIENTE_CIUDAD": [
+        {
+            "pregunta": "Clientes más valiosos por ciudad",
+            "query": "SELECT \"Ciudad\", \"Cliente\", \"Razon Social\", SUM(\"Neto\") AS valor_total\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL AND \"Cliente\" IS NOT NULL\nGROUP BY \"Ciudad\", \"Cliente\", \"Razon Social\"\nORDER BY \"Ciudad\", valor_total DESC;"
+        },
+        {
+            "pregunta": "Ciudad con clientes más frecuentes",
+            "query": "SELECT \"Ciudad\", AVG(compras_por_cliente) AS frecuencia_promedio\nFROM (\n    SELECT \"Ciudad\", \"Cliente\", COUNT(*) AS compras_por_cliente\n    FROM \"sells\"\n    WHERE \"Ciudad\" IS NOT NULL AND \"Cliente\" IS NOT NULL\n    GROUP BY \"Ciudad\", \"Cliente\"\n)\nGROUP BY \"Ciudad\"\nORDER BY frecuencia_promedio DESC;"
+        },
+        {
+            "pregunta": "Distribución de clientes por ciudad",
+            "query": "SELECT \"Ciudad\", COUNT(DISTINCT \"Cliente\") AS total_clientes\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL AND \"Cliente\" IS NOT NULL\nGROUP BY \"Ciudad\"\nORDER BY total_clientes DESC;"
+        }
+    ],
+    "PRODUCTO_DINERO": [
+        {
+            "pregunta": "Productos que generan más ingresos",
+            "query": "SELECT \"Descripcion\", SUM(\"Neto\") AS ingresos_totales\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL\nGROUP BY \"Descripcion\"\nORDER BY ingresos_totales DESC\nLIMIT 10;"
+        },
+        {
+            "pregunta": "Rentabilidad por categoría de producto",
+            "query": "SELECT \nCASE \n    WHEN \"Descripcion\" LIKE '%queso%' THEN 'Lácteos'\n    WHEN \"Descripcion\" LIKE '%pan%' THEN 'Panadería'\n    WHEN \"Descripcion\" LIKE '%bebida%' THEN 'Bebidas'\n    ELSE 'Otros'\nEND AS categoria,\nSUM(\"Neto\") AS ingresos_totales\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL\nGROUP BY categoria\nORDER BY ingresos_totales DESC;"
+        },
+        {
+            "pregunta": "Productos con mejor margen por unidad",
+            "query": "SELECT \"Descripcion\", (SUM(\"Neto\") / SUM(\"Cantidad\")) AS margen_unitario\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL AND \"Cantidad\" > 0\nGROUP BY \"Descripcion\"\nHAVING SUM(\"Cantidad\") > 20\nORDER BY margen_unitario DESC\nLIMIT 10;"
+        }
+    ],
+    "CIUDAD_DINERO": [
+        {
+            "pregunta": "Ciudades con mayor facturación",
+            "query": "SELECT \"Ciudad\", SUM(\"Neto\") AS ingresos_totales\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL\nGROUP BY \"Ciudad\"\nORDER BY ingresos_totales DESC\nLIMIT 5;"
+        },
+        {
+            "pregunta": "Crecimiento de ventas por ciudad",
+            "query": "SELECT \"Ciudad\", \n(SUM(CASE WHEN strftime('%Y-%m', \"Fecha\") = '2025-06' THEN \"Neto\" ELSE 0 END) - \nSUM(CASE WHEN strftime('%Y-%m', \"Fecha\") = '2025-05' THEN \"Neto\" ELSE 0 END)) AS crecimiento_mensual\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL\nGROUP BY \"Ciudad\"\nORDER BY crecimiento_mensual DESC;"
+        },
+        {
+            "pregunta": "Ticket promedio por ciudad",
+            "query": "SELECT \"Ciudad\", AVG(\"Neto\") AS ticket_promedio\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL\nGROUP BY \"Ciudad\"\nORDER BY ticket_promedio DESC;"
+        }
+    ],
+    "CLIENTE_DINERO": [
+        {
+            "pregunta": "Top 10 clientes por valor de compras",
+            "query": "SELECT \"Cliente\", \"Razon Social\", SUM(\"Neto\") AS valor_total\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nGROUP BY \"Cliente\", \"Razon Social\"\nORDER BY valor_total DESC\nLIMIT 10;"
+        },
+        {
+            "pregunta": "Clientes con mayor frecuencia de compra de alto valor",
+            "query": "SELECT \"Cliente\", \"Razon Social\", COUNT(*) AS compras, AVG(\"Neto\") AS ticket_promedio\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL AND \"Neto\" > 500\nGROUP BY \"Cliente\", \"Razon Social\"\nORDER BY compras DESC\nLIMIT 10;"
+        },
+        {
+            "pregunta": "Clientes con mayor potencial de crecimiento",
+            "query": "SELECT \"Cliente\", \"Razon Social\", \n(MAX(\"Neto\") - AVG(\"Neto\")) AS potencial_crecimiento\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nGROUP BY \"Cliente\", \"Razon Social\"\nHAVING COUNT(*) > 3\nORDER BY potencial_crecimiento DESC\nLIMIT 10;"
+        }
+    ],
+    "TIEMPO_DINERO": [
+        {
+            "pregunta": "Ventas mensuales del año actual",
+            "query": "SELECT strftime('%Y-%m', \"Fecha\") AS mes, SUM(\"Neto\") AS ventas_mensuales\nFROM \"sells\"\nWHERE \"Fecha\" IS NOT NULL AND strftime('%Y', \"Fecha\") = '2025'\nGROUP BY mes\nORDER BY mes;"
+        },
+        {
+            "pregunta": "Crecimiento interanual de ventas",
+            "query": "SELECT \nSUM(CASE WHEN strftime('%Y', \"Fecha\") = '2025' THEN \"Neto\" ELSE 0 END) AS ventas_2025,\nSUM(CASE WHEN strftime('%Y', \"Fecha\") = '2024' THEN \"Neto\" ELSE 0 END) AS ventas_2024,\n((SUM(CASE WHEN strftime('%Y', \"Fecha\") = '2025' THEN \"Neto\" ELSE 0 END) - \nSUM(CASE WHEN strftime('%Y', \"Fecha\") = '2024' THEN \"Neto\" ELSE 0 END)) * 100.0 / \nSUM(CASE WHEN strftime('%Y', \"Fecha\") = '2024' THEN \"Neto\" ELSE 0 END)) AS crecimiento_porcentual\nFROM \"sells\";"
+        },
+        {
+            "pregunta": "Ventas por día de la semana",
+            "query": "SELECT \nCASE strftime('%w', \"Fecha\")\n    WHEN '0' THEN 'Domingo'\n    WHEN '1' THEN 'Lunes'\n    WHEN '2' THEN 'Martes'\n    WHEN '3' THEN 'Miércoles'\n    WHEN '4' THEN 'Jueves'\n    WHEN '5' THEN 'Viernes'\n    WHEN '6' THEN 'Sábado'\nEND AS dia_semana,\nSUM(\"Neto\") AS ventas_dia\nFROM \"sells\"\nWHERE \"Fecha\" IS NOT NULL\nGROUP BY dia_semana\nORDER BY ventas_dia DESC;"
+        }
+    ],
+    "CIUDADES": [
+        {
+            "pregunta": "¿Cuántas ciudades diferentes tenemos registradas?",
+            "query": "SELECT COUNT(DISTINCT \"Ciudad\") AS total_ciudades\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL;"
+        },
+        {
+            "pregunta": "Lista de las 10 ciudades con más transacciones",
+            "query": "SELECT \"Ciudad\", COUNT(*) AS total_transacciones\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL\nGROUP BY \"Ciudad\"\nORDER BY total_transacciones DESC\nLIMIT 10;"
+        },
+        {
+            "pregunta": "Nombra 15 ciudades al azar de nuestra base",
+            "query": "SELECT DISTINCT \"Ciudad\"\nFROM \"sells\"\nWHERE \"Ciudad\" IS NOT NULL\nORDER BY RANDOM()\nLIMIT 15;"
+        }
+    ],
+    "CLIENTES": [
+        {
+            "pregunta": "¿Cuántos clientes únicos tenemos registrados?",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS total_clientes\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL;"
+        },
+        {
+            "pregunta": "Muestra 10 clientes al azar con su razón social",
+            "query": "SELECT DISTINCT \"Cliente\", \"Razon Social\"\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nORDER BY RANDOM()\nLIMIT 10;"
+        },
+        {
+            "pregunta": "Clientes con nombres más largos en la base",
+            "query": "SELECT \"Cliente\", \"Razon Social\", LENGTH(\"Razon Social\") AS longitud_nombre\nFROM \"sells\"\nWHERE \"Razon Social\" IS NOT NULL\nGROUP BY \"Cliente\", \"Razon Social\"\nORDER BY longitud_nombre DESC\nLIMIT 10;"
+        }
+    ],
+    "FECHAS": [
+        {
+            "pregunta": "¿Desde cuándo tenemos registros de ventas?",
+            "query": "SELECT MIN(DATE(\"Fecha\")) AS primera_fecha, MAX(DATE(\"Fecha\")) AS ultima_fecha\nFROM \"sells\"\nWHERE \"Fecha\" IS NOT NULL;"
+        },
+        {
+            "pregunta": "¿Cuántos días diferentes con ventas tenemos?",
+            "query": "SELECT COUNT(DISTINCT DATE(\"Fecha\")) AS dias_con_ventas\nFROM \"sells\"\nWHERE \"Fecha\" IS NOT NULL;"
+        },
+        {
+            "pregunta": "Rango de fechas de nuestro historial",
+            "query": "SELECT MIN(\"Fecha\") AS inicio_historial, MAX(\"Fecha\") AS fin_historial,\nJULIANDAY(MAX(\"Fecha\")) - JULIANDAY(MIN(\"Fecha\")) AS dias_totales\nFROM \"sells\"\nWHERE \"Fecha\" IS NOT NULL;"
+        }
+    ],
+    "VOLUMEN_VENTAS": [
+        {
+            "pregunta": "¿Cuántas transacciones totales tenemos?",
+            "query": "SELECT COUNT(*) AS total_transacciones\nFROM \"sells\";"
+        },
+        {
+            "pregunta": "Promedio de ventas por día",
+            "query": "SELECT AVG(ventas_dia) AS promedio_ventas_diarias\nFROM (\n    SELECT DATE(\"Fecha\"), SUM(\"Neto\") AS ventas_dia\n    FROM \"sells\"\n    WHERE \"Fecha\" IS NOT NULL\n    GROUP BY DATE(\"Fecha\")\n);"
+        },
+        {
+            "pregunta": "Día con mayor número de transacciones",
+            "query": "SELECT DATE(\"Fecha\") AS dia, COUNT(*) AS total_transacciones\nFROM \"sells\"\nWHERE \"Fecha\" IS NOT NULL\nGROUP BY dia\nORDER BY total_transacciones DESC\nLIMIT 1;"
+        }
+    ],
+    "ESTADISTICAS_BASICAS": [
+        {
+            "pregunta": "Valor promedio de una venta",
+            "query": "SELECT AVG(\"Neto\") AS valor_promedio_venta\nFROM \"sells\"\nWHERE \"Neto\" IS NOT NULL;"
+        },
+        {
+            "pregunta": "Cantidad promedio de productos por venta",
+            "query": "SELECT AVG(\"Cantidad\") AS cantidad_promedio\nFROM \"sells\"\nWHERE \"Cantidad\" IS NOT NULL;"
+        },
+        {
+            "pregunta": "Valor máximo y mínimo de una venta",
+            "query": "SELECT MAX(\"Neto\") AS venta_maxima, MIN(\"Neto\") AS venta_minima\nFROM \"sells\"\nWHERE \"Neto\" IS NOT NULL;"
+        }
+    ],
+    "CATEGORIAS_PRODUCTOS": [
+        {
+            "pregunta": "¿Cuáles categorías de productos tenemos?",
+            "query": "SELECT DISTINCT \nCASE \n    WHEN \"Descripcion\" LIKE '%QUESO%' OR \"Descripcion\" LIKE '%QUESO%' THEN 'Quesos'\n    WHEN \"Descripcion\" LIKE '%LECHE%' OR \"Descripcion\" LIKE '%LACTEO%' THEN 'Lácteos'\n    WHEN \"Descripcion\" LIKE '%PAN%' OR \"Descripcion\" LIKE '%BOLLERIA%' THEN 'Panadería'\n    WHEN \"Descripcion\" LIKE '%BEBIDA%' OR \"Descripcion\" LIKE '%REFRESCO%' THEN 'Bebidas'\n    WHEN \"Descripcion\" LIKE '%EMBUTIDO%' OR \"Descripcion\" LIKE '%SALCHICHA%' THEN 'Embutidos'\n    ELSE 'Otros'\nEND AS categoria\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL\nORDER BY categoria;"
+        },
+        {
+            "pregunta": "Cantidad de productos por categoría",
+            "query": "SELECT \nCASE \n    WHEN \"Descripcion\" LIKE '%QUESO%' OR \"Descripcion\" LIKE '%QUESO%' THEN 'Quesos'\n    WHEN \"Descripcion\" LIKE '%LECHE%' OR \"Descripcion\" LIKE '%LACTEO%' THEN 'Lácteos'\n    WHEN \"Descripcion\" LIKE '%PAN%' OR \"Descripcion\" LIKE '%BOLLERIA%' THEN 'Panadería'\n    WHEN \"Descripcion\" LIKE '%BEBIDA%' OR \"Descripcion\" LIKE '%REFRESCO%' THEN 'Bebidas'\n    WHEN \"Descripcion\" LIKE '%EMBUTIDO%' OR \"Descripcion\" LIKE '%SALCHICHA%' THEN 'Embutidos'\n    ELSE 'Otros'\nEND AS categoria,\nCOUNT(DISTINCT \"Descripcion\") AS cantidad_productos\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL\nGROUP BY categoria\nORDER BY cantidad_productos DESC;"
+        },
+        {
+            "pregunta": "Categorías con mayor variedad de productos",
+            "query": "SELECT \nCASE \n    WHEN \"Descripcion\" LIKE '%QUESO%' OR \"Descripcion\" LIKE '%QUESO%' THEN 'Quesos'\n    WHEN \"Descripcion\" LIKE '%LECHE%' OR \"Descripcion\" LIKE '%LACTEO%' THEN 'Lácteos'\n    WHEN \"Descripcion\" LIKE '%PAN%' OR \"Descripcion\" LIKE '%BOLLERIA%' THEN 'Panadería'\n    WHEN \"Descripcion\" LIKE '%BEBIDA%' OR \"Descripcion\" LIKE '%REFRESCO%' THEN 'Bebidas'\n    WHEN \"Descripcion\" LIKE '%EMBUTIDO%' OR \"Descripcion\" LIKE '%SALCHICHA%' THEN 'Embutidos'\n    ELSE 'Otros'\nEND AS categoria,\nCOUNT(DISTINCT \"Descripcion\") AS variedad_productos\nFROM \"sells\"\nWHERE \"Descripcion\" IS NOT NULL\nGROUP BY categoria\nORDER BY variedad_productos DESC\nLIMIT 3;"
+        }
+    ],
+    "CLIENTES_CERO_TIEMPO": [
+        {
+            "pregunta": "¿Cuántos clientes cero tuvimos en Marzo 2025?",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nAND DATE(\"Fecha\") <= DATE('2025-03-31')\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") BETWEEN DATE('2025-03-31','-27 day') AND DATE('2025-03-31')\n);"
+        },
+        {
+            "pregunta": "¿Cuántos clientes cero tuvimos en el primer trimestre de 2025?",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nAND DATE(\"Fecha\") <= DATE('2025-03-31')\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") BETWEEN DATE('2025-03-31','-27 day') AND DATE('2025-03-31')\n);"
+        },
+        {
+            "pregunta": "¿Cuántos clientes cero tuvimos en el último semestre?",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Cliente\" IS NOT NULL\nAND DATE(\"Fecha\") <= DATE('now','-6 months')\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") BETWEEN DATE('now','-6 months','-27 day') AND DATE('now','-6 months')\n);"
+        }
+    ],
+    "CIUDAD_CERO_TIEMPO": [
+        {
+            "pregunta": "¿Cuántos clientes cero tuvimos en Guadalajara en Marzo 2025?",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Guadalajara%'\nAND \"Cliente\" IS NOT NULL\nAND DATE(\"Fecha\") <= DATE('2025-03-31')\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") BETWEEN DATE('2025-03-31','-27 day') AND DATE('2025-03-31')\n);"
+        },
+        {
+            "pregunta": "¿Cuántos clientes cero tuvimos en Monterrey en Abril 2025?",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Monterrey%'\nAND \"Cliente\" IS NOT NULL\nAND DATE(\"Fecha\") <= DATE('2025-04-30')\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") BETWEEN DATE('2025-04-30','-27 day') AND DATE('2025-04-30')\n);"
+        },
+        {
+            "pregunta": "¿Cuántos clientes cero tuvimos en Zapopan en el último mes?",
+            "query": "SELECT COUNT(DISTINCT \"Cliente\") AS clientes_cero\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Zapopan%'\nAND \"Cliente\" IS NOT NULL\nAND DATE(\"Fecha\") <= DATE('now','-1 month')\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") BETWEEN DATE('now','-1 month','-27 day') AND DATE('now','-1 month')\n);"
+        }
+    ],
+    "PRODUCTO_CIUDAD_TIEMPO": [
+        {
+            "pregunta": "¿Qué productos se vendieron más en Guadalajara en Marzo?",
+            "query": "SELECT \"Descripcion\", SUM(\"Cantidad\") AS total_vendido\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Guadalajara%'\nAND \"Descripcion\" IS NOT NULL\nAND strftime('%Y-%m', \"Fecha\") = '2025-03'\nGROUP BY \"Descripcion\"\nORDER BY total_vendido DESC\nLIMIT 5;"
+        },
+        {
+            "pregunta": "¿Cuál fue el producto más vendido en Monterrey en Abril?",
+            "query": "SELECT \"Descripcion\", SUM(\"Cantidad\") AS total_vendido\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Monterrey%'\nAND \"Descripcion\" IS NOT NULL\nAND strftime('%Y-%m', \"Fecha\") = '2025-04'\nGROUP BY \"Descripcion\"\nORDER BY total_vendido DESC\nLIMIT 1;"
+        },
+        {
+            "pregunta": "Productos con mayor crecimiento en Zapopan el último trimestre",
+            "query": "SELECT \"Descripcion\", \n(SUM(CASE WHEN strftime('%Y-%m', \"Fecha\") = strftime('%Y-%m','now') THEN \"Cantidad\" ELSE 0 END) - \nSUM(CASE WHEN strftime('%Y-%m', \"Fecha\") = strftime('%Y-%m','now','-3 months') THEN \"Cantidad\" ELSE 0 END)) AS crecimiento\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Zapopan%'\nAND \"Descripcion\" IS NOT NULL\nGROUP BY \"Descripcion\"\nORDER BY crecimiento DESC\nLIMIT 5;"
+        }
+    ],
+    "CLIENTE_CIUDAD_TIEMPO": [
+        {
+            "pregunta": "Clientes más valiosos de Guadalajara en el último trimestre",
+            "query": "SELECT \"Cliente\", \"Razon Social\", SUM(\"Neto\") AS valor_total\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Guadalajara%'\nAND \"Cliente\" IS NOT NULL\nAND \"Fecha\" BETWEEN DATE('now','-3 months') AND DATE('now')\nGROUP BY \"Cliente\", \"Razon Social\"\nORDER BY valor_total DESC\nLIMIT 10;"
+        },
+        {
+            "pregunta": "Clientes nuevos en Monterrey durante Marzo",
+            "query": "SELECT \"Cliente\", \"Razon Social\", MIN(DATE(\"Fecha\")) AS primera_compra\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Monterrey%'\nAND \"Cliente\" IS NOT NULL\nAND strftime('%Y-%m', \"Fecha\") = '2025-03'\nGROUP BY \"Cliente\", \"Razon Social\"\nHAVING MIN(DATE(\"Fecha\")) >= DATE('2025-03-01')\nORDER BY primera_compra;"
+        },
+        {
+            "pregunta": "Clientes inactivos en Zapopan que compraron hace 6 meses",
+            "query": "SELECT \"Cliente\", \"Razon Social\", MAX(DATE(\"Fecha\")) AS ultima_compra\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Zapopan%'\nAND \"Cliente\" IS NOT NULL\nAND \"Cliente\" NOT IN (\n    SELECT DISTINCT \"Cliente\"\n    FROM \"sells\"\n    WHERE DATE(\"Fecha\") > DATE('now','-28 day')\n)\nAND DATE(\"Fecha\") BETWEEN DATE('now','-6 months') AND DATE('now','-5 months')\nGROUP BY \"Cliente\", \"Razon Social\";"
+        }
+    ],
+    "TIEMPO_DINERO_CIUDAD": [
+        {
+            "pregunta": "Ventas totales por mes en Guadalajara",
+            "query": "SELECT strftime('%Y-%m', \"Fecha\") AS mes, SUM(\"Neto\") AS ventas_mensuales\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Guadalajara%'\nAND \"Fecha\" IS NOT NULL\nGROUP BY mes\nORDER BY mes DESC;"
+        },
+        {
+            "pregunta": "Crecimiento de ventas en Monterrey por trimestre",
+            "query": "SELECT \nCASE \n    WHEN strftime('%m', \"Fecha\") BETWEEN '01' AND '03' THEN 'Q1'\n    WHEN strftime('%m', \"Fecha\") BETWEEN '04' AND '06' THEN 'Q2'\n    WHEN strftime('%m', \"Fecha\") BETWEEN '07' AND '09' THEN 'Q3'\n    WHEN strftime('%m', \"Fecha\") BETWEEN '10' AND '12' THEN 'Q4'\nEND AS trimestre,\nSUM(\"Neto\") AS ventas_trimestrales\nFROM \"sells\"\nWHERE \"Ciudad\" LIKE '%Monterrey%'\nAND \"Fecha\" IS NOT NULL\nGROUP BY trimestre\nORDER BY trimestre;"
+        },
+        {
+            "pregunta": "Comparativo de ventas: Guadalajara vs Monterrey por mes",
+            "query": "SELECT strftime('%Y-%m', \"Fecha\") AS mes,\nSUM(CASE WHEN \"Ciudad\" LIKE '%Guadalajara%' THEN \"Neto\" ELSE 0 END) AS ventas_guadalajara,\nSUM(CASE WHEN \"Ciudad\" LIKE '%Monterrey%' THEN \"Neto\" ELSE 0 END) AS ventas_monterrey\nFROM \"sells\"\nWHERE \"Fecha\" IS NOT NULL\nGROUP BY mes\nORDER BY mes DESC;"
+        }
+    ]
+}

intelligent_question_router.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from supervised_classifier import QuestionClassifier
+import torch
+import json
+class IntelligentQuestionRouter:
+    def __init__(self):
+        self.classifier = QuestionClassifier()
+        self._initialize_json()
+    def _initialize_json(self):
+        with open("examples.json", "r", encoding="utf-8") as f:
+            self.examples = json.load(f)
+    def _get_examples(self, category):
+        return self.examples.get(category, self.examples["CLIENTES_CERO"])
+    def route_question(self, question: str):
+        try:
+            ml_category_id = self.classifier.predict(question)
+            return self._get_examples(ml_category_id)
+        except Exception as e:
+            print("Error in routing question:", e)
+            return self._get_examples("CLIENTES_CERO")

load_json.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import json
+def load_examples(path: str = 'examples.json'):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+    examples = []
+    for category, items in data.items():
+        for item in items:
+            examples.append({
+                "category": category,
+                "question": item["pregunta"],
+                "sql": item["sql"]
+            })
+    return examples

memory.py CHANGED Viewed

@@ -1,47 +1,26 @@
-from collections import deque
-import pandas as pd
-from config import Config
-class ConversationMemory:
-    def __init__(self, max_history: int = Config.MAX_HISTORY):
-        self.history = deque(maxlen=max_history)
-        self.schema_cache = None
-    def add_interaction(self, question: str, sql: str, result: str):
-        self.history.append({
-            "question": question,
-            "sql": sql,
-            "result_summary": self._summarize_result(result)
-        })
-    def _summarize_result(self, result) -> str:
-        """Resumen ejecutivo para memoria de contexto"""
-        if isinstance(result, pd.DataFrame):
-            # Enfocado en datos CLAVE no en metadatos
-            if len(result) == 1:
-                return f"Único resultado: {result.iloc[0].to_dict()}"
-            elif 'Cliente' in result.columns:
-                top = result.nlargest(3, 'Neto') if 'Neto' in result.columns else result.head(3)
-                return f"Top clientes: {top['Cliente'].tolist()}"
-            else:
-                return f"Filas: {len(result)}, Columnas: {list(result.columns)}"
-        return str(result)
-    def get_context(self, current_question: str) -> str:
-        if not self.history:
-            return ""
-        last_relevant = []
-        for interaction in self.history:
-            if "producto" in interaction['question'].lower() and "producto" in current_question.lower():
-                last_relevant.append(interaction)
-            elif "cliente" in interaction['question'].lower() and "cliente" in current_question.lower():
-                last_relevant.append(interaction)
-        context = ""
-        for i, interaction in enumerate(last_relevant[-1:], 1):  # Solo la última relevante
-            context += (
-                f"Interacción #{i}: {interaction['question'][:50]}...\n"
-                f"SQL: {interaction['sql'][:70]}...\n"
-                f"Resultado: {interaction['result_summary']}\n\n"
-            )
-        return context

+from langchain.vectorstores import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain_core.documents import Document
+class Memory:
+    def __init__(self):
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        self.vector_store = Chroma(
+            persist_directory="./chroma_db", embedding_function=self.embedding_model
+        )
+        self.schema_cache = None
+    def add_interaction(self, question: str, answer: str, sql: str):
+        document = Document(
+            page_content=f"Pregunta: {question}\nRespuesta: {answer}\nSQL: {sql}",
+            metadata={"source": "interaction"},
+        )
+        self.vector_store.add_documents([document])
+    def get_relevant_interactions(self, question: str, top_k=3):
+        results = self.vector_store.similarity_search(question, k=top_k)
+        return "\n".join([d.page_content for d in results])

mojica_agent.py CHANGED Viewed

@@ -1,18 +1,21 @@
-from memory import ConversationMemory
 from config import Config
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch, gc
 import unicodedata
-from typing import Dict, Tuple, Optional
 import re
 import pandas as pd
 import sqlite3
 class MojicaAgent:
     def __init__(self, config: Config):
         self.config = config
-        self.memory = ConversationMemory()
         self.essential_columns = [
             {
                 "name": "Descripcion",
@@ -34,172 +37,111 @@ class MojicaAgent:
             },
             {"name": "Neto", "type": "REAL", "description": "Valor neto de la venta"},
         ]
-        self.schema = self._load_schema()
-        self._safe_initializer_model()
-    def _safe_initializer_model(self):
         def try_load_model():
-            tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_NAME)
-            model = AutoModelForCausalLM.from_pretrained(
                 self.config.MODEL_NAME,
-                trust_remote_code=True,
-                device_map="auto",  # * <- Distribuye mejor entre GPU
                 torch_dtype="auto",
             ).eval()
-            return tokenizer, model
         try:
-            self.tokenizer, self.model = try_load_model()
         except torch.cuda.OutOfMemoryError:
-            # * Libera memoria e intenta de nuevo
             gc.collect()
             torch.cuda.empty_cache()
             torch.cuda.ipc_collect()
-            self.tokenizer, self.model = try_load_model()
-    def _load_schema(self) -> Dict:
-        connection = sqlite3.connect(self.config.DB_PATH)
-        cursor = connection.cursor()
-        cursor.execute(f"PRAGMA table_info({self.config.TABLE_NAME})")
-        columns = [
-            {"name": column[1], "type": column[2]} for column in cursor.fetchall()
         ]
-        schema = {"table_name": self.config.TABLE_NAME, "columns": columns}
-        connection.close()
-        return schema
-    def _handle_speecial_cases(self, question: str) -> Optional[str]:
-        lower_question = question.lower()
-        table = self.schema["table_name"]
-        if any(kw in lower_question for kw in ["clientes 0", "clientes cero", "cliente cero"]):
-            return f"""
-            SELECT DISTINCT "Cliente", "Razon Social"
-            FROM "{table}"
-            WHERE "Cliente" IS NOT NULL
-            AND "Cliente" NOT IN (
-                SELECT DISTINCT "Cliente"
-                FROM "{table}"
-                WHERE DATE("Fecha") >= DATE('now', '-28 day')
-            );
-            """
-        return None
-    def _validate_and_correct_sql(self, sql: str) -> str:
-        def replace_column(match):
-            candidate = match.group(1)  # * Lo que venia entre comillas
-            key = candidate.lower()
-            # * Buscamos si el nombre coincide con alguna columna real del esquema (case-sensitive)
-            # * Si existe devolvemos (solo un error de mayusculas)
-            corrected = column_lower_map.get(key)
-            if corrected:
-                return f'"{corrected}"'
-            # * Si no existe entonces buscamos su alias
-            corrected = alias_map.get(key)
-            if corrected:
-                return f'"{corrected}"'
-            return f'"{candidate}"'
-        connection = sqlite3.connect(self.config.DB_PATH)
-        cursor = connection.cursor()
-        cursor.execute(f"PRAGMA tavle_info({self.config.TABLE_NAME})")
-        real_columns = [row[1] for row in cursor.fetchall()]
-        # * Razon Social -> razon social
-        column_lower_map = {column.lower(): column for column in real_columns}
-        aliases = {
-            "city": "Ciudad",
-            "client": "Cliente",
-            "razon_social": "Razon Social",
-            "razón social": "Razon Social",
-            "Sales": "sells",
-        }
-        # * Opcional (convierte la key en minuscula solo si el valor esta en real_columns)
-        alias_map = {
-            key.lower(): value
-            for key, value in aliases.items()
-            if value in real_columns
-        }
-        pattern = r'"(\w+)"'
-        # * patrón que queremos encontrar, remplazo (puede ser X en este caso es una funcion), el texto en el que se buscará
-        return re.sub(pattern, replace_column, sql)
-    def _build_prompt(self, question: str) -> str:
-        memory_context = self.memory.get_context(question)
         table_name = self.schema["table_name"]
-        # * Detectamos el tipo de pregunta para asignar los ejemplos
-        question_type = (
-            "PRODUCTOS"
-            if "producto" in question.lower()
-            else "CLIENTES" if "cliente" in question.lower() else "GENERAL"
-        )
-        examples = {
-            "PRODUCTOS": (
-                "-- P: 'Top 10 productos más vendidos'\n"
-                'SELECT "Descripcion", SUM("Cantidad") AS total_vendido\n'
-                f'FROM "{table_name}"\n'
-                'WHERE "Descripcion" IS NOT NULL\n'
-                'GROUP BY "Descripcion"\n'
-                "ORDER BY total_vendido DESC\n"
-                "LIMIT 10;\n\n"
-                "-- P: 'Productos con mayor valor neto'\n"
-                'SELECT "Descripcion", SUM("Neto") AS valor_total\n'
-                f'FROM "{table_name}"\n'
-                'WHERE "Descripcion" IS NOT NULL\n'
-                'GROUP BY "Descripcion"\n'
-                "ORDER BY valor_total DESC\n"
-                "LIMIT 5;"
-            ),
-            "CLIENTES": (
-                "-- P: 'Top 5 clientes con mayor valor neto'\n"
-                'SELECT "Cliente", SUM("Neto") AS valor_total\n'
-                f'FROM "{table_name}"\n'
-                "WHERE \"Cliente\" IS NOT NULL AND \"Fecha\" BETWEEN '2025-01-01' AND '2025-12-31'\n"
-                'GROUP BY "Cliente"\n'
-                "ORDER BY valor_total DESC\n"
-                "LIMIT 5;\n\n"
-                "-- P: 'Clientes con más compras en marzo'\n"
-                'SELECT "Cliente", COUNT(*) AS total_compras\n'
-                f'FROM "{table_name}"\n'
-                "WHERE \"Cliente\" IS NOT NULL AND strftime('%m', \"Fecha\") = '03'\n"
-                'GROUP BY "Cliente"\n'
-                "ORDER BY total_compras DESC\n"
-                "LIMIT 10;\n\n"
-                "-- P: 'Clientes de Guadalajara con más compras'\n"
-                'SELECT "Cliente", "Razon Social", COUNT(*) AS total_compras\n'
-                f'FROM "{table_name}"\n'
-                'WHERE "Cliente" IS NOT NULL AND "Ciudad" = \'Guadalajara\'\n'
-                'GROUP BY "Cliente", "Razon Social"\n'
-                "ORDER BY total_compras DESC\n"
-                "LIMIT 10;"
-            ),
-            "GENERAL": (
-                "-- P: 'Ventas totales por mes'\n"
-                'SELECT strftime(\'%m\', "Fecha") AS mes, SUM("Neto") AS ventas\n'
-                f'FROM "{table_name}"\n'
-                "WHERE mes IS NOT NULL\n"
-                "GROUP BY mes\n"
-                "ORDER BY mes;\n\n"
-                "-- P: 'Producto menos vendido en 2025'\n"
-                'SELECT "Descripcion", SUM("Cantidad") AS total_vendido\n'
-                f'FROM "{table_name}"\n'
-                "WHERE \"Descripcion\" IS NOT NULL AND \"Fecha\" BETWEEN '2025-01-01' AND '2025-12-31'\n"
-                'GROUP BY "Descripcion"\n'
-                "ORDER BY total_vendido ASC\n"
-                "LIMIT 1;"
-            ),
-        }
-        # * Retornamos el prompt
         return (
             f"""
-        ### TAREA ###
-        Generar SOLO código SQL para la pregunta, usando EXCLUSIVAMENTE la tabla: "{table_name}"
-        ### COLUMNAS RELEVANTES ###
-        """
             + "\n".join(
                 [
                     f"- {col['name']} ({col['type']}): {col['description']}"
@@ -207,114 +149,161 @@ class MojicaAgent:
                 ]
             )
             + f"""
-        ### CONTEXTO (Últimas interacciones) ###
-        {memory_context if memory_context else "Sin historial relevante"}
-        ### EJEMPLOS ({question_type}) ###
-        {examples[question_type]}
-        ### REGLAS CRÍTICAS ###
-        - Usar siempre nombres exactos de columnas
-        - Usar solo las columnas listadas
-        - Prohibido inventar columnas
-        - Para el nombre del cliente, usar SIEMPRE "Razon Social".
-        - Para un mes específico usar: strftime('%m', "Fecha") = 'MM'
-        - Para cantidades usar SUM("Cantidad"), para dinero usar SUM("Neto")
-        - Para ciudad usar SIEMPRE "Ciudad"
-        - Agrupar por la dimensión principal (producto/cliente)
-        - Ordenar DESC para 'más/mayor', ASC para 'menos/menor'
-        - Usar LIMIT para top N
-        - Contesta siempre en el idioma en el que se te pregunta no traduzcas.
-        - Año actual: 2025
-        - Siempre terminar con un LIMIT = 1 en caso que se indique lo contrario
-        - Para 'más vendido' usar SUM("Cantidad"), para 'mayor valor' usar SUM("Neto")
-        - Usar "Razon Social" cuando pregunten por el nombre del cliente
-        - Usar "Ciudad" para filtrar o agrupar por ubicación
-        - Queda estrictamente prohibido usar acentos
-        - **Siempre excluir valores nulos con 'IS NOT NULL' en las columnas usadas en WHERE, GROUP BY u ORDER BY**
-        ### PREGUNTA ACTUAL ###
-        \"\"\"{question}\"\"\"
-        ### SQL:
-        """
         )
-    def _clean_sql_output(self, output: str) -> Optional[str]:
-        def remove_accents(text: str) -> str:
-            # ? que es lo que hace
-            return "".join(
-                c
-                for c in unicodedata.normalize("NFKD", text)
-                if not unicodedata.combining(c)
-            )
-        # * Encontramos todas las querys del prompt
         sql_matches = list(
             re.finditer(
                 r"(SELECT|WITH|INSERT|UPDATE|DELETE)[\s\S]+?;", output, re.IGNORECASE
             )
         )
-        # * si no existe nada entonces retornamos None
         if not sql_matches:
             return None
-        # * De todas las querys que encontramos tomamos la ultima
-        sql = sql_matches[-1].group(0).strip()  # ? Que hace toda esta linea?
-        # * Protegemos de modificaciones a la bd
         if any(
-            command in sql.upper()
-            for command in ["DROP", "DELETE", "UPDATE", "INSERT", "ALTER"]
         ):
             return None
-        # * Agregamos ; sino tiene
         if not sql.endswith(";"):
             sql += ";"
-        # * Removemos acentos
         sql = remove_accents(sql)
-        # * Agregamos LIMIT 1 sino hay
-        if not re.search(r"\bLIMIT\s+\d+", sql, re.IGNORECASE):
-            sql = sql[:-1] + "LIMIT 1;"
-        # * SQL limpio mas no corregido de erores, corregumos los errores y despues enviamos
         validate_sql = self._validate_and_correct_sql(sql)
         return validate_sql
-    def _execute_sql(self, sql: str):
-        connection = sqlite3.connect(self.config.DB_PATH)
         try:
-            result = pd.read_sql_query(sql, connection)
-            connection.close()
-            return result
         except Exception as e:
-            return f"Error de ejecución: {str(e)}"
-        finally:
-            connection.close()
-    def consult(self, question: str) -> Tuple[str, any]:
-        if self._handle_speecial_cases(question=question) != None:
-            sql_query = self._handle_speecial_cases(question=question)
-            result = self._execute_sql(sql_query)
-            # self.memory.add_interaction(question, sql_query, result)
-            return sql_query, result
-        prompt = self._build_prompt(question)
         tokenized_input = self.tokenizer(
-            prompt,
             return_tensors="pt",
             truncation=True,
             max_length=self.config.MAX_TOKENS,
         ).to(self.config.DEVICE)
-        # * Desactiva el cálculo de gradientes -> Siempre poner cuando se haga prediccion
-        # *  - Reduce consumo de memoria
-        # *   - Acelera inferencia
         with torch.no_grad():
             tokenized_output_model = self.model.generate(
                 **tokenized_input,
@@ -326,17 +315,33 @@ class MojicaAgent:
                 do_sample=True,
                 pad_token_id=self.tokenizer.eos_token_id,
             )
         output_model = self.tokenizer.decode(
             tokenized_output_model[0], skip_special_tokens=True
         )
-        sql_query = self._clean_sql_output(output_model)
-        if not sql_query:
-            return "Error: No se pudo generar SQL válido" + "\n" + output_model, None
-        result = self._execute_sql(sql_query)
-        self.memory.add_interaction(question, sql_query, result)
-        return sql_query, result

+from memory import Memory as VectorMemory
 from config import Config
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch, gc
 import unicodedata
+from typing import Dict, Tuple, Optional, Any
 import re
 import pandas as pd
 import sqlite3
+from intelligent_question_router import IntelligentQuestionRouter
 class MojicaAgent:
     def __init__(self, config: Config):
         self.config = config
+        self.memory = VectorMemory()
+        self.router = IntelligentQuestionRouter()
+        self._load_training_data()
         self.essential_columns = [
             {
                 "name": "Descripcion",
             },
             {"name": "Neto", "type": "REAL", "description": "Valor neto de la venta"},
         ]
+        self._initialize_database()
+        self._initialize_model()
+    def _initialize_model(self):
         def try_load_model():
+            self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_NAME)
+            self.model = AutoModelForCausalLM.from_pretrained(
                 self.config.MODEL_NAME,
+                device_map="auto",
                 torch_dtype="auto",
+                trust_remote_code=True,
             ).eval()
         try:
+            try_load_model()
         except torch.cuda.OutOfMemoryError:
             gc.collect()
             torch.cuda.empty_cache()
             torch.cuda.ipc_collect()
+            try_load_model()
+    def _load_training_data(self):
+        training_examples = [
+            {"question": "productos más vendidos", "category": "producto"},
+            {"question": "mejor producto", "category": "producto"},
+            {"question": "clientes que más compran", "category": "cliente"},
+            {"question": "clientes inactivos", "category": "cliente"},
         ]
+        try:
+            self.router.semantic_classifier.train(training_examples)
+        except Exception as e:
+            print(f"Error training semantic classifier: {e}")
+    def _validate_result_existing(self, result):
+        # Si es un string de error
+        if isinstance(result, str) and "Error" in result:
+            return False
+        # Si es un DataFrame vacío
+        if hasattr(result, "empty") and result.empty:
+            return False
+        # Si es una lista vacía
+        if isinstance(result, list) and len(result) == 0:
+            return False
+        # En cualquier otro caso, asumimos éxito
+        return True
+    def _initialize_database(self):
+        self.conn = sqlite3.connect(self.config.DB_PATH)
+        cursor = self.conn.cursor()
+        cursor.execute(f"DROP TABLE IF EXISTS {self.config.TABLE_NAME}")
+        self.conn.commit()
+        df = pd.read_csv(self.config.CSV_PATH, low_memory=False)
+        real_cols = [
+            col["name"] for col in self.essential_columns if col["type"] == "REAL"
+        ]
+        for col in real_cols:
+            if col in df.columns:
+                df[col] = pd.to_numeric(df[col], errors="coerce")
+        df.to_sql(self.config.TABLE_NAME, self.conn, if_exists="replace", index=False)
+        self.schema = self._get_schema_structured()
+        # Configuracion de pandas:
+        pd.set_option("display.float_format", "{:,.2f}".format)
+    def _get_schema_structured(self) -> Dict:
+        if self.memory.schema_cache:
+            return self.memory.schema_cache
+        cursor = self.conn.cursor()
+        cursor.execute(f"PRAGMA table_info({self.config.TABLE_NAME})")
+        columns = [
+            {"name": column[1], "type": column[2]} for column in cursor.fetchall()
+        ]
+        schema = {"table_name": self.config.TABLE_NAME, "columns": columns}
+        self.memory.schema_cache = schema
+        return schema
+    def _generate_sql_prompt(self, question: str) -> str:
+        memory_context = self.memory.get_relevant_memory(question)
         table_name = self.schema["table_name"]
+        # Uso del router
+        try:
+            examples_list = self.router.route_question(question)
+            # Convertir ejemplos a texto para el prompt
+            examples_text = "\n".join(
+                [f"-- P: '{ex['pregunta']}'\n{ex['query']}\n" for ex in examples_list]
+            )
+            question_type = "ROUTED_EXAMPLES"
+        except Exception as e:
+            print(f"Router failed, using manual detection: {e}")
+            # Fallback a detección manual
+            # question_type = self._detect_question_type_manual(question)
+            # examples_text = self.examples.get(question_type, "")
         return (
             f"""
+    ### TAREA ###
+    Generar SOLO código SQL para la pregunta, usando EXCLUSIVAMENTE la tabla: "{table_name}"
+    ### COLUMNAS RELEVANTES ###
+    """
             + "\n".join(
                 [
                     f"- {col['name']} ({col['type']}): {col['description']}"
                 ]
             )
             + f"""
+    ### CONTEXTO (Últimas interacciones) ###
+    {memory_context if memory_context else "Sin historial relevante"}
+    ### EJEMPLOS ###
+    {examples_text}
+    ### REGLAS CRÍTICAS ###
+    - Usar siempre nombres exactos de columnas
+    - Usar solo las columnas listadas
+    - Prohibido inventar columnas
+    - Para el nombre del cliente, usar SIEMPRE "Razon Social".
+    - Para un mes específico usar: strftime('%m', "Fecha") = 'MM'
+    - Para cantidades usar SUM("Cantidad"), para dinero usar SUM("Neto")
+    - Agrupar por la dimensión principal (producto/cliente)
+    - Ordenar DESC para 'más/mayor', ASC para 'menos/menor'
+    - Contesta siempre en el idioma en el que se te pregunta no traduzcas.
+    - Año actual: 2025
+    - No inventes columnas o tablas que no existan
+    - Para preguntas sobre clientes cero, SIEMPRE usar la subconsulta NOT IN con las últimas 4 semanas.
+    - Si se menciona una ciudad, incluir el filtro AND "Ciudad" LIKE '%...%'
+    - Usa LIMIT cuando se te pida un numero finito de datos
+    - Para 'más vendido' usar SUM("Cantidad"), para 'mayor valor' usar SUM("Neto")
+    - Usar "Razon Social" cuando pregunten por el nombre del cliente
+    - Usar "Ciudad" para filtrar o agrupar por ubicación
+    - Queda estrictamente prohibido usar acentos
+    - **Siempre excluir valores nulos con 'IS NOT NULL' en las columnas usadas en WHERE, GROUP BY u ORDER BY**
+    - Para preguntas sobre ciudad SIEMPRE incluir "Ciudad" en la query
+    - Para busquedas por Descripcion siempre usar LIKE
+    - Mandar solo la cantidad de rows que el usuario pide.
+    ### PREGUNTA ACTUAL ###
+    \"\"\"{question}\"\"\"
+    ### SQL:
+    """
         )
+    def _generate_analysis_prompt(self, question: str, result: Any) -> str:
+        return f"""
+        Basado EXCLUSIVAMENTE en estos datos: {result}
+        Responde esta pregunta: {question}
+        Reglas estrictas:
+        - Nunca inventes numeros
+        - Usa solo datos proporcionados
+        - Maximo una oracion
+        """
+    def _clean_analysis_output(self, ouput: str) -> Optional[str]:
+        pattern = r"Respuesta:([\s\S]+)"
+        match = re.search(pattern, ouput)
+        if match:
+            return match.group(1).strip()
+        else:
+            return "Sin análisis"
+    def _clean_sql_output(self, output: str) -> Optional[str]:
+        # Encuentra todas las posibles queries completas que terminen en ;
         sql_matches = list(
             re.finditer(
                 r"(SELECT|WITH|INSERT|UPDATE|DELETE)[\s\S]+?;", output, re.IGNORECASE
             )
         )
         if not sql_matches:
             return None
+        # Tomar la última query encontrada
+        sql = sql_matches[-1].group(0).strip()
+        # Seguridad: bloquear queries peligrosas
         if any(
+            cmd in sql.upper()
+            for cmd in ["DROP", "DELETE", "UPDATE", "INSERT", "ALTER"]
         ):
             return None
+        # Asegurar que termine en ;
         if not sql.endswith(";"):
             sql += ";"
+        # ────────────────────────────────
+        # 1. Quitar acentos de toda la query
+        # ────────────────────────────────
+        def remove_accents(text: str) -> str:
+            return "".join(
+                c
+                for c in unicodedata.normalize("NFKD", text)
+                if not unicodedata.combining(c)
+            )
         sql = remove_accents(sql)
+        # ────────────────────────────────
+        # 2. Agregar LIMIT si no existe
+        # ────────────────────────────────
+        # Buscar si ya hay un LIMIT en la query
+        # if not re.search(r"\bLIMIT\s+\d+", sql, re.IGNORECASE):
+        #     # Insertar antes del último punto y coma
+        #     sql = sql[:-1] + " LIMIT 1;"  # puedes cambiar 100 por el valor default que quieras
         validate_sql = self._validate_and_correct_sql(sql)
         return validate_sql
+    def _validate_and_correct_sql(self, sql: str) -> str:
+        cur = self.conn.cursor()
+        cur.execute(f'PRAGMA table_info("{self.config.TABLE_NAME}")')
+        real_columns = [row[1] for row in cur.fetchall()]
+        column_lower_map = {col.lower(): col for col in real_columns}
+        aliases = {
+            "city": "Ciudad",
+            "client": "Cliente",
+            "razon_social": "Razon Social",
+            "razón social": "Razon Social",
+            "Sales": "sells",
+            '"Date"': "Fecha",
+            "mojica_Clientes": "sells",
+            "value_total": "valor_total",
+            "strstrftime": "strftime",
+        }
+        alias_map = {k.lower(): v for k, v in aliases.items()}
+        pattern = r"\b\w+\b"
+        def replace_column(m):
+            candidate = m.group(0)  # Palabra encontrada
+            key = candidate.lower()
+            # ¿Es una columna?
+            corrected = column_lower_map.get(key)
+            if corrected:
+                return corrected
+            # ¿Es una alias?
+            corrected = alias_map.get(key)
+            if corrected is not None:
+                return corrected
+            return candidate  # si no encuentra nada, lo deja igual
+        return re.sub(pattern, replace_column, sql).replace("\\", "")
+    def _execute_sql(self, sql: str) -> Any:
         try:
+            return pd.read_sql_query(sql, self.conn)
         except Exception as e:
+            return f"Error: {str(e)}"
+    def consult(self, question: str) -> Tuple[str, Any, str]:
+        sql_prompt = self._generate_sql_prompt(question)
         tokenized_input = self.tokenizer(
+            sql_prompt,
             return_tensors="pt",
             truncation=True,
             max_length=self.config.MAX_TOKENS,
         ).to(self.config.DEVICE)
         with torch.no_grad():
             tokenized_output_model = self.model.generate(
                 **tokenized_input,
                 do_sample=True,
                 pad_token_id=self.tokenizer.eos_token_id,
             )
         output_model = self.tokenizer.decode(
             tokenized_output_model[0], skip_special_tokens=True
         )
+        sql = self._clean_sql_output(output_model)
+        # * Ejecución de SQL y generación de analisis
+        result = self._execute_sql(sql)
+        # * INICIO DE ANALISIS (COMENTADO)
+        # Analisis
+        # analysis_prompt = self._generate_analysis_prompt(question, result)
+        # analyzed_token_input = self.tokenizer(
+        #     analysis_prompt,
+        #     return_tensors="pt",
+        #     truncation=True,
+        #     max_length=self.config.MAX_TOKENS,
+        # ).to(self.config.DEVICE)
+        # with torch.no_grad():
+        #     tokenized_analysis_output_model = self.model.generate(
+        #         **analyzed_token_input,
+        #         max_new_tokens=self.config.MAX_NEW_TOKENS,
+        #         temperature=0.65,
+        #     )
+        # analysis = self.tokenizer.decode(
+        #     tokenized_analysis_output_model[0], skip_special_tokens=True
+        # )
+        # analysis = self._clean_analysis_output(analysis)
+        # analysis <- LE quite ese parametro
+        # * FIN DE ANALISIS (COMENTADO)
+        self.memory.add_interaction(question=question, answer=result, sql=sql)
+        return sql, result

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

semantic_classifier.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
+from config import Config
+from load_json import load_examples
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
+class SemanticClassifier:
+    def __init__(self, model_name="paraphrase-multilingual-MiniLM-L12-v2", initialized_train=True):
+        self.model = SentenceTransformer(model_name)
+        self.clusters = {}
+        self.examples_embeddings = None
+        self.kmeans = None
+        if initialized_train:
+            self.train()
+    def train(self, train_data=Config.EXMAPLES_JSON, n_clusters=15):
+        examples = load_examples(train_data)
+        # * Aplanar ejemplos
+        flat_examples = []
+        for category, items in examples.items():
+            for item in items:
+                flat_examples.append({
+                    "category": category,
+                    "pregunta": item["pregunta"],
+                    "query": item["query"]
+                })
+        questions = [ex["pregunta"] for ex in flat_examples]
+        # * Obtener embeddings
+        embeddings = self.model.encode(questions)
+        # * Clustering
+        self.kmeans = KMeans(n_clusters=n_clusters, random_state=12)
+        cluster_ids = self.kmeans.fit_predict(embeddings)
+        # * Guardar ejemplos por cluster
+        for i, cluster_id in enumerate(cluster_ids):
+            # * Crear lista si no existe
+            if cluster_id not in self.clusters:
+                self.clusters[cluster_id] = []
+            # * Agregamos el ejemplo
+            self.clusters[cluster_id].append(flat_examples[i])
+        self.examples_embeddings = embeddings
+    def classify(self, question: str):
+        # * En formato de embedding
+        question_embedding = self.model.encode([question])
+        # * Encontrar el cluster más cercano
+        cluster_id = self.kmeans.predict(question_embedding)[0]
+        # * Retornamos los ejemplos de ese cluster
+        return self.clusters.get(cluster_id, [])
+# * FORMA DE USARSE
+# classifier = SemanticClassifier()
+# classifier.train(Config.EXMAPLES_JSON, n_clusters=5)
+# resultado = classifier.classify("¿Cuantas ciudades tenemos registradas?")
+# print(resultado)  # te devuelve ejemplos de ese cluster

supervised_classifier.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+)
+from config import Config
+import json
+from datasets import Dataset
+import torch
+class QuestionClassifier:
+    def __init__(
+        self, model_name="distilbert-base-multilingual-cased", initialized_train=True
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model_name = model_name
+        self.category2id = None
+        self.category2id = None
+        if initialized_train:
+            self.train()
+    def train(self, json_path=Config.EXMAPLES_JSON, num_epochs=3):
+        # * Cargar ejemplos
+        with open(json_path, "r", encoding="utf-8") as f:
+            examples = json.load(f)
+        texts, labels, category2id = self._prepare_supervised_data(examples)
+        self.category2id = category2id
+        self.id2category = {value: key for key, value in category2id.items()}
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name, num_labels=len(category2id)
+        )
+        encodings = self.tokenizer(texts, truncation=True, padding=True)
+        dataset = Dataset.from_dict(
+            {
+                "input_ids": encodings["input_ids"],
+                "attention_mask": encodings["attention_mask"],
+                "labels": labels,
+            }
+        )
+        training_args = TrainingArguments(
+            output_dir="./results",
+            per_device_train_batch_size=8,
+            num_train_epochs=num_epochs,
+            logging_steps=1,
+            # logging_strategy="steps",
+            report_to="none",
+            save_strategy="no",
+            remove_unused_columns=False,
+            eval_strategy="no",
+        )
+        # 4. Trainer
+        trainer = Trainer(model=self.model, args=training_args, train_dataset=dataset)
+        trainer.train()
+    def _prepare_supervised_data(self, examples):
+        category2id = {cat: i for i, cat in enumerate(examples.keys())}
+        texts = []
+        labels = []
+        for category, items in examples.items():
+            for item in items:
+                texts.append(item["pregunta"])
+                labels.append(category2id[category])
+        return texts, labels, category2id
+    def predict(self, question: str):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(device)
+        inputs = self.tokenizer(
+            question, return_tensors="pt", truncation=True, padding=True
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            predicted_class_id = outputs.logits.argmax().item()
+        return self.id2category[predicted_class_id]
+# * FORMA DE USARSE
+# qc = QuestionClassifier()
+# qc.train()
+# categoria = qc.predict("Dame los productos más vendidos")
+# print(categoria)  # → 'PRODUCTOS'