Spaces:
Sleeping
Sleeping
File size: 3,053 Bytes
63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f 5d0bb3c 63dc01f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# embedding.py
import logging
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from chroma_setup import initialize_client
import uuid
# Creamos una instancia del modelo local de sentence-transformers
# (se descargará y cacheará la primera vez que se ejecute)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def embed_text_chunks(pages_and_chunks: list[dict]) -> pd.DataFrame:
"""
Genera embeddings para cada chunk de texto usando un modelo local
de sentence-transformers.
"""
for item in pages_and_chunks:
text_chunk = item["sentence_chunk"]
try:
# encode() acepta una lista de strings y retorna una lista de embeddings (ndarray).
embedding_array = model.encode([text_chunk])
# Devuelve una matriz shape (1, 384) si es all-MiniLM-L6-v2, así que tomamos el [0]
embedding = embedding_array[0].tolist()
# embedding ahora es una lista de floats
item["embedding"] = embedding
except Exception as e:
logging.error(f"Fallo al generar embedding para: {text_chunk}. Error: {e}")
item["embedding"] = None
return pd.DataFrame(pages_and_chunks)
def save_to_chroma_db(embeddings_df: pd.DataFrame, user_id: str, document_id: str):
"""
Guarda en ChromaDB los embeddings generados.
"""
client = initialize_client()
# Creas o recuperas la colección. Asegúrate de usar el mismo nombre
# que luego usarás en tus queries.
collection = client.get_or_create_collection(name=f"text_embeddings_{user_id}")
combined_key = f"{user_id}_{document_id}"
ids = [f"{combined_key}_{i}" for i in range(len(embeddings_df))]
documents = embeddings_df["sentence_chunk"].tolist()
embeddings = embeddings_df["embedding"].tolist()
# Verificamos que ninguno sea None
for idx, emb in enumerate(embeddings):
if emb is None:
raise ValueError(
f"El chunk con ID {ids[idx]} no tiene embedding válido (None)."
)
# ¡Ahora todos deben ser listas de floats!
# Podemos añadirlos a la colección:
collection.add(
documents=documents,
embeddings=embeddings,
ids=ids,
metadatas=[{"combined_key": combined_key} for _ in range(len(embeddings_df))]
)
def generate_document_id() -> str:
return str(uuid.uuid4())
def query_chroma_db(user_id: str, document_id: str, query: str):
client = initialize_client()
collection = client.get_collection(name=f"text_embeddings_{user_id}")
combined_key = f"{user_id}_{document_id}"
results = collection.query(
query_texts=[query],
n_results=5,
where={"combined_key": combined_key},
)
documents = results.get("documents", [])
if not documents:
return "No se encontraron documentos"
# Aplanar la lista de documentos
relevant_docs = [doc for sublist in documents for doc in sublist]
return "\n\n".join(relevant_docs)
|