dewiri commited on
Commit
eb74260
·
verified ·
1 Parent(s): 06a3076

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +11 -9
rag_pipeline.py CHANGED
@@ -17,7 +17,7 @@ url_chunks = "https://drive.google.com/uc?export=download&id=1nsrAm_ozsK4GlmMui9
17
  local_index = "faiss_index.index"
18
  local_chunks = "chunks_mapping.pkl"
19
 
20
- # === Hilfsfunktion: Datei herunterladen wenn nicht vorhanden
21
  def download_if_missing(url, local_path):
22
  if not os.path.exists(local_path):
23
  print(f"⬇️ Lade {local_path} von Google Drive...")
@@ -29,11 +29,10 @@ def download_if_missing(url, local_path):
29
  else:
30
  raise Exception(f"❌ Download fehlgeschlagen für {local_path}")
31
 
32
- # === Dateien herunterladen
33
  download_if_missing(url_index, local_index)
34
  download_if_missing(url_chunks, local_chunks)
35
 
36
- # === FAISS Index & Chunks laden
37
  print("📂 Lade FAISS Index und Text-Chunks...")
38
  index = faiss.read_index(local_index)
39
 
@@ -41,18 +40,21 @@ with open(local_chunks, "rb") as f:
41
  token_split_texts = pickle.load(f)
42
 
43
  print(f"✅ Geladene Chunks: {len(token_split_texts)}")
44
- print("⚙️ Starte Embedding-Berechnung...")
45
- chunk_embeddings = model.encode(token_split_texts, convert_to_numpy=True)
 
 
 
46
  print("✅ Embeddings kodiert")
47
 
48
- # === Abruffunktion für ähnliche Chunks
49
  def retrieve(query, k=5):
50
  query_embedding = model.encode([query], convert_to_numpy=True)
51
  distances, indices = index.search(query_embedding, k)
52
- retrieved_texts = [token_split_texts[i] for i in indices[0]]
53
  return retrieved_texts
54
 
55
- # === Prompt-Zusammenbau
56
  def build_prompt(query, texts):
57
  context = "\n\n".join(texts)
58
  return f"Beantworte die folgende Frage basierend auf dem Kontext:\n\nKontext:\n{context}\n\nFrage:\n{query}"
@@ -61,4 +63,4 @@ def build_prompt(query, texts):
61
  def run_qa_pipeline(query, k=5):
62
  retrieved = retrieve(query, k)
63
  prompt = build_prompt(query, retrieved)
64
- return f"🔍 Kontext gefunden:\n\n{prompt}\n\n(Füge hier optional deine LLM-Antwort ein)"
 
17
  local_index = "faiss_index.index"
18
  local_chunks = "chunks_mapping.pkl"
19
 
20
+ # === Datei-Download bei Bedarf
21
  def download_if_missing(url, local_path):
22
  if not os.path.exists(local_path):
23
  print(f"⬇️ Lade {local_path} von Google Drive...")
 
29
  else:
30
  raise Exception(f"❌ Download fehlgeschlagen für {local_path}")
31
 
 
32
  download_if_missing(url_index, local_index)
33
  download_if_missing(url_chunks, local_chunks)
34
 
35
+ # === FAISS & Chunks laden
36
  print("📂 Lade FAISS Index und Text-Chunks...")
37
  index = faiss.read_index(local_index)
38
 
 
40
  token_split_texts = pickle.load(f)
41
 
42
  print(f"✅ Geladene Chunks: {len(token_split_texts)}")
43
+
44
+ # === Embedding nur auf den ersten 10 Chunks testen
45
+ print("⚙️ Starte Embedding-Berechnung auf 10 Chunks...")
46
+ test_chunks = token_split_texts[:10]
47
+ chunk_embeddings = model.encode(test_chunks, convert_to_numpy=True)
48
  print("✅ Embeddings kodiert")
49
 
50
+ # === Abruffunktion
51
  def retrieve(query, k=5):
52
  query_embedding = model.encode([query], convert_to_numpy=True)
53
  distances, indices = index.search(query_embedding, k)
54
+ retrieved_texts = [test_chunks[i] for i in indices[0]]
55
  return retrieved_texts
56
 
57
+ # === Prompt Builder
58
  def build_prompt(query, texts):
59
  context = "\n\n".join(texts)
60
  return f"Beantworte die folgende Frage basierend auf dem Kontext:\n\nKontext:\n{context}\n\nFrage:\n{query}"
 
63
  def run_qa_pipeline(query, k=5):
64
  retrieved = retrieve(query, k)
65
  prompt = build_prompt(query, retrieved)
66
+ return f"🔍 Kontext gefunden:\n\n{prompt}"