gaonkarrs commited on
Commit
132de7f
·
1 Parent(s): 7e0fec3

Add RAG Evaluation Gradio app

Browse files
Files changed (2) hide show
  1. app.py +497 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Deploy_CapstoneRagBench.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1OG-77VqKwz3509_osgNgSeOMJ9G6RvB4
8
+ """
9
+
10
+ # For Legal
11
+
12
+ from datasets import load_from_disk
13
+ from transformers import AutoTokenizer, AutoModel
14
+ import faiss
15
+ import numpy as np
16
+ import torch
17
+ from datasets import load_dataset, Dataset, get_dataset_config_names
18
+ import os
19
+ from groq import Groq
20
+ from sentence_transformers import CrossEncoder
21
+ import requests
22
+ import uuid
23
+ import re
24
+ import gradio as gr
25
+ import json
26
+ import torch
27
+ import numpy as np
28
+ from sklearn.metrics import mean_squared_error, roc_auc_score
29
+ import gradio as gr
30
+ import io
31
+ import sys
32
+ import traceback
33
+
34
+
35
+ def retrieve_top_k(query,domain='legal', model_name='nlpaueb/legal-bert-base-uncased', k=8):
36
+ # Load tokenizer and model
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ model = AutoModel.from_pretrained(model_name).to(device)
39
+ model.eval()
40
+
41
+ #print(f"In retrive_top_k Query:{query}")
42
+ # Tokenize and embed query using mean pooling
43
+ inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
44
+ inputs = {k: v.to(device) for k, v in inputs.items()}
45
+ with torch.no_grad():
46
+ outputs = model(**inputs)
47
+ query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
48
+
49
+ # Load FAISS index and dataset
50
+ index_path = f"{domain}_index/faiss.index"
51
+ dataset_path = f"{domain}_dataset"
52
+
53
+ faiss_index = faiss.read_index(index_path)
54
+ dataset = load_from_disk(dataset_path)
55
+
56
+ # Perform FAISS search
57
+ D, I = faiss_index.search(query_embedding.astype('float32'), k)
58
+
59
+ # Retrieve top-k matching chunks
60
+ top_chunks = [dataset[int(idx)]['text'] for idx in I[0]]
61
+ return top_chunks
62
+
63
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
+ #print(device)
65
+
66
+ dataset = load_dataset("rungalileo/ragbench", "cuad", split="test")
67
+
68
+ client = Groq(
69
+ api_key= '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',#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',
70
+ )
71
+
72
+ # Load BGE reranker
73
+ reranker = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
74
+
75
+ def rerank_documents_bge(query, documents, top_n=5, return_scores=False):
76
+ """
77
+ Rerank documents using BAAI/bge-reranker-base CrossEncoder.
78
+
79
+ Args:
80
+ query (str): The query string.
81
+ documents (List[str]): List of candidate documents.
82
+ top_n (int): Number of top results to return.
83
+ return_scores (bool): Whether to return scores along with documents.
84
+
85
+ Returns:
86
+ List[str] or List[Tuple[str, float]]
87
+ """
88
+ if not documents:
89
+ return []
90
+
91
+ # Prepare (query, doc) pairs
92
+ pairs = [(query, doc) for doc in documents]
93
+
94
+ # Predict relevance scores
95
+ scores = reranker.predict(pairs, batch_size=16)
96
+
97
+ # Sort by score descending
98
+ reranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
99
+
100
+ if return_scores:
101
+ return reranked[:top_n]
102
+ else:
103
+ return [doc for doc, _ in reranked[:top_n]]
104
+
105
+
106
+ def generate_response_rag(query,model,index_dir="legal_index"):
107
+ # Step 1: Retrieve top-k context chunks using your FAISS setup
108
+ top_chunks = retrieve_top_k(query,'legal', "nlpaueb/legal-bert-base-uncased")
109
+
110
+ # Step 2: Rerank retrieved documents using cross-encoder
111
+ #reranked_chunks = rerank_documents(query, top_chunks, top_n=15)
112
+ #rerank_and_filter_chunks = filter_by_faithfulness(query, reranked_chunks)
113
+
114
+ #reranked_chunks = rerank_and_filter_chunks
115
+ reranked_chunks_bge = rerank_documents_bge(query, top_chunks, top_n=5)
116
+ #sum_context = summarize_context("\n\n".join(reranked_chunks_bge))
117
+
118
+
119
+
120
+ final_context = reranked_chunks_bge
121
+ # Step 2: Prepare context and RAG-style prompt
122
+ context = "\n\n".join(final_context)
123
+
124
+ #print(f"Context:{context}")
125
+ prompt = f"""You are a helpful legal assistant.
126
+ Use the following context to answer the question.
127
+ Using only the information from the retrieved context, answer the following question. If the answer cannot be derived, say "I don't know." Always have answer with prefix **Answer:**
128
+
129
+ Context:{context}
130
+
131
+ Question: {query}
132
+ Answer:"""
133
+
134
+ # Step 3: Call the LLM (LLaMA3 or any chat model)
135
+ chat_completion = client.chat.completions.create(
136
+ messages=[
137
+ {"role": "user", "content": prompt}
138
+ ],
139
+ model=model,#"gemma2-9b-it"#"qwen/qwen3-32b"#deepseek-r1-distill-llama-70b",#"llama3-70b-8192", # mistral-saba-24b
140
+ temperature=0.0
141
+ )
142
+
143
+ return chat_completion.choices[0].message.content.strip()
144
+
145
+ '''response = openai.chat.completions.create(
146
+ model="gpt-3.5-turbo",
147
+ messages=[
148
+ {"role": "user", "content": prompt}
149
+ ],
150
+ temperature=0.0,
151
+ max_tokens=1024
152
+ )
153
+
154
+ return response.choices[0].message.content'''
155
+
156
+ #JUDGE LLM
157
+
158
+ def split_into_keyed_sentences(text, prefix):
159
+ """Splits text into sentences with keys like '0a.', '0b.', or 'a.', 'b.', etc."""
160
+ # Basic sentence tokenizer with keys
161
+ sentences = re.split(r'(?<=[.?!])\s+', text.strip())
162
+ keyed = {}
163
+ for i, s in enumerate(sentences):
164
+ key = f"{prefix}{chr(97 + i)}" # 'a', 'b', ...
165
+ if s:
166
+ keyed[key] = s.strip()
167
+ return keyed
168
+
169
+
170
+ def jugde_response_rag(query, embedder="nlpaueb/legal-bert-base-uncased", domain="legal", k=5):
171
+
172
+ top_chunks = retrieve_top_k(query)
173
+
174
+ top_chunks = [chunk[0] if isinstance(chunk, tuple) else chunk for chunk in top_chunks]
175
+
176
+ # Step 2: Prepare context and RAG-style prompt
177
+ context = "\n\n".join(top_chunks)
178
+
179
+ # Split context and dummy answer into keyed sentences
180
+ document_keys = split_into_keyed_sentences(context, "0")
181
+
182
+ #print(f"Query:{query}\n====================================================================")
183
+ response = generate_response_rag(query,model="llama3-70b-8192") #deepseek-r1-distill-llama-70b llama3-70b-8192
184
+ #print(f"\n====================================\Generator Response:{response}")
185
+ #For deepseek
186
+ #print("Before Curated:",response)
187
+ response=response[response.find("**Answer"):].replace("**Answer","");
188
+
189
+ print(f"Response for Generator LLM:{response}")
190
+
191
+ response_keys = split_into_keyed_sentences(response, "")
192
+ # Rebuild sections for prompt
193
+ documents_formatted = "\n".join([f"{k}. {v}" for k, v in document_keys.items()])
194
+ response_formatted = "\n".join([f"{k}. {v}" for k, v in response_keys.items()])
195
+
196
+ '''print(f"\n====================================================================")
197
+ print(f"documents_formatted:{documents_formatted}")
198
+ print(f"\n====================================================================")
199
+ print(f"response_formatted:{response_formatted}")
200
+ print(f"\n====================================================================")'''
201
+
202
+
203
+ prompt = f"""I asked someone to answer a question based on one or more documents.
204
+ Your task is to review their response and assess whether or not each sentence
205
+ in that response is supported by text in the documents. And if so, which
206
+ sentences in the documents provide that support. You will also tell me which
207
+ of the documents contain useful information for answering the question, and
208
+ which of the documents the answer was sourced from.
209
+ Here are the documents, each of which is split into sentences. Alongside each
210
+ sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
211
+ to it:
212
+ '''
213
+ {documents_formatted}
214
+ '''
215
+ The question was:
216
+ '''
217
+ {query}
218
+ '''
219
+ Here is their response, split into sentences. Alongside each sentence is
220
+ associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
221
+ that these keys are unique to the response, and are not related to the keys
222
+ in the documents:
223
+ '''
224
+ {response_formatted}
225
+ '''
226
+ You must respond with a JSON object matching this schema:
227
+ '''
228
+ {{
229
+ "relevance_explanation": string,
230
+ "all_relevant_sentence_keys": [string],
231
+ "overall_supported_explanation": string,
232
+ "overall_supported": boolean,
233
+ "sentence_support_information": [
234
+ {{
235
+ "response_sentence_key": string,
236
+ "explanation": string,
237
+ "supporting_sentence_keys": [string],
238
+ "fully_supported": boolean
239
+ }},
240
+ ],
241
+ "all_utilized_sentence_keys": [string]
242
+ }}
243
+ '''
244
+ The relevance_explanation field is a string explaining which documents
245
+ contain useful information for answering the question. Provide a step-by-step
246
+ breakdown of information provided in the documents and how it is useful for
247
+ answering the question.
248
+ The all_relevant_sentence_keys field is a list of all document sentences keys
249
+ (e.g. ’0a’) that are revant to the question. Include every sentence that is
250
+ useful and relevant to the question, even if it was not used in the response,
251
+ or if only parts of the sentence are useful. Ignore the provided response when
252
+ making this judgement and base your judgement solely on the provided documents
253
+ and question. Omit sentences that, if removed from the document, would not
254
+ impact someone’s ability to answer the question.
255
+ The overall_supported_explanation field is a string explaining why the response
256
+ *as a whole* is or is not supported by the documents. In this field, provide a
257
+ step-by-step breakdown of the claims made in the response and the support (or
258
+ lack thereof) for those claims in the documents. Begin by assessing each claim
259
+ separately, one by one; don’t make any remarks about the response as a whole
260
+ until you have assessed all the claims in isolation.
261
+ The overall_supported field is a boolean indicating whether the response as a
262
+ whole is supported by the documents. This value should reflect the conclusion
263
+ you drew at the end of your step-by-step breakdown in overall_supported_explanation.
264
+ In the sentence_support_information field, provide information about the support
265
+ *for each sentence* in the response.
266
+ The sentence_support_information field is a list of objects, one for each sentence
267
+ in the response. Each object MUST have the following fields:
268
+ - response_sentence_key: a string identifying the sentence in the response.
269
+ This key is the same as the one used in the response above.
270
+ - explanation: a string explaining why the sentence is or is not supported by the
271
+ documents.
272
+ - supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
273
+ support the response sentence. If the sentence is not supported, this list MUST
274
+ be empty. If the sentence is supported, this list MUST contain one or more keys.
275
+ In special cases where the sentence is supported, but not by any specific sentence,
276
+ you can use the string "supported_without_sentence" to indicate that the sentence
277
+ is generally supported by the documents. Consider cases where the sentence is
278
+ expressing inability to answer the question due to lack of relevant information in
279
+ the provided contex as "supported_without_sentence". In cases where the sentence
280
+ is making a general statement (e.g. outlining the steps to produce an answer, or
281
+ summarizing previously stated sentences, or a transition sentence), use the
282
+ sting "general".In cases where the sentence is correctly stating a well-known fact,
283
+ like a mathematical formula, use the string "well_known_fact". In cases where the
284
+ sentence is performing numerical reasoning (e.g. addition, multiplication), use
285
+ the string "numerical_reasoning".
286
+ - fully_supported: a boolean indicating whether the sentence is fully supported by
287
+ the documents.
288
+ - This value should reflect the conclusion you drew at the end of your step-by-step
289
+ breakdown in explanation.
290
+ - If supporting_sentence_keys is an empty list, then fully_supported must be false.
291
+ 17
292
+ - Otherwise, use fully_supported to clarify whether everything in the response
293
+ sentence is fully supported by the document text indicated in supporting_sentence_keys
294
+ (fully_supported = true), or whether the sentence is only partially or incompletely
295
+ supported by that document text (fully_supported = false).
296
+ The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
297
+ were used to construct the answer. Include every sentence that either directly supported
298
+ the answer, or was implicitly used to construct the answer, even if it was not used
299
+ in its entirety. Omit sentences that were not used, and could have been removed from
300
+ the documents without affecting the answer.
301
+ You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
302
+ newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
303
+ wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
304
+ As a reminder: your task is to review the response and assess which documents contain
305
+ useful information pertaining to the question, and how each sentence in the response
306
+ is supported by the text in the documents.\
307
+ """
308
+
309
+ # Step 3: Call the LLM
310
+ chat_completion = client.chat.completions.create(
311
+ messages=[
312
+ {"role": "user", "content": prompt}
313
+ ],
314
+ model="meta-llama/llama-4-maverick-17b-128e-instruct", #deepseek-r1-distill-llama-70b llama3-70b-8192 meta-llama/llama-4-maverick-17b-128e-instruct
315
+ )
316
+
317
+ return documents_formatted,chat_completion.choices[0].message.content.strip()
318
+
319
+ '''chat_completion = openai.chat.completions.create(
320
+ messages=[
321
+ {"role":"user",
322
+ "content":prompt}
323
+ ],
324
+ model="gpt-4o",
325
+ max_tokens=1024,
326
+
327
+ )
328
+ return documents_formatted,chat_completion.choices[0].message.content'''
329
+
330
+ def extract_retrieved_sentence_keys(document_text: str) -> list[str]:
331
+ """
332
+ Extracts sentence keys like '0a.', '0b.', etc. from a formatted document string.
333
+
334
+ Parameters:
335
+ - document_text (str): full text of document with sentence keys
336
+
337
+ Returns:
338
+ - List of unique sentence keys in the order they appear
339
+ """
340
+ # Match pattern like 0a., 0b., 0z., 0{., 0|., etc.
341
+ pattern = r'\b0[\w\{\|\}~€‚]\.'
342
+
343
+ matches = re.findall(pattern, document_text)
344
+ return list(dict.fromkeys(matches)) # Removes duplicates while preserving order
345
+
346
+ def compute_ragbench_metrics(judge_response: dict, retrieved_sentence_keys: list[str]) -> dict:
347
+ """
348
+ Computes RAGBench-style metrics from Judge LLM response.
349
+
350
+ Parameters:
351
+ - judge_response (dict): JSON response from Judge LLM
352
+ - retrieved_sentence_keys (list of str): all sentence keys from the retrieved documents
353
+
354
+ Returns:
355
+ - Dictionary with Context Relevance, Context Utilization, Completeness, and Adherence
356
+ """
357
+
358
+ R = set(judge_response.get("all_relevant_sentence_keys", [])) # Relevant sentences
359
+ U = set(judge_response.get("all_utilized_sentence_keys", [])) # Utilized sentences
360
+ intersection_RU = R & U
361
+
362
+ total_retrieved = len(retrieved_sentence_keys)
363
+ len_R = len(R)
364
+ len_U = len(U)
365
+ len_intersection = len(intersection_RU)
366
+
367
+ # Context Relevance: fraction of retrieved context that is relevant
368
+ context_relevance = len_R / total_retrieved if total_retrieved else 0.0
369
+
370
+ # Context Utilization: fraction of retrieved context that was used
371
+ context_utilization = len_U / total_retrieved if total_retrieved else 0.0
372
+
373
+ # Completeness: fraction of relevant content that was used
374
+ completeness = len_intersection / len_R if len_R else 0.0
375
+
376
+ # Adherence: 1 if all response sentences are fully supported, else 0
377
+ is_fully_supported = all(s.get("fully_supported", False)
378
+ for s in judge_response.get("sentence_support_information", []))
379
+ adherence = 1.0 if is_fully_supported and judge_response.get("overall_supported", False) else 0.0
380
+
381
+ return {
382
+ "Context Relevance": round(context_relevance, 4),
383
+ "Context Utilization": round(context_utilization, 4),
384
+ "Completeness": round(completeness, 4),
385
+ "Adherence": adherence
386
+ }
387
+
388
+
389
+ def compute_rmse(gt, pred):
390
+ return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
391
+
392
+
393
+ def evaluate_rag_pipeline(q_indices):
394
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
395
+
396
+ def safe_append(gt_list, pred_list, gt_val, pred_val):
397
+ if gt_val is not None and pred_val is not None:
398
+ gt_list.append(gt_val)
399
+ pred_list.append(pred_val)
400
+
401
+ def clean_and_parse_json_block(text):
402
+ # Strip markdown-style code block if present
403
+ #text = text.strip().strip("`").strip()
404
+ code_block_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
405
+ if code_block_match:
406
+ text = code_block_match.group(1).strip()
407
+
408
+ # Remove invalid/control characters that break decoding
409
+ text = re.sub(r"[^\x20-\x7E\n\t]", "", text)
410
+
411
+ try:
412
+ return json.loads(text)
413
+ except json.JSONDecodeError as e:
414
+ print("❌ JSON Decode Error:", e)
415
+ print("⚠️ Cleaned text:\n", text)
416
+ raise
417
+
418
+
419
+ gt_relevance, pred_relevance = [], []
420
+ gt_utilization, pred_utilization = [], []
421
+ gt_completeness, pred_completeness = [], []
422
+ gt_adherence, pred_adherence = [], []
423
+
424
+ for i in q_indices:
425
+ query = dataset[i]['question']
426
+ print(f"\n\n\nQuery:{i}.{query}\n====================================================================")
427
+ documents_formatted, response = jugde_response_rag(
428
+ query, embedder="nlpaueb/legal-bert-base-uncased", domain="legal")
429
+ judge_response = clean_and_parse_json_block(response)
430
+ print(f"\n======================================================================\nResponse:{judge_response}")
431
+ retrieved_sentences = extract_retrieved_sentence_keys(documents_formatted)
432
+ predicted = compute_ragbench_metrics(judge_response, retrieved_sentences)
433
+
434
+ # GT values
435
+ gt_r = dataset[i].get('relevance_score')
436
+ gt_u = dataset[i].get('utilization_score')
437
+ gt_c = dataset[i].get('completeness_score')
438
+ gt_a = dataset[i].get('gpt3_adherence')
439
+
440
+ safe_append(gt_relevance, pred_relevance, gt_r, predicted['Context Relevance'])
441
+ safe_append(gt_utilization, pred_utilization, gt_u, predicted['Context Utilization'])
442
+ safe_append(gt_completeness, pred_completeness, gt_c, predicted['Completeness'])
443
+ if gt_a is not None and predicted['Adherence'] is not None:
444
+ safe_append(gt_adherence, pred_adherence, int(gt_a), int(predicted['Adherence']))
445
+
446
+ def compute_rmse(gt, pred):
447
+ return round(np.sqrt(np.mean((np.array(gt) - np.array(pred)) ** 2)), 4)
448
+
449
+ result = {
450
+ "Context Relevance": compute_rmse(gt_relevance, pred_relevance),
451
+ "Context Utilization": compute_rmse(gt_utilization, pred_utilization),
452
+ "Completeness": compute_rmse(gt_completeness, pred_completeness),
453
+ }
454
+
455
+ if len(set(gt_adherence)) == 2:
456
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
457
+ result["AUC-ROC (Adherence)"] = round(roc_auc_score(gt_adherence, pred_adherence), 4)
458
+ else:
459
+ result["Adherence"] = compute_rmse(gt_adherence, pred_adherence)
460
+ result["AUC-ROC (Adherence)"] = "N/A - one class only"
461
+
462
+ return result
463
+
464
+
465
+ # Wrapper to parse textbox input into list of ints
466
+ def evaluate_rag_gradio(q_indices_str):
467
+ # Capture printed logs
468
+ log_stream = io.StringIO()
469
+ sys.stdout = log_stream
470
+
471
+ try:
472
+ q_indices = [int(x.strip()) for x in q_indices_str.split(",") if x.strip().isdigit()]
473
+ results = evaluate_rag_pipeline(q_indices)
474
+
475
+ # Return metrics and logs
476
+ logs = log_stream.getvalue()
477
+ return results, logs
478
+
479
+ except Exception as e:
480
+ traceback.print_exc()
481
+ return {"error": str(e)}, log_stream.getvalue()
482
+
483
+ finally:
484
+ sys.stdout = sys.__stdout__
485
+
486
+ iface = gr.Interface(
487
+ fn=evaluate_rag_gradio,
488
+ inputs=gr.Textbox(label="Comma-separated Query Indices (e.g. 89,121,245)", lines=1),
489
+ outputs=[
490
+ gr.JSON(label="Evaluation Metrics (RMSE & AUC-ROC)"),
491
+ gr.Textbox(label="Execution Log", lines=5, interactive=True)
492
+ ],
493
+ title="RAG Evaluation Dashboard",
494
+ description="Evaluate your RAG pipeline across selected queries using GPT-based generation and judgment."
495
+ )
496
+
497
+ iface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ faiss-cpu
5
+ torch
6
+ datasets
7
+ scikit-learn
8
+ groq