Niklas Hoepner commited on
Commit
06fb7ae
·
1 Parent(s): 2a3f461

Better error handling in L3Score computation

Browse files
Files changed (5) hide show
  1. L3Score.py +47 -9
  2. __pycache__/L3Score.cpython-311.pyc +0 -0
  3. app.py +2 -1
  4. requirements.txt +2 -1
  5. tests.py +1 -17
L3Score.py CHANGED
@@ -24,6 +24,8 @@ import evaluate
24
  import datasets
25
  import numpy as np
26
 
 
 
27
  from langchain.chat_models.base import init_chat_model
28
 
29
 
@@ -109,7 +111,7 @@ class L3Score(evaluate.Metric):
109
  """Optional: download external resources useful to compute the scores"""
110
  pass
111
 
112
- def _verify_input(self, questions, predictions, references, provider):
113
  """Verify the input parameters"""
114
 
115
  if provider not in PROVIDER_WITH_TOP_LOGPROBS:
@@ -118,7 +120,31 @@ class L3Score(evaluate.Metric):
118
  PROVIDER_WITH_TOP_LOGPROBS
119
  )
120
  )
121
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
123
 
124
 
@@ -127,6 +153,7 @@ class L3Score(evaluate.Metric):
127
  llm = init_chat_model(model=model, api_key=api_key)
128
  llm = llm.bind(logprobs=True, top_logprobs=5)
129
  return llm
 
130
 
131
  def _compute(
132
  self,
@@ -138,16 +165,17 @@ class L3Score(evaluate.Metric):
138
  model="gpt-4o-mini",
139
  ):
140
  """Returns the scores"""
141
- print(questions,predictions,references)
142
-
143
  # Check whether llm can be initialized
144
- self._verify_input(questions, predictions, references, provider)
145
 
146
  # Initialize the LLM
147
  llm = self._get_llm(model, api_key)
148
 
 
149
  L3Score = 0
150
  count = 0
 
151
  for question, prediction, reference in zip(questions, predictions, references):
152
  try:
153
  response = llm.invoke(
@@ -156,10 +184,20 @@ class L3Score(evaluate.Metric):
156
  _PROMPT.format(question=question, gt=reference, answer=prediction),
157
  )
158
  )
159
- except Exception as e:
160
- print(e)
161
- continue
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  score = self._calculate_L3Score(
164
  response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
165
  )
@@ -257,4 +295,4 @@ if __name__ == "__main__":
257
  provider="deepseek",
258
  model="deepseek-coder",
259
  )
260
- print(results)
 
24
  import datasets
25
  import numpy as np
26
 
27
+ import openai
28
+
29
  from langchain.chat_models.base import init_chat_model
30
 
31
 
 
111
  """Optional: download external resources useful to compute the scores"""
112
  pass
113
 
114
+ def _verify_input(self, questions, predictions, references, provider, api_key, model):
115
  """Verify the input parameters"""
116
 
117
  if provider not in PROVIDER_WITH_TOP_LOGPROBS:
 
120
  PROVIDER_WITH_TOP_LOGPROBS
121
  )
122
  )
123
+
124
+ # Check whether the model is available
125
+ try:
126
+ if provider == "openai":
127
+ client = openai.OpenAI(api_key=api_key)
128
+ model_names = set([model.id for model in client.models.list()])
129
+ if model not in model_names:
130
+ raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
131
+
132
+ elif provider == "deepseek":
133
+ client = openai.OpenAI(api_key=api_key,base_url="https://api.deepseek.com")
134
+ model_names = [model.id for model in client.models.list()]
135
+ if model not in model_names:
136
+ raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
137
+
138
+ elif provider == "xai":
139
+ client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
140
+ model_names = [model.id for model in client.models.list()]
141
+ if model not in model_names:
142
+ raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
143
+
144
+ except openai.AuthenticationError as e:
145
+ message = e.body["message"]
146
+ return {"error": f"Authentication failed: {message}"}
147
+
148
  assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
149
 
150
 
 
153
  llm = init_chat_model(model=model, api_key=api_key)
154
  llm = llm.bind(logprobs=True, top_logprobs=5)
155
  return llm
156
+
157
 
158
  def _compute(
159
  self,
 
165
  model="gpt-4o-mini",
166
  ):
167
  """Returns the scores"""
168
+
 
169
  # Check whether llm can be initialized
170
+ self._verify_input(questions, predictions, references, provider, api_key, model)
171
 
172
  # Initialize the LLM
173
  llm = self._get_llm(model, api_key)
174
 
175
+
176
  L3Score = 0
177
  count = 0
178
+
179
  for question, prediction, reference in zip(questions, predictions, references):
180
  try:
181
  response = llm.invoke(
 
184
  _PROMPT.format(question=question, gt=reference, answer=prediction),
185
  )
186
  )
 
 
 
187
 
188
+ except openai.AuthenticationError as e:
189
+ message = e.body["message"]
190
+ return {"error": f"Authentication failed: {message}"}
191
+ except openai.RateLimitError as e:
192
+ message = e.body["message"]
193
+ return {"error": "Rate limit exceeded: {}".format(e)}
194
+ except openai.BadRequestError as e:
195
+ message = e.body["message"]
196
+ return {"error": "Bad request: {}".format(e)}
197
+ except Exception as e:
198
+ message = e.body["message"]
199
+ return {"error": "An error occurred: {}".format(e)}
200
+
201
  score = self._calculate_L3Score(
202
  response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
203
  )
 
295
  provider="deepseek",
296
  model="deepseek-coder",
297
  )
298
+
__pycache__/L3Score.cpython-311.pyc ADDED
Binary file (13.8 kB). View file
 
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  import evaluate
 
3
 
4
- l3score = evaluate.load("nhop/L3Score")
5
 
6
  def compute_l3score(api_key, provider, model, questions, predictions, references):
7
  try:
 
1
  import gradio as gr
2
  import evaluate
3
+ from L3Score import L3Score
4
 
5
+ l3score = L3Score()
6
 
7
  def compute_l3score(api_key, provider, model, questions, predictions, references):
8
  try:
requirements.txt CHANGED
@@ -4,4 +4,5 @@ langchain-deepseek==0.1.3
4
  langchain-openai==0.3.12
5
  langchain-community==0.3.21
6
  langchain-core==0.3.52
7
- numpy==2.2.4
 
 
4
  langchain-openai==0.3.12
5
  langchain-community==0.3.21
6
  langchain-core==0.3.52
7
+ numpy==2.2.4
8
+ openai==1.74.0
tests.py CHANGED
@@ -1,17 +1 @@
1
- test_cases = [
2
- {
3
- "predictions": [0, 0],
4
- "references": [1, 1],
5
- "result": {"metric_score": 0}
6
- },
7
- {
8
- "predictions": [1, 1],
9
- "references": [1, 1],
10
- "result": {"metric_score": 1}
11
- },
12
- {
13
- "predictions": [1, 0],
14
- "references": [1, 1],
15
- "result": {"metric_score": 0.5}
16
- }
17
- ]
 
1
+ test_cases = []