Niklas Hoepner
commited on
Commit
·
06fb7ae
1
Parent(s):
2a3f461
Better error handling in L3Score computation
Browse files- L3Score.py +47 -9
- __pycache__/L3Score.cpython-311.pyc +0 -0
- app.py +2 -1
- requirements.txt +2 -1
- tests.py +1 -17
L3Score.py
CHANGED
@@ -24,6 +24,8 @@ import evaluate
|
|
24 |
import datasets
|
25 |
import numpy as np
|
26 |
|
|
|
|
|
27 |
from langchain.chat_models.base import init_chat_model
|
28 |
|
29 |
|
@@ -109,7 +111,7 @@ class L3Score(evaluate.Metric):
|
|
109 |
"""Optional: download external resources useful to compute the scores"""
|
110 |
pass
|
111 |
|
112 |
-
def _verify_input(self, questions, predictions, references, provider):
|
113 |
"""Verify the input parameters"""
|
114 |
|
115 |
if provider not in PROVIDER_WITH_TOP_LOGPROBS:
|
@@ -118,7 +120,31 @@ class L3Score(evaluate.Metric):
|
|
118 |
PROVIDER_WITH_TOP_LOGPROBS
|
119 |
)
|
120 |
)
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
|
123 |
|
124 |
|
@@ -127,6 +153,7 @@ class L3Score(evaluate.Metric):
|
|
127 |
llm = init_chat_model(model=model, api_key=api_key)
|
128 |
llm = llm.bind(logprobs=True, top_logprobs=5)
|
129 |
return llm
|
|
|
130 |
|
131 |
def _compute(
|
132 |
self,
|
@@ -138,16 +165,17 @@ class L3Score(evaluate.Metric):
|
|
138 |
model="gpt-4o-mini",
|
139 |
):
|
140 |
"""Returns the scores"""
|
141 |
-
|
142 |
-
|
143 |
# Check whether llm can be initialized
|
144 |
-
self._verify_input(questions, predictions, references, provider)
|
145 |
|
146 |
# Initialize the LLM
|
147 |
llm = self._get_llm(model, api_key)
|
148 |
|
|
|
149 |
L3Score = 0
|
150 |
count = 0
|
|
|
151 |
for question, prediction, reference in zip(questions, predictions, references):
|
152 |
try:
|
153 |
response = llm.invoke(
|
@@ -156,10 +184,20 @@ class L3Score(evaluate.Metric):
|
|
156 |
_PROMPT.format(question=question, gt=reference, answer=prediction),
|
157 |
)
|
158 |
)
|
159 |
-
except Exception as e:
|
160 |
-
print(e)
|
161 |
-
continue
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
score = self._calculate_L3Score(
|
164 |
response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
|
165 |
)
|
@@ -257,4 +295,4 @@ if __name__ == "__main__":
|
|
257 |
provider="deepseek",
|
258 |
model="deepseek-coder",
|
259 |
)
|
260 |
-
|
|
|
24 |
import datasets
|
25 |
import numpy as np
|
26 |
|
27 |
+
import openai
|
28 |
+
|
29 |
from langchain.chat_models.base import init_chat_model
|
30 |
|
31 |
|
|
|
111 |
"""Optional: download external resources useful to compute the scores"""
|
112 |
pass
|
113 |
|
114 |
+
def _verify_input(self, questions, predictions, references, provider, api_key, model):
|
115 |
"""Verify the input parameters"""
|
116 |
|
117 |
if provider not in PROVIDER_WITH_TOP_LOGPROBS:
|
|
|
120 |
PROVIDER_WITH_TOP_LOGPROBS
|
121 |
)
|
122 |
)
|
123 |
+
|
124 |
+
# Check whether the model is available
|
125 |
+
try:
|
126 |
+
if provider == "openai":
|
127 |
+
client = openai.OpenAI(api_key=api_key)
|
128 |
+
model_names = set([model.id for model in client.models.list()])
|
129 |
+
if model not in model_names:
|
130 |
+
raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
|
131 |
+
|
132 |
+
elif provider == "deepseek":
|
133 |
+
client = openai.OpenAI(api_key=api_key,base_url="https://api.deepseek.com")
|
134 |
+
model_names = [model.id for model in client.models.list()]
|
135 |
+
if model not in model_names:
|
136 |
+
raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
|
137 |
+
|
138 |
+
elif provider == "xai":
|
139 |
+
client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
|
140 |
+
model_names = [model.id for model in client.models.list()]
|
141 |
+
if model not in model_names:
|
142 |
+
raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
|
143 |
+
|
144 |
+
except openai.AuthenticationError as e:
|
145 |
+
message = e.body["message"]
|
146 |
+
return {"error": f"Authentication failed: {message}"}
|
147 |
+
|
148 |
assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
|
149 |
|
150 |
|
|
|
153 |
llm = init_chat_model(model=model, api_key=api_key)
|
154 |
llm = llm.bind(logprobs=True, top_logprobs=5)
|
155 |
return llm
|
156 |
+
|
157 |
|
158 |
def _compute(
|
159 |
self,
|
|
|
165 |
model="gpt-4o-mini",
|
166 |
):
|
167 |
"""Returns the scores"""
|
168 |
+
|
|
|
169 |
# Check whether llm can be initialized
|
170 |
+
self._verify_input(questions, predictions, references, provider, api_key, model)
|
171 |
|
172 |
# Initialize the LLM
|
173 |
llm = self._get_llm(model, api_key)
|
174 |
|
175 |
+
|
176 |
L3Score = 0
|
177 |
count = 0
|
178 |
+
|
179 |
for question, prediction, reference in zip(questions, predictions, references):
|
180 |
try:
|
181 |
response = llm.invoke(
|
|
|
184 |
_PROMPT.format(question=question, gt=reference, answer=prediction),
|
185 |
)
|
186 |
)
|
|
|
|
|
|
|
187 |
|
188 |
+
except openai.AuthenticationError as e:
|
189 |
+
message = e.body["message"]
|
190 |
+
return {"error": f"Authentication failed: {message}"}
|
191 |
+
except openai.RateLimitError as e:
|
192 |
+
message = e.body["message"]
|
193 |
+
return {"error": "Rate limit exceeded: {}".format(e)}
|
194 |
+
except openai.BadRequestError as e:
|
195 |
+
message = e.body["message"]
|
196 |
+
return {"error": "Bad request: {}".format(e)}
|
197 |
+
except Exception as e:
|
198 |
+
message = e.body["message"]
|
199 |
+
return {"error": "An error occurred: {}".format(e)}
|
200 |
+
|
201 |
score = self._calculate_L3Score(
|
202 |
response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
|
203 |
)
|
|
|
295 |
provider="deepseek",
|
296 |
model="deepseek-coder",
|
297 |
)
|
298 |
+
|
__pycache__/L3Score.cpython-311.pyc
ADDED
Binary file (13.8 kB). View file
|
|
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
import evaluate
|
|
|
3 |
|
4 |
-
l3score =
|
5 |
|
6 |
def compute_l3score(api_key, provider, model, questions, predictions, references):
|
7 |
try:
|
|
|
1 |
import gradio as gr
|
2 |
import evaluate
|
3 |
+
from L3Score import L3Score
|
4 |
|
5 |
+
l3score = L3Score()
|
6 |
|
7 |
def compute_l3score(api_key, provider, model, questions, predictions, references):
|
8 |
try:
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ langchain-deepseek==0.1.3
|
|
4 |
langchain-openai==0.3.12
|
5 |
langchain-community==0.3.21
|
6 |
langchain-core==0.3.52
|
7 |
-
numpy==2.2.4
|
|
|
|
4 |
langchain-openai==0.3.12
|
5 |
langchain-community==0.3.21
|
6 |
langchain-core==0.3.52
|
7 |
+
numpy==2.2.4
|
8 |
+
openai==1.74.0
|
tests.py
CHANGED
@@ -1,17 +1 @@
|
|
1 |
-
test_cases = [
|
2 |
-
{
|
3 |
-
"predictions": [0, 0],
|
4 |
-
"references": [1, 1],
|
5 |
-
"result": {"metric_score": 0}
|
6 |
-
},
|
7 |
-
{
|
8 |
-
"predictions": [1, 1],
|
9 |
-
"references": [1, 1],
|
10 |
-
"result": {"metric_score": 1}
|
11 |
-
},
|
12 |
-
{
|
13 |
-
"predictions": [1, 0],
|
14 |
-
"references": [1, 1],
|
15 |
-
"result": {"metric_score": 0.5}
|
16 |
-
}
|
17 |
-
]
|
|
|
1 |
+
test_cases = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|