Better error handling in L3Score computation
Browse files- L3Score.py +47 -9
- __pycache__/L3Score.cpython-311.pyc +0 -0
- app.py +2 -1
- requirements.txt +2 -1
- tests.py +1 -17
L3Score.py
CHANGED
|
@@ -24,6 +24,8 @@ import evaluate
|
|
| 24 |
import datasets
|
| 25 |
import numpy as np
|
| 26 |
|
|
|
|
|
|
|
| 27 |
from langchain.chat_models.base import init_chat_model
|
| 28 |
|
| 29 |
|
|
@@ -109,7 +111,7 @@ class L3Score(evaluate.Metric):
|
|
| 109 |
"""Optional: download external resources useful to compute the scores"""
|
| 110 |
pass
|
| 111 |
|
| 112 |
-
def _verify_input(self, questions, predictions, references, provider):
|
| 113 |
"""Verify the input parameters"""
|
| 114 |
|
| 115 |
if provider not in PROVIDER_WITH_TOP_LOGPROBS:
|
|
@@ -118,7 +120,31 @@ class L3Score(evaluate.Metric):
|
|
| 118 |
PROVIDER_WITH_TOP_LOGPROBS
|
| 119 |
)
|
| 120 |
)
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
|
| 123 |
|
| 124 |
|
|
@@ -127,6 +153,7 @@ class L3Score(evaluate.Metric):
|
|
| 127 |
llm = init_chat_model(model=model, api_key=api_key)
|
| 128 |
llm = llm.bind(logprobs=True, top_logprobs=5)
|
| 129 |
return llm
|
|
|
|
| 130 |
|
| 131 |
def _compute(
|
| 132 |
self,
|
|
@@ -138,16 +165,17 @@ class L3Score(evaluate.Metric):
|
|
| 138 |
model="gpt-4o-mini",
|
| 139 |
):
|
| 140 |
"""Returns the scores"""
|
| 141 |
-
|
| 142 |
-
|
| 143 |
# Check whether llm can be initialized
|
| 144 |
-
self._verify_input(questions, predictions, references, provider)
|
| 145 |
|
| 146 |
# Initialize the LLM
|
| 147 |
llm = self._get_llm(model, api_key)
|
| 148 |
|
|
|
|
| 149 |
L3Score = 0
|
| 150 |
count = 0
|
|
|
|
| 151 |
for question, prediction, reference in zip(questions, predictions, references):
|
| 152 |
try:
|
| 153 |
response = llm.invoke(
|
|
@@ -156,10 +184,20 @@ class L3Score(evaluate.Metric):
|
|
| 156 |
_PROMPT.format(question=question, gt=reference, answer=prediction),
|
| 157 |
)
|
| 158 |
)
|
| 159 |
-
except Exception as e:
|
| 160 |
-
print(e)
|
| 161 |
-
continue
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
score = self._calculate_L3Score(
|
| 164 |
response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
|
| 165 |
)
|
|
@@ -257,4 +295,4 @@ if __name__ == "__main__":
|
|
| 257 |
provider="deepseek",
|
| 258 |
model="deepseek-coder",
|
| 259 |
)
|
| 260 |
-
|
|
|
|
| 24 |
import datasets
|
| 25 |
import numpy as np
|
| 26 |
|
| 27 |
+
import openai
|
| 28 |
+
|
| 29 |
from langchain.chat_models.base import init_chat_model
|
| 30 |
|
| 31 |
|
|
|
|
| 111 |
"""Optional: download external resources useful to compute the scores"""
|
| 112 |
pass
|
| 113 |
|
| 114 |
+
def _verify_input(self, questions, predictions, references, provider, api_key, model):
|
| 115 |
"""Verify the input parameters"""
|
| 116 |
|
| 117 |
if provider not in PROVIDER_WITH_TOP_LOGPROBS:
|
|
|
|
| 120 |
PROVIDER_WITH_TOP_LOGPROBS
|
| 121 |
)
|
| 122 |
)
|
| 123 |
+
|
| 124 |
+
# Check whether the model is available
|
| 125 |
+
try:
|
| 126 |
+
if provider == "openai":
|
| 127 |
+
client = openai.OpenAI(api_key=api_key)
|
| 128 |
+
model_names = set([model.id for model in client.models.list()])
|
| 129 |
+
if model not in model_names:
|
| 130 |
+
raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
|
| 131 |
+
|
| 132 |
+
elif provider == "deepseek":
|
| 133 |
+
client = openai.OpenAI(api_key=api_key,base_url="https://api.deepseek.com")
|
| 134 |
+
model_names = [model.id for model in client.models.list()]
|
| 135 |
+
if model not in model_names:
|
| 136 |
+
raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
|
| 137 |
+
|
| 138 |
+
elif provider == "xai":
|
| 139 |
+
client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
|
| 140 |
+
model_names = [model.id for model in client.models.list()]
|
| 141 |
+
if model not in model_names:
|
| 142 |
+
raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
|
| 143 |
+
|
| 144 |
+
except openai.AuthenticationError as e:
|
| 145 |
+
message = e.body["message"]
|
| 146 |
+
return {"error": f"Authentication failed: {message}"}
|
| 147 |
+
|
| 148 |
assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
|
| 149 |
|
| 150 |
|
|
|
|
| 153 |
llm = init_chat_model(model=model, api_key=api_key)
|
| 154 |
llm = llm.bind(logprobs=True, top_logprobs=5)
|
| 155 |
return llm
|
| 156 |
+
|
| 157 |
|
| 158 |
def _compute(
|
| 159 |
self,
|
|
|
|
| 165 |
model="gpt-4o-mini",
|
| 166 |
):
|
| 167 |
"""Returns the scores"""
|
| 168 |
+
|
|
|
|
| 169 |
# Check whether llm can be initialized
|
| 170 |
+
self._verify_input(questions, predictions, references, provider, api_key, model)
|
| 171 |
|
| 172 |
# Initialize the LLM
|
| 173 |
llm = self._get_llm(model, api_key)
|
| 174 |
|
| 175 |
+
|
| 176 |
L3Score = 0
|
| 177 |
count = 0
|
| 178 |
+
|
| 179 |
for question, prediction, reference in zip(questions, predictions, references):
|
| 180 |
try:
|
| 181 |
response = llm.invoke(
|
|
|
|
| 184 |
_PROMPT.format(question=question, gt=reference, answer=prediction),
|
| 185 |
)
|
| 186 |
)
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
except openai.AuthenticationError as e:
|
| 189 |
+
message = e.body["message"]
|
| 190 |
+
return {"error": f"Authentication failed: {message}"}
|
| 191 |
+
except openai.RateLimitError as e:
|
| 192 |
+
message = e.body["message"]
|
| 193 |
+
return {"error": "Rate limit exceeded: {}".format(e)}
|
| 194 |
+
except openai.BadRequestError as e:
|
| 195 |
+
message = e.body["message"]
|
| 196 |
+
return {"error": "Bad request: {}".format(e)}
|
| 197 |
+
except Exception as e:
|
| 198 |
+
message = e.body["message"]
|
| 199 |
+
return {"error": "An error occurred: {}".format(e)}
|
| 200 |
+
|
| 201 |
score = self._calculate_L3Score(
|
| 202 |
response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
|
| 203 |
)
|
|
|
|
| 295 |
provider="deepseek",
|
| 296 |
model="deepseek-coder",
|
| 297 |
)
|
| 298 |
+
|
__pycache__/L3Score.cpython-311.pyc
ADDED
|
Binary file (13.8 kB). View file
|
|
|
app.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import evaluate
|
|
|
|
| 3 |
|
| 4 |
-
l3score =
|
| 5 |
|
| 6 |
def compute_l3score(api_key, provider, model, questions, predictions, references):
|
| 7 |
try:
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import evaluate
|
| 3 |
+
from L3Score import L3Score
|
| 4 |
|
| 5 |
+
l3score = L3Score()
|
| 6 |
|
| 7 |
def compute_l3score(api_key, provider, model, questions, predictions, references):
|
| 8 |
try:
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ langchain-deepseek==0.1.3
|
|
| 4 |
langchain-openai==0.3.12
|
| 5 |
langchain-community==0.3.21
|
| 6 |
langchain-core==0.3.52
|
| 7 |
-
numpy==2.2.4
|
|
|
|
|
|
| 4 |
langchain-openai==0.3.12
|
| 5 |
langchain-community==0.3.21
|
| 6 |
langchain-core==0.3.52
|
| 7 |
+
numpy==2.2.4
|
| 8 |
+
openai==1.74.0
|
tests.py
CHANGED
|
@@ -1,17 +1 @@
|
|
| 1 |
-
test_cases = [
|
| 2 |
-
{
|
| 3 |
-
"predictions": [0, 0],
|
| 4 |
-
"references": [1, 1],
|
| 5 |
-
"result": {"metric_score": 0}
|
| 6 |
-
},
|
| 7 |
-
{
|
| 8 |
-
"predictions": [1, 1],
|
| 9 |
-
"references": [1, 1],
|
| 10 |
-
"result": {"metric_score": 1}
|
| 11 |
-
},
|
| 12 |
-
{
|
| 13 |
-
"predictions": [1, 0],
|
| 14 |
-
"references": [1, 1],
|
| 15 |
-
"result": {"metric_score": 0.5}
|
| 16 |
-
}
|
| 17 |
-
]
|
|
|
|
| 1 |
+
test_cases = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|