Spaces:

nhop
/

L3Score

Sleeping

App Files Files Community

Niklas Hoepner commited on Apr 16

Commit

06fb7ae

1 Parent(s): 2a3f461

Better error handling in L3Score computation

Browse files

Files changed (5) hide show

L3Score.py +47 -9
__pycache__/L3Score.cpython-311.pyc +0 -0
app.py +2 -1
requirements.txt +2 -1
tests.py +1 -17

L3Score.py CHANGED Viewed

@@ -24,6 +24,8 @@ import evaluate
 import datasets
 import numpy as np
 from langchain.chat_models.base import init_chat_model
@@ -109,7 +111,7 @@ class L3Score(evaluate.Metric):
         """Optional: download external resources useful to compute the scores"""
         pass
-    def _verify_input(self, questions, predictions, references, provider):
         """Verify the input parameters"""
         if provider not in PROVIDER_WITH_TOP_LOGPROBS:
@@ -118,7 +120,31 @@ class L3Score(evaluate.Metric):
                     PROVIDER_WITH_TOP_LOGPROBS
                 )
             )
         assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
@@ -127,6 +153,7 @@ class L3Score(evaluate.Metric):
         llm = init_chat_model(model=model, api_key=api_key)
         llm = llm.bind(logprobs=True, top_logprobs=5)
         return llm
     def _compute(
         self,
@@ -138,16 +165,17 @@ class L3Score(evaluate.Metric):
         model="gpt-4o-mini",
     ):
         """Returns the scores"""
-        print(questions,predictions,references)
         # Check whether llm can be initialized
-        self._verify_input(questions, predictions, references, provider)
         # Initialize the LLM
         llm = self._get_llm(model, api_key)
         L3Score = 0
         count = 0
         for question, prediction, reference in zip(questions, predictions, references):
             try:
                 response = llm.invoke(
@@ -156,10 +184,20 @@ class L3Score(evaluate.Metric):
                         _PROMPT.format(question=question, gt=reference, answer=prediction),
                     )
                 )
-            except Exception as e:
-                print(e)
-                continue
             score = self._calculate_L3Score(
                 response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
             )
@@ -257,4 +295,4 @@ if __name__ == "__main__":
         provider="deepseek",
         model="deepseek-coder",
     )
-    print(results)

 import datasets
 import numpy as np
+import openai
 from langchain.chat_models.base import init_chat_model
         """Optional: download external resources useful to compute the scores"""
         pass
+    def _verify_input(self, questions, predictions, references, provider, api_key, model):
         """Verify the input parameters"""
         if provider not in PROVIDER_WITH_TOP_LOGPROBS:
                     PROVIDER_WITH_TOP_LOGPROBS
                 )
             )
+        # Check whether the model is available
+        try:
+            if provider == "openai":
+                client = openai.OpenAI(api_key=api_key)
+                model_names = set([model.id for model in client.models.list()])
+                if model not in model_names:
+                    raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
+            elif provider == "deepseek":
+                client = openai.OpenAI(api_key=api_key,base_url="https://api.deepseek.com")
+                model_names = [model.id for model in client.models.list()]
+                if model not in model_names:
+                    raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
+            elif provider == "xai":
+                client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
+                model_names = [model.id for model in client.models.list()]
+                if model not in model_names:
+                    raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
+        except openai.AuthenticationError as e:
+            message = e.body["message"]
+            return {"error": f"Authentication failed: {message}"}
         assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
         llm = init_chat_model(model=model, api_key=api_key)
         llm = llm.bind(logprobs=True, top_logprobs=5)
         return llm
     def _compute(
         self,
         model="gpt-4o-mini",
     ):
         """Returns the scores"""
         # Check whether llm can be initialized
+        self._verify_input(questions, predictions, references, provider, api_key, model)
         # Initialize the LLM
         llm = self._get_llm(model, api_key)
         L3Score = 0
         count = 0
         for question, prediction, reference in zip(questions, predictions, references):
             try:
                 response = llm.invoke(
                         _PROMPT.format(question=question, gt=reference, answer=prediction),
                     )
                 )
+            except openai.AuthenticationError as e:
+                message = e.body["message"]
+                return {"error": f"Authentication failed: {message}"}
+            except openai.RateLimitError as e:
+                message = e.body["message"]
+                return {"error": "Rate limit exceeded: {}".format(e)}
+            except openai.BadRequestError as e:
+                message = e.body["message"]
+                return {"error": "Bad request: {}".format(e)}
+            except Exception as e:
+                message = e.body["message"]
+                return {"error": "An error occurred: {}".format(e)}
             score = self._calculate_L3Score(
                 response.response_metadata["logprobs"]["content"][0]["top_logprobs"]
             )
         provider="deepseek",
         model="deepseek-coder",
     )

__pycache__/L3Score.cpython-311.pyc ADDED Viewed

Binary file (13.8 kB). View file

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
 import evaluate
-l3score = evaluate.load("nhop/L3Score")
 def compute_l3score(api_key, provider, model, questions, predictions, references):
     try:

 import gradio as gr
 import evaluate
+from L3Score import L3Score
+l3score = L3Score()
 def compute_l3score(api_key, provider, model, questions, predictions, references):
     try:

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ langchain-deepseek==0.1.3
 langchain-openai==0.3.12
 langchain-community==0.3.21
 langchain-core==0.3.52
-numpy==2.2.4

 langchain-openai==0.3.12
 langchain-community==0.3.21
 langchain-core==0.3.52
+numpy==2.2.4
+openai==1.74.0

tests.py CHANGED Viewed

@@ -1,17 +1 @@
-test_cases = [
-    {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
-    },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]


1	+ test_cases = []