Add R1 and Llama 3.1 70B to g_eval support.

Add tests. Note: ran a lot more, these are the only ones that work. Fireworks only returns 5 logprobs (not enough). Ollama doesn't support logprobs. Amazon could work, but can do that later. Note: slightly ugly provider specific code leaking into the OAI compaible adapter. Okay for now but should limit this.
Kiln-AI · Feb 24, 2025 · 7d3cccb · 7d3cccb
1 parent 0af1cdf
commit 7d3cccb
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 14 deletions.
diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py
@@ -4,7 +4,9 @@
 import pytest
 from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval, GEvalTask
 from kiln_ai.adapters.eval.test_g_eval_data import serialized_run_output
+from kiln_ai.adapters.ml_model_list import built_in_models
 from kiln_ai.adapters.model_adapters.base_adapter import RunOutput
+from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers
 from kiln_ai.datamodel import (
     BasePrompt,
     DataSource,
@@ -130,15 +132,20 @@ def test_task_run(test_task):
     return task_run
 
 
-@pytest.mark.parametrize(
-    "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
-)
-@pytest.mark.paid
-async def test_run_g_eval(
-    test_task, test_eval_config, test_task_run, config_type, test_run_config
+async def run_g_eval_test(
+    test_task,
+    test_eval_config,
+    test_task_run,
+    config_type,
+    test_run_config,
+    model_name: str | None = None,
+    provider_name: str | None = None,
 ):
     # Create G-Eval instance
     test_eval_config.config_type = config_type
+    if model_name is not None and provider_name is not None:
+        test_eval_config.model.properties["model_name"] = model_name
+        test_eval_config.model.properties["model_provider"] = provider_name
     g_eval = GEval(test_eval_config, test_run_config)
 
     # Run the evaluation
@@ -160,6 +167,18 @@ async def test_run_g_eval(
     assert 1.0 <= overall <= 5.0
 
 
+@pytest.mark.parametrize(
+    "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
+)
+@pytest.mark.paid
+async def test_run_g_eval(
+    test_task, test_eval_config, test_task_run, config_type, test_run_config
+):
+    await run_g_eval_test(
+        test_task, test_eval_config, test_task_run, config_type, test_run_config
+    )
+
+
 @pytest.mark.parametrize(
     "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
 )
@@ -445,3 +464,41 @@ def test_g_eval_system_instruction():
         g_eval_task.instruction
         == "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
     )
+
+
+def check_supports_logprobs(model_name: str, provider_name: str):
+    for model in built_in_models:
+        if model.name != model_name:
+            continue
+        for provider in model.providers:
+            if provider.name != provider_name:
+                continue
+            if not provider.supports_logprobs:
+                pytest.skip(
+                    f"Skipping {model.name} {provider.name} because it does not support logprobs"
+                )
+            return
+    raise RuntimeError(f"No model {model_name} {provider_name} found")
+
+
+@pytest.mark.paid
+@pytest.mark.ollama
+@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
+async def test_all_built_in_models_logprobs_geval(
+    model_name,
+    provider_name,
+    test_task,
+    test_eval_config,
+    test_task_run,
+    test_run_config,
+):
+    check_supports_logprobs(model_name, provider_name)
+    await run_g_eval_test(
+        test_task,
+        test_eval_config,
+        test_task_run,
+        EvalConfigType.g_eval,
+        test_run_config,
+        model_name,
+        provider_name,
+    )
diff --git a/libs/core/kiln_ai/adapters/ml_model_list.py b/libs/core/kiln_ai/adapters/ml_model_list.py
@@ -245,6 +245,7 @@ class KilnModel(BaseModel):
                 # No custom parser -- openrouter implemented it themselves
                 structured_output_mode=StructuredOutputMode.json_instructions,
                 reasoning_capable=True,
+                supports_logprobs=True,
             ),
             KilnModelProvider(
                 name=ModelProviderName.fireworks_ai,
@@ -393,6 +394,7 @@ class KilnModel(BaseModel):
                 supports_data_gen=False,
                 structured_output_mode=StructuredOutputMode.function_calling,
                 provider_options={"model": "meta-llama/llama-3.1-70b-instruct"},
+                supports_logprobs=True,
             ),
             KilnModelProvider(
                 name=ModelProviderName.ollama,

diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py
@@ -9,7 +9,7 @@
 )
 
 import kiln_ai.datamodel as datamodel
-from kiln_ai.adapters.ml_model_list import StructuredOutputMode
+from kiln_ai.adapters.ml_model_list import ModelProviderName, StructuredOutputMode
 from kiln_ai.adapters.model_adapters.base_adapter import (
     COT_FINAL_ANSWER_PROMPT,
     AdapterConfig,
@@ -115,6 +115,12 @@ async def _run(self, input: Dict | str) -> RunOutput:
                 # fp8 quants are awful
                 "ignore": ["DeepInfra"],
             }
+        elif self.model_provider().name == ModelProviderName.openrouter:
+            # OpenRouter specific options. Bit of a hack but really does improve usability.
+            extra_body["provider"] = {
+                "require_parameters": True,
+                "ignore": ["DeepInfra"],
+            }
 
         # Main completion call
         response_format_options = await self.response_format_options()
@@ -235,15 +241,19 @@ def tool_call_params(self) -> dict[str, Any]:
             )
         output_schema["additionalProperties"] = False
 
+        function_params = {
+            "name": "task_response",
+            "parameters": output_schema,
+        }
+        # This parameter is only reliable for OpenAI
+        if self.model_provider().name == ModelProviderName.openai:
+            function_params["strict"] = True
+
         return {
             "tools": [
                 {
                     "type": "function",
-                    "function": {
-                        "name": "task_response",
-                        "parameters": output_schema,
-                        "strict": True,
-                    },
+                    "function": function_params,
                 }
             ],
             "tool_choice": {

diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py
@@ -43,7 +43,7 @@ def config():
         api_key="test_key",
         base_url="https://api.test.com",
         model_name="test-model",
-        provider_name="test-provider",
+        provider_name="openrouter",
         default_headers={"X-Test": "test"},
     )
 
@@ -166,7 +166,32 @@ async def test_response_format_options_json_schema(config, mock_task):
         }
 
 
-def test_tool_call_params(config, mock_task):
+def test_tool_call_params_non_openai(config, mock_task):
+    adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task)
+
+    params = adapter.tool_call_params()
+    expected_schema = mock_task.output_schema()
+    expected_schema["additionalProperties"] = False
+
+    assert params == {
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "task_response",
+                    "parameters": expected_schema,
+                },
+            }
+        ],
+        "tool_choice": {
+            "type": "function",
+            "function": {"name": "task_response"},
+        },
+    }
+
+
+def test_tool_call_params_openai(config, mock_task):
+    config.provider_name = "openai"
     adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task)
 
     params = adapter.tool_call_params()