Skip to content

Commit

Permalink
Add R1 and Llama 3.1 70B to g_eval support.
Browse files Browse the repository at this point in the history
Add tests.

Note: ran a lot more, these are the only ones that work. Fireworks only returns 5 logprobs (not enough). Ollama doesn't support logprobs. Amazon could work, but can do that later.

Note: slightly ugly provider specific code leaking into the OAI compaible adapter. Okay for now but should limit this.
  • Loading branch information
scosman committed Feb 24, 2025
1 parent 0af1cdf commit 7d3cccb
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 14 deletions.
69 changes: 63 additions & 6 deletions libs/core/kiln_ai/adapters/eval/test_g_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import pytest
from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval, GEvalTask
from kiln_ai.adapters.eval.test_g_eval_data import serialized_run_output
from kiln_ai.adapters.ml_model_list import built_in_models
from kiln_ai.adapters.model_adapters.base_adapter import RunOutput
from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers
from kiln_ai.datamodel import (
BasePrompt,
DataSource,
Expand Down Expand Up @@ -130,15 +132,20 @@ def test_task_run(test_task):
return task_run


@pytest.mark.parametrize(
"config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
)
@pytest.mark.paid
async def test_run_g_eval(
test_task, test_eval_config, test_task_run, config_type, test_run_config
async def run_g_eval_test(
test_task,
test_eval_config,
test_task_run,
config_type,
test_run_config,
model_name: str | None = None,
provider_name: str | None = None,
):
# Create G-Eval instance
test_eval_config.config_type = config_type
if model_name is not None and provider_name is not None:
test_eval_config.model.properties["model_name"] = model_name
test_eval_config.model.properties["model_provider"] = provider_name
g_eval = GEval(test_eval_config, test_run_config)

# Run the evaluation
Expand All @@ -160,6 +167,18 @@ async def test_run_g_eval(
assert 1.0 <= overall <= 5.0


@pytest.mark.parametrize(
"config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
)
@pytest.mark.paid
async def test_run_g_eval(
test_task, test_eval_config, test_task_run, config_type, test_run_config
):
await run_g_eval_test(
test_task, test_eval_config, test_task_run, config_type, test_run_config
)


@pytest.mark.parametrize(
"config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
)
Expand Down Expand Up @@ -445,3 +464,41 @@ def test_g_eval_system_instruction():
g_eval_task.instruction
== "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
)


def check_supports_logprobs(model_name: str, provider_name: str):
for model in built_in_models:
if model.name != model_name:
continue
for provider in model.providers:
if provider.name != provider_name:
continue
if not provider.supports_logprobs:
pytest.skip(
f"Skipping {model.name} {provider.name} because it does not support logprobs"
)
return
raise RuntimeError(f"No model {model_name} {provider_name} found")


@pytest.mark.paid
@pytest.mark.ollama
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
async def test_all_built_in_models_logprobs_geval(
model_name,
provider_name,
test_task,
test_eval_config,
test_task_run,
test_run_config,
):
check_supports_logprobs(model_name, provider_name)
await run_g_eval_test(
test_task,
test_eval_config,
test_task_run,
EvalConfigType.g_eval,
test_run_config,
model_name,
provider_name,
)
2 changes: 2 additions & 0 deletions libs/core/kiln_ai/adapters/ml_model_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ class KilnModel(BaseModel):
# No custom parser -- openrouter implemented it themselves
structured_output_mode=StructuredOutputMode.json_instructions,
reasoning_capable=True,
supports_logprobs=True,
),
KilnModelProvider(
name=ModelProviderName.fireworks_ai,
Expand Down Expand Up @@ -393,6 +394,7 @@ class KilnModel(BaseModel):
supports_data_gen=False,
structured_output_mode=StructuredOutputMode.function_calling,
provider_options={"model": "meta-llama/llama-3.1-70b-instruct"},
supports_logprobs=True,
),
KilnModelProvider(
name=ModelProviderName.ollama,
Expand Down
22 changes: 16 additions & 6 deletions libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
)

import kiln_ai.datamodel as datamodel
from kiln_ai.adapters.ml_model_list import StructuredOutputMode
from kiln_ai.adapters.ml_model_list import ModelProviderName, StructuredOutputMode
from kiln_ai.adapters.model_adapters.base_adapter import (
COT_FINAL_ANSWER_PROMPT,
AdapterConfig,
Expand Down Expand Up @@ -115,6 +115,12 @@ async def _run(self, input: Dict | str) -> RunOutput:
# fp8 quants are awful
"ignore": ["DeepInfra"],
}
elif self.model_provider().name == ModelProviderName.openrouter:
# OpenRouter specific options. Bit of a hack but really does improve usability.
extra_body["provider"] = {
"require_parameters": True,
"ignore": ["DeepInfra"],
}

# Main completion call
response_format_options = await self.response_format_options()
Expand Down Expand Up @@ -235,15 +241,19 @@ def tool_call_params(self) -> dict[str, Any]:
)
output_schema["additionalProperties"] = False

function_params = {
"name": "task_response",
"parameters": output_schema,
}
# This parameter is only reliable for OpenAI
if self.model_provider().name == ModelProviderName.openai:
function_params["strict"] = True

return {
"tools": [
{
"type": "function",
"function": {
"name": "task_response",
"parameters": output_schema,
"strict": True,
},
"function": function_params,
}
],
"tool_choice": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def config():
api_key="test_key",
base_url="https://api.test.com",
model_name="test-model",
provider_name="test-provider",
provider_name="openrouter",
default_headers={"X-Test": "test"},
)

Expand Down Expand Up @@ -166,7 +166,32 @@ async def test_response_format_options_json_schema(config, mock_task):
}


def test_tool_call_params(config, mock_task):
def test_tool_call_params_non_openai(config, mock_task):
adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task)

params = adapter.tool_call_params()
expected_schema = mock_task.output_schema()
expected_schema["additionalProperties"] = False

assert params == {
"tools": [
{
"type": "function",
"function": {
"name": "task_response",
"parameters": expected_schema,
},
}
],
"tool_choice": {
"type": "function",
"function": {"name": "task_response"},
},
}


def test_tool_call_params_openai(config, mock_task):
config.provider_name = "openai"
adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task)

params = adapter.tool_call_params()
Expand Down

0 comments on commit 7d3cccb

Please sign in to comment.