Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 30 additions & 21 deletions packages/sdk/server-ai/src/ldai/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,38 @@ def judge_config(
key, context, default_value.to_dict(), variables
)

# Extract evaluation_metric_keys from the variation
variation = self._client.variation(key, context, default_value.to_dict())
Comment thread
jsonbailey marked this conversation as resolved.
Outdated
evaluation_metric_keys = variation.get('evaluationMetricKeys', default_value.evaluation_metric_keys or [])

def _extract_evaluation_metric_key(
variation: Dict[str, Any], default_value: AIJudgeConfigDefault
Comment thread
jsonbailey marked this conversation as resolved.
Outdated
) -> Optional[str]:
"""
Extract evaluation_metric_key with backward compatibility.

Priority: 1) evaluationMetricKey from variation, 2) evaluation_metric_key from default,
3) first from evaluationMetricKeys in variation, 4) first from evaluation_metric_keys in default
"""
if evaluation_metric_key := variation.get('evaluationMetricKey'):
return evaluation_metric_key

if default_value.evaluation_metric_key:
return default_value.evaluation_metric_key

variation_keys = variation.get('evaluationMetricKeys')
if isinstance(variation_keys, list) and variation_keys:
return variation_keys[0]

if default_value.evaluation_metric_keys:
return default_value.evaluation_metric_keys[0]

return None

evaluation_metric_key = _extract_evaluation_metric_key(variation, default_value)

config = AIJudgeConfig(
key=key,
enabled=bool(enabled),
evaluation_metric_keys=evaluation_metric_keys,
evaluation_metric_key=evaluation_metric_key,
model=model,
messages=messages,
provider=provider,
Expand Down Expand Up @@ -142,7 +166,7 @@ async def create_judge(
enabled=True,
model=ModelConfig("gpt-4"),
provider=ProviderConfig("openai"),
evaluation_metric_keys=['$ld:ai:judge:relevance'],
evaluation_metric_key='$ld:ai:judge:relevance',
messages=[LDMessage(role='system', content='You are a relevance judge.')]
),
variables={'metric': "relevance"}
Expand All @@ -158,33 +182,27 @@ async def create_judge(
self._client.track('$ld:ai:judge:function:createJudge', context, key, 1)

try:
# Warn if reserved variables are provided
if variables:
if 'message_history' in variables:
# Note: Python doesn't have a logger on the client, but we could add one
pass # Would log warning if logger available
pass
if 'response_to_evaluate' in variables:
pass # Would log warning if logger available
pass

# Overwrite reserved variables to ensure they remain as placeholders for judge evaluation
extended_variables = dict(variables) if variables else {}
extended_variables['message_history'] = '{{message_history}}'
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'

judge_config = self.judge_config(key, context, default_value, extended_variables)

if not judge_config.enabled or not judge_config.tracker:
# Would log info if logger available
return None

# Create AI provider for the judge
provider = await AIProviderFactory.create(judge_config, default_ai_provider)
if not provider:
return None

return Judge(judge_config, judge_config.tracker, provider)
except Exception as error:
# Would log error if logger available
return None

async def _initialize_judges(
Expand Down Expand Up @@ -277,7 +295,6 @@ async def create_chat(
config = self.completion_config(key, context, default_value, variables)

if not config.enabled or not config.tracker:
# Would log info if logger available
return None

provider = await AIProviderFactory.create(config, default_ai_provider)
Expand Down Expand Up @@ -331,7 +348,6 @@ def agent_config(
:param variables: Variables for interpolation.
:return: Configured AIAgentConfig instance.
"""
# Track single agent usage
self._client.track(
"$ld:ai:agent:function:single",
context,
Expand Down Expand Up @@ -397,7 +413,6 @@ def agent_configs(
:param context: The context to evaluate the agent configurations in.
:return: Dictionary mapping agent keys to their AIAgentConfig configurations.
"""
# Track multiple agents usage
agent_count = len(agent_configs)
self._client.track(
"$ld:ai:agent:function:multiple",
Expand Down Expand Up @@ -461,7 +476,6 @@ def __evaluate(
all_variables.update(variables)
all_variables['ldctx'] = context.to_dict()

# Extract messages
messages = None
if 'messages' in variation and isinstance(variation['messages'], list) and all(
isinstance(entry, dict) for entry in variation['messages']
Expand All @@ -476,18 +490,15 @@ def __evaluate(
for entry in variation['messages']
]

# Extract instructions
instructions = None
if 'instructions' in variation and isinstance(variation['instructions'], str):
instructions = self.__interpolate_template(variation['instructions'], all_variables)

# Extract provider config
provider_config = None
if 'provider' in variation and isinstance(variation['provider'], dict):
provider = variation['provider']
provider_config = ProviderConfig(provider.get('name', ''))

# Extract model config
model = None
if 'model' in variation and isinstance(variation['model'], dict):
parameters = variation['model'].get('parameters', None)
Expand All @@ -498,7 +509,6 @@ def __evaluate(
custom=custom
)

# Create tracker
tracker = LDAIConfigTracker(
self._client,
variation.get('_ldMeta', {}).get('variationKey', ''),
Expand All @@ -511,7 +521,6 @@ def __evaluate(

enabled = variation.get('_ldMeta', {}).get('enabled', False)

# Extract judge configuration
judge_configuration = None
if 'judgeConfiguration' in variation and isinstance(variation['judgeConfiguration'], dict):
judge_config = variation['judgeConfiguration']
Expand Down
61 changes: 31 additions & 30 deletions packages/sdk/server-ai/src/ldai/judge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
from ldai.models import AIJudgeConfig, LDMessage
from ldai.providers.ai_provider import AIProvider
from ldai.providers.types import (ChatResponse, EvalScore, JudgeResponse,
StructuredResponse)
from ldai.providers.types import ChatResponse, EvalScore, JudgeResponse
from ldai.tracker import LDAIConfigTracker


Expand Down Expand Up @@ -38,9 +37,7 @@ def __init__(
self._ai_config = ai_config
self._ai_config_tracker = ai_config_tracker
self._ai_provider = ai_provider
self._evaluation_response_structure = EvaluationSchemaBuilder.build(
ai_config.evaluation_metric_keys
)
self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key)

async def evaluate(
self,
Expand All @@ -57,9 +54,9 @@ async def evaluate(
:return: Evaluation results or None if not sampled
"""
try:
if not self._ai_config.evaluation_metric_keys or len(self._ai_config.evaluation_metric_keys) == 0:
if not self._ai_config.evaluation_metric_key:
log.warn(
'Judge configuration is missing required evaluationMetricKeys'
'Judge configuration is missing required evaluationMetricKey'
)
return None

Expand All @@ -72,8 +69,8 @@ async def evaluate(
return None

messages = self._construct_evaluation_messages(input_text, output_text)
assert self._evaluation_response_structure is not None

# Track metrics of the structured model invocation
response = await self._ai_config_tracker.track_metrics_of(
lambda: self._ai_provider.invoke_structured_model(messages, self._evaluation_response_structure),
lambda result: result.metrics,
Expand All @@ -83,8 +80,8 @@ async def evaluate(

evals = self._parse_evaluation_response(response.data)

if len(evals) != len(self._ai_config.evaluation_metric_keys):
log.warn('Judge evaluation did not return all evaluations')
if self._ai_config.evaluation_metric_key not in evals:
log.warn('Judge evaluation did not return the expected evaluation')
success = False

return JudgeResponse(
Expand Down Expand Up @@ -191,30 +188,34 @@ def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScor

evaluations = data['evaluations']

for metric_key in self._ai_config.evaluation_metric_keys:
evaluation = evaluations.get(metric_key)
metric_key = self._ai_config.evaluation_metric_key
if not metric_key:
log.warn('Evaluation metric key is missing')
return results

if not evaluation or not isinstance(evaluation, dict):
log.warn(f'Missing evaluation for metric key: {metric_key}')
continue
evaluation = evaluations.get(metric_key)

score = evaluation.get('score')
reasoning = evaluation.get('reasoning')
if not evaluation or not isinstance(evaluation, dict):
log.warn(f'Missing evaluation for metric key: {metric_key}')
return results

if not isinstance(score, (int, float)) or score < 0 or score > 1:
log.warn(
f'Invalid score evaluated for {metric_key}: {score}. '
'Score must be a number between 0 and 1 inclusive'
)
continue
score = evaluation.get('score')
reasoning = evaluation.get('reasoning')

if not isinstance(reasoning, str):
log.warn(
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
'Reasoning must be a string'
)
continue
if not isinstance(score, (int, float)) or score < 0 or score > 1:
log.warn(
f'Invalid score evaluated for {metric_key}: {score}. '
'Score must be a number between 0 and 1 inclusive'
)
return results

if not isinstance(reasoning, str):
log.warn(
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
'Reasoning must be a string'
)
return results

results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)

return results
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Internal class for building dynamic evaluation response schemas."""

from typing import Any, Dict
from typing import Any, Dict, Optional


class EvaluationSchemaBuilder:
Expand All @@ -10,26 +10,29 @@ class EvaluationSchemaBuilder:
"""

@staticmethod
def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]:
"""
Build an evaluation response schema from evaluation metric keys.
Build an evaluation response schema from evaluation metric key.

:param evaluation_metric_keys: List of evaluation metric keys
:return: Schema dictionary for structured output
:param evaluation_metric_key: Evaluation metric key, or None if not available
:return: Schema dictionary for structured output, or None if evaluation_metric_key is None
"""
if not evaluation_metric_key:
return None

return {
'title': 'EvaluationResponse',
'description': f"Response containing evaluation results for {', '.join(evaluation_metric_keys)} metrics",
'description': f"Response containing evaluation results for {evaluation_metric_key} metric",
'type': 'object',
'properties': {
'evaluations': {
'type': 'object',
'description': (
f"Object containing evaluation results for "
f"{', '.join(evaluation_metric_keys)} metrics"
f"{evaluation_metric_key} metric"
),
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_keys),
'required': evaluation_metric_keys,
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key),
'required': [evaluation_metric_key],
'additionalProperties': False,
},
},
Expand All @@ -38,17 +41,16 @@ def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
}

@staticmethod
def _build_key_properties(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]:
"""
Build properties for each evaluation metric key.
Build properties for a single evaluation metric key.

:param evaluation_metric_keys: List of evaluation metric keys
:return: Dictionary of properties for each key
:param evaluation_metric_key: Evaluation metric key
:return: Dictionary of properties for the key
"""
result: Dict[str, Any] = {}
for key in evaluation_metric_keys:
result[key] = EvaluationSchemaBuilder._build_key_schema(key)
return result
return {
evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key)
}

@staticmethod
def _build_key_schema(key: str) -> Dict[str, Any]:
Expand Down
9 changes: 6 additions & 3 deletions packages/sdk/server-ai/src/ldai/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,16 +285,17 @@ class AIJudgeConfigDefault(AIConfigDefault):
Default Judge-specific AI Config with required evaluation metric key.
"""
messages: Optional[List[LDMessage]] = None
# Deprecated: evaluation_metric_key is used instead
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are sub 1.0 release as long as we can guarantee the api is always returning the new single key we should be able to just drop this and do a breaking change. They only thing that really makes this breaking is people will need to update their defaults if they defined it. If you want to drop it now update the PR to be "feat!: ".

I won't block if you want to leave this in for a little while but it likely isn't necessary. The real question is how long do we want to continue sending the old values in the API as that is what will break older SDKs.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now we want to make sure this is non-breaking, but soon we're going to remove "legacy" support. For keeping this change as minimal and safe as possible I'd err on the side of caution and keep it in for the time being.

evaluation_metric_keys: Optional[List[str]] = None
evaluation_metric_key: Optional[str] = None

def to_dict(self) -> dict:
"""
Render the given judge config default as a dictionary object.
"""
result = self._base_to_dict()
result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
if self.evaluation_metric_keys is not None:
result['evaluationMetricKeys'] = self.evaluation_metric_keys
result['evaluationMetricKey'] = self.evaluation_metric_key
return result


Expand All @@ -303,16 +304,18 @@ class AIJudgeConfig(AIConfig):
"""
Judge-specific AI Config with required evaluation metric key.
"""
# Deprecated: evaluation_metric_key is used instead
evaluation_metric_keys: List[str] = field(default_factory=list)
messages: Optional[List[LDMessage]] = None
evaluation_metric_key: Optional[str] = None

def to_dict(self) -> dict:
"""
Render the given judge config as a dictionary object.
"""
result = self._base_to_dict()
result['evaluationMetricKeys'] = self.evaluation_metric_keys
result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
result['evaluationMetricKey'] = self.evaluation_metric_key
return result


Expand Down
Loading