Source code for oumi.core.configs.synthesis_config
# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from oumi.core.configs.base_config import BaseConfig
from oumi.core.configs.environment_config import EnvironmentConfig
from oumi.core.configs.inference_config import InferenceConfig
from oumi.core.configs.params.environment_params import EnvironmentParams
from oumi.core.configs.params.synthesis_params import (
GeneralSynthesisParams,
MultiTurnAttribute,
)
from oumi.core.configs.params.tool_params import ToolParams
from oumi.exceptions import OumiConfigError
class SynthesisStrategy(str, Enum):
"""The supported synthesis strategies."""
GENERAL = "general"
"""A general synthesis strategy that can be used for any task."""
[docs]
@dataclass
class SynthesisConfig(BaseConfig):
"""The configuration for the synthesis pipeline."""
output_path: str | None = None
"""The path to the output file where the generated data will be saved.
If not specified, the data will be returned as a list of dictionaries.
"""
strategy: SynthesisStrategy = SynthesisStrategy.GENERAL
"""The synthesis strategy to use."""
strategy_params: GeneralSynthesisParams = field(
default_factory=GeneralSynthesisParams
)
"""The synthesis strategy parameters to use."""
environment_config: EnvironmentConfig | None = None
"""Reusable environment-first tool configuration."""
environment_config_path: str | None = None
"""Optional path to an EnvironmentConfig YAML file."""
inference_config: InferenceConfig = field(default_factory=InferenceConfig)
"""The inference configuration to use."""
num_samples: int = 1
"""The number of synthetic samples to generate."""
[docs]
def __post_init__(self):
"""Verifies/populates params."""
if self.strategy == SynthesisStrategy.GENERAL:
pass
else:
raise OumiConfigError(f"Unsupported synthesis strategy: {self.strategy}")
if self.inference_config.input_path is not None:
raise OumiConfigError(
"Input path is not supported for general synthesis strategy."
)
if self.inference_config.output_path is not None:
raise OumiConfigError(
"Output path is not supported for general synthesis strategy."
)
if self.output_path is not None:
if self.output_path == "":
raise OumiConfigError("Output path cannot be empty.")
if not self.output_path.endswith(".jsonl"):
raise OumiConfigError("Output path must end with .jsonl.")
self.environment_config = self._resolve_environment_config()
self._validate_available_tooling()
def _resolve_environment_config(self) -> EnvironmentConfig | None:
"""Resolve top-level environment configuration."""
if (
self.environment_config is not None
and self.environment_config_path is not None
):
raise OumiConfigError(
"SynthesisConfig.environment_config and "
"SynthesisConfig.environment_config_path cannot both be set."
)
if self.environment_config is not None:
return self.environment_config
if self.environment_config_path is not None:
if self.environment_config_path == "":
raise OumiConfigError(
"SynthesisConfig.environment_config_path cannot be empty."
)
config_path = Path(self.environment_config_path)
if not config_path.exists():
raise OumiConfigError(
f"Environment config path does not exist: "
f"{self.environment_config_path}"
)
return EnvironmentConfig.from_yaml(config_path)
return None
[docs]
def resolve_multiturn_environments(
self, multiturn_attribute: MultiTurnAttribute
) -> list[EnvironmentParams]:
"""Resolve the environments available to a multiturn attribute."""
if self.environment_config is None:
return []
if not multiturn_attribute.available_environments:
return list(self.environment_config.environments)
resolved_environments: list[EnvironmentParams] = []
for environment_id in multiturn_attribute.available_environments:
environment = self.environment_config.get_environment(environment_id)
if environment is None:
raise OumiConfigError(
f"MultiTurnAttribute '{multiturn_attribute.id}' references unknown "
f"environment '{environment_id}'. Defined environment ids: "
f"{sorted(env.id for env in self.environment_config.environments)}"
)
resolved_environments.append(environment)
return resolved_environments
def _validate_available_tooling(self) -> None:
"""Validate multiturn environment/tool selections against the catalog."""
if not self.strategy_params.multiturn_attributes:
return
all_referenced_tools = [
tool_id
for mt_attr in self.strategy_params.multiturn_attributes
for tool_id in mt_attr.available_tools
]
all_referenced_environments = [
environment_id
for mt_attr in self.strategy_params.multiturn_attributes
for environment_id in mt_attr.available_environments
]
if not all_referenced_tools and not all_referenced_environments:
return
if self.environment_config is None:
raise OumiConfigError(
"Environment or tool references require "
"SynthesisConfig.environment_config, or "
"SynthesisConfig.environment_config_path."
)
for mt_attr in self.strategy_params.multiturn_attributes:
selected_environments = self.resolve_multiturn_environments(mt_attr)
selected_environment_ids = {
environment.id for environment in selected_environments
}
selected_tools = self.environment_config.resolve_tools(
environment_ids=list(selected_environment_ids)
)
selected_tool_ids = {tool.id for tool in selected_tools}
for tool_id in mt_attr.available_tools:
if tool_id not in selected_tool_ids:
raise OumiConfigError(
f"MultiTurnAttribute '{mt_attr.id}' references unknown "
f"tool '{tool_id}' for environments "
f"{sorted(selected_environment_ids)}. Defined tool ids: "
f"{sorted(selected_tool_ids)}"
)