import json
from itertools import product
from typing import Optional
import numpy as np
import yaml
from beartype import beartype
from pydantic import BaseModel, Field, validator
from sequifier.config.train_config import (
DotDict,
ModelSpecModel,
TrainingSpecModel,
TrainModel,
)
from sequifier.helpers import normalize_path
@beartype
def load_hyperparameter_search_config(
config_path: str, on_unprocessed: bool
) -> "HyperparameterSearch":
"""Load a hyperparameter search configuration from a YAML file.
This function reads a YAML configuration file, processes it to include
data-driven configurations if needed, and returns a HyperparameterSearch
object.
Args:
config_path: The path to the hyperparameter search configuration file.
on_unprocessed: A boolean flag indicating whether the configuration is
for unprocessed data. If False, it will load and integrate
data-driven configurations.
Returns:
An instance of the HyperparameterSearch class, populated with the
configuration from the file.
"""
with open(config_path, "r") as f:
config_values = yaml.safe_load(f)
if not on_unprocessed:
ddconfig_path = config_values.get("ddconfig_path")
with open(
normalize_path(ddconfig_path, config_values["project_path"]), "r"
) as f:
dd_config = json.loads(f.read())
config_values["column_types"] = config_values.get(
"column_types", [dd_config["column_types"]]
)
if config_values["selected_columns"] is None:
config_values["selected_columns"] = [
list(config_values["column_types"].keys())
]
config_values["categorical_columns"] = [
[
col
for col, type_ in dd_config["column_types"].items()
if type_ == "Int64" and col in selected_columns
]
for selected_columns in config_values["selected_columns"]
]
config_values["real_columns"] = [
[
col
for col, type_ in dd_config["column_types"].items()
if type_ == "Float64" and col in selected_columns
]
for selected_columns in config_values["selected_columns"]
]
config_values["n_classes"] = config_values.get(
"n_classes", dd_config["n_classes"]
)
config_values["training_data_path"] = normalize_path(
config_values.get("training_data_path", dd_config["split_paths"][0]),
config_values["project_path"],
)
config_values["validation_data_path"] = normalize_path(
config_values.get(
"validation_data_path",
dd_config["split_paths"][min(1, len(dd_config["split_paths"]) - 1)],
),
config_values["project_path"],
)
config_values["id_maps"] = dd_config["id_maps"]
return HyperparameterSearch(**config_values)
[docs]
class TrainingSpecHyperparameterSampling(BaseModel):
"""Pydantic model for training specification hyperparameter sampling.
Attributes:
device: The device to train on (e.g., 'cuda', 'cpu').
epochs: A list of possible numbers of epochs to train for.
log_interval: The interval in batches for logging.
class_share_log_columns: Columns for which to log class share.
early_stopping_epochs: Number of epochs for early stopping.
iter_save: Interval in epochs for saving model checkpoints.
batch_size: A list of possible batch sizes.
lr: A list of possible learning rates.
criterion: A dictionary mapping target columns to loss functions.
class_weights: Optional dictionary mapping columns to class weights.
accumulation_steps: A list of possible gradient accumulation steps.
dropout: A list of possible dropout rates.
loss_weights: Optional dictionary mapping columns to loss weights.
optimizer: A list of possible optimizer configurations.
scheduler: A list of possible scheduler configurations.
continue_training: Flag to continue training from a checkpoint.
"""
device: str
epochs: list[int]
log_interval: int = 10
class_share_log_columns: list[str] = Field(default_factory=list)
early_stopping_epochs: Optional[int] = None
iter_save: int
batch_size: list[int]
lr: list[float]
criterion: dict[str, str]
class_weights: Optional[dict[str, list[float]]] = None
accumulation_steps: list[int]
dropout: list[float] = [0.0]
loss_weights: Optional[dict[str, float]] = None
optimizer: list[DotDict] = Field(
default_factory=lambda: [DotDict({"name": "Adam"})]
)
scheduler: list[DotDict] = Field(
default_factory=lambda: [
DotDict({"name": "StepLR", "step_size": 1, "gamma": 0.99})
]
)
continue_training: bool = True
[docs]
def __init__(self, **kwargs):
"""Initialize the TrainingSpecHyperparameterSampling instance.
This method initializes the Pydantic BaseModel and then processes the
optimizer and scheduler configurations from the provided keyword
arguments, converting them into DotDict objects.
Args:
**kwargs: Keyword arguments that correspond to the attributes of this
class. The 'optimizer' and 'scheduler' arguments are expected
to be lists of dictionaries.
"""
super().__init__(
**{k: v for k, v in kwargs.items() if k not in ["optimizer", "scheduler"]}
)
self.optimizer = [
DotDict(optimizer_config) for optimizer_config in kwargs["optimizer"]
]
self.scheduler = [
DotDict(scheduler_config) for scheduler_config in kwargs["scheduler"]
]
@validator("scheduler")
def validate_model_spec(cls, v, values):
assert (
len(values["lr"]) == len(v)
), "lr and scheduler must have the same number of candidate values, that are paired"
assert (
len(values["epochs"]) == len(v)
), "epochs and scheduler must have the same number of candidate values, that are paired"
return v
[docs]
def random_sample(self):
"""Randomly sample a set of training hyperparameters.
This method selects a random combination of hyperparameters from the
defined lists of possibilities. It ensures that learning rates and
schedulers are paired correctly.
Returns:
A TrainingSpecModel instance populated with a randomly sampled set of
hyperparameters.
"""
lr_and_scheduler_index = np.random.randint(len(self.lr))
optimizer_index = np.random.randint(len(self.optimizer))
batch_size = np.random.choice(self.batch_size)
dropout = np.random.choice(self.dropout)
accumulation_steps = np.random.choice(self.accumulation_steps)
optimizer = self.optimizer[optimizer_index]
lr = self.lr[lr_and_scheduler_index]
print(f"{lr = } - {batch_size = } - {dropout = } - {optimizer = }")
return TrainingSpecModel(
device=self.device,
epochs=self.epochs[lr_and_scheduler_index],
log_interval=self.log_interval,
class_share_log_columns=self.class_share_log_columns,
early_stopping_epochs=self.early_stopping_epochs,
iter_save=self.iter_save,
batch_size=batch_size,
lr=lr,
criterion=self.criterion,
class_weights=self.class_weights,
accumulation_steps=accumulation_steps,
dropout=dropout,
loss_weights=self.loss_weights,
optimizer=optimizer,
scheduler=self.scheduler[lr_and_scheduler_index],
)
[docs]
def grid_sample(self, i):
"""Select a set of training hyperparameters based on a grid search index.
This method generates a grid of all possible hyperparameter combinations
and selects the combination at the given index.
Args:
i: The index of the hyperparameter combination to select from the grid.
Returns:
A TrainingSpecModel instance populated with the selected set of
hyperparameters.
"""
hyperparameter_combinations = list(
product(
np.arange(len(self.lr)),
self.batch_size,
self.dropout,
self.optimizer,
self.accumulation_steps,
)
)
lr_and_scheduler_index, batch_size, dropout, optimizer, accumulation_steps = (
hyperparameter_combinations[i]
)
lr = self.lr[lr_and_scheduler_index]
print(f"{lr = } - {batch_size = } - {dropout = } - {optimizer = }")
return TrainingSpecModel(
device=self.device,
epochs=self.epochs[lr_and_scheduler_index],
log_interval=self.log_interval,
class_share_log_columns=self.class_share_log_columns,
early_stopping_epochs=self.early_stopping_epochs,
iter_save=self.iter_save,
batch_size=batch_size,
lr=lr,
criterion=self.criterion,
class_weights=self.class_weights,
accumulation_steps=accumulation_steps,
dropout=dropout,
loss_weights=self.loss_weights,
optimizer=optimizer,
scheduler=self.scheduler[lr_and_scheduler_index],
)
[docs]
def n_combinations(self):
"""Calculate the total number of hyperparameter combinations.
This method computes the total number of unique hyperparameter sets that
can be generated by the grid search.
Returns:
The total number of possible hyperparameter combinations.
"""
return (
len(self.lr)
* len(self.batch_size)
* len(self.dropout)
* len(self.optimizer)
* len(self.accumulation_steps)
)
[docs]
class ModelSpecHyperparameterSampling(BaseModel):
"""Pydantic model for model specification hyperparameter sampling.
Attributes:
d_model: A list of possible numbers of expected features in the input.
d_model_by_column: A list of possible embedding dimensions for each input column.
nhead: A list of possible numbers of heads in the multi-head attention models.
d_hid: A list of possible dimensions of the feedforward network model.
nlayers: A list of possible numbers of layers in the transformer model.
"""
d_model: list[int]
d_model_by_column: Optional[list[dict[str, int]]]
nhead: list[int]
d_hid: list[int]
nlayers: list[int]
@validator("nhead")
def validate_model_spec(cls, v, values):
if values["d_model_by_column"] is not None:
assert (
len(values["d_model"]) == len(values["d_model_by_column"])
), "d_model and d_model_by_column must have the same number of candidate values, that are paired"
assert (
len(values["d_model"]) == len(v)
), "d_model and nhead must have the same number of candidate values, that are paired"
return v
[docs]
def random_sample(self):
"""Randomly sample a set of model hyperparameters.
This method selects a random combination of model hyperparameters from the
defined lists of possibilities. It ensures that d_model, d_model_by_column,
and nhead are paired correctly.
Returns:
A ModelSpecModel instance populated with a randomly sampled set of
hyperparameters.
"""
d_model_index = np.random.randint(len(self.d_model))
d_model_by_column = (
None
if self.d_model_by_column is None
else self.d_model_by_column[d_model_index]
)
d_model = self.d_model[d_model_index]
d_hid = np.random.choice(self.d_hid)
nlayers = np.random.choice(self.nlayers)
print(f"{d_model = } - {d_hid = } - {nlayers = }")
return ModelSpecModel(
d_model=self.d_model[d_model_index],
d_model_by_column=d_model_by_column,
nhead=self.nhead[d_model_index],
d_hid=d_hid,
nlayers=nlayers,
)
[docs]
def grid_sample(self, i):
"""Select a set of model hyperparameters based on a grid search index.
This method generates a grid of all possible model hyperparameter
combinations and selects the combination at the given index.
Args:
i: The index of the hyperparameter combination to select from the grid.
Returns:
A ModelSpecModel instance populated with the selected set of
hyperparameters.
"""
hyperparameter_combinations = list(
product(np.arange(len(self.d_model)), self.d_hid, self.nlayers)
)
d_model_index, d_hid, nlayers = hyperparameter_combinations[i]
d_model = self.d_model[d_model_index]
print(f"{d_model = } - {d_hid = } - {nlayers = }")
d_model_by_column = (
None
if self.d_model_by_column is None
else self.d_model_by_column[d_model_index]
)
return ModelSpecModel(
d_model=d_model,
d_model_by_column=d_model_by_column,
nhead=self.nhead[d_model_index],
d_hid=d_hid,
nlayers=nlayers,
)
[docs]
def n_combinations(self):
"""Calculate the total number of model hyperparameter combinations.
This method computes the total number of unique model hyperparameter sets
that can be generated by the grid search.
Returns:
The total number of possible model hyperparameter combinations.
"""
return len(self.d_model) * len(self.d_hid) * len(self.nlayers)
[docs]
class HyperparameterSearch(BaseModel):
"""Pydantic model for hyperparameter search configuration.
Attributes:
project_path: The path to the sequifier project directory.
ddconfig_path: The path to the data-driven configuration file.
hp_search_name: The name for the hyperparameter search.
search_strategy: The search strategy, either "sample" or "grid".
n_samples: The number of samples to draw for the search.
model_config_write_path: The path to write the model configurations to.
training_data_path: The path to the training data.
validation_data_path: The path to the validation data.
read_format: The file format of the input data.
selected_columns: A list of lists of columns to be used for training.
column_types: A list of dictionaries mapping columns to their types.
categorical_columns: A list of lists of categorical columns.
real_columns: A list of lists of real-valued columns.
target_columns: The list of target columns for model training.
target_column_types: A dictionary mapping target columns to their types.
id_maps: A dictionary mapping categorical values to their indexed representation.
seq_length: A list of possible sequence lengths.
n_classes: The number of classes for each categorical column.
inference_batch_size: The batch size for inference.
export_onnx: If True, exports the model in ONNX format.
export_pt: If True, exports the model using torch.save.
export_with_dropout: If True, exports the model with dropout enabled.
model_hyperparameter_sampling: The sampling configuration for model hyperparameters.
training_hyperparameter_sampling: The sampling configuration for training hyperparameters.
"""
project_path: str
ddconfig_path: str
hp_search_name: str
search_strategy: str = "sample" # "sample" or "grid"
n_samples: Optional[int]
model_config_write_path: str
training_data_path: str
validation_data_path: str
read_format: str = "parquet"
selected_columns: list[list[str]]
column_types: list[dict[str, str]]
categorical_columns: list[list[str]]
real_columns: list[list[str]]
target_columns: list[str]
target_column_types: dict[str, str]
id_maps: dict[str, dict[str | int, int]]
seq_length: list[int]
n_classes: dict[str, int]
inference_batch_size: int
export_onnx: bool = True
export_pt: bool = False
export_with_dropout: bool = False
model_hyperparameter_sampling: ModelSpecHyperparameterSampling
training_hyperparameter_sampling: TrainingSpecHyperparameterSampling
@validator("column_types")
def validate_model_spec(cls, v, values):
if v is not None:
assert (
len(values["selected_columns"]) == len(v)
), "selected_columns and column_types must have the same number of candidate values, that are paired"
return v
[docs]
def random_sample(self, i):
"""Randomly sample a full training configuration.
This method generates a complete training configuration by randomly
sampling model and training hyperparameters, as well as selecting a
column set and sequence length.
Args:
i: The index of the sample, used to create a unique model name.
Returns:
A TrainModel instance populated with a randomly sampled configuration.
"""
model_spec = self.model_hyperparameter_sampling.random_sample()
training_spec = self.training_hyperparameter_sampling.random_sample()
selected_columns_index = np.random.randint(len(self.selected_columns))
seq_length = np.random.choice(self.seq_length)
print(f"{selected_columns_index = } - {seq_length = }")
return TrainModel(
project_path=self.project_path,
ddconfig_path=self.ddconfig_path,
model_name=self.hp_search_name + f"-run-{i}",
training_data_path=self.training_data_path,
validation_data_path=self.validation_data_path,
read_format=self.read_format,
selected_columns=self.selected_columns[selected_columns_index],
column_types=self.column_types[selected_columns_index],
categorical_columns=self.categorical_columns[selected_columns_index],
real_columns=self.real_columns[selected_columns_index],
target_columns=self.target_columns,
target_column_types=self.target_column_types,
id_maps=self.id_maps,
seq_length=seq_length,
n_classes=self.n_classes,
inference_batch_size=self.inference_batch_size,
seed=101,
export_onnx=self.export_onnx,
export_pt=self.export_pt,
export_with_dropout=self.export_with_dropout,
model_spec=model_spec,
training_spec=training_spec,
)
[docs]
def grid_sample(self, i):
"""Select a full training configuration based on a grid search index.
This method generates a grid of all possible configurations and selects
the configuration at the given index.
Args:
i: The index of the configuration to select from the grid.
Returns:
A TrainModel instance populated with the selected configuration.
"""
model_hyperparamter_sample = self.model_hyperparameter_sampling.n_combinations()
training_hyperparamter_sample = (
self.training_hyperparameter_sampling.n_combinations()
)
inner_combinations = model_hyperparamter_sample * training_hyperparamter_sample
i_model = i % model_hyperparamter_sample
i_training = (i // model_hyperparamter_sample) % training_hyperparamter_sample
i_outer = i // inner_combinations
model_spec = self.model_hyperparameter_sampling.grid_sample(i_model)
training_spec = self.training_hyperparameter_sampling.grid_sample(i_training)
hyperparameter_combinations = list(
product(np.arange(len(self.selected_columns)), self.seq_length)
)
selected_columns_index, seq_length = hyperparameter_combinations[i_outer]
return TrainModel(
project_path=self.project_path,
ddconfig_path=self.ddconfig_path,
model_name=self.hp_search_name + f"-run-{i}",
training_data_path=self.training_data_path,
validation_data_path=self.validation_data_path,
read_format=self.read_format,
selected_columns=self.selected_columns[selected_columns_index],
column_types=self.column_types[selected_columns_index],
categorical_columns=self.categorical_columns[selected_columns_index],
real_columns=self.real_columns[selected_columns_index],
target_columns=self.target_columns,
target_column_types=self.target_column_types,
id_maps=self.id_maps,
seq_length=seq_length,
n_classes=self.n_classes,
inference_batch_size=self.inference_batch_size,
seed=101,
export_onnx=self.export_onnx,
export_pt=self.export_pt,
export_with_dropout=self.export_with_dropout,
model_spec=model_spec,
training_spec=training_spec,
)
[docs]
def sample(self, i):
"""Sample a configuration based on the specified search strategy.
This method delegates to either random_sample or grid_sample based on
the `search_strategy` attribute.
Args:
i: The index of the sample or grid combination to generate.
Returns:
A TrainModel instance with a generated configuration.
Raises:
Exception: If the search_strategy is not 'sample' or 'grid'.
"""
if self.search_strategy == "sample":
return self.random_sample(i)
elif self.search_strategy == "grid":
return self.grid_sample(i)
else:
raise Exception(f"{self.search_strategy} invalid")
[docs]
def n_combinations(self):
"""Calculate the total number of possible configurations.
This method computes the total number of unique configurations that can be
generated by a grid search over all defined hyperparameters.
Returns:
The total number of possible hyperparameter configurations.
"""
return (
len(self.selected_columns)
* len(self.seq_length)
* self.model_hyperparameter_sampling.n_combinations()
* self.training_hyperparameter_sampling.n_combinations()
)