Source code for sequifier.config.hyperparameter_search_config

import json
from itertools import product
from typing import Optional

import numpy as np
import yaml
from beartype import beartype
from pydantic import BaseModel, Field, validator

from sequifier.config.train_config import (
    DotDict,
    ModelSpecModel,
    TrainingSpecModel,
    TrainModel,
)
from sequifier.helpers import normalize_path


@beartype
def load_hyperparameter_search_config(
    config_path: str, on_unprocessed: bool
) -> "HyperparameterSearch":
    """Load a hyperparameter search configuration from a YAML file.

    This function reads a YAML configuration file, processes it to include
    data-driven configurations if needed, and returns a HyperparameterSearch
    object.

    Args:
        config_path: The path to the hyperparameter search configuration file.
        on_unprocessed: A boolean flag indicating whether the configuration is
            for unprocessed data. If False, it will load and integrate
            data-driven configurations.

    Returns:
        An instance of the HyperparameterSearch class, populated with the
        configuration from the file.
    """
    with open(config_path, "r") as f:
        config_values = yaml.safe_load(f)

    if not on_unprocessed:
        ddconfig_path = config_values.get("ddconfig_path")

        with open(
            normalize_path(ddconfig_path, config_values["project_path"]), "r"
        ) as f:
            dd_config = json.loads(f.read())

        config_values["column_types"] = config_values.get(
            "column_types", [dd_config["column_types"]]
        )

        if config_values["selected_columns"] is None:
            config_values["selected_columns"] = [
                list(config_values["column_types"].keys())
            ]

        config_values["categorical_columns"] = [
            [
                col
                for col, type_ in dd_config["column_types"].items()
                if type_ == "Int64" and col in selected_columns
            ]
            for selected_columns in config_values["selected_columns"]
        ]

        config_values["real_columns"] = [
            [
                col
                for col, type_ in dd_config["column_types"].items()
                if type_ == "Float64" and col in selected_columns
            ]
            for selected_columns in config_values["selected_columns"]
        ]

        config_values["n_classes"] = config_values.get(
            "n_classes", dd_config["n_classes"]
        )
        config_values["training_data_path"] = normalize_path(
            config_values.get("training_data_path", dd_config["split_paths"][0]),
            config_values["project_path"],
        )
        config_values["validation_data_path"] = normalize_path(
            config_values.get(
                "validation_data_path",
                dd_config["split_paths"][min(1, len(dd_config["split_paths"]) - 1)],
            ),
            config_values["project_path"],
        )

        config_values["id_maps"] = dd_config["id_maps"]

    return HyperparameterSearch(**config_values)


[docs] class TrainingSpecHyperparameterSampling(BaseModel): """Pydantic model for training specification hyperparameter sampling. Attributes: device: The device to train on (e.g., 'cuda', 'cpu'). epochs: A list of possible numbers of epochs to train for. log_interval: The interval in batches for logging. class_share_log_columns: Columns for which to log class share. early_stopping_epochs: Number of epochs for early stopping. iter_save: Interval in epochs for saving model checkpoints. batch_size: A list of possible batch sizes. lr: A list of possible learning rates. criterion: A dictionary mapping target columns to loss functions. class_weights: Optional dictionary mapping columns to class weights. accumulation_steps: A list of possible gradient accumulation steps. dropout: A list of possible dropout rates. loss_weights: Optional dictionary mapping columns to loss weights. optimizer: A list of possible optimizer configurations. scheduler: A list of possible scheduler configurations. continue_training: Flag to continue training from a checkpoint. """ device: str epochs: list[int] log_interval: int = 10 class_share_log_columns: list[str] = Field(default_factory=list) early_stopping_epochs: Optional[int] = None iter_save: int batch_size: list[int] lr: list[float] criterion: dict[str, str] class_weights: Optional[dict[str, list[float]]] = None accumulation_steps: list[int] dropout: list[float] = [0.0] loss_weights: Optional[dict[str, float]] = None optimizer: list[DotDict] = Field( default_factory=lambda: [DotDict({"name": "Adam"})] ) scheduler: list[DotDict] = Field( default_factory=lambda: [ DotDict({"name": "StepLR", "step_size": 1, "gamma": 0.99}) ] ) continue_training: bool = True
[docs] def __init__(self, **kwargs): """Initialize the TrainingSpecHyperparameterSampling instance. This method initializes the Pydantic BaseModel and then processes the optimizer and scheduler configurations from the provided keyword arguments, converting them into DotDict objects. Args: **kwargs: Keyword arguments that correspond to the attributes of this class. The 'optimizer' and 'scheduler' arguments are expected to be lists of dictionaries. """ super().__init__( **{k: v for k, v in kwargs.items() if k not in ["optimizer", "scheduler"]} ) self.optimizer = [ DotDict(optimizer_config) for optimizer_config in kwargs["optimizer"] ] self.scheduler = [ DotDict(scheduler_config) for scheduler_config in kwargs["scheduler"] ]
@validator("scheduler") def validate_model_spec(cls, v, values): assert ( len(values["lr"]) == len(v) ), "lr and scheduler must have the same number of candidate values, that are paired" assert ( len(values["epochs"]) == len(v) ), "epochs and scheduler must have the same number of candidate values, that are paired" return v
[docs] def random_sample(self): """Randomly sample a set of training hyperparameters. This method selects a random combination of hyperparameters from the defined lists of possibilities. It ensures that learning rates and schedulers are paired correctly. Returns: A TrainingSpecModel instance populated with a randomly sampled set of hyperparameters. """ lr_and_scheduler_index = np.random.randint(len(self.lr)) optimizer_index = np.random.randint(len(self.optimizer)) batch_size = np.random.choice(self.batch_size) dropout = np.random.choice(self.dropout) accumulation_steps = np.random.choice(self.accumulation_steps) optimizer = self.optimizer[optimizer_index] lr = self.lr[lr_and_scheduler_index] print(f"{lr = } - {batch_size = } - {dropout = } - {optimizer = }") return TrainingSpecModel( device=self.device, epochs=self.epochs[lr_and_scheduler_index], log_interval=self.log_interval, class_share_log_columns=self.class_share_log_columns, early_stopping_epochs=self.early_stopping_epochs, iter_save=self.iter_save, batch_size=batch_size, lr=lr, criterion=self.criterion, class_weights=self.class_weights, accumulation_steps=accumulation_steps, dropout=dropout, loss_weights=self.loss_weights, optimizer=optimizer, scheduler=self.scheduler[lr_and_scheduler_index], )
[docs] def grid_sample(self, i): """Select a set of training hyperparameters based on a grid search index. This method generates a grid of all possible hyperparameter combinations and selects the combination at the given index. Args: i: The index of the hyperparameter combination to select from the grid. Returns: A TrainingSpecModel instance populated with the selected set of hyperparameters. """ hyperparameter_combinations = list( product( np.arange(len(self.lr)), self.batch_size, self.dropout, self.optimizer, self.accumulation_steps, ) ) lr_and_scheduler_index, batch_size, dropout, optimizer, accumulation_steps = ( hyperparameter_combinations[i] ) lr = self.lr[lr_and_scheduler_index] print(f"{lr = } - {batch_size = } - {dropout = } - {optimizer = }") return TrainingSpecModel( device=self.device, epochs=self.epochs[lr_and_scheduler_index], log_interval=self.log_interval, class_share_log_columns=self.class_share_log_columns, early_stopping_epochs=self.early_stopping_epochs, iter_save=self.iter_save, batch_size=batch_size, lr=lr, criterion=self.criterion, class_weights=self.class_weights, accumulation_steps=accumulation_steps, dropout=dropout, loss_weights=self.loss_weights, optimizer=optimizer, scheduler=self.scheduler[lr_and_scheduler_index], )
[docs] def n_combinations(self): """Calculate the total number of hyperparameter combinations. This method computes the total number of unique hyperparameter sets that can be generated by the grid search. Returns: The total number of possible hyperparameter combinations. """ return ( len(self.lr) * len(self.batch_size) * len(self.dropout) * len(self.optimizer) * len(self.accumulation_steps) )
[docs] class ModelSpecHyperparameterSampling(BaseModel): """Pydantic model for model specification hyperparameter sampling. Attributes: d_model: A list of possible numbers of expected features in the input. d_model_by_column: A list of possible embedding dimensions for each input column. nhead: A list of possible numbers of heads in the multi-head attention models. d_hid: A list of possible dimensions of the feedforward network model. nlayers: A list of possible numbers of layers in the transformer model. """ d_model: list[int] d_model_by_column: Optional[list[dict[str, int]]] nhead: list[int] d_hid: list[int] nlayers: list[int] @validator("nhead") def validate_model_spec(cls, v, values): if values["d_model_by_column"] is not None: assert ( len(values["d_model"]) == len(values["d_model_by_column"]) ), "d_model and d_model_by_column must have the same number of candidate values, that are paired" assert ( len(values["d_model"]) == len(v) ), "d_model and nhead must have the same number of candidate values, that are paired" return v
[docs] def random_sample(self): """Randomly sample a set of model hyperparameters. This method selects a random combination of model hyperparameters from the defined lists of possibilities. It ensures that d_model, d_model_by_column, and nhead are paired correctly. Returns: A ModelSpecModel instance populated with a randomly sampled set of hyperparameters. """ d_model_index = np.random.randint(len(self.d_model)) d_model_by_column = ( None if self.d_model_by_column is None else self.d_model_by_column[d_model_index] ) d_model = self.d_model[d_model_index] d_hid = np.random.choice(self.d_hid) nlayers = np.random.choice(self.nlayers) print(f"{d_model = } - {d_hid = } - {nlayers = }") return ModelSpecModel( d_model=self.d_model[d_model_index], d_model_by_column=d_model_by_column, nhead=self.nhead[d_model_index], d_hid=d_hid, nlayers=nlayers, )
[docs] def grid_sample(self, i): """Select a set of model hyperparameters based on a grid search index. This method generates a grid of all possible model hyperparameter combinations and selects the combination at the given index. Args: i: The index of the hyperparameter combination to select from the grid. Returns: A ModelSpecModel instance populated with the selected set of hyperparameters. """ hyperparameter_combinations = list( product(np.arange(len(self.d_model)), self.d_hid, self.nlayers) ) d_model_index, d_hid, nlayers = hyperparameter_combinations[i] d_model = self.d_model[d_model_index] print(f"{d_model = } - {d_hid = } - {nlayers = }") d_model_by_column = ( None if self.d_model_by_column is None else self.d_model_by_column[d_model_index] ) return ModelSpecModel( d_model=d_model, d_model_by_column=d_model_by_column, nhead=self.nhead[d_model_index], d_hid=d_hid, nlayers=nlayers, )
[docs] def n_combinations(self): """Calculate the total number of model hyperparameter combinations. This method computes the total number of unique model hyperparameter sets that can be generated by the grid search. Returns: The total number of possible model hyperparameter combinations. """ return len(self.d_model) * len(self.d_hid) * len(self.nlayers)
[docs] class HyperparameterSearch(BaseModel): """Pydantic model for hyperparameter search configuration. Attributes: project_path: The path to the sequifier project directory. ddconfig_path: The path to the data-driven configuration file. hp_search_name: The name for the hyperparameter search. search_strategy: The search strategy, either "sample" or "grid". n_samples: The number of samples to draw for the search. model_config_write_path: The path to write the model configurations to. training_data_path: The path to the training data. validation_data_path: The path to the validation data. read_format: The file format of the input data. selected_columns: A list of lists of columns to be used for training. column_types: A list of dictionaries mapping columns to their types. categorical_columns: A list of lists of categorical columns. real_columns: A list of lists of real-valued columns. target_columns: The list of target columns for model training. target_column_types: A dictionary mapping target columns to their types. id_maps: A dictionary mapping categorical values to their indexed representation. seq_length: A list of possible sequence lengths. n_classes: The number of classes for each categorical column. inference_batch_size: The batch size for inference. export_onnx: If True, exports the model in ONNX format. export_pt: If True, exports the model using torch.save. export_with_dropout: If True, exports the model with dropout enabled. model_hyperparameter_sampling: The sampling configuration for model hyperparameters. training_hyperparameter_sampling: The sampling configuration for training hyperparameters. """ project_path: str ddconfig_path: str hp_search_name: str search_strategy: str = "sample" # "sample" or "grid" n_samples: Optional[int] model_config_write_path: str training_data_path: str validation_data_path: str read_format: str = "parquet" selected_columns: list[list[str]] column_types: list[dict[str, str]] categorical_columns: list[list[str]] real_columns: list[list[str]] target_columns: list[str] target_column_types: dict[str, str] id_maps: dict[str, dict[str | int, int]] seq_length: list[int] n_classes: dict[str, int] inference_batch_size: int export_onnx: bool = True export_pt: bool = False export_with_dropout: bool = False model_hyperparameter_sampling: ModelSpecHyperparameterSampling training_hyperparameter_sampling: TrainingSpecHyperparameterSampling @validator("column_types") def validate_model_spec(cls, v, values): if v is not None: assert ( len(values["selected_columns"]) == len(v) ), "selected_columns and column_types must have the same number of candidate values, that are paired" return v
[docs] def random_sample(self, i): """Randomly sample a full training configuration. This method generates a complete training configuration by randomly sampling model and training hyperparameters, as well as selecting a column set and sequence length. Args: i: The index of the sample, used to create a unique model name. Returns: A TrainModel instance populated with a randomly sampled configuration. """ model_spec = self.model_hyperparameter_sampling.random_sample() training_spec = self.training_hyperparameter_sampling.random_sample() selected_columns_index = np.random.randint(len(self.selected_columns)) seq_length = np.random.choice(self.seq_length) print(f"{selected_columns_index = } - {seq_length = }") return TrainModel( project_path=self.project_path, ddconfig_path=self.ddconfig_path, model_name=self.hp_search_name + f"-run-{i}", training_data_path=self.training_data_path, validation_data_path=self.validation_data_path, read_format=self.read_format, selected_columns=self.selected_columns[selected_columns_index], column_types=self.column_types[selected_columns_index], categorical_columns=self.categorical_columns[selected_columns_index], real_columns=self.real_columns[selected_columns_index], target_columns=self.target_columns, target_column_types=self.target_column_types, id_maps=self.id_maps, seq_length=seq_length, n_classes=self.n_classes, inference_batch_size=self.inference_batch_size, seed=101, export_onnx=self.export_onnx, export_pt=self.export_pt, export_with_dropout=self.export_with_dropout, model_spec=model_spec, training_spec=training_spec, )
[docs] def grid_sample(self, i): """Select a full training configuration based on a grid search index. This method generates a grid of all possible configurations and selects the configuration at the given index. Args: i: The index of the configuration to select from the grid. Returns: A TrainModel instance populated with the selected configuration. """ model_hyperparamter_sample = self.model_hyperparameter_sampling.n_combinations() training_hyperparamter_sample = ( self.training_hyperparameter_sampling.n_combinations() ) inner_combinations = model_hyperparamter_sample * training_hyperparamter_sample i_model = i % model_hyperparamter_sample i_training = (i // model_hyperparamter_sample) % training_hyperparamter_sample i_outer = i // inner_combinations model_spec = self.model_hyperparameter_sampling.grid_sample(i_model) training_spec = self.training_hyperparameter_sampling.grid_sample(i_training) hyperparameter_combinations = list( product(np.arange(len(self.selected_columns)), self.seq_length) ) selected_columns_index, seq_length = hyperparameter_combinations[i_outer] return TrainModel( project_path=self.project_path, ddconfig_path=self.ddconfig_path, model_name=self.hp_search_name + f"-run-{i}", training_data_path=self.training_data_path, validation_data_path=self.validation_data_path, read_format=self.read_format, selected_columns=self.selected_columns[selected_columns_index], column_types=self.column_types[selected_columns_index], categorical_columns=self.categorical_columns[selected_columns_index], real_columns=self.real_columns[selected_columns_index], target_columns=self.target_columns, target_column_types=self.target_column_types, id_maps=self.id_maps, seq_length=seq_length, n_classes=self.n_classes, inference_batch_size=self.inference_batch_size, seed=101, export_onnx=self.export_onnx, export_pt=self.export_pt, export_with_dropout=self.export_with_dropout, model_spec=model_spec, training_spec=training_spec, )
[docs] def sample(self, i): """Sample a configuration based on the specified search strategy. This method delegates to either random_sample or grid_sample based on the `search_strategy` attribute. Args: i: The index of the sample or grid combination to generate. Returns: A TrainModel instance with a generated configuration. Raises: Exception: If the search_strategy is not 'sample' or 'grid'. """ if self.search_strategy == "sample": return self.random_sample(i) elif self.search_strategy == "grid": return self.grid_sample(i) else: raise Exception(f"{self.search_strategy} invalid")
[docs] def n_combinations(self): """Calculate the total number of possible configurations. This method computes the total number of unique configurations that can be generated by a grid search over all defined hyperparameters. Returns: The total number of possible hyperparameter configurations. """ return ( len(self.selected_columns) * len(self.seq_length) * self.model_hyperparameter_sampling.n_combinations() * self.training_hyperparameter_sampling.n_combinations() )