Source code for clearbox_synthetic.utils.autoconfig.autoconfig

"""
This module provides functionality for automatically configuring and searching
optimal parameters for a tabular data engine. The main class, ``Autoconfig``, is
used to perform grid search over various model architectures and batch sizes 
to find the best configuration based on reconstruction loss.
"""

import math
import threading
from typing import List

import numpy as np


[docs] def learning_rule(training_rows_size: int): """ Determines the learning rate, number of epochs, and batch size based on the size of the training data. Parameters ---------- training_rows_size : int The number of rows in the training dataset. Returns ------- Tuple[int, int, int] A tuple containing (learning_rate, epochs, batch_size). """ if training_rows_size < 1000: model_epochs = 1000 model_batch_size = 16 elif training_rows_size < 10000: model_epochs = 500 model_batch_size = 32 elif training_rows_size < 50000: model_epochs = 300 model_batch_size = 128 else: model_epochs = 100 model_batch_size = 256 model_learning_rate = 0.001 return model_learning_rate, model_epochs, model_batch_size
[docs] class Autoconfig: """ A class for automatically configuring and searching optimal parameters for a tabular engine. Attributes ---------- train_ds : np.ndarray The training dataset. y_train_ds : np.ndarray, optional The target values for the training dataset. numerical_features_sizes : int The size of ordinal features. categorical_features_sizes : List The sizes of categorical features. """ def __init__( self, train_ds: np.ndarray, numerical_features_sizes: int, categorical_features_sizes: List, y_train_ds: np.ndarray = None, ): """ Initializes the ``Autoconfig`` class, splits the data into training and test sets, and sets feature sizes. Parameters ---------- train_ds : np.ndarray The complete dataset for training. numerical_features_sizes : int The size of ordinal features. categorical_features_sizes : List The sizes of categorical features. y_train_ds : np.ndarray, optional The target values for the training dataset. Defaults to None. """ splitted_train_ds = np.split( train_ds, [math.ceil(train_ds.shape[0] * 0.8)], axis=0 ) self.train_ds = splitted_train_ds[0] self.test_ds = splitted_train_ds[1] if y_train_ds is not None: splitted_y_train_ds = np.split( y_train_ds, [math.ceil(y_train_ds.shape[0] * 0.8)], axis=0 ) self.y_train_ds = splitted_y_train_ds[0] self.y_test_ds = splitted_y_train_ds[1] else: self.y_train_ds = None self.y_test_ds = None self.numerical_features_sizes = numerical_features_sizes self.categorical_features_sizes = categorical_features_sizes