Source code for skgrf.ensemble.boosted_regressor

import logging
from abc import ABC
from abc import abstractmethod

import numpy as np
from numpy import random
from scipy import stats as ss
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.base import clone
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted

from skgrf import grf
from skgrf.base import GRFMixin
from skgrf.ensemble.regressor import GRFForestRegressor
from skgrf.utils.validation import check_sample_weight

logger = logging.getLogger(__name__)


[docs]class GRFBoostedForestRegressor(GRFMixin, RegressorMixin, BaseEstimator): r"""GRF Boosted Regression implementation for sci-kit learn. Provides a sklearn regressor interface to the GRF C++ library using Cython. :param int n_estimators: The number of tree regressors to train :param bool equalize_cluster_weights: Weight the samples such that clusters have equally weight. If ``False``, larger clusters will have more weight. If ``True``, the number of samples drawn from each cluster is equal to the size of the smallest cluster. If ``True``, sample weights should not be passed on fitting. :param float sample_fraction: Fraction of samples used in each tree. If ``ci_group_size`` > 1, the max allowed fraction is 0.5 :param int mtry: The number of features to split on each node. The default is ``sqrt(p) + 20`` where ``p`` is the number of features. :param int min_node_size: The minimum number of observations in each tree leaf. :param bool honesty: Use honest splitting (subsample splitting). :param float honesty_fraction: The fraction of data used for subsample splitting. :param bool honesty_prune_leaves: Prune estimation sample tree such that no leaves are empty. If ``False``, trees with empty leaves are skipped. :param float alpha: The maximum imbalance of a split. :param float imbalance_penalty: Penalty applied to imbalanced splits. :param int ci_group_size: The quantity of trees grown on each subsample. At least 2 is required to provide confidence intervals. :param list(str) tune_params: A list of parameter names on which to perform tuning. Valid strings are "sample_fraction", "mtry", "min_node_size", "honesty_fraction", "honesty_prune_leaves", "alpha", "imbalance_penalty". :param int tune_n_estimators: The number of estimators to use in the tuning model. :param int tune_n_reps: The number of forests used in the tuning model :param int tune_n_draws: The number of random parameter values for tuning model selection :param int boost_steps: The number of boosting iterations :param int boost_error_reduction: The percentage of previous step's error that must be estimated by the current boost step :param int boost_max_steps: The maximum number of boosting iterations :param int boost_trees_tune: The number of trees used to test a new boosting step. :param int n_jobs: The number of threads. Default is number of CPU cores. :param int seed: Random seed value. :ivar int n_features_in\_: The number of features (columns) from the fit input ``X``. :ivar dict boosted_forests\_: The boosted regression forests. :ivar int mtry\_: The ``mtry`` value determined by validation. :ivar int outcome_index\_: The index of the grf train matrix holding the outcomes. :ivar list samples_per_cluster\_: The number of samples to train per cluster. :ivar list clusters\_: The cluster labels determined from the fit input ``cluster``. :ivar int n_clusters\_: The number of unique cluster labels from the fit input ``cluster``. """ def __init__( self, n_estimators=100, equalize_cluster_weights=False, sample_fraction=0.5, mtry=None, min_node_size=5, honesty=True, honesty_fraction=0.5, honesty_prune_leaves=True, alpha=0.05, imbalance_penalty=0, ci_group_size=2, tune_params=None, tune_n_estimators=50, tune_n_reps=100, tune_n_draws=1000, boost_steps=None, boost_error_reduction=0.97, boost_max_steps=5, boost_trees_tune=10, n_jobs=-1, seed=42, ): self.n_estimators = n_estimators self.equalize_cluster_weights = equalize_cluster_weights self.sample_fraction = sample_fraction self.mtry = mtry self.min_node_size = min_node_size self.honesty = honesty self.honesty_fraction = honesty_fraction self.honesty_prune_leaves = honesty_prune_leaves self.alpha = alpha self.imbalance_penalty = imbalance_penalty self.ci_group_size = ci_group_size self.tune_params = tune_params self.tune_n_estimators = tune_n_estimators self.tune_n_reps = tune_n_reps self.tune_n_draws = tune_n_draws self.boost_steps = boost_steps self.boost_error_reduction = boost_error_reduction self.boost_max_steps = boost_max_steps self.boost_trees_tune = boost_trees_tune self.n_jobs = n_jobs self.seed = seed
[docs] def fit(self, X, y, sample_weight=None, cluster=None): """Fit the grf forest using training data. :param array2d X: training input features :param array1d y: training input targets :param array1d sample_weight: optional weights for input samples :param array1d cluster: optional cluster assignments for input samples """ X, y = self._validate_data(X, y, force_all_finite="allow-nan") self._check_n_features(X, reset=True) self._check_boost_error_reduction() self._check_sample_fraction(oob=True) self._check_alpha() sample_weight, _ = check_sample_weight(sample_weight, X) cluster_ = self._check_cluster(X=X, cluster=cluster) self.samples_per_cluster_ = self._check_equalize_cluster_weights( cluster=cluster_, sample_weight=sample_weight ) self.mtry_ = self._check_mtry(X=X) _ = self._create_train_matrices(X=X, y=y, sample_weight=sample_weight) # region tuning a regression forest regression_forest = GRFForestRegressor( n_estimators=self.tune_n_estimators, equalize_cluster_weights=self.equalize_cluster_weights, sample_fraction=self.sample_fraction, mtry=self.mtry, min_node_size=self.min_node_size, honesty=self.honesty, honesty_fraction=self.honesty_fraction, honesty_prune_leaves=self.honesty_prune_leaves, alpha=self.alpha, imbalance_penalty=self.imbalance_penalty, ci_group_size=self.ci_group_size, n_jobs=self.n_jobs, seed=self.seed, ) if self.tune_params is None: logger.debug("not tuning boosted forest") regression_forest.fit( X=X, y=y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) params = regression_forest.get_params(deep=True) forest = regression_forest else: logger.debug("tuning boosted forest") tunable_params = ( "sample_fraction", "mtry", "min_node_size", "honesty_fraction", "honesty_prune_leaves", "alpha", "imbalance_penalty", ) param_distributions = {} for param in self.tune_params: if param not in tunable_params: raise ValueError( f"tuning param {param} not found in {str(tunable_params)}" ) param_distributions[param] = PARAM_DISTRIBUTIONS[param](*X.shape) uniform_samples = random.uniform( size=(self.tune_n_draws, len(self.tune_params)) ) param_samples = np.zeros(shape=(self.tune_n_draws, len(self.tune_params))) for idx, param in enumerate(self.tune_params): param_samples[:, idx] = param_distributions[param].dist( uniform_samples[:, idx] ) errors = [] for draw in range(self.tune_n_draws): params = { p: param_samples[draw, idx] for idx, p in enumerate(self.tune_params) } regression_forest.set_params(**params) regression_forest.fit( X=X, y=y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) errors.append( np.nanmean(regression_forest.grf_forest_["debiased_error"]) ) if np.any(np.isnan(errors)): raise ValueError( "unable to tune because of NaN-valued forest error estimates; consider more trees" ) if np.std(errors) == 0 or np.std(errors) / np.mean(errors) < 1e-10: raise ValueError( "unable to tune because of constant errors for forests; consider more trees" ) variance_guess = np.var(errors) / 2 gp = GaussianProcessRegressor(alpha=variance_guess) gp.fit(uniform_samples, errors) opt_samples = random.uniform( size=(self.tune_n_draws, len(self.tune_params)) ) model_surface = gp.predict(opt_samples) tuned_params = np.zeros(shape=(self.tune_n_draws, len(self.tune_params))) for idx, param in enumerate(self.tune_params): tuned_params[:, idx] = param_distributions[param].dist( opt_samples[:, idx] ) opt_idx = np.argmin(model_surface) params = { p: tuned_params[opt_idx, idx] for idx, p in enumerate(self.tune_params) } params.update(**{"n_estimators": self.tune_n_estimators * 4}) regression_forest.set_params(**params) regression_forest.fit( X, y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) retrained_error = np.nanmean( regression_forest.grf_forest_["debiased_error"] ) default_params = { "sample_fraction": 0.5, "mtry": min(np.ceil(np.sqrt(X.shape[1]) + 20), X.shape[1]), "min_node_size": 5, "honesty_fraction": 0.5, "honesty_prune_leaves": True, "alpha": 0.05, "imbalance_penalty": 0, } default_forest = clone(regression_forest) default_forest.set_params(**default_params) default_forest.fit( X=X, y=y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) default_error = np.nanmean(default_forest.grf_forest_["debiased_error"]) if default_error < retrained_error: params = default_forest.get_params() forest = default_forest else: params = regression_forest.get_params() forest = regression_forest # endregion # region boosting with the tuned forest logger.debug("boosting forest") current_pred = { "predictions": forest.grf_forest_["predictions"], "debiased_error": forest.grf_forest_["debiased_error"], "excess_error": forest.grf_forest_["excess_error"], } y_hat = np.atleast_1d(np.squeeze(np.array(current_pred["predictions"]))) debiased_error = current_pred["debiased_error"] boosted_forests = { "forest": [forest], "error": [np.mean(debiased_error)], } step = 1 while True: y_residual = y - y_hat if self.boost_steps is not None: if step > self.boost_steps: break elif step > self.boost_max_steps: break else: forest_small = GRFForestRegressor( sample_fraction=params["sample_fraction"], mtry=params["mtry"], n_estimators=self.boost_trees_tune, n_jobs=self.n_jobs, min_node_size=params["min_node_size"], honesty=self.honesty, honesty_fraction=params["honesty_fraction"], honesty_prune_leaves=params["honesty_prune_leaves"], seed=self.seed, ci_group_size=self.ci_group_size, alpha=params["alpha"], imbalance_penalty=params["imbalance_penalty"], equalize_cluster_weights=self.equalize_cluster_weights, ) forest_small.fit( X=X, y=y_residual, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) step_error = forest_small.grf_forest_["debiased_error"] if not np.nanmean( step_error ) <= self.boost_error_reduction * np.nanmean(debiased_error): break forest_residual = GRFForestRegressor( sample_fraction=params["sample_fraction"], mtry=params["mtry"], n_estimators=self.n_estimators, n_jobs=self.n_jobs, min_node_size=params["min_node_size"], honesty=self.honesty, honesty_fraction=params["honesty_fraction"], honesty_prune_leaves=params["honesty_prune_leaves"], seed=self.seed, ci_group_size=self.ci_group_size, alpha=params["alpha"], imbalance_penalty=params["imbalance_penalty"], equalize_cluster_weights=self.equalize_cluster_weights, ) forest_residual.fit( X, y_residual, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) current_pred = { "predictions": forest_residual.grf_forest_["predictions"], "debiased_error": forest_residual.grf_forest_["debiased_error"], "excess_error": forest_residual.grf_forest_["excess_error"], } y_hat = y_hat + np.atleast_1d( np.squeeze(np.array(current_pred["predictions"])) ) debiased_error = current_pred["debiased_error"] boosted_forests["forest"].append(forest_residual) boosted_forests["error"].append(np.mean(debiased_error)) step += 1 # endregion boosted_forests["predictions"] = y_hat self.boosted_forests_ = boosted_forests return self
[docs] def predict(self, X, boost_predict_steps=None): """Predict regression target for X. :param array2d X: prediction input features :param int boost_predict_steps: number of boost prediction steps """ check_is_fitted(self) X = check_array(X, force_all_finite="allow-nan") self._check_n_features(X, reset=False) num_forests = len(self.boosted_forests_["forest"]) if boost_predict_steps is None: boost_predict_steps = num_forests else: boost_predict_steps = min(boost_predict_steps, num_forests) y_hat = 0 for forest in self.boosted_forests_["forest"][:boost_predict_steps]: forest._ensure_ptr() result = grf.regression_predict( forest.grf_forest_cpp_, np.asfortranarray([[]]), # train_matrix self.outcome_index_, np.asfortranarray(X.astype("float64")), # test_matrix self._get_num_threads(), False, # estimate_variance ) y = np.atleast_1d(np.squeeze(np.array(result["predictions"]))) y_hat = y_hat + y return y_hat
def _more_tags(self): return { "requires_y": True, "_xfail_checks": { "check_sample_weights_invariance": "zero sample_weight is not equivalent to removing samples", }, "allow_nan": True, }
# region random parameter tuning functions class GRFParamDistribution(ABC): def __init__(self, X_rows: int, X_cols: int): self.X_rows = X_rows self.X_cols = X_cols @abstractmethod def rvs(self, *args, **kwds): # pragma: no cover raise NotImplementedError() class GRFMinNodeSizeDistribution(GRFParamDistribution): def rvs(self, *args, **kwds): # pragma: no cover return self.dist(ss.uniform(*args, **kwds)) def dist(self, uniform): return np.floor(np.exp2(uniform * (np.log(self.X_rows) - np.log(2) - 4))) class GRFSampleFractionDistribution(GRFParamDistribution): def rvs(self, *args, **kwds): # pragma: no cover return self.dist(ss.uniform(*args, **kwds)) def dist(self, uniform): return 0.05 + 0.45 * uniform class GRFMtryDistribution(GRFParamDistribution): def rvs(self, *args, **kwds): # pragma: no cover return self.dist(ss.uniform(*args, **kwds)) def dist(self, uniform): return np.ceil(np.min([self.X_cols, np.sqrt(self.X_cols) + 20]) * uniform) class GRFAlphaDistribution(GRFParamDistribution): def rvs(self, *args, **kwds): # pragma: no cover return self.dist(ss.uniform(*args, **kwds)) def dist(self, uniform): return uniform / 4 class GRFImbalancePenaltyDistribution(GRFParamDistribution): def rvs(self, *args, **kwds): # pragma: no cover return self.dist(ss.uniform(*args, **kwds)) def dist(self, uniform): return -np.log(uniform) class GRFHonestyFractionDistribution(GRFParamDistribution): def rvs(self, *args, **kwds): # pragma: no cover return self.dist(ss.uniform(*args, **kwds)) def dist(self, uniform): return 0.5 + 0.3 * uniform class GRFHonestyPruneLeavesDistribution(GRFParamDistribution): def __init__(self, X_rows: int, X_cols: int): super().__init__(X_rows, X_cols) self.choices = np.array([True, False]) def rvs(self, *args, **kwds): # pragma: no cover return self.dist(ss.uniform(*args, **kwds)) def dist(self, uniform): return self.choices[(uniform < 0.5).astype(int)] PARAM_DISTRIBUTIONS = { "min_node_size": GRFMinNodeSizeDistribution, "sample_fraction": GRFSampleFractionDistribution, "mtry": GRFMtryDistribution, "alpha": GRFAlphaDistribution, "imbalance_penalty": GRFImbalancePenaltyDistribution, "honesty_fraction": GRFHonestyFractionDistribution, "honesty_prune_leaves": GRFHonestyPruneLeavesDistribution, } # endregion