diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index fbe89b41..f6ea33f3 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,10 +6,12 @@ import scipy.sparse as sparse from sparsekmeans import LloydKmeans, ElkanKmeans import sklearn.preprocessing +from scipy.special import log_expit from tqdm import tqdm import psutil from . import linear +from . import metrics __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] @@ -57,7 +59,11 @@ def __init__( self.node_ptr = node_ptr self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. + self.estimator_parameter = 3 + def sigmoid_A(self, x): + return log_expit(self.estimator_parameter * x) + def predict_values( self, x: sparse.csr_matrix, @@ -68,10 +74,12 @@ def predict_values( Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10. + estimation_parameter (int, optional): The tunable parameter of probability estimation function, that is sigmoid(estimation_parameter * preds). Returns: np.ndarray: A matrix with dimension number of instances * number of classes. """ + if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. @@ -132,7 +140,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + children_scores = 0.0 + self.sigmoid_A(root_preds) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -182,7 +190,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - np.square(np.maximum(0, 1 - pred)) + children_score = score + self.sigmoid_A(pred) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -193,9 +201,58 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + scores[node.label_map] = np.exp(score + self.sigmoid_A(pred)) return scores + def tuning_A_by_cross_validation( + self, + y: sparse.csr_matrix, + x: sparse.csr_matrix, + n_folds: int, + batch_size: int, + beamwidth: int, + metric: list, + A_candidates: list, + options: str = "", + K=100, + dmax=10, + ): + data_splits = [] + for n in range(n_folds): + start = np.ceil(n/n_folds*x.shape[0]).astype(int) + end = np.ceil((n+1)/n_folds*x.shape[0]).astype(int) + data_splits.append({'x':x[start:end, :], 'y':y[start:end ,:]}) + + score = {m:{A:0 for A in A_candidates} for m in metric} + for n in range(n_folds): + data_y = sparse.vstack([data_splits[j]["y"] for j in range(n_folds) if j != n]) + data_x = sparse.vstack([data_splits[j]["x"] for j in range(n_folds) if j != n]) + + model = train_tree( + data_y, + data_x, + options, + K, + dmax, + ) + + for A in A_candidates: + model.estimator_parameter = A + + num_instances = data_splits[n]["x"].shape[0] + num_batch = np.ceil(num_instances/batch_size).astype(int) + metric_eval = metrics.get_metrics(metric ,num_classes = data_y.shape[1]) + for i in range(num_batch): + valid_x = data_splits[n]["x"][i * batch_size : (i+1) * batch_size] + valid_y = data_splits[n]["y"][i * batch_size : (i+1) * batch_size] + preds = model.predict_values(valid_x, beam_width=beamwidth) + metric_eval.update(preds, valid_y) + + eval = metric_eval.compute() + for k in eval.keys(): + score[k][A] += eval[k] + + self.estimator_parameter = max(score[k], key=score[k].get) def train_tree( y: sparse.csr_matrix,