From b6c87c8a2d2829227208c7b638b96dc96ddb5a98 Mon Sep 17 00:00:00 2001 From: Guan-Ting Date: Fri, 11 Jul 2025 11:32:04 +0000 Subject: [PATCH 1/7] Add probability estimation function and parameter to be arguments for tree prediction --- libmultilabel/linear/tree.py | 39 +++++++++++++++++++++++++++++------- linear_trainer.py | 2 ++ main.py | 15 ++++++++++++++ 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index fe6e94b4..4868b8d6 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,6 +6,7 @@ import scipy.sparse as sparse import sklearn.cluster import sklearn.preprocessing +from scipy.special import log_expit from tqdm import tqdm import psutil @@ -54,21 +55,42 @@ def __init__( self.node_ptr = node_ptr self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. + self.estimator = self.sigmoid_A + def exp_L2(self, x, A): + return np.square(np.maximum(0, 1 - x)) + + def exp_L1(self, x, A): + return np.maximum(0, 1 - x) + + def sigmoid_A(self, x, A): + return log_expit(A * x) + def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, + estimation_function: str = "sigmoid_A", + estimation_parameter: int = 3, ) -> np.ndarray: """Calculate the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10. + estimation_function (str, optional): The probability estimation function used in beamsearch. Default function is sigmoid-A. + estimation_parameter (int, optional): The extra parameter of probability estimation function if needed. Default value is 3. Returns: np.ndarray: A matrix with dimension number of instances * number of classes. """ + if estimation_function == "exp-L1": + self.estimator = self.exp_L1 + elif estimation_function == "exp-L2": + self.estimator = self.exp_L2 + elif estimation_function == "sigmoid_A": + self.estimator = self.sigmoid_A + if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. @@ -78,8 +100,8 @@ def predict_values( if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) - return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) + all_preds = self._prune_tree_and_predict_values(x, beam_width, estimation_parameter) # number of instances * (number of labels + total number of metalabels) + return np.vstack([self._beam_search(all_preds[i], beam_width, estimation_parameter) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): """ @@ -110,7 +132,7 @@ def _separate_model_for_pruning_tree(self): ) self.subtree_models.append(subtree_flatmodel) - def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, estimation_parameter: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. Only subtrees corresponding to the top beam_width candidates from the root are evaluated, @@ -119,6 +141,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. + estimation_parameter (int): The extra parameter of probability estimation function if needed. Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). @@ -129,7 +152,8 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + print(estimation_parameter) + children_scores = 0.0 - self.estimator(root_preds, estimation_parameter) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -156,12 +180,13 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) return all_preds - def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray: + def _beam_search(self, instance_preds: np.ndarray, beam_width: int, estimation_parameter:int) -> np.ndarray: """Predict with beam search using cached probability estimates for a single instance. Args: instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels. beam_width (int): Number of candidates considered. + estimation_parameter (int): The extra parameter of probability estimation function if needed. Returns: np.ndarray: A vector with dimension number of classes. @@ -179,7 +204,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - np.square(np.maximum(0, 1 - pred)) + children_score = score - self.estimator(pred, estimation_parameter) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -190,7 +215,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + scores[node.label_map] = np.exp(score - self.estimator(pred, estimation_parameter)) return scores diff --git a/linear_trainer.py b/linear_trainer.py index b0524ee7..2805da1a 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -23,6 +23,8 @@ def linear_test(config, model, datasets, label_mapping): predict_kwargs = {} if model.name == "tree": predict_kwargs["beam_width"] = config.beam_width + predict_kwargs["estimation_function"] = config.estimation_function + predict_kwargs["estimation_parameter"] = config.estimation_parameter for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size] diff --git a/main.py b/main.py index 12564f6b..ad981004 100644 --- a/main.py +++ b/main.py @@ -229,6 +229,21 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) + parser.add_argument( + "--estimation_function", + type=str, + default="sigmoid_A", + choices=["exp-L1", "exp-L2", "sigmoid_A"], + help="The function that estimates probability in beam search (default: %(default)s)" + ) + + parser.add_argument( + "--estimation_parameter", + type=float, + default=3, + help="The parameter that probability estimation function may need (default: %(default)s)" + ) + # AttentionXML parser.add_argument( "--cluster_size", From 0554c8ba69abac99dcef7a98372da55d21b67173 Mon Sep 17 00:00:00 2001 From: Guan-Ting Date: Fri, 11 Jul 2025 11:35:11 +0000 Subject: [PATCH 2/7] fix sigmoid function --- libmultilabel/linear/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 4868b8d6..6b8aa8d7 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -64,7 +64,7 @@ def exp_L1(self, x, A): return np.maximum(0, 1 - x) def sigmoid_A(self, x, A): - return log_expit(A * x) + return -log_expit(A * x) def predict_values( self, From e77e9c3f2fe06baacd0bffaa7ea64823b70ce4d3 Mon Sep 17 00:00:00 2001 From: Guan-Ting Date: Fri, 11 Jul 2025 13:10:54 +0000 Subject: [PATCH 3/7] Save probability estimator parameter in class structure --- libmultilabel/linear/tree.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 6b8aa8d7..15ac57a2 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -56,15 +56,16 @@ def __init__( self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. self.estimator = self.sigmoid_A + self.estimator_parameter = 3 - def exp_L2(self, x, A): + def exp_L2(self, x): return np.square(np.maximum(0, 1 - x)) - def exp_L1(self, x, A): + def exp_L1(self, x): return np.maximum(0, 1 - x) - def sigmoid_A(self, x, A): - return -log_expit(A * x) + def sigmoid_A(self, x): + return -log_expit(self.estimator_parameter * x) def predict_values( self, @@ -91,6 +92,8 @@ def predict_values( elif estimation_function == "sigmoid_A": self.estimator = self.sigmoid_A + self.estimator_parameter = estimation_parameter + if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. @@ -100,8 +103,8 @@ def predict_values( if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width, estimation_parameter) # number of instances * (number of labels + total number of metalabels) - return np.vstack([self._beam_search(all_preds[i], beam_width, estimation_parameter) for i in range(all_preds.shape[0])]) + all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) + return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): """ @@ -132,7 +135,7 @@ def _separate_model_for_pruning_tree(self): ) self.subtree_models.append(subtree_flatmodel) - def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, estimation_parameter: int) -> np.ndarray: + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. Only subtrees corresponding to the top beam_width candidates from the root are evaluated, @@ -141,7 +144,6 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. - estimation_parameter (int): The extra parameter of probability estimation function if needed. Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). @@ -152,8 +154,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - print(estimation_parameter) - children_scores = 0.0 - self.estimator(root_preds, estimation_parameter) + children_scores = 0.0 - self.estimator(root_preds) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -180,13 +181,12 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, return all_preds - def _beam_search(self, instance_preds: np.ndarray, beam_width: int, estimation_parameter:int) -> np.ndarray: + def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray: """Predict with beam search using cached probability estimates for a single instance. Args: instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels. beam_width (int): Number of candidates considered. - estimation_parameter (int): The extra parameter of probability estimation function if needed. Returns: np.ndarray: A vector with dimension number of classes. @@ -204,7 +204,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int, estimation_p continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - self.estimator(pred, estimation_parameter) + children_score = score - self.estimator(pred) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -215,7 +215,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int, estimation_p for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - self.estimator(pred, estimation_parameter)) + scores[node.label_map] = np.exp(score - self.estimator(pred)) return scores From 5808ec04c71bb1f5ed3d2edaae96e35b21fe72a8 Mon Sep 17 00:00:00 2001 From: Guan-Ting Date: Thu, 17 Jul 2025 08:01:49 +0000 Subject: [PATCH 4/7] Remove unnecessary function and revise corresponding notes. --- libmultilabel/linear/tree.py | 25 +++++-------------------- linear_trainer.py | 1 - main.py | 9 +-------- 3 files changed, 6 insertions(+), 29 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 15ac57a2..a3a7aeda 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -55,23 +55,15 @@ def __init__( self.node_ptr = node_ptr self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. - self.estimator = self.sigmoid_A self.estimator_parameter = 3 - def exp_L2(self, x): - return np.square(np.maximum(0, 1 - x)) - - def exp_L1(self, x): - return np.maximum(0, 1 - x) - def sigmoid_A(self, x): - return -log_expit(self.estimator_parameter * x) + return log_expit(self.estimator_parameter * x) def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, - estimation_function: str = "sigmoid_A", estimation_parameter: int = 3, ) -> np.ndarray: """Calculate the probability estimates associated with x. @@ -79,18 +71,11 @@ def predict_values( Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10. - estimation_function (str, optional): The probability estimation function used in beamsearch. Default function is sigmoid-A. - estimation_parameter (int, optional): The extra parameter of probability estimation function if needed. Default value is 3. + estimation_parameter (int, optional): The tunable parameter of probability estimation function, that is sigmoid(estimation_parameter * preds). Returns: np.ndarray: A matrix with dimension number of instances * number of classes. """ - if estimation_function == "exp-L1": - self.estimator = self.exp_L1 - elif estimation_function == "exp-L2": - self.estimator = self.exp_L2 - elif estimation_function == "sigmoid_A": - self.estimator = self.sigmoid_A self.estimator_parameter = estimation_parameter @@ -154,7 +139,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - self.estimator(root_preds) + children_scores = 0.0 + self.sigmoid_A(root_preds) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -204,7 +189,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - self.estimator(pred) + children_score = score + self.sigmoid_A(pred) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -215,7 +200,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - self.estimator(pred)) + scores[node.label_map] = np.exp(score + self.sigmoid_A(pred)) return scores diff --git a/linear_trainer.py b/linear_trainer.py index 2805da1a..637aa2f9 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -23,7 +23,6 @@ def linear_test(config, model, datasets, label_mapping): predict_kwargs = {} if model.name == "tree": predict_kwargs["beam_width"] = config.beam_width - predict_kwargs["estimation_function"] = config.estimation_function predict_kwargs["estimation_parameter"] = config.estimation_parameter for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): diff --git a/main.py b/main.py index ad981004..0f296ebe 100644 --- a/main.py +++ b/main.py @@ -229,19 +229,12 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) - parser.add_argument( - "--estimation_function", - type=str, - default="sigmoid_A", - choices=["exp-L1", "exp-L2", "sigmoid_A"], - help="The function that estimates probability in beam search (default: %(default)s)" - ) parser.add_argument( "--estimation_parameter", type=float, default=3, - help="The parameter that probability estimation function may need (default: %(default)s)" + help="The parameter for probability estimation function (default: %(default)s)" ) # AttentionXML From bcf81c2825d0c59c550ecbf7a7bf301e6b167e26 Mon Sep 17 00:00:00 2001 From: Guan-Ting Date: Wed, 13 Aug 2025 09:47:14 +0000 Subject: [PATCH 5/7] update cv --- libmultilabel/linear/tree.py | 53 ++++++++++++++++++++++++++++++++++-- main.py | 1 - 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index a3a7aeda..cb52ef53 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -11,6 +11,7 @@ import psutil from . import linear +from . import metrics __all__ = ["train_tree", "TreeModel"] @@ -64,7 +65,6 @@ def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, - estimation_parameter: int = 3, ) -> np.ndarray: """Calculate the probability estimates associated with x. @@ -77,8 +77,6 @@ def predict_values( np.ndarray: A matrix with dimension number of instances * number of classes. """ - self.estimator_parameter = estimation_parameter - if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. @@ -203,6 +201,55 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra scores[node.label_map] = np.exp(score + self.sigmoid_A(pred)) return scores + def tuning_A_by_cross_validation( + self, + y: sparse.csr_matrix, + x: sparse.csr_matrix, + n_folds: int, + batch_size: int, + beamwidth: int, + metric: list, + A_candidates: list, + options: str = "", + K=100, + dmax=10, + ): + data_splits = [] + for n in range(n_folds): + start = np.ceil(n/n_folds*x.shape[0]).astype(int) + end = np.ceil((n+1)/n_folds*x.shape[0]).astype(int) + data_splits.append({'x':x[start:end, :], 'y':y[start:end ,:]}) + + score = {m:{A:0 for A in A_candidates} for m in metric} + for n in range(n_folds): + data_y = sparse.vstack([data_splits[j]["y"] for j in range(n_folds) if j != n]) + data_x = sparse.vstack([data_splits[j]["x"] for j in range(n_folds) if j != n]) + + model = train_tree( + data_y, + data_x, + options, + K, + dmax, + ) + + for A in A_candidates: + model.estimator_parameter = A + + num_instances = data_splits[n]["x"].shape[0] + num_batch = np.ceil(num_instances/batch_size).astype(int) + metric_eval = metrics.get_metrics(metric ,num_classes = data_y.shape[1]) + for i in range(num_batch): + valid_x = data_splits[n]["x"][i * batch_size : (i+1) * batch_size] + valid_y = data_splits[n]["y"][i * batch_size : (i+1) * batch_size] + preds = model.predict_values(valid_x, beam_width=beamwidth) + metric_eval.update(preds, valid_y) + + eval = metric_eval.compute() + for k in eval.keys(): + score[k][A] += eval[k] + + self.estimator_parameter = max(score[k], key=score[k].get) def train_tree( y: sparse.csr_matrix, diff --git a/main.py b/main.py index 0f296ebe..19469b2f 100644 --- a/main.py +++ b/main.py @@ -229,7 +229,6 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) - parser.add_argument( "--estimation_parameter", type=float, From 47339e43ba9379f513682858fe127a4ba389c251 Mon Sep 17 00:00:00 2001 From: Guan-Ting Date: Tue, 19 Aug 2025 19:54:56 +0000 Subject: [PATCH 6/7] remove command line change --- linear_trainer.py | 1 - main.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/linear_trainer.py b/linear_trainer.py index 637aa2f9..b0524ee7 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -23,7 +23,6 @@ def linear_test(config, model, datasets, label_mapping): predict_kwargs = {} if model.name == "tree": predict_kwargs["beam_width"] = config.beam_width - predict_kwargs["estimation_parameter"] = config.estimation_parameter for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size] diff --git a/main.py b/main.py index 120767aa..bfcc8688 100644 --- a/main.py +++ b/main.py @@ -229,12 +229,6 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) - parser.add_argument( - "--estimation_parameter", - type=float, - default=3, - help="The parameter for probability estimation function (default: %(default)s)" - ) # AttentionXML parser.add_argument( From acdf0c6e5b74de5536be6759179ebc965d780477 Mon Sep 17 00:00:00 2001 From: Guan-Ting Date: Tue, 19 Aug 2025 19:55:51 +0000 Subject: [PATCH 7/7] remove extra blank line --- main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main.py b/main.py index bfcc8688..3a1aa98c 100644 --- a/main.py +++ b/main.py @@ -229,7 +229,6 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) - # AttentionXML parser.add_argument( "--cluster_size",