Source code for causalnex.discretiser.discretiser_strategy

# Copyright 2019-2020 QuantumBlack Visual Analytics Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
# (either separately or in combination, "QuantumBlack Trademarks") are
# trademarks of QuantumBlack. The License does not grant you any right or
# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
# Trademarks or any confusingly similar mark as a trademark for your product,
#     or use the QuantumBlack Trademarks in any other manner that might cause
# confusion in the marketplace, including but not limited to in advertising,
# on websites, or on software.
#
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tools to help discretise data."""

import logging
from copy import deepcopy
from typing import Any, Dict, List

import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from causalnex.discretiser.abstract_discretiser import (
    AbstractSupervisedDiscretiserMethod,
)
from causalnex.utils.decision_tree_tools import extract_thresholds_from_dtree

try:
    from mdlp.discretization import MDLP
except ImportError:
    MDLP = None
    logging.warning("MDLP was not imported successfully")


[docs]class DecisionTreeSupervisedDiscretiserMethod(AbstractSupervisedDiscretiserMethod): """Allows the discretisation of continuous features based on the split thresholds of either sklearn's DecisionTreeRegressor or DecisionTreeClassifier. DecisionTreeSupervisedDiscretiserMethod is inhereited from AbstractSupervisedDiscretiserMethod. When instantiated, we have an object with .fit method to learn discretisation thresholds from data and .transform method to process the input. Example: :: >>> import pandas as pd >>> import numpy as np >>> from causalnex.discretiser.discretiser_strategy import DecisionTreeSupervisedDiscretiserMethod >>> from sklearn.datasets import load_iris >>> iris = load_iris() >>> X, y = iris["data"], iris["target"] >>> names = iris["feature_names"] >>> data = pd.DataFrame(X, columns=names) >>> data["target"] = y >>> dt_multi = DecisionTreeSupervisedDiscretiserMethod( >>> mode="multi", tree_params={"max_depth": 3, "random_state": 2020} >>> ) >>> tree_discretiser = dt_multi.fit( >>> feat_names=[ >>> "sepal length (cm)", >>> "sepal width (cm)", >>> "petal length (cm)", >>> "petal width (cm)", >>> ], >>> dataframe=data, >>> target="target", >>> target_continuous=False, >>> ) >>> discretised_data = tree_discretiser.transform(data[["petal width (cm)"]]) >>> discretised_data.values.ravel() array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) """
[docs] def __init__( self, mode: str = "single", split_unselected_feat: bool = False, tree_params: Dict[str, Any] = None, ): """ This Discretiser Method uses Decision Trees to predict the target. The cutting points on the the Decision Tree becomes the chosen discretisation thresholds If the target is a continuous variable, we fit a `DecisionTreeRegressor` to discretise the data. Otherwise, we fit a Classifier. Args: max_depth (int): maximum depth of the decision tree. mode (str): Either 'single' or 'multi'. - if single, Train a univariate decision tree for each continuous variable being discretised. The splitting points of the decision tree become discretiser fixed points - if multi, Train a decision tree over all the variables passed. The splitting points of each variable used in the Decision tree become the thresholds for discretisation split_unselected_feat (bool): only applicable if self.mode = 'multi'. - if True, features not selected by the decision tree will be discretised using 'single' mode with the same tree parameters - if False, features not selected by the decision tree will be left unchanged tree_params: keyword arguments, which are parameters used for `sklearn.tree.DecisionTreeClassifier`/`sklearn.tree.DecisionTreeRegressor` Raises: KeyError: if an incorrect argument is passed """ super().__init__() tree_params = tree_params or {"max_depth": 2} self.tree_params = tree_params self.feat_names = None self.map_thresholds = {} if mode not in ["single", "multi"]: raise KeyError( f"mode, `{mode}` is not valid, please choose in ['single', 'multi']" ) self.mode = mode self.split_unselected_feat = split_unselected_feat
[docs] def fit( self, feat_names: List[str], target: str, dataframe: pd.DataFrame, target_continuous: bool, ) -> "DecisionTreeSupervisedDiscretiserMethod": """ The fit method allows DecisionTrees to learn split thresholds from the input data Args: feat_names (List[str]): a list of feature to be discretised target (str): name of variable that is going to be used a target for the decision tree dataframe (pd.DataFrame): pandas dataframe of input data target_continuous (bool): a boolean that indicates if the target variable is continuous Returns: self: DecisionTreeSupervisedDiscretiserMethod object with learned split thresholds from the decision tree """ dtree = ( DecisionTreeRegressor(**self.tree_params) if target_continuous else DecisionTreeClassifier(**self.tree_params) ) self.feat_names = feat_names self.map_thresholds = {} if self.mode == "single": for feat in feat_names: dtree = deepcopy(dtree) dtree.fit(dataframe[[feat]], dataframe[[target]]) thresholds = extract_thresholds_from_dtree(dtree, 1)[0] self.map_thresholds[feat] = thresholds elif self.mode == "multi": dtree = deepcopy(dtree) dtree.fit(dataframe[feat_names], dataframe[[target]]) threshold_list = extract_thresholds_from_dtree(dtree, len(feat_names)) for feat, threshold in zip(feat_names, threshold_list): self.map_thresholds[feat] = threshold if self.split_unselected_feat: for feat, thres in self.map_thresholds.items(): if thres.size == 0: dtree = deepcopy(dtree) dtree.fit(dataframe[[feat]], dataframe[[target]]) thresholds = extract_thresholds_from_dtree(dtree, 1)[0] self.map_thresholds[feat] = thresholds else: no_use = [] for feat in list(self.map_thresholds.keys()): if self.map_thresholds[feat].size == 0: no_use.append(feat) del self.map_thresholds[feat] if len(no_use) > 0: logging.warning( "%s not selected by the decision tree. No discretisation thresholds were learned. " "Consider setting split_unselected_feat = True or discretise them using single mode", no_use, ) return self
[docs]class MDLPSupervisedDiscretiserMethod(AbstractSupervisedDiscretiserMethod): """Allows discretisation of continuous features using mdlp algorithm Example: :: >>> import pandas as pd >>> import numpy as np >>> from causalnex.discretiser.discretiser_strategy import MDLPSupervisedDiscretiserMethod >>> from sklearn.datasets import load_iris >>> iris = load_iris() >>> X, y = iris["data"], iris["target"] >>> names = iris["feature_names"] >>> data = pd.DataFrame(X, columns=names) >>> data["target"] = y >>> discretiser = MDLPSupervisedDiscretiserMethod( >>> {"min_depth": 0, "random_state": 2020, "min_split": 1e-3, "dtype": int} >>> ) >>> discretiser.fit( >>> feat_names=["sepal length (cm)"], >>> dataframe=data, >>> target="target", >>> target_continuous=False, >>> ) >>> discretised_data = discretiser.transform(data[["sepal length (cm)"]]) >>> discretised_data.values.ravel() array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 0, 2, 2, 2, 1, 1, 1, 2, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2]) """
[docs] def __init__( self, mdlp_args: Dict[str, Any] = None, ): """ This method of discretisation applies MDLP to discretise the data Args: min_depth: The minimum depth of the interval splitting. min_split: The minmum size to split a bin dtype: The type of the array returned by the `transform()` method **dlp_args: keyword arguments, which are parameters used for `mdlp.discretization.MDLP` Raises: ImportError: if mdlp-discretization is not installed successfully """ super().__init__() mdlp_args = mdlp_args or {"min_depth": 0, "min_split": 1e-3, "dtype": int} self.mdlp_args = mdlp_args self.feat_names = None self.map_feat_transformer = {} if MDLP is None: raise ImportError( "mdlp-discretisation was not installed and imported successfully" ) self.mdlp = MDLP(**mdlp_args)
[docs] def fit( self, feat_names: List[str], target: str, dataframe: pd.DataFrame, target_continuous: bool, ) -> "MDLPSupervisedDiscretiserMethod": """ The fit method allows MDLP to learn split thresholds from the input data. The target feature cannot be continuous Args: feat_names (List[str]): a list of feature to be discretised target (str): name of the variable that is going to be used a target for MDLP dataframe (pd.DataFrame): pandas dataframe of input data target_continuous (bool): boolean that indicates if target variable is continuous. Returns: self: MDLPSupervisedDiscretiserMethod object with learned split thresholds from mdlp algorithm Raises: ValueError: if the target is continuous """ self.feat_names = feat_names self.map_feat_transformer = {} if target_continuous: raise ValueError( "Target variable should not be continuous when using MDLP." ) for feat in feat_names: mdlp = deepcopy(self.mdlp) mdlp.fit(dataframe[[feat]], dataframe[[target]]) self.map_thresholds[feat] = mdlp.cut_points_[0] return self