Source code for causalnex.discretiser.discretiser_strategy

# Copyright 2019-2020 QuantumBlack Visual Analytics Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
# (either separately or in combination, "QuantumBlack Trademarks") are
# trademarks of QuantumBlack. The License does not grant you any right or
# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
# Trademarks or any confusingly similar mark as a trademark for your product,
#     or use the QuantumBlack Trademarks in any other manner that might cause
# confusion in the marketplace, including but not limited to in advertising,
# on websites, or on software.
#
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tools to help discretise data."""

import logging
from copy import deepcopy
from typing import Any, Dict, List

import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from causalnex.discretiser.abstract_discretiser import (
    AbstractSupervisedDiscretiserMethod,
)
from causalnex.utils.decision_tree_tools import extract_thresholds_from_dtree

try:
    from mdlp.discretization import MDLP
except ImportError:
    MDLP = None
    logging.warning("MDLP was not imported successfully")


[docs]class DecisionTreeSupervisedDiscretiserMethod(AbstractSupervisedDiscretiserMethod):
    """Allows the discretisation of continuous features based on the split thresholds of either
    sklearn's DecisionTreeRegressor or DecisionTreeClassifier.
    DecisionTreeSupervisedDiscretiserMethod is inhereited from AbstractSupervisedDiscretiserMethod.
    When instantiated, we have an object with .fit method to learn discretisation thresholds from data
    and .transform method to process the input.


    Example:
    ::
        >>> import pandas as pd
        >>> import numpy as np
        >>> from causalnex.discretiser.discretiser_strategy import DecisionTreeSupervisedDiscretiserMethod
        >>> from sklearn.datasets import load_iris
        >>> iris = load_iris()
        >>> X, y = iris["data"], iris["target"]
        >>> names = iris["feature_names"]
        >>> data = pd.DataFrame(X, columns=names)
        >>> data["target"] = y
        >>> dt_multi = DecisionTreeSupervisedDiscretiserMethod(
        >>>     mode="multi", tree_params={"max_depth": 3, "random_state": 2020}
        >>> )
        >>> tree_discretiser = dt_multi.fit(
        >>>     feat_names=[
        >>>         "sepal length (cm)",
        >>>         "sepal width (cm)",
        >>>         "petal length (cm)",
        >>>         "petal width (cm)",
        >>>     ],
        >>>     dataframe=data,
        >>>     target="target",
        >>>     target_continuous=False,
        >>> )
        >>> discretised_data = tree_discretiser.transform(data[["petal width (cm)"]])
        >>> discretised_data.values.ravel()
        array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
           2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
           2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

    """

[docs]    def __init__(
        self,
        mode: str = "single",
        split_unselected_feat: bool = False,
        tree_params: Dict[str, Any] = None,
    ):
        """
        This Discretiser Method uses Decision Trees to predict the target.
        The cutting points on the the Decision Tree becomes the chosen discretisation thresholds

        If the target is a continuous variable, we fit a `DecisionTreeRegressor` to discretise the data.
        Otherwise, we fit a Classifier.

        Args:
            max_depth (int): maximum depth of the decision tree.
            mode (str): Either 'single' or 'multi'.
            - if single, Train a univariate decision tree for each continuous variable being discretised.
                The splitting points of the decision tree become discretiser fixed points
            - if multi, Train a decision tree over all the variables passed.
                The splitting points of each variable used in the Decision tree become the thresholds for discretisation
            split_unselected_feat (bool): only applicable if self.mode = 'multi'.
            - if True, features not selected by the decision tree will be discretised using 'single' mode
            with the same tree parameters
            - if False, features not selected by the decision tree will be left unchanged
            tree_params: keyword arguments, which are parameters
            used for `sklearn.tree.DecisionTreeClassifier`/`sklearn.tree.DecisionTreeRegressor`
        Raises:
            KeyError: if an incorrect argument is passed
        """

        super().__init__()
        tree_params = tree_params or {"max_depth": 2}
        self.tree_params = tree_params
        self.feat_names = None
        self.map_thresholds = {}
        if mode not in ["single", "multi"]:
            raise KeyError(
                f"mode, `{mode}` is not valid, please choose in ['single', 'multi']"
            )
        self.mode = mode
        self.split_unselected_feat = split_unselected_feat

[docs]    def fit(
        self,
        feat_names: List[str],
        target: str,
        dataframe: pd.DataFrame,
        target_continuous: bool,
    ) -> "DecisionTreeSupervisedDiscretiserMethod":
        """
        The fit method allows DecisionTrees to learn split thresholds from the input data

        Args:
            feat_names (List[str]): a list of feature to be discretised
            target (str): name of variable that is going to be used a target for the decision tree
            dataframe (pd.DataFrame): pandas dataframe of input data
            target_continuous (bool): a boolean that indicates if the target variable is continuous

        Returns:
            self: DecisionTreeSupervisedDiscretiserMethod object with learned split thresholds from the decision tree
        """
        dtree = (
            DecisionTreeRegressor(**self.tree_params)
            if target_continuous
            else DecisionTreeClassifier(**self.tree_params)
        )
        self.feat_names = feat_names
        self.map_thresholds = {}

        if self.mode == "single":
            for feat in feat_names:
                dtree = deepcopy(dtree)

                dtree.fit(dataframe[[feat]], dataframe[[target]])
                thresholds = extract_thresholds_from_dtree(dtree, 1)[0]
                self.map_thresholds[feat] = thresholds

        elif self.mode == "multi":
            dtree = deepcopy(dtree)
            dtree.fit(dataframe[feat_names], dataframe[[target]])
            threshold_list = extract_thresholds_from_dtree(dtree, len(feat_names))

            for feat, threshold in zip(feat_names, threshold_list):
                self.map_thresholds[feat] = threshold

            if self.split_unselected_feat:
                for feat, thres in self.map_thresholds.items():
                    if thres.size == 0:
                        dtree = deepcopy(dtree)
                        dtree.fit(dataframe[[feat]], dataframe[[target]])
                        thresholds = extract_thresholds_from_dtree(dtree, 1)[0]
                        self.map_thresholds[feat] = thresholds

            else:
                no_use = []
                for feat in list(self.map_thresholds.keys()):
                    if self.map_thresholds[feat].size == 0:
                        no_use.append(feat)
                        del self.map_thresholds[feat]
                if len(no_use) > 0:
                    logging.warning(
                        "%s not selected by the decision tree. No discretisation thresholds were learned. "
                        "Consider setting split_unselected_feat = True or discretise them using single mode",
                        no_use,
                    )

        return self


[docs]class MDLPSupervisedDiscretiserMethod(AbstractSupervisedDiscretiserMethod):
    """Allows discretisation of continuous features using mdlp algorithm

    Example:
    ::
        >>> import pandas as pd
        >>> import numpy as np
        >>> from causalnex.discretiser.discretiser_strategy import MDLPSupervisedDiscretiserMethod
        >>> from sklearn.datasets import load_iris
        >>> iris = load_iris()
        >>> X, y = iris["data"], iris["target"]
        >>> names = iris["feature_names"]
        >>> data = pd.DataFrame(X, columns=names)
        >>> data["target"] = y
        >>> discretiser = MDLPSupervisedDiscretiserMethod(
        >>>     {"min_depth": 0, "random_state": 2020, "min_split": 1e-3, "dtype": int}
        >>> )
        >>> discretiser.fit(
        >>>     feat_names=["sepal length (cm)"],
        >>>     dataframe=data,
        >>>     target="target",
        >>>     target_continuous=False,
        >>> )
        >>> discretised_data = discretiser.transform(data[["sepal length (cm)"]])
        >>> discretised_data.values.ravel()

        array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 2, 2, 2, 1, 2,
               1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 0, 2, 2, 2,
               1, 1, 1, 2, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2,
               2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
               2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2])

    """

[docs]    def __init__(
        self,
        mdlp_args: Dict[str, Any] = None,
    ):
        """
        This method of discretisation applies MDLP to discretise the data

        Args:
            min_depth: The minimum depth of the interval splitting.
            min_split: The minmum size to split a bin
            dtype: The type of the array returned by the `transform()` method
            **dlp_args: keyword arguments, which are parameters used for `mdlp.discretization.MDLP`
        Raises:
            ImportError: if mdlp-discretization is not installed successfully
        """
        super().__init__()
        mdlp_args = mdlp_args or {"min_depth": 0, "min_split": 1e-3, "dtype": int}
        self.mdlp_args = mdlp_args
        self.feat_names = None
        self.map_feat_transformer = {}
        if MDLP is None:
            raise ImportError(
                "mdlp-discretisation was not installed and imported successfully"
            )
        self.mdlp = MDLP(**mdlp_args)

[docs]    def fit(
        self,
        feat_names: List[str],
        target: str,
        dataframe: pd.DataFrame,
        target_continuous: bool,
    ) -> "MDLPSupervisedDiscretiserMethod":
        """
        The fit method allows MDLP to learn split thresholds from the input data.
        The target feature cannot be continuous

        Args:
            feat_names (List[str]): a list of feature to be discretised
            target (str): name of the variable that is going to be used a target for MDLP
            dataframe (pd.DataFrame): pandas dataframe of input data
            target_continuous (bool): boolean that indicates if target variable is continuous.

        Returns:
            self: MDLPSupervisedDiscretiserMethod object with learned split thresholds from mdlp algorithm

        Raises:
            ValueError: if the target is continuous
        """
        self.feat_names = feat_names
        self.map_feat_transformer = {}
        if target_continuous:
            raise ValueError(
                "Target variable should not be continuous when using MDLP."
            )

        for feat in feat_names:
            mdlp = deepcopy(self.mdlp)

            mdlp.fit(dataframe[[feat]], dataframe[[target]])
            self.map_thresholds[feat] = mdlp.cut_points_[0]

        return self