Source code for causalnex.discretiser.discretiser

# Copyright 2019-2020 QuantumBlack Visual Analytics Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
# (either separately or in combination, "QuantumBlack Trademarks") are
# trademarks of QuantumBlack. The License does not grant you any right or
# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
# Trademarks or any confusingly similar mark as a trademark for your product,
#     or use the QuantumBlack Trademarks in any other manner that might cause
# confusion in the marketplace, including but not limited to in advertising,
# on websites, or on software.
#
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tools to help discretise data."""

from typing import List

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


[docs]class Discretiser(BaseEstimator, TransformerMixin): """Allows the discretisation of numeric data. Example: :: >>> import causalnex >>> import pandas as pd >>> >>> df = pd.DataFrame({'Age': [12, 13, 18, 19, 22, 60]}) >>> >>> from causalnex.discretiser import Discretiser >>> df["Transformed_Age_1"] = Discretiser(method="fixed", >>> numeric_split_points=[11,18,50]).transform(df["Age"]) >>> df.to_dict() {'Age': {0: 7, 1: 12, 2: 13, 3: 18, 4: 19, 5: 22, 6: 60}, 'Transformed_Age': {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 3}} """
[docs] def __init__( self, method: str = "uniform", num_buckets: int = None, outlier_percentile: float = None, numeric_split_points: List[float] = None, percentile_split_points: List[float] = None, ): """ Creates a new Discretiser, that provides fit, fit_transform, and transform function to discretise data. Args: method (str): can be one of: - uniform: discretise data into uniformly spaced buckets. Note, complete uniformity cannot be guaranteed under all circumstances, for example, if 5 data points are to split into 2 buckets, then one will contain 2 points, and the other will contain 3. Provide num_buckets. - quantile: discretise data according to the distribution of values. For example, providing num_buckets=4 will discretise data into 4 buckets, [0-25th, 25th-50th, 50th-75th, 75th-100th] percentiles. Provide num_buckets. - outlier: discretise data into 3 buckets - [low_outliers, normal, high_outliers] based on outliers being below outlier_percentile, or above 1-outlier_percentile. Provide outlier_percentile. - fixed: discretise according to pre-defined split points. Provide numeric_split_points - percentiles: discretise data according to the distribution of percentiles values. Provide percentile_split_points. num_buckets: (int): used by method=uniform and method=quantile. outlier_percentile: used by method=outlier. numeric_split_points: used by method=fixed. to split such that values below 10 go into bucket 0, 10 to 20 go into bucket 1, and above 20 go into bucket 2, provide [10, 21]. Note that split_point values are non-inclusive. percentile_split_points: used by method=percentiles. to split such that values below 10th percentiles go into bucket 0, 10th to below 75th percentiles go into bucket 1, and 75th percentiles and above go into bucket 2, provide [0.1, 0.75]. Raises: ValueError: If an incorrect argument is passed. """ self.numeric_split_points = [] self.method = method self.num_buckets = num_buckets self.outlier_percentile = outlier_percentile self.numeric_split_points = numeric_split_points self.percentile_split_points = percentile_split_points allowed_methods = ["uniform", "quantile", "outlier", "fixed", "percentiles"] if self.method not in allowed_methods: raise ValueError( f"{self.method} is not a recognised method. " f"Use one of: {' '.join(allowed_methods)}" ) if self.method in {"uniform", "quantile"} and num_buckets is None: raise ValueError(f"{self.method} method expects num_buckets") if self.method == "outlier" and outlier_percentile is None: raise ValueError(f"{self.method} method expects outlier_percentile") if outlier_percentile is not None and not 0 <= outlier_percentile < 0.5: raise ValueError("outlier_percentile must be between 0 and 0.5") if self.method == "fixed" and numeric_split_points is None: raise ValueError(f"{self.method} method expects numeric_split_points") if ( numeric_split_points is not None and sorted(numeric_split_points) != numeric_split_points ): raise ValueError("numeric_split_points must be monotonically increasing") if self.method == "percentiles" and percentile_split_points is None: raise ValueError(f"{self.method} method expects percentile_split_points") if percentile_split_points is not None and not all( 0 <= p <= 1 for p in percentile_split_points ): raise ValueError("percentile_split_points must be between 0 and 1") if ( percentile_split_points is not None and sorted(percentile_split_points) != percentile_split_points ): raise ValueError("percentile_split_points must be monotonically increasing") if self.method == "fixed": self.numeric_split_points = numeric_split_points
[docs] def fit(self, data: np.ndarray) -> "Discretiser": """ Fit where split points are based on the input data. Args: data (np.ndarray): values used to learn where split points exist. Returns: self Raises: RuntimeError: If an attempt to fit fixed numeric_split_points is made. """ x = data.flatten() x.sort() if self.method == "uniform": bucket_width = (np.max(x) - np.min(x)) / self.num_buckets self.numeric_split_points = [ np.min(x) + bucket_width * (n + 1) for n in range(self.num_buckets - 1) ] elif self.method == "quantile": bucket_width = 1.0 / self.num_buckets quantiles = [bucket_width * (n + 1) for n in range(self.num_buckets - 1)] self.numeric_split_points = np.quantile(x, quantiles) elif self.method == "outlier": self.numeric_split_points = np.quantile( x, [self.outlier_percentile, 1 - self.outlier_percentile] ) elif self.method == "percentiles": percentiles = [p * 100 for p in self.percentile_split_points] self.numeric_split_points = np.percentile(x, percentiles) else: raise RuntimeError("cannot call fit using method=fixed") return self
[docs] def transform(self, data: np.ndarray) -> np.ndarray: """ Transform the input data into discretised digits, based on the numeric_split_points that were either learned through using fit(), or from initialisation if method="fixed". Args: data (np.ndarray): values that will be transformed into discretised digits. Returns: input data transformed into discretised digits. """ return np.digitize(data, self.numeric_split_points, right=False)