Source code for causalnex.estimator.em

# Copyright 2019-2020 QuantumBlack Visual Analytics Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
# (either separately or in combination, "QuantumBlack Trademarks") are
# trademarks of QuantumBlack. The License does not grant you any right or
# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
# Trademarks or any confusingly similar mark as a trademark for your product,
#     or use the QuantumBlack Trademarks in any other manner that might cause
# confusion in the marketplace, including but not limited to in advertising,
# on websites, or on software.
#
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains the implementation of ``EMSingleLatentVariable``.

``EMSingleLatentVariable`` is a class that implements expectation-maximisation (EM) algorithm
for a single latent variable
"""
import logging
import os
from time import time
from typing import Dict, List, Tuple, Union

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from causalnex.structure import StructureModel
from causalnex.utils.data_utils import chunk_data, count_unique_rows
from causalnex.utils.pgmpy_utils import cpd_multiplication

INITIAL_PARAMS = ["random"]


[docs]class EMSingleLatentVariable:  # pylint: disable=too-many-arguments, too-many-instance-attributes
    """
    This class uses Expectation-Maximization to learn parameters of a single latent variable in a bayesian network.
    We do so by also allowing the user to CONSTRAINT the optimisation
    These are elements that help the algorithm find a local optimal point closer to the point we think

    The setting is:
    Input:
        - a StructureModel representing the whole network or any sub-graph containing the Markov Blanket of the LV
        - data as a dataframe. The LV must be in the dataframe, with missing values represented by `np.nan`s
        - constraints:
            - Box - A hard constraint; forbids the solution to be outside of certain boundaries
            - Priors - establishes Dirichlet priors to every parameter
    run:
        - using the method `run` or manually alternating over E and M steps)
    Result:
        - CPTs involving the latent variable, learnt by EM, found in the attribute `cpds`
        - CPTs not involving the LV not learned (They must be learned separately by MLE.
        This is faster and the result is the same)

    Example:
    >>> em = EMSingleLatentVariable(sm=sm, data=data, lv_name=lv_name, node_states=node_states)
    >>> em.run() # run EM until convergence
    >>> # or run E and M steps separately
    >>> for i in range(10): # Run EM 10 times
    >>>     em.e_step()
    >>>     em.m_step()
    """

[docs]    def __init__(
        self,
        sm: StructureModel,
        data: pd.DataFrame,
        lv_name: str,
        node_states: Dict[str, list],
        initial_params: Union[str, Dict[str, pd.DataFrame]] = "random",
        seed: int = 22,
        box_constraints: Dict[str, Tuple[pd.DataFrame, pd.DataFrame]] = None,
        priors: Dict[str, pd.DataFrame] = None,
        non_missing_data_factor: int = 1,
        n_jobs: int = 1,
    ):
        """
        Args:
            sm: structure. Only requirement is: must contain all edges in the Markov Blanket of the latent variable.
                Note: all variable names must be non empty strings
            data: dataframe, must contain all variables in the Markov Blanket of the latent variable. Include one column
                with the latent variable name, filled with np.nan for missing info about LV.
                If some data is present about the LV, create complete columns.
            lv_name: name of latent variable
            node_states: dictionary mapping variable name and list of states
            initial_params: way to initialise parameters. Can be:
                - "random": random values (default)
                - if a dictionary of dataframes is provided, this will be used as the initialisation
            seed: seed for the random generator (used if iitialise parameters randomly)
            box_constraints: minimum and maximum values for each model parameter. Specified with a dictionary mapping:
                - Node
                - two dataframes, in order: Min(P(Node|Par(Node))) and Max(P(Node|Par(Node)))
            priors: priors, provided as a mapping Node -> dataframe with Dirichilet priors for P(Node|Par(Node))
            non_missing_data_factor:
                This is a weight added to the non-missing data samples. The effect is as if the amount of data provided
                was bigger. Empirically, it helps to set the factor to 10 if the non missing data is ~1% of the dataset
            n_jobs:
                If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful
                for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
                Thus for n_jobs = -2, all CPUs but one are used.
        """
        np.random.seed(seed)

        self.sm = sm
        self.lv_name = lv_name
        self.node_states = node_states
        self.initial_params = initial_params
        self.seed = seed
        self.non_missing_data_factor = non_missing_data_factor
        self.n_jobs = max(1, n_jobs if n_jobs >= 0 else (os.cpu_count() + 1 + n_jobs))

        self.box_constraints = self._check_box_constraints(box_constraints)
        self.priors = self._check_priors(priors)

        # These are the nodes for which we compute CPDs. We do not care about CPDs of parents of the latent variable or
        # parents of children of the LV. This is because these CPDs do not depend on the LV, and are not affected by
        # the fact that we do not observe it
        # **IMPORTANT**: The first name in the list MUST BE lv_name (for the multiplication to work correctly)
        self.valid_nodes = [self.lv_name] + list(self.sm.successors(self.lv_name))

        # Initialise CPDs
        self.cpds = self._initialise_network_cpds()
        self._old_cpds = None
        self._mb_partitions = None
        self._sufficient_stats = {}
        self._index_columns_lookup = {}

        # Compute aggregated data based on Markov blanket
        self._mb_data, self._mb_partitions = self._get_markov_blanket_data(data)

        # Build index columns lookup for each valid node
        self._lv_states = self.node_states[self.lv_name]
        self._mb_product = cpd_multiplication(
            [self.cpds[node] for node in self.valid_nodes]
        )
        for node in self.valid_nodes:
            self._mb_partitions[node]["_lookup_"] = self._mb_partitions[node].apply(
                lambda record: self._build_lookup(
                    node, record  # pylint: disable=cell-var-from-loop
                ),
                axis=1,
            )

    @property
    def _logger(self):
        """Obtains logger for this specific class"""
        return logging.getLogger(self.__class__.__name__)

[docs]    def run(self, n_runs: int, stopping_delta: float = 0.0, verbose: int = 0):
        """
        Runs E and M steps until convergence (`stopping_delta`) or max iterations is reached (n_runs)

        Args:
            n_runs: max number of EM alternations
            stopping_delta: if max difference in current - last iteration CPDS < stopping_delta => convergence reached
            verbose: amount of printing
        """
        if verbose:
            self._logger.info(
                "* Iteration 0: likelihood = %.4f",
                self.compute_total_likelihood(),
            )

        for i in range(n_runs):
            t_start = time()
            self.e_step()  # Expectation step
            e_duration = time() - t_start

            t_start = time()
            self.m_step()  # Maximisation step
            m_duration = time() - t_start

            self.apply_box_constraints()  # Apply box constraints
            delta = self._stopping_criteria()  # Compute change in parameters

            if verbose:
                self._logger.info(
                    "* Iteration %d: "
                    "likelihood = %.4f | "
                    "max(|theta - theta_old|)) = %.4f | "
                    "duration = %.4fs (E-step), %.4fs (M-step)",
                    (i + 1),
                    self.compute_total_likelihood(),
                    delta,
                    e_duration,
                    m_duration,
                )

            if delta < stopping_delta:
                break

[docs]    def e_step(self) -> Dict[str, pd.DataFrame]:
        """
        Performs the Expectation step.
        This boils down to computing the expected sufficient statistics M[X, U]
        for every "valid" node X, where U = Par(X)

        Returns:
            The expected sufficient statistics of each node X
        """
        # This is a product of elements in the Markov Blanket of the latent variable.
        # NOTE: Convert product dataframe to dictionary to speed up the E-step
        self._mb_product = cpd_multiplication(
            [self.cpds[node] for node in self.valid_nodes],
            normalize=True,
        ).to_dict(orient="dict")

        # Get M[X, U] for X being each valid node and U being its parents (Daphne Koller's notation)
        for node in self.valid_nodes:
            node_mb_data = self._mb_partitions[node]

            # Initialize ESS with zeros (or prior values) and then increase from data
            sufficient_stats_df = self._initialize_sufficient_stats(node)
            sufficient_stats = sufficient_stats_df.to_dict(orient="dict")

            # Update ESS based on all data records (observations)
            if self.n_jobs == 1:
                results = self._update_sufficient_stats(node_mb_data["_lookup_"])

                for updates in results:
                    for idx, cols, val in updates:
                        sufficient_stats[cols][idx] += val
            else:
                results = Parallel(n_jobs=self.n_jobs, prefer="threads")(
                    delayed(self._update_sufficient_stats)(chunk_df["_lookup_"])
                    for chunk_df in chunk_data(node_mb_data, self.n_jobs * 2)
                )
                for chunk_results in results:
                    for updates in chunk_results:
                        for idx, cols, val in updates:
                            sufficient_stats[cols][idx] += val

            # Register sufficient statistics as Pandas dataframe
            self._sufficient_stats[node] = pd.DataFrame(
                sufficient_stats,
                index=sufficient_stats_df.index,
                columns=sufficient_stats_df.columns,
            )

        return self._sufficient_stats

[docs]    def m_step(self) -> Dict[str, pd.DataFrame]:
        """
        Maximization step. It boils down to normalising the likelihood table previously created

        $$ \\theta_{[X | U]} = M[X, U] / M[U] = M[X, U] / \\sum_X M[X, U] $$

        Returns:
            New updated CPDs
        """
        self._old_cpds = self.cpds  # Store old CPDs
        self.cpds = {
            node: self._normalise(self._sufficient_stats[node])
            for node in self.valid_nodes
        }
        return self.cpds

[docs]    def compute_total_likelihood(self) -> float:
        """
        This computes the LOG likelihood of the whole dataset (or MAP, if priors given)
        for the current parameter steps

        Returns:
            Total likelihood over dataset
        """
        cpd_prods = cpd_multiplication(
            [self.cpds[n] for n in self.valid_nodes],
            normalize=False,
        )
        proba_of_row = cpd_prods.sum(axis=0)

        def compute_likelihood_stub(record: Dict) -> float:
            t = tuple(record[el] for el in proba_of_row.index.names)

            if np.isnan(record[self.lv_name]):
                likelihood = proba_of_row.loc[t]
            else:
                likelihood = cpd_prods.loc[record[self.lv_name], t]

            return np.log(likelihood) * record["count"]

        return self._mb_data.apply(compute_likelihood_stub, axis=1).sum()

[docs]    def apply_box_constraints(self):
        """
        if CPDs fall outside the box constraints created, bring them back to inside the constraints.
        """
        if self.box_constraints is None:
            return

        for node in self.valid_nodes:
            min_vals, max_vals = self.box_constraints[node]
            cpd = self.cpds[node]

            # where replaces if the condition is false
            cpd.where(cpd < max_vals, max_vals, inplace=True)
            cpd.where(cpd > min_vals, min_vals, inplace=True)
            self.cpds[node] = self._normalise(cpd)

[docs]    @staticmethod
    def get_default_priors(
        sm: StructureModel,
        node_states: Dict[str, list],
        lv_name: str,
    ) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]:
        """
        The default dirichlet priors (zero values)

        Args:
            sm: model structure
            node_states: node states
            lv_name: name of latent variable

        Returns:
            Dictionary with pd dataframes initialized with zeros
        """
        valid_node_set = set([lv_name] + list(sm.successors(lv_name)))
        return {
            node: EMSingleLatentVariable._initialise_node_cpd(node, node_states, sm)
            for node in sm.nodes
            if node in valid_node_set
        }

[docs]    @staticmethod
    def get_default_box(
        sm: StructureModel,
        node_states: Dict[str, list],
        lv_name: str,
    ) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]:
        """
        Get boxes with min = 0 and max = 1 for all parameters.

        Args:
            sm: model structure
            node_states: node states
            lv_name: name of latent variable

        Returns:
            Dictionary with a tuple of two elements: the first being the lower value constraint and the second
        the maximum value constraint
        """
        valid_node_set = set([lv_name] + list(sm.successors(lv_name)))
        boxes = {}

        for node in sm.nodes:
            if node in valid_node_set:
                cpd = EMSingleLatentVariable._initialise_node_cpd(node, node_states, sm)
                min_vals, max_vals = cpd.copy(), cpd.copy()
                min_vals.loc[:] = 0
                max_vals.loc[:] = 1
                boxes[node] = (min_vals, max_vals)

        return boxes

    def _build_lookup(
        self,
        node: str,
        record: Dict,
    ) -> Tuple[List[Tuple], Tuple]:
        """
        Build lookup table based on an individual data record/instance

        Args:
            node: Node name
            record: A data record/instance

        Returns:
            List of CPD index-columns-count triplets, and tuple of Markov blanket columns
        """
        count = record["count"]
        node_value = record[node]
        node_cpd = self.cpds[node]
        lv_states = self.node_states[self.lv_name]
        idx_cols_counts = []

        if not np.isnan(record[self.lv_name]):
            mb_cols = None

            if node_cpd.shape[1] == 1:
                # if the probability is unconditional (i.e. P(Z)), the column names are [""]
                idx_cols_counts.append((node_value, "", count))
            else:
                cols = tuple(record[j] for j in node_cpd.columns.names)
                idx_cols_counts.append((node_value, cols, count))
        else:
            mb_cols = tuple(record[j] for j in self._mb_product.columns.names)

            if node_cpd.shape[1] == 1:
                # if the probability is unconditional (i.e., P(Z)), the column names are [""]
                for lv_value in lv_states:
                    index = lv_value if node == self.lv_name else node_value
                    idx_cols_counts.append((index, "", count))
            else:
                if node == self.lv_name:
                    cols = tuple(record[j] for j in node_cpd.columns.names)

                    for lv_value in lv_states:
                        idx_cols_counts.append((lv_value, cols, count))
                else:
                    for lv_value in lv_states:
                        cols = tuple(
                            lv_value if j == self.lv_name else record[j]
                            for j in node_cpd.columns.names
                        )
                        idx_cols_counts.append((node_value, cols, count))

        return idx_cols_counts, mb_cols

    def _update_sufficient_stats(
        self,
        lookup: pd.Series,
    ) -> List[List[Tuple]]:
        """
        Update expected sufficient statistics based on a given dataframe

        Args:
            lookup: Lookup table for index, columns and count

        Returns:
            List of list of update tuples
        """
        updates = []

        for idx_cols_counts, mb_cols in lookup.values:
            if mb_cols is None:
                # Update the ESS: increase it by the number of times this row appears on the dataset
                updates.append(idx_cols_counts)
            else:
                # Update the ESS: increase it by:
                #     (number of times this row appears on the dataset) * (Probability of Z assuming that value)
                # Because lv is not observed, we consider all possible values it can assume and,
                # instead of adding 1 for the likelihood M[X=x_i, U=u_i], we add p(Z=z_1|observations)
                prob_lv_given_mb = self._mb_product[mb_cols]
                updates.append(
                    [
                        (idx, cols, count * prob_lv_given_mb[lv_value])
                        for (idx, cols, count), lv_value in zip(
                            idx_cols_counts, self._lv_states
                        )
                    ]
                )

        return updates

    def _initialize_sufficient_stats(self, node: str) -> pd.DataFrame:
        """
        Likelihood of node and parents, initialized with zeros (or prior values) and then increased from data.
        The likelihood is not a conditional expression (i.e. X|U), but a joint expression (i.e. X,U).
        However, we use the same structure as the CPT to store that likelihood. That structure is:
        a pandas table with the index being X values and the the columns being U values as a MultiIndex

        Args:
            node: Node key

        Returns:
            Dataframe containing the likelihood of the node's parents
        """
        if self.priors is None:
            sufficient_stats_df = self.cpds[node].copy()
            sufficient_stats_df.loc[:] = 0
        else:
            sufficient_stats_df = self.priors[node].copy()

        return sufficient_stats_df

    def _initialise_network_cpds(self) -> Dict[str, pd.DataFrame]:
        """
        Initialise all the CPDs according to the choice made in the constructor.
        It can be:
        - filling CPDs with random values,
        - filling CPDs with specific values given by user,
        - filling CPDs with uniform probabilities (Tends to have bad effects on convergence)

        Returns:
            Dictionary of CPD dataframes

        Raises:
            ValueError: if `initial_params` is neither a dictionary nor part of supported type strings
        """
        if isinstance(self.initial_params, str) and self.initial_params in set(
            INITIAL_PARAMS
        ):
            valid_node_set = set(self.valid_nodes)
            cpds = {}

            for node in self.sm.nodes:
                if node in valid_node_set:
                    cpd = self._initialise_node_cpd(node, self.node_states, self.sm)
                    cpd.loc[:] = 1

                    if self.initial_params == "random":
                        cpd.loc[:] = np.random.random(cpd.shape)

                    cpd = cpd / cpd.sum(axis=0)
                    cpds[node] = cpd  # Update dictionary
        elif isinstance(self.initial_params, dict):
            self._check_initial_params_dict()
            cpds = self.initial_params
        else:
            raise ValueError(
                f"`initial_params` must be a dictionary or one of {INITIAL_PARAMS}"
            )

        return cpds

    @staticmethod
    def _initialise_node_cpd(
        node: str,
        node_states: Dict[str, List],
        sm: StructureModel,
    ) -> pd.DataFrame:
        """
        Initialise the CPD of a specified node

        Args:
            node: Node name
            node_states: States of the node
            sm: Structure model

        Returns:
            CPD dataframe associated with the node

        Raises:
            ValueError: if node is not found in the network
        """
        parents = list(sorted(sm.predecessors(node)))
        columns = [""]

        if len(parents) > 0:
            columns = pd.MultiIndex.from_product(
                [sorted(node_states[p]) for p in parents],
                names=parents,
            )

        indices = pd.Index(data=sorted(node_states[node]), name=node)
        values = np.zeros(shape=(len(indices), len(columns)))
        return pd.DataFrame(index=indices, columns=columns, data=values)

    def _get_markov_blanket_data(
        self,
        df: pd.DataFrame,
    ) -> Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]:
        """
        Keeps only features the belong to the latent variable's Markov blanket +
        groups and counts identical rows +
        multiplies non missing data counts by a factor

        Args:
            df: Raw data

        Returns:
            Aggregated data, as well as data partition for each node
        """
        # Add a column for latent variable if not already present
        if self.lv_name not in df.columns:
            df[self.lv_name] = np.nan

        # Get the counts of each unique record
        valid_cols = set()

        for node in self.valid_nodes:
            valid_cols.add(node)
            valid_cols.update(self.sm.predecessors(node))

        mb_data = count_unique_rows(df[list(valid_cols)])
        indices = ~mb_data.isna().any(axis=1)
        mb_data.loc[indices, "count"] *= self.non_missing_data_factor

        # Partition data based on the Markov blanket of each node
        mb_product = cpd_multiplication([self.cpds[node] for node in self.valid_nodes])
        mb_partitions = {}

        for node in self.valid_nodes:
            valid_cols = list(
                set(
                    [node, "count"]
                    + list(self.sm.predecessors(node))
                    + mb_product.columns.names
                )
            )
            mb_partitions[node] = count_unique_rows(mb_data[valid_cols])

        return mb_data, mb_partitions

    @staticmethod
    def _normalise(df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalises dataframe

        Args:
            df: Raw dataframe

        Returns:
            Normalised dataframe
        """
        norm_df = df / df.sum(axis=0)
        norm_df.fillna(1.0 / df.shape[0], inplace=True)
        return norm_df

    def _stopping_criteria(self) -> float:
        """
        Maximum change, in absolute values, between parameters of last EM iteration and params of current EM iteration

        Returns:
            Maximum absolute difference between new and old CPDs
        """
        return max(
            (
                (self._old_cpds[node] - self.cpds[node]).abs().values.max()
                for node in self.valid_nodes
            ),
            default=-1,  # return -1 if valid nodes list is empty
        )

    def _check_initial_params_dict(self):
        """
        Checks initial parameter dictionary

        Raises:
            ValueError: when the initial parameter dictionary keys are different from valid nodes, or
                when the CPD provided in the initial parameter dictionary has incorrect format
        """
        if sorted(self.valid_nodes) != sorted(self.initial_params.keys()):
            raise ValueError(
                "If `initial_params` is a dictionary, it has to map `valid nodes` to "
                "corresponding CPTs. A valid node is : L.V. or Successors(L.V.)"
            )
        for node in self.valid_nodes:
            df = self.initial_params[node]
            check = (
                isinstance(df, pd.DataFrame)
                and (df.index.name == node)
                and (list(df.index) == self.node_states[node])
            )
            check = check and (
                (np.all(df.columns == "") and (not self.sm.predecessors(node)))
                or (df.columns.names == list(self.sm.predecessors(node)))
            )
            if not check:  # pragma: no cover
                raise ValueError(
                    "CPTs provided in `initial_params` do not correspond to the expected format"
                )

    @staticmethod
    def _check_priors(priors: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
        """
        Checks if the priors are passed in the right format and if they are valid

        Args:
            priors: Prior distribution to check

        Returns:
            Verified priors

        Raises:
            ValueError: when prior distributions are invalid
        """
        if priors is None or isinstance(priors, dict):
            return priors

        raise ValueError(f"Invalid priors {priors}")

    @staticmethod
    def _check_box_constraints(
        box_constraints: Dict[str, Tuple[pd.DataFrame, pd.DataFrame]],
    ) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]:
        """
        Checks if the box constraints are passed in the right format and if they are valid

        Args:
            box_constraints: Box constraints to check

        Returns:
            Verified box constraints

        Raises:
            ValueError: when box constraints are invalid
        """
        if box_constraints is None or isinstance(box_constraints, dict):
            return box_constraints

        raise ValueError(f"Invalid box constraints {box_constraints}")