Source code for fairensics.data.synthetic_dataset

"""A synthetic 2D data set with two features and one protected attribute
implemented as AIF360 BinaryLabelDataset.

The code is adopted from: https://github.com/mbilalzafar/fair-classification.

Additionally, a function to scatter plot the points is available.

TODO: pass labels, colors etc. to the plot function
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from scipy.stats import multivariate_normal  # generating synthetic data


[docs]class SyntheticDataset(BinaryLabelDataset):
    """Synthetic data set with two features and one protected attribute.

    The data set is randomly generated from two gaussians each time.
    Both protected attribute and label are binary and features are numerical.
    """

    _UNPRIVILEGED_GROUP_NEGATIVE_LABEL = "Prot. -ve"
    _UNPRIVILEGED_GROUP_POSITIVE_LABEL = "Prot. +ve"
    _PRIVILEGED_GROUP_NEGATIVE_LABEL = "Non-prot. -ve"
    _PRIVILEGED_GROUP_POSITIVE_LABEL = "Non-prot. +ve"

[docs]    def __init__(
        self,
        n_samples=1000,
        label_name="label",
        feature_one_name="feature_1",
        feature_two_name="feature_2",
        favorable_label=1,
        unfavorable_label=0,
        protected_attribute_name="protected_attribute",
        privileged_class=1,
        unprivileged_class=0,
        sd=1122334455,
        mu_1=(2, 2),
        sigma_1=((5, 1), (1, 5)),
        mu_2=(-2, -2),
        sigma_2=((10, 1), (1, 3)),
        initial_discrimination=4.0,
    ):
        """
        Args:
            n_samples (int) : the number of samples to generate
            label_name (str): name of the column storing the target variable
            feature_one_name (str): name of the first unprotected feature
            feature_two_name (str): name of the second unprotected feature
            favorable_label (int): label considered positive
            unfavorable_label (int):  label considered negative
            protected_attribute_name (str): the name of the protected attribute
            privileged_class (int): class of protected attribute considered
                positive
            unprivileged_class (int): class of protected attribute considered
                negative
            sd (int): seed for random generator
            mu_1 (float, float): mean of positive group cluster
            sigma_1 ((float, float), (float, float)): covariance of positive
                group cluster
            mu_2 (float, float): mean of negative group cluster
            sigma_2 ((float, float), (float, float)):  covariance of negative
                group cluster
            initial_discrimination (float): initial discrimination factor
        """
        np.random.seed(sd)

        self.n_samples = n_samples
        self._mu_1 = np.array(mu_1)
        self._sigma_1 = np.array(sigma_1)
        self._mu_2 = np.array(mu_2)
        self._sigma_2 = np.array(sigma_2)
        self._disc_factor = np.pi / initial_discrimination

        X1, X2, X_s, y = self._gen_data(
            favorable_label,
            unfavorable_label,
            privileged_class,
            unprivileged_class,
        )

        df = pd.DataFrame(
            {
                feature_one_name: X1,
                feature_two_name: X2,
                protected_attribute_name: X_s,
                label_name: y,
            }
        )

        super(SyntheticDataset, self).__init__(
            favorable_label=favorable_label,
            unfavorable_label=unfavorable_label,
            df=df,
            label_names=[label_name],
            protected_attribute_names=[protected_attribute_name],
            privileged_protected_attributes=[[privileged_class]],
            unprivileged_protected_attributes=[[unprivileged_class]],
        )

    def _gen_gaussian(self, mean_in, cov_in, class_label):
        """Generates n_samples from gaussian distribution"""
        nv = multivariate_normal(mean=mean_in, cov=cov_in)
        X = nv.rvs(self.n_samples)
        y = np.ones(self.n_samples, dtype=float) * class_label
        return nv, X, y

    def _gen_data(
        self,
        favorable_label,
        unfavorable_label,
        privileged_class,
        unprivileged_class,
    ):
        """Code for generating the synthetic data.

        We will have two non-sensitive features and one sensitive feature.

        Args:
            favorable_label (int): the label considered positive.
            unfavorable_label (int): the label considered negative.
            privileged_class (int): the class in protected attribute considered
                privileged.
            unprivileged_class (int): the class in protected attribute
                considered unprivileged.

        Returns:
            X_0 (np.ndarary): 1D array, the first unprotected feature.
            X_1 (np.ndarary): 1D array, the second unprotected feature.
            X_s (np.ndarary): 1D array, binary, the protected attribute.
            y (np.ndarary): 1D array, binary, the labels.

        """
        nv1, X1, y1 = self._gen_gaussian(
            self._mu_1, self._sigma_1, favorable_label
        )
        nv2, X2, y2 = self._gen_gaussian(
            self._mu_2, self._sigma_2, unfavorable_label
        )

        X = np.vstack((X1, X2))
        y = np.hstack((y1, y2))

        # shuffle the data
        perm = list(range(0, self.n_samples * 2))
        np.random.shuffle(perm)
        X = X[perm]
        y = y[perm]

        rotation_mult = np.array(
            [
                [np.cos(self._disc_factor), -np.sin(self._disc_factor)],
                [np.sin(self._disc_factor), np.cos(self._disc_factor)],
            ]
        )

        X_aux = np.dot(X, rotation_mult)

        # Generate the sensitive feature here
        x_sensitive = []
        for i in range(0, len(X)):
            x = X_aux[i]

            # probability for each cluster that the point belongs to it
            p1 = nv1.pdf(x)
            p2 = nv2.pdf(x)

            # normalize the probabilities
            s = p1 + p2
            p1 = p1 / s

            r = np.random.uniform()

            if r < p1:  # the first cluster is the positive class
                x_sensitive.append(privileged_class)
            else:
                x_sensitive.append(unprivileged_class)

        x_sensitive = np.array(x_sensitive)

        return X[:, 0], X[:, 1], x_sensitive, y

    # noinspection Duplicates
[docs]    def plot(self, num_to_draw=200):
        """Plot subsample of data with unprotected features on x and y axis."""
        x_draw = self.features[:num_to_draw, :2]  # ignore the protected column
        y_draw = self.labels[:num_to_draw, 0]
        x_sensitive_draw = self.protected_attributes[:num_to_draw, 0]

        idx = x_sensitive_draw == self.unprivileged_protected_attributes[0]
        X_unprivileged = x_draw[idx]
        idx = x_sensitive_draw == self.privileged_protected_attributes[0]
        X_privileged = x_draw[idx]

        idx = x_sensitive_draw == self.unprivileged_protected_attributes[0]
        y_unprivileged = y_draw[idx]
        idx = x_sensitive_draw == self.privileged_protected_attributes[0]
        y_privileged = y_draw[idx]

        # pylint: disable=duplicate-code
        plt.scatter(
            X_unprivileged[y_unprivileged == self.favorable_label][:, 0],
            X_unprivileged[y_unprivileged == self.favorable_label][:, 1],
            color="green",
            marker="x",
            label=self._UNPRIVILEGED_GROUP_POSITIVE_LABEL,
        )

        plt.scatter(
            X_unprivileged[y_unprivileged == self.unfavorable_label][:, 0],
            X_unprivileged[y_unprivileged == self.unfavorable_label][:, 1],
            color="red",
            marker="x",
            label=self._UNPRIVILEGED_GROUP_NEGATIVE_LABEL,
        )

        plt.scatter(
            X_privileged[y_privileged == self.favorable_label][:, 0],
            X_privileged[y_privileged == self.favorable_label][:, 1],
            color="green",
            facecolors="none",
            label=self._PRIVILEGED_GROUP_POSITIVE_LABEL,
        )

        plt.scatter(
            X_privileged[y_privileged == self.unfavorable_label][:, 0],
            X_privileged[y_privileged == self.unfavorable_label][:, 1],
            color="red",
            facecolors="none",
            label=self._PRIVILEGED_GROUP_NEGATIVE_LABEL,
        )

        plt.legend()
        plt.xlabel(self.feature_names[0])
        plt.ylabel(self.feature_names[1])

        plt.xlim((np.min(x_draw[:, 0]) - 2, np.max(x_draw[:, 0]) + 2))
        plt.ylim((np.min(x_draw[:, 1]) - 2, np.max(x_draw[:, 1]) + 2))

        plt.show()
Source code for fairensics.data.synthetic_dataset

fairensics

Navigation

Related Topics