Demo 2: Classification problem
Demo: Fairness on classification problems¶
# !pip install -e fairsense
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from deel.fairsense.data_management.factory import from_numpy, from_pandas
from deel.fairsense.data_management.processing import one_hot_encode
from deel.fairsense.indices.confidence_intervals import with_confidence_intervals
from deel.fairsense.indices.cvm import cvm_indices
from deel.fairsense.indices.standard_metrics import disparate_impact
from deel.fairsense.indices.sobol import sobol_indices
from deel.fairsense.utils.dataclasses import IndicesInput
from deel.fairsense.utils.fairness_objective import y_true, squared_error, y_pred, classification_error
from deel.fairsense.visualization.plots import cat_plot
from deel.fairsense.visualization.text import format_with_intervals
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
data = pd.read_csv("data/adult.csv")
data["income"] = data["income"] == ">50K"
data = data.drop(['native-country', 'fnlwgt'], axis=1)
# data = data[data["native-country"] != "Holand-Netherlands"]
data.head()
indice computation: Disparate impact¶
First we will start with computing some indices on the training data to see if the dataset is biased.
The first step consist of building the IndicesInput object that stores the data. As we can set the target y_true
means that we analyse the data, but this can be set to y_pred
if we want to analyse predictions, or squared_error
if we want to analyse the error.
We can then apply preprocessing such as one_hot encoding.
indices_inputs = from_pandas(data, "income", target=y_true)
categorical_cols = list(filter(lambda col: data.dtypes[col] == "O", data.columns))
indices_inputs = one_hot_encode(indices_inputs, categorical_cols)
We then declare the indices computation functions. The results are stored in a indicesOuput
object. raw value can be acessed with .values
, note that 0 refers to total independence and 1 refers to total dependence.
indices_outputs = disparate_impact(indices_inputs)
indices_outputs.values
cat_plot(indices_outputs, plot_per="index", kind="bar")
plt.show()
It is also possible to decorate any indice function with with_confidence_intervals
to use bootstrapping to compute confidence intervals. We can also use the + operator to compute multiple indices simulteanously. Results with confidence intervals can be visualized either textually with format_with_intervals
or 'graphically with cat_plot
di_with_ci = with_confidence_intervals(n_splits=30)(disparate_impact)
indices_outputs = di_with_ci(indices_inputs)
format_with_intervals(indices_outputs, quantile=0.05)
cat_plot(indices_outputs, plot_per="index", kind="bar")
plt.show()
2 train a model and analyse it's sensitivity¶
first we will split the data and then train a basic model on it.
data = data.sample(frac=1.) # shuffle data
data_train = data.iloc[:int(len(data)*0.8)]
data_test = data.iloc[int(len(data)*0.8):]
similarly we build the IndiceInput
object
indices_inputs_train = IndicesInput(
x=indices_inputs.x.iloc[:int(len(data)*0.8)],
y_true=indices_inputs.y_true.iloc[:int(len(data)*0.8)],
variable_groups=indices_inputs.variable_groups
)
indices_inputs_test = IndicesInput(
x=indices_inputs.x.iloc[int(len(data)*0.8):],
y_true=indices_inputs.y_true.iloc[int(len(data)*0.8):],
variable_groups=indices_inputs.variable_groups
)
then we train a basic model: DecisionTree. Note that this analysis can be applied to any callable that can handle numpy array as inputs.
model = DecisionTreeClassifier()
model.fit(indices_inputs_train.x, indices_inputs_train.y_true)
train_acc = accuracy_score(indices_inputs_train.y_true, model.predict(indices_inputs_train.x))
val_acc = accuracy_score(indices_inputs_test.y_true, model.predict(indices_inputs_test.x))
print(f"train acc: {train_acc}, val acc {val_acc}")
we set the model and the objective
indices_inputs_train.model = model.predict
indices_inputs_train.objective = y_pred
indices_inputs_test.model = model.predict
indices_inputs_test.objective = y_pred
indices_inputs.model = model.predict
indices_inputs.objective = y_pred
cvm_with_ci = with_confidence_intervals(n_splits=30)(cvm_indices)
di_with_ci = with_confidence_intervals(n_splits=30)(disparate_impact)
sobol_with_ci = with_confidence_intervals(n_splits=30)(sobol_indices)
indices_outputs_test = cvm_with_ci(indices_inputs_train) + di_with_ci(indices_inputs_train)# + sobol_with_ci(indices_inputs_train)
format_with_intervals(indices_outputs_test, quantile=0.1)
cat_plot(indices_outputs_test, plot_per="index", kind="box", col_wrap=3)
plt.show()
indices_inputs_train._objective = classification_error
indices_inputs_test._objective = classification_error
cvm_with_ci = with_confidence_intervals(n_splits=15)(cvm_indices)
sobol_with_ci = with_confidence_intervals(n_splits=15)(sobol_indices)
indices_outputs_test_error = cvm_with_ci(indices_inputs_test)# + sobol_with_ci(indices_inputs_test)
format_with_intervals(indices_outputs_test_error, quantile=0.1)
cat_plot(indices_outputs_test_error, plot_per="variable", kind="box", col_wrap=4)
plt.show()