Skip to content

processing module

one_hot_encode(indices_input, categorical_variables=None)

Performs one-hot encoding on the specified categorical variables. Variable groups are updated accordingly. Newly created variables are named: original_feature_name=value

Parameters:

Name Type Description Default
indices_input IndicesInput

IndiceInput object containing the data.

required
categorical_variables List[str]

name of the variable that should be encoded.

None

Returns:

Type Description
IndicesInput

the updated IndicesInput.

Source code in deel\fairsense\data_management\processing.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def one_hot_encode(
    indices_input: IndicesInput, categorical_variables: List[str] = None
) -> IndicesInput:
    """
    Performs one-hot encoding on the specified categorical variables. Variable groups
    are updated accordingly. Newly created variables are named:
    `original_feature_name=value`

    Args:
        indices_input: IndiceInput object containing the data.
        categorical_variables: name of the variable that should be encoded.

    Returns:
        the updated IndicesInput.

    """
    x = indices_input.x
    orig_var_groups = indices_input.variable_groups
    out_x = pd.get_dummies(
        x,
        prefix=categorical_variables,
        prefix_sep="=",
        dummy_na=False,
        drop_first=True,
        columns=categorical_variables,
    )
    out_var_groups = []
    # read the original groups
    for group in orig_var_groups:
        # iterate through the variable names in the group
        new_group = []
        for c in group:
            # in the group seek for the new variables names
            if c in out_x.columns:
                # variable has not been one hot encoded yet
                new_group += [c]
            else:
                # variable has been one hot encoded, find the new variable names and
                # add it to the group
                new_group += list(
                    filter(lambda cname: cname.startswith(c + "="), out_x.columns)
                )
        # group is finished add the new group to groups
        out_var_groups.append(new_group)
    return IndicesInput(
        x=out_x,
        y_true=indices_input.y_true,
        model=indices_input.model,
        variable_groups=out_var_groups,
        objective=indices_input.objective,
    )