etiq.datasets package

Subpackages

Submodules

etiq.datasets.abstract_dataset module

Define Abstract Dataset class

class etiq.datasets.abstract_dataset.AbstractDataset

Bases: Generic[T, S]

Abstract dataset class

abstract property all_feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset

Returns:

A list containing the names of all features in the dataset

Return type:

Tuple[str, …]

abstract property all_features: T
abstract all_identical(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') bool
abstract cache_predictions(predictions: T, prediction_name: str, condition: str) None
abstract property categorical_features: Tuple[str, ...]

Return the names of the categorical features stored in the dataset

Returns:

A list containing the names of all categorical features in the dataset

Return type:

Tuple[str, …]

abstract check_column_ordering_violations(gt_feature: str, lt_feature: str, ids: Tuple[str, ...] | None = None) Iterable
abstract check_duplicate_row_violations(duplicate_features_subset: Tuple[str, ...] | None = None, ids: Tuple[str, ...] | None = None) Iterable
abstract check_missing_ids(id_feature: str, primary_ids: List[str] | None = None) Iterable
abstract property checksum: Dict[str, str]
abstract compare_feature(other_dataset: AbstractDataset[T, S], afeature: str) bool
abstract property continuous_features: Tuple[str, ...]

Returns the names of the continuous features stored in the dataset

Returns:

A list containing the names of all continuous features in the dataset

Return type:

Tuple[str, …]

abstract property data_backend: DataBackend
abstract property date_feature_names: Tuple[str, ...]

Return the names of the features that store dates

Returns:

A list containing the names of the date features

Return type:

Tuple[str, …]

abstract property date_features: T
abstract property date_features_testing: T
abstract property date_features_training: T
abstract property date_features_validation: T
abstract feature(name: str, mask: str = '') S

Returns all data in the dataset for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The data for the specified feature

Return type:

S

abstract feature_combined_std(other_dataset: AbstractDataset[T, S], name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') float
abstract feature_max(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Any
abstract feature_min(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Any
abstract property feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset excluding the id and date features

Returns:

A list containing the names of all features in the dataset excluding the id and date features

Return type:

Tuple[str, …]

abstract feature_testing(name: str, mask: str = '') S

Returns testing data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The testing data for the specified feature

Return type:

S

abstract feature_training(name: str, mask: str = '') S

Returns training data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The training data for the specified feature

Return type:

S

abstract feature_validation(name: str, mask: str = '') S

Returns validation data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The validation data for the specified feature

Return type:

S

abstract property features: T
abstract property features_testing: T
abstract property features_training: T
abstract property features_validation: T
abstract filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) AbstractDataset[T, S]
abstract filter_count(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) int
abstract generate_data_profile(group_by: str | None = None) DataProfile
abstract get_feature_categories(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Tuple[Any, ...]
abstract get_feature_profile(afeature: str, feature_vals: List[Any], atype: FeatureType) FeatureDataProfile
abstract get_metric_calculation_params(mask: str = '') Tuple
get_predictions_cache() PredictionCache[T, S] | None
abstract get_restricted_features() Tuple[str, ...]
abstract get_segment_data(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') T | S
abstract property has_predictions: bool
abstract histogram(feature_name: str, bins: int = 15)
abstract property history: List[Tuple[str, str, dict]]
abstract property id: str
abstract property id_feature_names: Tuple[str, ...]

Return the names of the features that store ids

Returns:

A list containing the names of the date features

Return type:

Tuple[str, …]

abstract property id_features: T
abstract property id_features_testing: T
abstract property id_features_training: T
abstract property id_features_validation: T
abstract is_categorical_feature(afeature: str) bool
abstract is_continuous_feature(afeature: str) bool
abstract is_date_feature(afeature: str) bool
abstract is_id_feature(afeature: str) bool
abstract is_numerical(afeature: str) bool
abstract is_target_categorical() bool
abstract is_target_continuous() bool
abstract mean(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Number
abstract median(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Number
abstract property name: str
abstract property number_of_date_features: int
abstract property number_of_features: int
abstract property number_of_id_features: int
abstract property number_of_samples: int
abstract property number_of_testing_samples: int
abstract property number_of_training_samples: int
abstract property number_of_validation_samples: int
abstract property overall_checksum: str
abstract property prediction: S
abstract property prediction_categorical: bool
abstract property prediction_name: str | None
abstract property prediction_testing: S
abstract property prediction_training: S
abstract property prediction_validation: S
abstract quantiles(name: str, quantile_vals: List[float], dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') List[Number]
abstract property target: S
abstract property target_categorical: bool
abstract property target_name: str

Return the name of the target

Returns:

The name of the target

Return type:

str

abstract property target_testing: S
abstract property target_training: S
abstract property target_validation: S
class etiq.datasets.abstract_dataset.DatasetSegment(value)

Bases: Enum

An enumeration.

ALL = 0
TESTING = 3
TRAINING = 1
VALIDATION = 2
class etiq.datasets.abstract_dataset.PredictionCache

Bases: Generic[T, S]

abstract get_data(condition: str) T
abstract get_predicted(condition: str = '') S | str
abstract get_protected(condition: str = '') S | str | None
abstract get_target(condition: str = '') S | str

etiq.datasets.backend module

class etiq.datasets.backend.DataBackend(value)

Bases: Enum

An enumeration.

PANDAS = 0
SPARK = 1

etiq.datasets.bias_dataset module

class etiq.datasets.bias_dataset.BiasDataset

Bases: SimpleDataset[T, S]

Abstract bias dataset class

abstract get_protected_metric_calculation_params(mask: str = '') Any
abstract property protected: S
abstract property protected_name: str
abstract property protected_testing: S
abstract property protected_training: S
abstract property protected_validation: S

etiq.datasets.simple_dataset module

class etiq.datasets.simple_dataset.SimpleDataset

Bases: AbstractDataset[T, S]

Abstract simple dataset class

etiq.datasets.utils module

class etiq.datasets.utils.Cache(maxlen=32, items=None)

Bases: MutableMapping

property maxlen
class etiq.datasets.utils.PdEncoder(*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)

Bases: JSONEncoder

default(obj)

Implement this method in a subclass such that it returns a serializable object for o, or calls the base implementation (to raise a TypeError).

For example, to support arbitrary iterators, you could implement default like this:

def default(self, o):
    try:
        iterable = iter(o)
    except TypeError:
        pass
    else:
        return list(iterable)
    # Let the base class default method raise the TypeError
    return JSONEncoder.default(self, o)
etiq.datasets.utils.calculate_conditional_probs(base_dataset: AbstractDataset, comparison_dataset: AbstractDataset, number_of_bins: int = 10) Dict[str, Tuple[ndarray, ndarray]]
etiq.datasets.utils.calculate_conditionals_from_counter(target_counter: Dict[Any, int], feature_counter: Dict[Any, int], target_values: List[Any], feature_values: List[Any]) ndarray
etiq.datasets.utils.detect_mixed_types(adataframe_or_series: DataFrame | Series) List[str]
etiq.datasets.utils.hash_columns(row: Series) int
etiq.datasets.utils.hash_pandas_df(df: DataFrame) int
etiq.datasets.utils.str_index(index, use_int_bins: bool) str

Convert index value to a string

If an index is a Pandas Interval, use the left hand side for the boundary.

Module contents

class etiq.datasets.AbstractDataset

Bases: Generic[T, S]

Abstract dataset class

abstract property all_feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset

Returns:

A list containing the names of all features in the dataset

Return type:

Tuple[str, …]

abstract property all_features: T
abstract all_identical(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') bool
abstract cache_predictions(predictions: T, prediction_name: str, condition: str) None
abstract property categorical_features: Tuple[str, ...]

Return the names of the categorical features stored in the dataset

Returns:

A list containing the names of all categorical features in the dataset

Return type:

Tuple[str, …]

abstract check_column_ordering_violations(gt_feature: str, lt_feature: str, ids: Tuple[str, ...] | None = None) Iterable
abstract check_duplicate_row_violations(duplicate_features_subset: Tuple[str, ...] | None = None, ids: Tuple[str, ...] | None = None) Iterable
abstract check_missing_ids(id_feature: str, primary_ids: List[str] | None = None) Iterable
abstract property checksum: Dict[str, str]
abstract compare_feature(other_dataset: AbstractDataset[T, S], afeature: str) bool
abstract property continuous_features: Tuple[str, ...]

Returns the names of the continuous features stored in the dataset

Returns:

A list containing the names of all continuous features in the dataset

Return type:

Tuple[str, …]

abstract property data_backend: DataBackend
abstract property date_feature_names: Tuple[str, ...]

Return the names of the features that store dates

Returns:

A list containing the names of the date features

Return type:

Tuple[str, …]

abstract property date_features: T
abstract property date_features_testing: T
abstract property date_features_training: T
abstract property date_features_validation: T
abstract feature(name: str, mask: str = '') S

Returns all data in the dataset for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The data for the specified feature

Return type:

S

abstract feature_combined_std(other_dataset: AbstractDataset[T, S], name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') float
abstract feature_max(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Any
abstract feature_min(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Any
abstract property feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset excluding the id and date features

Returns:

A list containing the names of all features in the dataset excluding the id and date features

Return type:

Tuple[str, …]

abstract feature_testing(name: str, mask: str = '') S

Returns testing data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The testing data for the specified feature

Return type:

S

abstract feature_training(name: str, mask: str = '') S

Returns training data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The training data for the specified feature

Return type:

S

abstract feature_validation(name: str, mask: str = '') S

Returns validation data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The validation data for the specified feature

Return type:

S

abstract property features: T
abstract property features_testing: T
abstract property features_training: T
abstract property features_validation: T
abstract filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) AbstractDataset[T, S]
abstract filter_count(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) int
abstract generate_data_profile(group_by: str | None = None) DataProfile
abstract get_feature_categories(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Tuple[Any, ...]
abstract get_feature_profile(afeature: str, feature_vals: List[Any], atype: FeatureType) FeatureDataProfile
abstract get_metric_calculation_params(mask: str = '') Tuple
get_predictions_cache() PredictionCache[T, S] | None
abstract get_restricted_features() Tuple[str, ...]
abstract get_segment_data(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') T | S
abstract property has_predictions: bool
abstract histogram(feature_name: str, bins: int = 15)
abstract property history: List[Tuple[str, str, dict]]
abstract property id: str
abstract property id_feature_names: Tuple[str, ...]

Return the names of the features that store ids

Returns:

A list containing the names of the date features

Return type:

Tuple[str, …]

abstract property id_features: T
abstract property id_features_testing: T
abstract property id_features_training: T
abstract property id_features_validation: T
abstract is_categorical_feature(afeature: str) bool
abstract is_continuous_feature(afeature: str) bool
abstract is_date_feature(afeature: str) bool
abstract is_id_feature(afeature: str) bool
abstract is_numerical(afeature: str) bool
abstract is_target_categorical() bool
abstract is_target_continuous() bool
abstract mean(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Number
abstract median(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Number
abstract property name: str
abstract property number_of_date_features: int
abstract property number_of_features: int
abstract property number_of_id_features: int
abstract property number_of_samples: int
abstract property number_of_testing_samples: int
abstract property number_of_training_samples: int
abstract property number_of_validation_samples: int
abstract property overall_checksum: str
abstract property prediction: S
abstract property prediction_categorical: bool
abstract property prediction_name: str | None
abstract property prediction_testing: S
abstract property prediction_training: S
abstract property prediction_validation: S
abstract quantiles(name: str, quantile_vals: List[float], dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') List[Number]
abstract property target: S
abstract property target_categorical: bool
abstract property target_name: str

Return the name of the target

Returns:

The name of the target

Return type:

str

abstract property target_testing: S
abstract property target_training: S
abstract property target_validation: S
class etiq.datasets.BiasDataset

Bases: SimpleDataset[T, S]

Abstract bias dataset class

abstract get_protected_metric_calculation_params(mask: str = '') Any
abstract property protected: S
abstract property protected_name: str
abstract property protected_testing: S
abstract property protected_training: S
abstract property protected_validation: S
class etiq.datasets.BiasDatasetBuilder

Bases: SimpleDatasetBuilder

classmethod bias_params(bias_params: BiasParams | None = None, protected: str | None = None, privileged: Any | None = None, unprivileged: Any | None = None, positive_outcome_label: Any | None = None, negative_outcome_label: Any | None = None)

Returns a BiasParams object

Parameters:
  • bias_params – A bias params object to clone. This defaults to None if not provided.

  • protected – Protected feature name for example ‘gender’. Defaults to None.

  • privileged – Privileged label within the protected feature for example ‘male’. This defaults to None if not provided.

  • unprivileged – Privileged label within the protected feature for example ‘female’. This defaults to None if not provided.

  • positive_outcome_label – The label of a “positive” outcome within a target feature. Defaults to None.

  • negative_outcome_label – The label of a “negative” outcome within a target feature. Defaults to None.

Returns:

A BiasParams object

classmethod dataset(features: DataFrame, target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, train_valid_test_splits: Tuple[float, float, float] = (0.8, 0.2, 0.0), id_col: List[str] | None = None, date_col: List[str] | None = None, bias_params: BiasParams | None = None, convert_date_cols: bool = False, datetime_format: str = '', remove_protected_from_features: bool = True, random_seed: int = 2, name: str = None, register_creation: bool = True) BiasDataset

Creates a BiasDataset object given pandas dataframe(s).

Use this the dataset builder like:

from etiq import BiasDatasetBuilder
from etiq.biasparams import BiasParams
import pandas as pd
a = [
        ["2022-10-10", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-11", 'F', 8, 9, 10, 11, 12, 0],
        ["", 'F', 2, 3, 4, 5, 6, 1],
        ["2022-10-13", 'M', 8, 9, 10, 11, 12, 0],
        ["2022-10-14", 'F', 2, 3, 4, 5, 6, 1],
        ["2022-10-15", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-16", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-17", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-18", 'M', 14, 15, 16, 17, 18, 1],
        ["2022-10-19", 'M', 15, 16, 17, 18, 19, 1],
    ]
df = pd.DataFrame(a,
                columns=["start_date", "gender", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = BiasDatasetBuilder.dataset(
                features=df,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                bias_params = BiasParams(protected='gender',
                                         privileged='M',
                                         unprivileged='F',
                                         positive_outcome_label= 1,
                                         negative_outcome_label= 0),
                remove_protected_from_features = True,
                convert_date_cols=True,
                name="test_dataset")
Parameters:
  • features – Pandas dataframe containing the dataset features as columns.

  • target – Pandas dataframe containing the target feature as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.

  • label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.

  • prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.

  • cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.

  • cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.

  • id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.

  • date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.

  • bias_params – This contains demographic data (the protected feature) needed to create the bias dataset. This defaults to None in which case a fake random protected feature is created.

  • train_valid_test_splits – This parameter specifies the proportions to use when splitting the data into training, validation and test subsets. This defaults to (0.8, 0.2, 0.0).

  • random_seed – Random number seed (for reproducibility) used when splitting the data into random training, validation and test subsets. This defaults to 2.

  • remove_protected_from_features – This is set to True in order to remove the protected feature from the normal features i.e. the protected feature is then not considered a feature used by the model. Otherwise the protected feature is treated as a normal feature.

  • convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.

  • datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.

  • name – The name to use for the dataset. This defaults to None in which case a random name is assigned.

  • register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A BiasDataset object.

classmethod datasets(training_features: DataFrame | None = None, training_target: DataFrame | None = None, validation_features: DataFrame | None = None, validation_target: DataFrame | None = None, testing_features: DataFrame | None = None, testing_target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, bias_params: BiasParams | None = None, remove_protected_from_features: bool = True, id_col: List[str] | None = None, date_col: List[str] | None = None, convert_date_cols: bool = False, datetime_format: str = '', name: str | None = None, register_creation: bool = True) BiasDataset

Creates a SimpleDataset object given pandas dataframe(s).

Use this builder like:

from etiq import BiasDatasetBuilder
from etiq.biasparams import BiasParams
import pandas as pd
training = [
        ["2022-10-10", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-11", 'F', 8, 9, 10, 11, 12, 0],
        ["", 'F', 2, 3, 4, 5, 6, 1],
        ["2022-10-13", 'M', 8, 9, 10, 11, 12, 0],
        ["2022-10-14", 'F', 2, 3, 4, 5, 6, 1]
        ]
validation = [
        ["2022-10-15", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-16", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-17", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-18", 'M', 14, 15, 16, 17, 18, 1],
        ["2022-10-19", 'M', 15, 16, 17, 18, 19, 1]
        ]
df1 = pd.DataFrame(training,
                columns=["start_date", "gender", "age2", "age3", "age4",
                        "age5", "age6", "income"])
df2 = pd.DataFrame(validation,
                columns=["start_date", "gender", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = BiasDatasetBuilder.datasets(
                training_features=df1,
                validation_features=df2,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                bias_params = BiasParams(protected='gender',
                                         privileged='M',
                                         unprivileged='F',
                                         positive_outcome_label= 1,
                                         negative_outcome_label= 0),
                remove_protected_from_features = True,
                convert_date_cols=True,
                name="test_dataset")
Parameters:
  • training_features – Pandas dataframe containing the training dataset features. This defaults to None in which case we assume there is no training data.

  • training_target – Pandas dataframe containing the target training data as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.

  • validation_features – Pandas dataframe containing the validation dataset features. This defaults to None in which case we assume there is no validation data.

  • validation_target – Pandas dataframe containing the target validation data as a column. This defaults to None in which case the target feature is assumed to be either the last column in validation features dataset or the column name specified in the label argument.

  • testing_features – Pandas dataframe containing the testing dataset features. This defaults to None in which case we assume there is no testing data.

  • testing_target – Pandas dataframe containing the target testing data as a column. This defaults to None in which case the target feature is assumed to be either the last column in testing features dataset or the column name specified in the label argument.

  • label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.

  • prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.

  • cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.

  • cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.

  • id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.

  • date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.

  • bias_params – This contains demographic data (the protected feature) needed to create the bias dataset. This defaults to None in which case a fake random protected feature is created.

  • train_valid_test_splits – This parameter specifies the proportions to use when splitting the data into training, validation and test subsets. This defaults to (0.8, 0.2, 0.0).

  • random_seed – Random number seed used when splitting the data into random training, validation and test subsets. This defaults to 2.

  • remove_protected_from_features – This is set to True in order to remove the protected feature from the normal features i.e. the protected feature is then not considered a feature used by the model. Otherwise the protected feature is treated as a normal feature.

  • convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.

  • datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.

  • name – The name to use for the dataset. This defaults to None in which case a random name is assigned.

  • register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A BiasDataset object.

class etiq.datasets.DataBackend(value)

Bases: Enum

An enumeration.

PANDAS = 0
SPARK = 1
class etiq.datasets.DatasetSegment(value)

Bases: Enum

An enumeration.

ALL = 0
TESTING = 3
TRAINING = 1
VALIDATION = 2
class etiq.datasets.PredictionCache

Bases: Generic[T, S]

abstract get_data(condition: str) T
abstract get_predicted(condition: str = '') S | str
abstract get_protected(condition: str = '') S | str | None
abstract get_target(condition: str = '') S | str
class etiq.datasets.SimpleDataset

Bases: AbstractDataset[T, S]

Abstract simple dataset class

class etiq.datasets.SimpleDatasetBuilder

Bases: object

A builder for the SimpleDataset class

classmethod dataset(features: DataFrame, target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, id_col: List[str] | None = None, date_col: List[str] | None = None, train_valid_test_splits: Tuple[float, float, float] = (0.8, 0.2, 0.0), random_seed: int = 2, convert_date_cols: bool = False, datetime_format: str = '', name: str | None = None, register_creation: bool = True) SimpleDataset

Creates a SimpleDataset object given pandas dataframe(s).

Use this builder like:

from etiq import SimpleDatasetBuilder
import pandas as pd
a = [
        ["2022-10-10", 2, 3, 4, 5, 6, 1],
        ["2022-10-11", 8, 9, 10, 11, 12, 0],
        ["", 2, 3, 4, 5, 6, 1],
        ["2022-10-13", 8, 9, 10, 11, 12, 0],
        ["2022-10-14", 2, 3, 4, 5, 6, 1],
        ["2022-10-15", 8, 9, 10, 11, 12, 0],
        ["2022-10-16", 2, 3, 4, 5, 6, 1],
        ["2022-10-17", 8, 9, 10, 11, 12, 0],
        ["2022-10-18", 14, 15, 16, 17, 18, 1],
        ["2022-10-19", 15, 16, 17, 18, 19, 1],
    ]
df = pd.DataFrame(a,
                columns=["start_date", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = SimpleDatasetBuilder.dataset(
                features=df,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                convert_date_cols=True,
                name="test_dataset")
Parameters:
  • features – Pandas dataframe containing the dataset features as columns.

  • target – Pandas dataframe containing the target feature as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.

  • label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.

  • prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.

  • cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.

  • cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.

  • id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.

  • date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.

  • train_valid_test_splits – This parameter specifies the proportions to use when splitting the data into training, validation and test subsets. This defaults to (0.8, 0.2, 0.0).

  • random_seed – Random number seed used when splitting the data into random training, validation and test subsets. This defaults to 2.

  • convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.

  • datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.

  • name – The name to use for the dataset. This defaults to None in which case a random name is assigned.

  • register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A SimpleDataset object.

classmethod datasets(training_features: DataFrame | None = None, training_target: DataFrame | None = None, validation_features: DataFrame | None = None, validation_target: DataFrame | None = None, testing_features: DataFrame | None = None, testing_target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, id_col: List[str] | None = None, date_col: List[str] | None = None, convert_date_cols=False, datetime_format='', name: str | None = None, register_creation: bool = True) SimpleDataset

Creates a SimpleDataset object given pandas dataframe(s).

Use this builder like:

from etiq import SimpleDatasetBuilder
import pandas as pd
training = [
                ["2022-10-10", 2, 3, 4, 5, 6, 1],
                ["2022-10-11", 8, 9, 10, 11, 12, 0],
                ["", 2, 3, 4, 5, 6, 1],
                ["2022-10-13", 8, 9, 10, 11, 12, 0]
                ["2022-10-14", 2, 3, 4, 5, 6, 1]
            ]
validation = [
                ["2022-10-15", 8, 9, 10, 11, 12, 0],
                ["2022-10-16", 2, 3, 4, 5, 6, 1],
                ["2022-10-17", 8, 9, 10, 11, 12, 0],
                ["2022-10-18", 14, 15, 16, 17, 18, 1],
                ["2022-10-19", 15, 16, 17, 18, 19, 1]
            ]
df1 = pd.DataFrame(training,
                columns=["start_date", "age2", "age3", "age4",
                        "age5", "age6", "income"])
df2 = pd.DataFrame(validation,
                columns=["start_date", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = SimpleDatasetBuilder.datasets(
                training_features=df1,
                validation_features=df2,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                convert_date_cols=True,
                name="test_dataset")
Parameters:
  • training_features – Pandas dataframe containing the training dataset features. This defaults to None in which case we assume there is no training data.

  • training_target – Pandas dataframe containing the target training data as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.

  • validation_features – Pandas dataframe containing the validation dataset features. This defaults to None in which case we assume there is no validation data.

  • validation_target – Pandas dataframe containing the target validation data as a column. This defaults to None in which case the target feature is assumed to be either the last column in validation features dataset or the column name specified in the label argument.

  • testing_features – Pandas dataframe containing the testing dataset features. This defaults to None in which case we assume there is no testing data.

  • testing_target – Pandas dataframe containing the target testing data as a column. This defaults to None in which case the target feature is assumed to be either the last column in testing features dataset or the column name specified in the label argument.

  • label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.

  • prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.

  • cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.

  • cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.

  • id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.

  • date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.

  • convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.

  • datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.

  • name – The name to use for the dataset. This defaults to None in which case a random name is assigned.

  • register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A SimpleDataset object.

etiq.datasets.calculate_conditional_probs(base_dataset: AbstractDataset, comparison_dataset: AbstractDataset, number_of_bins: int = 10) Dict[str, Tuple[ndarray, ndarray]]