etiq.datasets package

Subpackages

Submodules

etiq.datasets.abstract_dataset module

Define Abstract Dataset class

class etiq.datasets.abstract_dataset.AbstractDataset

Bases: Generic[T, S]

Abstract dataset class

abstract property all_feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset

Returns:: A list containing the names of all features in the dataset
Return type:: Tuple[str, …]

abstract property all_features: T

abstract all_identical(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → bool

abstract cache_predictions(predictions: T, prediction_name: str, condition: str) → None

abstract property categorical_features: Tuple[str, ...]

Return the names of the categorical features stored in the dataset

Returns:: A list containing the names of all categorical features in the dataset
Return type:: Tuple[str, …]

abstract check_column_ordering_violations(gt_feature: str, lt_feature: str, ids: Tuple[str, ...] | None = None) → Iterable

abstract check_duplicate_row_violations(duplicate_features_subset: Tuple[str, ...] | None = None, ids: Tuple[str, ...] | None = None) → Iterable

abstract check_missing_ids(id_feature: str, primary_ids: List[str] | None = None) → Iterable

abstract property checksum: Dict[str, str]

abstract compare_feature(other_dataset: AbstractDataset[T, S], afeature: str) → bool

abstract property continuous_features: Tuple[str, ...]

Returns the names of the continuous features stored in the dataset

Returns:: A list containing the names of all continuous features in the dataset
Return type:: Tuple[str, …]

abstract property data_backend: DataBackend

abstract property date_feature_names: Tuple[str, ...]

Return the names of the features that store dates

Returns:: A list containing the names of the date features
Return type:: Tuple[str, …]

abstract property date_features: T

abstract property date_features_testing: T

abstract property date_features_training: T

abstract property date_features_validation: T

abstract feature(name: str, mask: str = '') → S

Returns all data in the dataset for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The data for the specified feature
Return type:: S

abstract feature_combined_std(other_dataset: AbstractDataset[T, S], name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → float

abstract feature_max(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Any

abstract feature_min(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Any

abstract property feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset excluding the id and date features

Returns:: A list containing the names of all features in the dataset excluding the id and date features
Return type:: Tuple[str, …]

abstract feature_testing(name: str, mask: str = '') → S

Returns testing data for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The testing data for the specified feature
Return type:: S

abstract feature_training(name: str, mask: str = '') → S

Returns training data for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The training data for the specified feature
Return type:: S

abstract feature_validation(name: str, mask: str = '') → S

Returns validation data for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The validation data for the specified feature
Return type:: S

abstract property features: T

abstract property features_testing: T

abstract property features_training: T

abstract property features_validation: T

abstract filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) → AbstractDataset[T, S]

abstract filter_count(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) → int

abstract generate_data_profile(group_by: str | None = None) → DataProfile

abstract get_feature_categories(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Tuple[Any, ...]

abstract get_feature_profile(afeature: str, feature_vals: List[Any], atype: FeatureType) → FeatureDataProfile

abstract get_metric_calculation_params(mask: str = '') → Tuple

get_predictions_cache() → PredictionCache[T, S] | None

abstract get_restricted_features() → Tuple[str, ...]

abstract get_segment_data(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → T | S

abstract property has_predictions: bool

abstract histogram(feature_name: str, bins: int = 15)

abstract property history: List[Tuple[str, str, dict]]

abstract property id: str

abstract property id_feature_names: Tuple[str, ...]

Return the names of the features that store ids

Returns:: A list containing the names of the date features
Return type:: Tuple[str, …]

abstract property id_features: T

abstract property id_features_testing: T

abstract property id_features_training: T

abstract property id_features_validation: T

abstract is_categorical_feature(afeature: str) → bool

abstract is_continuous_feature(afeature: str) → bool

abstract is_date_feature(afeature: str) → bool

abstract is_id_feature(afeature: str) → bool

abstract is_numerical(afeature: str) → bool

abstract is_target_categorical() → bool

abstract is_target_continuous() → bool

abstract mean(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Number

abstract median(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Number

abstract property name: str

abstract property number_of_date_features: int

abstract property number_of_features: int

abstract property number_of_id_features: int

abstract property number_of_samples: int

abstract property number_of_testing_samples: int

abstract property number_of_training_samples: int

abstract property number_of_validation_samples: int

abstract property overall_checksum: str

abstract property prediction: S

abstract property prediction_categorical: bool

abstract property prediction_name: str | None

abstract property prediction_testing: S

abstract property prediction_training: S

abstract property prediction_validation: S

abstract quantiles(name: str, quantile_vals: List[float], dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → List[Number]

abstract property target: S

abstract property target_categorical: bool

abstract property target_name: str

Return the name of the target

Returns:: The name of the target
Return type:: str

abstract property target_testing: S

abstract property target_training: S

abstract property target_validation: S

class etiq.datasets.abstract_dataset.DatasetSegment(value)

Bases: Enum

An enumeration.

ALL = 0

TESTING = 3

TRAINING = 1

VALIDATION = 2

class etiq.datasets.abstract_dataset.PredictionCache

Bases: Generic[T, S]

abstract get_data(condition: str) → T

abstract get_predicted(condition: str = '') → S | str

abstract get_protected(condition: str = '') → S | str | None

abstract get_target(condition: str = '') → S | str

etiq.datasets.backend module

class etiq.datasets.backend.DataBackend(value)

Bases: Enum

An enumeration.

PANDAS = 0

SPARK = 1

etiq.datasets.bias_dataset module

class etiq.datasets.bias_dataset.BiasDataset

Bases: SimpleDataset[T, S]

Abstract bias dataset class

abstract get_protected_metric_calculation_params(mask: str = '') → Any

abstract property protected: S

abstract property protected_name: str

abstract property protected_testing: S

abstract property protected_training: S

abstract property protected_validation: S

etiq.datasets.simple_dataset module

class etiq.datasets.simple_dataset.SimpleDataset

Bases: AbstractDataset[T, S]

Abstract simple dataset class

etiq.datasets.utils module

class etiq.datasets.utils.Cache(maxlen=32, items=None)

Bases: MutableMapping

property maxlen

class etiq.datasets.utils.PdEncoder(*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)

Bases: JSONEncoder

default(obj)

Implement this method in a subclass such that it returns a serializable object for o, or calls the base implementation (to raise a TypeError).

For example, to support arbitrary iterators, you could implement default like this:

def default(self, o):
    try:
        iterable = iter(o)
    except TypeError:
        pass
    else:
        return list(iterable)
    # Let the base class default method raise the TypeError
    return JSONEncoder.default(self, o)

etiq.datasets.utils.calculate_conditional_probs(base_dataset: AbstractDataset, comparison_dataset: AbstractDataset, number_of_bins: int = 10) → Dict[str, Tuple[ndarray, ndarray]]

etiq.datasets.utils.calculate_conditionals_from_counter(target_counter: Dict[Any, int], feature_counter: Dict[Any, int], target_values: List[Any], feature_values: List[Any]) → ndarray

etiq.datasets.utils.detect_mixed_types(adataframe_or_series: DataFrame | Series) → List[str]

etiq.datasets.utils.hash_columns(row: Series) → int

etiq.datasets.utils.hash_pandas_df(df: DataFrame) → int

etiq.datasets.utils.str_index(index, use_int_bins: bool) → str

Convert index value to a string

If an index is a Pandas Interval, use the left hand side for the boundary.

Module contents

class etiq.datasets.AbstractDataset

Bases: Generic[T, S]

Abstract dataset class

abstract property all_feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset

Returns:: A list containing the names of all features in the dataset
Return type:: Tuple[str, …]

abstract property all_features: T

abstract all_identical(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → bool

abstract cache_predictions(predictions: T, prediction_name: str, condition: str) → None

abstract property categorical_features: Tuple[str, ...]

Return the names of the categorical features stored in the dataset

Returns:: A list containing the names of all categorical features in the dataset
Return type:: Tuple[str, …]

abstract check_column_ordering_violations(gt_feature: str, lt_feature: str, ids: Tuple[str, ...] | None = None) → Iterable

abstract check_duplicate_row_violations(duplicate_features_subset: Tuple[str, ...] | None = None, ids: Tuple[str, ...] | None = None) → Iterable

abstract check_missing_ids(id_feature: str, primary_ids: List[str] | None = None) → Iterable

abstract property checksum: Dict[str, str]

abstract compare_feature(other_dataset: AbstractDataset[T, S], afeature: str) → bool

abstract property continuous_features: Tuple[str, ...]

Returns the names of the continuous features stored in the dataset

Returns:: A list containing the names of all continuous features in the dataset
Return type:: Tuple[str, …]

abstract property data_backend: DataBackend

abstract property date_feature_names: Tuple[str, ...]

Return the names of the features that store dates

Returns:: A list containing the names of the date features
Return type:: Tuple[str, …]

abstract property date_features: T

abstract property date_features_testing: T

abstract property date_features_training: T

abstract property date_features_validation: T

abstract feature(name: str, mask: str = '') → S

Returns all data in the dataset for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The data for the specified feature
Return type:: S

abstract feature_combined_std(other_dataset: AbstractDataset[T, S], name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → float

abstract feature_max(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Any

abstract feature_min(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Any

abstract property feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset excluding the id and date features

Returns:: A list containing the names of all features in the dataset excluding the id and date features
Return type:: Tuple[str, …]

abstract feature_testing(name: str, mask: str = '') → S

Returns testing data for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The testing data for the specified feature
Return type:: S

abstract feature_training(name: str, mask: str = '') → S

Returns training data for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The training data for the specified feature
Return type:: S

abstract feature_validation(name: str, mask: str = '') → S

Returns validation data for the specified feature.

Parameters:: name (str) – The name of the feature.
Returns:: The validation data for the specified feature
Return type:: S

abstract property features: T

abstract property features_testing: T

abstract property features_training: T

abstract property features_validation: T

abstract filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) → AbstractDataset[T, S]

abstract filter_count(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) → int

abstract generate_data_profile(group_by: str | None = None) → DataProfile

abstract get_feature_categories(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Tuple[Any, ...]

abstract get_feature_profile(afeature: str, feature_vals: List[Any], atype: FeatureType) → FeatureDataProfile

abstract get_metric_calculation_params(mask: str = '') → Tuple

get_predictions_cache() → PredictionCache[T, S] | None

abstract get_restricted_features() → Tuple[str, ...]

abstract get_segment_data(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → T | S

abstract property has_predictions: bool

abstract histogram(feature_name: str, bins: int = 15)

abstract property history: List[Tuple[str, str, dict]]

abstract property id: str

abstract property id_feature_names: Tuple[str, ...]

Return the names of the features that store ids

Returns:: A list containing the names of the date features
Return type:: Tuple[str, …]

abstract property id_features: T

abstract property id_features_testing: T

abstract property id_features_training: T

abstract property id_features_validation: T

abstract is_categorical_feature(afeature: str) → bool

abstract is_continuous_feature(afeature: str) → bool

abstract is_date_feature(afeature: str) → bool

abstract is_id_feature(afeature: str) → bool

abstract is_numerical(afeature: str) → bool

abstract is_target_categorical() → bool

abstract is_target_continuous() → bool

abstract mean(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Number

abstract median(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → Number

abstract property name: str

abstract property number_of_date_features: int

abstract property number_of_features: int

abstract property number_of_id_features: int

abstract property number_of_samples: int

abstract property number_of_testing_samples: int

abstract property number_of_training_samples: int

abstract property number_of_validation_samples: int

abstract property overall_checksum: str

abstract property prediction: S

abstract property prediction_categorical: bool

abstract property prediction_name: str | None

abstract property prediction_testing: S

abstract property prediction_training: S

abstract property prediction_validation: S

abstract quantiles(name: str, quantile_vals: List[float], dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') → List[Number]

abstract property target: S

abstract property target_categorical: bool

abstract property target_name: str

Return the name of the target

Returns:: The name of the target
Return type:: str

abstract property target_testing: S

abstract property target_training: S

abstract property target_validation: S

class etiq.datasets.BiasDataset

Bases: SimpleDataset[T, S]

Abstract bias dataset class

abstract get_protected_metric_calculation_params(mask: str = '') → Any

abstract property protected: S

abstract property protected_name: str

abstract property protected_testing: S

abstract property protected_training: S

abstract property protected_validation: S

class etiq.datasets.BiasDatasetBuilder

Bases: SimpleDatasetBuilder

Returns a BiasParams object

Parameters:

bias_params – A bias params object to clone. This defaults to None if not provided.
protected – Protected feature name for example ‘gender’. Defaults to None.
privileged – Privileged label within the protected feature for example ‘male’. This defaults to None if not provided.
unprivileged – Privileged label within the protected feature for example ‘female’. This defaults to None if not provided.
positive_outcome_label – The label of a “positive” outcome within a target feature. Defaults to None.
negative_outcome_label – The label of a “negative” outcome within a target feature. Defaults to None.

Returns:

A BiasParams object

classmethod dataset(features: DataFrame, target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, train_valid_test_splits: Tuple[float, float, float] = (0.8, 0.2, 0.0), id_col: List[str] | None = None, date_col: List[str] | None = None, bias_params: BiasParams | None = None, convert_date_cols: bool = False, datetime_format: str = '', remove_protected_from_features: bool = True, random_seed: int = 2, name: str = None, register_creation: bool = True) → BiasDataset

Creates a BiasDataset object given pandas dataframe(s).

Use this the dataset builder like:

from etiq import BiasDatasetBuilder
from etiq.biasparams import BiasParams
import pandas as pd
a = [
        ["2022-10-10", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-11", 'F', 8, 9, 10, 11, 12, 0],
        ["", 'F', 2, 3, 4, 5, 6, 1],
        ["2022-10-13", 'M', 8, 9, 10, 11, 12, 0],
        ["2022-10-14", 'F', 2, 3, 4, 5, 6, 1],
        ["2022-10-15", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-16", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-17", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-18", 'M', 14, 15, 16, 17, 18, 1],
        ["2022-10-19", 'M', 15, 16, 17, 18, 19, 1],
    ]
df = pd.DataFrame(a,
                columns=["start_date", "gender", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = BiasDatasetBuilder.dataset(
                features=df,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                bias_params = BiasParams(protected='gender',
                                         privileged='M',
                                         unprivileged='F',
                                         positive_outcome_label= 1,
                                         negative_outcome_label= 0),
                remove_protected_from_features = True,
                convert_date_cols=True,
                name="test_dataset")

Parameters:

features – Pandas dataframe containing the dataset features as columns.
target – Pandas dataframe containing the target feature as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.
label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.
prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.
cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.
cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.
id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.
date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.
bias_params – This contains demographic data (the protected feature) needed to create the bias dataset. This defaults to None in which case a fake random protected feature is created.
train_valid_test_splits – This parameter specifies the proportions to use when splitting the data into training, validation and test subsets. This defaults to (0.8, 0.2, 0.0).
random_seed – Random number seed (for reproducibility) used when splitting the data into random training, validation and test subsets. This defaults to 2.
remove_protected_from_features – This is set to True in order to remove the protected feature from the normal features i.e. the protected feature is then not considered a feature used by the model. Otherwise the protected feature is treated as a normal feature.
convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.
datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.
name – The name to use for the dataset. This defaults to None in which case a random name is assigned.
register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A BiasDataset object.

classmethod datasets(training_features: DataFrame | None = None, training_target: DataFrame | None = None, validation_features: DataFrame | None = None, validation_target: DataFrame | None = None, testing_features: DataFrame | None = None, testing_target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, bias_params: BiasParams | None = None, remove_protected_from_features: bool = True, id_col: List[str] | None = None, date_col: List[str] | None = None, convert_date_cols: bool = False, datetime_format: str = '', name: str | None = None, register_creation: bool = True) → BiasDataset

Creates a SimpleDataset object given pandas dataframe(s).

Use this builder like:

from etiq import BiasDatasetBuilder
from etiq.biasparams import BiasParams
import pandas as pd
training = [
        ["2022-10-10", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-11", 'F', 8, 9, 10, 11, 12, 0],
        ["", 'F', 2, 3, 4, 5, 6, 1],
        ["2022-10-13", 'M', 8, 9, 10, 11, 12, 0],
        ["2022-10-14", 'F', 2, 3, 4, 5, 6, 1]
        ]
validation = [
        ["2022-10-15", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-16", 'M', 2, 3, 4, 5, 6, 1],
        ["2022-10-17", 'F', 8, 9, 10, 11, 12, 0],
        ["2022-10-18", 'M', 14, 15, 16, 17, 18, 1],
        ["2022-10-19", 'M', 15, 16, 17, 18, 19, 1]
        ]
df1 = pd.DataFrame(training,
                columns=["start_date", "gender", "age2", "age3", "age4",
                        "age5", "age6", "income"])
df2 = pd.DataFrame(validation,
                columns=["start_date", "gender", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = BiasDatasetBuilder.datasets(
                training_features=df1,
                validation_features=df2,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                bias_params = BiasParams(protected='gender',
                                         privileged='M',
                                         unprivileged='F',
                                         positive_outcome_label= 1,
                                         negative_outcome_label= 0),
                remove_protected_from_features = True,
                convert_date_cols=True,
                name="test_dataset")

Parameters:

training_features – Pandas dataframe containing the training dataset features. This defaults to None in which case we assume there is no training data.
training_target – Pandas dataframe containing the target training data as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.
validation_features – Pandas dataframe containing the validation dataset features. This defaults to None in which case we assume there is no validation data.
validation_target – Pandas dataframe containing the target validation data as a column. This defaults to None in which case the target feature is assumed to be either the last column in validation features dataset or the column name specified in the label argument.
testing_features – Pandas dataframe containing the testing dataset features. This defaults to None in which case we assume there is no testing data.
testing_target – Pandas dataframe containing the target testing data as a column. This defaults to None in which case the target feature is assumed to be either the last column in testing features dataset or the column name specified in the label argument.
label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.
prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.
cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.
cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.
id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.
date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.
bias_params – This contains demographic data (the protected feature) needed to create the bias dataset. This defaults to None in which case a fake random protected feature is created.
train_valid_test_splits – This parameter specifies the proportions to use when splitting the data into training, validation and test subsets. This defaults to (0.8, 0.2, 0.0).
random_seed – Random number seed used when splitting the data into random training, validation and test subsets. This defaults to 2.
remove_protected_from_features – This is set to True in order to remove the protected feature from the normal features i.e. the protected feature is then not considered a feature used by the model. Otherwise the protected feature is treated as a normal feature.
convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.
datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.
name – The name to use for the dataset. This defaults to None in which case a random name is assigned.
register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A BiasDataset object.

class etiq.datasets.DataBackend(value)

Bases: Enum

An enumeration.

PANDAS = 0

SPARK = 1

class etiq.datasets.DatasetSegment(value)

Bases: Enum

An enumeration.

ALL = 0

TESTING = 3

TRAINING = 1

VALIDATION = 2

class etiq.datasets.PredictionCache

Bases: Generic[T, S]

abstract get_data(condition: str) → T

abstract get_predicted(condition: str = '') → S | str

abstract get_protected(condition: str = '') → S | str | None

abstract get_target(condition: str = '') → S | str

class etiq.datasets.SimpleDataset

Bases: AbstractDataset[T, S]

Abstract simple dataset class

class etiq.datasets.SimpleDatasetBuilder

Bases: object

A builder for the SimpleDataset class

classmethod dataset(features: DataFrame, target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, id_col: List[str] | None = None, date_col: List[str] | None = None, train_valid_test_splits: Tuple[float, float, float] = (0.8, 0.2, 0.0), random_seed: int = 2, convert_date_cols: bool = False, datetime_format: str = '', name: str | None = None, register_creation: bool = True) → SimpleDataset

Creates a SimpleDataset object given pandas dataframe(s).

Use this builder like:

from etiq import SimpleDatasetBuilder
import pandas as pd
a = [
        ["2022-10-10", 2, 3, 4, 5, 6, 1],
        ["2022-10-11", 8, 9, 10, 11, 12, 0],
        ["", 2, 3, 4, 5, 6, 1],
        ["2022-10-13", 8, 9, 10, 11, 12, 0],
        ["2022-10-14", 2, 3, 4, 5, 6, 1],
        ["2022-10-15", 8, 9, 10, 11, 12, 0],
        ["2022-10-16", 2, 3, 4, 5, 6, 1],
        ["2022-10-17", 8, 9, 10, 11, 12, 0],
        ["2022-10-18", 14, 15, 16, 17, 18, 1],
        ["2022-10-19", 15, 16, 17, 18, 19, 1],
    ]
df = pd.DataFrame(a,
                columns=["start_date", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = SimpleDatasetBuilder.dataset(
                features=df,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                convert_date_cols=True,
                name="test_dataset")

Parameters:

features – Pandas dataframe containing the dataset features as columns.
target – Pandas dataframe containing the target feature as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.
label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.
prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.
cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.
cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.
id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.
date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.
train_valid_test_splits – This parameter specifies the proportions to use when splitting the data into training, validation and test subsets. This defaults to (0.8, 0.2, 0.0).
random_seed – Random number seed used when splitting the data into random training, validation and test subsets. This defaults to 2.
convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.
datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.
name – The name to use for the dataset. This defaults to None in which case a random name is assigned.
register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A SimpleDataset object.

classmethod datasets(training_features: DataFrame | None = None, training_target: DataFrame | None = None, validation_features: DataFrame | None = None, validation_target: DataFrame | None = None, testing_features: DataFrame | None = None, testing_target: DataFrame | None = None, label: str | None = None, prediction: str | None = None, cat_col: List[str] | None = None, cont_col: List[str] | None = None, id_col: List[str] | None = None, date_col: List[str] | None = None, convert_date_cols=False, datetime_format='', name: str | None = None, register_creation: bool = True) → SimpleDataset

Creates a SimpleDataset object given pandas dataframe(s).

Use this builder like:

from etiq import SimpleDatasetBuilder
import pandas as pd
training = [
                ["2022-10-10", 2, 3, 4, 5, 6, 1],
                ["2022-10-11", 8, 9, 10, 11, 12, 0],
                ["", 2, 3, 4, 5, 6, 1],
                ["2022-10-13", 8, 9, 10, 11, 12, 0]
                ["2022-10-14", 2, 3, 4, 5, 6, 1]
            ]
validation = [
                ["2022-10-15", 8, 9, 10, 11, 12, 0],
                ["2022-10-16", 2, 3, 4, 5, 6, 1],
                ["2022-10-17", 8, 9, 10, 11, 12, 0],
                ["2022-10-18", 14, 15, 16, 17, 18, 1],
                ["2022-10-19", 15, 16, 17, 18, 19, 1]
            ]
df1 = pd.DataFrame(training,
                columns=["start_date", "age2", "age3", "age4",
                        "age5", "age6", "income"])
df2 = pd.DataFrame(validation,
                columns=["start_date", "age2", "age3", "age4",
                        "age5", "age6", "income"])
adataset = SimpleDatasetBuilder.datasets(
                training_features=df1,
                validation_features=df2,
                label="income",
                cat_col=["age2", "age3", "income"],
                cont_col=["age4", "age5", "age6"],
                date_col=["start_date"],
                convert_date_cols=True,
                name="test_dataset")

Parameters:

training_features – Pandas dataframe containing the training dataset features. This defaults to None in which case we assume there is no training data.
training_target – Pandas dataframe containing the target training data as a column. This defaults to None in which case the target feature is assumed to be either the last column in features dataset or the column name specified in the label argument.
validation_features – Pandas dataframe containing the validation dataset features. This defaults to None in which case we assume there is no validation data.
validation_target – Pandas dataframe containing the target validation data as a column. This defaults to None in which case the target feature is assumed to be either the last column in validation features dataset or the column name specified in the label argument.
testing_features – Pandas dataframe containing the testing dataset features. This defaults to None in which case we assume there is no testing data.
testing_target – Pandas dataframe containing the target testing data as a column. This defaults to None in which case the target feature is assumed to be either the last column in testing features dataset or the column name specified in the label argument.
label – The name of the column containing the target. This defaults to None in which case the target is assumed to either be the last column of the features dataframe or the first column of the target dataframe if this is not None.
prediction – The name of the column containing the prediction data. This defaults to None in which case the assumption is that the dataset contains no prediction data.
cat_col – List of categorical features. This defaults to None in which case categorical features are determined automatically.
cont_col – List of continuous features. This defaults to None in which case continuous features are determined automatically.
id_col – List of id features. This defaults to None in which case it is assumed the dataset contains no id features.
date_col – List of datetime features. This defaults to None in which case it is assumed the dataset contains no datetime features.
convert_date_cols – This is set to True in order to convert an date features into datetime objects. This defaults to False.
datetime_format – The specific datetime format (assumes a common datetime is used for all datetime features). This defaults to an empty string in which case the datetime format is guessed.
name – The name to use for the dataset. This defaults to None in which case a random name is assigned.
register_creation – This is set to True to enable the dataset to be registered to the database (note that only a hash and/or fingerprint of the data is stored). This Defaults to True.

Returns:

A SimpleDataset object.

etiq.datasets.calculate_conditional_probs(base_dataset: AbstractDataset, comparison_dataset: AbstractDataset, number_of_bins: int = 10) → Dict[str, Tuple[ndarray, ndarray]]