etiq.datasets.pandas package

Submodules

etiq.datasets.pandas.base_pandas_dataset module

class etiq.datasets.pandas.base_pandas_dataset.BasePandasDatasetMixin(x_train: DataFrame | None = None, y_train: Series | None = None, prediction_train: Series | None = None, x_valid: DataFrame | None = None, y_valid: Series | None = None, prediction_valid: Series | None = None, x_test: DataFrame | None = None, y_test: Series | None = None, prediction_test: Series | None = None, feature_names: Sequence[str] | None = None, categorical_features: Sequence[str] | None = None, continuous_features: Sequence[str] | None = None, date_features: Sequence[str] | None = None, id_features: Sequence[str] | None = None, history: List[Tuple[str, str, dict]] | None = None, target_name: str | None = None, prediction_name: str | None = None, name: str | None = None, target_categorical: bool = True, prediction_categorical: bool = True)

Bases: object

Implementation of the Simple dataset class using pandas and numpy

property all_feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset

Returns:

A list containing the names of all features in the dataset

Return type:

Tuple[str, …]

property all_features: DataFrame
all_identical(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') bool
property categorical_features: Tuple[str, ...]

Return the names of the categorical features stored in the dataset

Returns:

A list containing the names of all categorical features in the dataset

Return type:

Tuple[str, …]

check_column_ordering_violations(gt_feature: str, lt_feature: str, ids: Tuple[str, ...] | None = None) Iterable
check_duplicate_row_violations(duplicate_features_subset: Tuple[str, ...] | None = None, ids: Tuple[str, ...] | None = None) Iterable
check_missing_ids(id_feature: str, primary_ids: List[str] | None = None) Iterable
property checksum: Dict[str, str]
property continuous_features: Tuple[str, ...]

Returns the names of the continuous features stored in the dataset

Returns:

A list containing the names of all continuous features in the dataset

Return type:

Tuple[str, …]

property data_backend: DataBackend
property date_feature_names: Tuple[str, ...]

Return the names of the features that store dates

Returns:

A list containing the names of the date features

Return type:

Tuple[str, …]

property date_features: DataFrame
property date_features_testing: DataFrame
property date_features_training: DataFrame
property date_features_validation: DataFrame
feature(name: str, mask: str = '') Series

Returns all data in the dataset for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The data for the specified feature

Return type:

pd.Series

feature_combined_std(other_dataset: AbstractDataset[DataFrame, Series], name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') float
feature_max(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Any
feature_min(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Any
property feature_names: Tuple[str, ...]

Return the names of the features stored in the dataset excluding the id and date features

Returns:

A list containing the names of all features in the dataset excluding the id and date features

Return type:

Tuple[str, …]

feature_testing(name: str, mask: str = '') Series

Returns testing data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The testing data for the specified feature

Return type:

pd.Series

feature_training(name: str, mask: str = '') Series

Returns training data for the specified feature.

Parameters:

name (str) – The name of the feature

Returns:

The training data for the specified feature

Return type:

pd.Series

feature_validation(name: str, mask: str = '') Series

Returns validation data for the specified feature.

Parameters:

name (str) – The name of the feature.

Returns:

The validation data for the specified feature

Return type:

pd.Series

property features: DataFrame
property features_testing: DataFrame
property features_training: DataFrame
property features_validation: DataFrame
filter_count(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) int
generate_data_profile(group_by: str | None = None) DataProfile

Create a data profile for the dataset

Parameters:

group_by (str) – Optional field to group the profile by.

get_dataset_checksum()
get_dataset_history()
get_eval(segment: DatasetSegment, condition: str)
get_feature_categories(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Tuple[Any, ...]
get_feature_profile(afeature: str, feature_vals: List[Any], atype: FeatureType, condition: str | None = None)
get_metric_calculation_params(mask: str = '') Tuple
get_restricted_features() Tuple[str, ...]
get_segment_data(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '')
property has_predictions: bool
histogram(feature_name: str, bins: int = 15)
property history: List[Tuple[str, str, dict]]
property id: str
property id_feature_names: Tuple[str, ...]

Return the names of the features that store ids

Returns:

A list containing the names of the date features

Return type:

Tuple[str, …]

property id_features: DataFrame
property id_features_testing: DataFrame
property id_features_training: DataFrame
property id_features_validation: DataFrame
is_categorical_feature(afeature: str) bool
is_continuous_feature(afeature: str) bool
is_date_feature(afeature: str) bool
is_id_feature(afeature: str) bool
is_numerical(afeature: str) bool
is_target_categorical() bool
is_target_continuous() bool
mean(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Number
median(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') Number
property name: str
property number_of_date_features: int
property number_of_features: int
property number_of_id_features: int
property number_of_samples: int
property number_of_testing_samples: int
property number_of_training_samples: int
property number_of_validation_samples: int
property overall_checksum: str
property prediction: Series
property prediction_categorical: bool
property prediction_name: str | None
property prediction_testing: Series
property prediction_training: Series
property prediction_validation: Series
quantiles(name: str, quantile_vals: List[float], dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '') List[Number]
property target: Series
property target_categorical: bool
property target_name: str
property target_testing: Series
property target_training: Series
property target_validation: Series
to_dataframe() DataFrame

Return this dataset as a Pandas Dataframe instance

etiq.datasets.pandas.base_pandas_dataset.query_eval(df: DataFrame, aquery: str) ndarray | None

etiq.datasets.pandas.bias_pandas_dataset module

class etiq.datasets.pandas.bias_pandas_dataset.BiasPandasDataset(x_train: DataFrame | None = None, y_train: Series | None = None, protected_train: Series | None = None, prediction_train: Series | None = None, x_valid: DataFrame | None = None, y_valid: Series | None = None, protected_valid: Series | None = None, prediction_valid: Series | None = None, x_test: DataFrame | None = None, y_test: Series | None = None, protected_test: Series | None = None, prediction_test: Series | None = None, col: Dict[str, List] | None = None, distances_train: ndarray | None = None, indices_train: ndarray | None = None, distances_valid: ndarray | None = None, indices_valid: ndarray | None = None, history: List | None = None, target_name: str | None = None, target_categorical: bool = True, protected_name: str | None = None, prediction_name: str | None = None, prediction_categorical: bool = True, id_features: List[str] | None = None, date_features: List[str] | None = None, name: str | None = None, is_algorithmic: bool = False)

Bases: BasePandasDatasetMixin, BiasDataset[DataFrame, Series]

cache_predictions(predictions: DataFrame, prediction_name: str, condition: str = '') None
check_column_ordering_violations(gt_feature: str, lt_feature: str, ids: Tuple[str, ...] | None = None) Iterator
compare_feature(other_dataset: AbstractDataset, afeature: str)
filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) BiasPandasDataset
generate_data_profile(group_by: str | None = None) DataProfile

Create a data profile for the dataset

Parameters:

group_by (str) – Optional field to group the profile by.

get_dataset_checksum()
get_dataset_history()
get_predictions_cache() BiasPandasPredictionCache | None
get_protected_metric_calculation_params(mask: str = '') Any
get_segment_data(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '')
property protected: Series
property protected_name: str
property protected_testing: Series
property protected_training: Series
property protected_validation: Series
to_dataframe()

Return this dataset as a Pandas Dataframe instance

class etiq.datasets.pandas.bias_pandas_dataset.BiasPandasPredictionCache(df: DataFrame, target_label: str, protected_label: str, predicted_label: str)

Bases: PredictionCache[DataFrame, Series]

get_cache_query(condition)
get_data(condition: str = '') DataFrame
get_predicted(condition: str = '') Series | str
get_protected(condition: str = '') Series | str | None
get_target(condition: str = '') Series | str
set_cache(df: DataFrame, target_label: str, protected_label: str, predicted_label: str)
set_eval_cache(eval_cache: Dict[str, Any])

etiq.datasets.pandas.simple_pandas_dataset module

class etiq.datasets.pandas.simple_pandas_dataset.SimplePandasDataset(x_train: DataFrame | None = None, y_train: Series | None = None, prediction_train: Series | None = None, x_valid: DataFrame | None = None, y_valid: Series | None = None, prediction_valid: Series | None = None, x_test: DataFrame | None = None, y_test: Series | None = None, prediction_test: Series | None = None, feature_names: Sequence[str] | None = None, categorical_features: Sequence[str] | None = None, continuous_features: Sequence[str] | None = None, date_features: Sequence[str] | None = None, id_features: Sequence[str] | None = None, history: List | None = None, target_name: str | None = None, prediction_name: str | None = None, name: str | None = None, target_categorical: bool = True, prediction_categorical: bool = True, is_algorithmic: bool = False)

Bases: BasePandasDatasetMixin, SimpleDataset[DataFrame, Series]

Implementation of the Simple dataset class using pandas and numpy

cache_predictions(predictions: DataFrame, prediction_name: str, condition: str) None
compare_feature(other_dataset: AbstractDataset[DataFrame, Series], afeature: str)
filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) SimplePandasDataset
get_predictions_cache() SimplePandasPredictionCache | None
class etiq.datasets.pandas.simple_pandas_dataset.SimplePandasPredictionCache(df: DataFrame, target_label: str, predicted_label: str)

Bases: PredictionCache[DataFrame, Series]

get_cache_query(condition)
get_data(condition: str = '') DataFrame
get_predicted(condition: str = '') Series | str
get_protected(condition: str = '') Series | str | None
get_target(condition: str = '') Series | str
set_cache(df: DataFrame, target_label: str, predicted_label: str)
set_eval_cache(eval_cache: Dict[str, Any])

Module contents

class etiq.datasets.pandas.BiasPandasDataset(x_train: DataFrame | None = None, y_train: Series | None = None, protected_train: Series | None = None, prediction_train: Series | None = None, x_valid: DataFrame | None = None, y_valid: Series | None = None, protected_valid: Series | None = None, prediction_valid: Series | None = None, x_test: DataFrame | None = None, y_test: Series | None = None, protected_test: Series | None = None, prediction_test: Series | None = None, col: Dict[str, List] | None = None, distances_train: ndarray | None = None, indices_train: ndarray | None = None, distances_valid: ndarray | None = None, indices_valid: ndarray | None = None, history: List | None = None, target_name: str | None = None, target_categorical: bool = True, protected_name: str | None = None, prediction_name: str | None = None, prediction_categorical: bool = True, id_features: List[str] | None = None, date_features: List[str] | None = None, name: str | None = None, is_algorithmic: bool = False)

Bases: BasePandasDatasetMixin, BiasDataset[DataFrame, Series]

cache_predictions(predictions: DataFrame, prediction_name: str, condition: str = '') None
check_column_ordering_violations(gt_feature: str, lt_feature: str, ids: Tuple[str, ...] | None = None) Iterator
compare_feature(other_dataset: AbstractDataset, afeature: str)
filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) BiasPandasDataset
generate_data_profile(group_by: str | None = None) DataProfile

Create a data profile for the dataset

Parameters:

group_by (str) – Optional field to group the profile by.

get_dataset_checksum()
get_dataset_history()
get_predictions_cache() BiasPandasPredictionCache | None
get_protected_metric_calculation_params(mask: str = '') Any
get_segment_data(name: str, dataset_segment: DatasetSegment = DatasetSegment.ALL, mask: str = '')
property protected: Series
property protected_name: str
property protected_testing: Series
property protected_training: Series
property protected_validation: Series
to_dataframe()

Return this dataset as a Pandas Dataframe instance

class etiq.datasets.pandas.SimplePandasDataset(x_train: DataFrame | None = None, y_train: Series | None = None, prediction_train: Series | None = None, x_valid: DataFrame | None = None, y_valid: Series | None = None, prediction_valid: Series | None = None, x_test: DataFrame | None = None, y_test: Series | None = None, prediction_test: Series | None = None, feature_names: Sequence[str] | None = None, categorical_features: Sequence[str] | None = None, continuous_features: Sequence[str] | None = None, date_features: Sequence[str] | None = None, id_features: Sequence[str] | None = None, history: List | None = None, target_name: str | None = None, prediction_name: str | None = None, name: str | None = None, target_categorical: bool = True, prediction_categorical: bool = True, is_algorithmic: bool = False)

Bases: BasePandasDatasetMixin, SimpleDataset[DataFrame, Series]

Implementation of the Simple dataset class using pandas and numpy

cache_predictions(predictions: DataFrame, prediction_name: str, condition: str) None
compare_feature(other_dataset: AbstractDataset[DataFrame, Series], afeature: str)
filter(condition: str, dataset_segment: DatasetSegment = DatasetSegment.ALL) SimplePandasDataset
get_predictions_cache() SimplePandasPredictionCache | None