bikes.core.schemas

Define and validate dataframe schemas.

View Source

  1"""Define and validate dataframe schemas."""
  2
  3# %% IMPORTS
  4
  5import typing as T
  6
  7import pandas as pd
  8import pandera as pa
  9import pandera.typing as papd
 10import pandera.typing.common as padt
 11
 12# %% TYPES
 13
 14# Generic type for a dataframe container
 15TSchema = T.TypeVar("TSchema", bound="pa.DataFrameModel")
 16
 17# %% SCHEMAS
 18
 19
 20class Schema(pa.DataFrameModel):
 21    """Base class for a dataframe schema.
 22
 23    Use a schema to type your dataframe object.
 24    e.g., to communicate and validate its fields.
 25    """
 26
 27    class Config:
 28        """Default configurations for all schemas.
 29
 30        Parameters:
 31            coerce (bool): convert data type if possible.
 32            strict (bool): ensure the data type is correct.
 33        """
 34
 35        coerce: bool = True
 36        strict: bool = True
 37
 38    @classmethod
 39    def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]:
 40        """Check the dataframe with this schema.
 41
 42        Args:
 43            data (pd.DataFrame): dataframe to check.
 44
 45        Returns:
 46            papd.DataFrame[TSchema]: validated dataframe.
 47        """
 48        return T.cast(papd.DataFrame[TSchema], cls.validate(data))
 49
 50
 51class InputsSchema(Schema):
 52    """Schema for the project inputs."""
 53
 54    instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
 55    dteday: papd.Series[padt.DateTime] = pa.Field()
 56    season: papd.Series[padt.UInt8] = pa.Field(isin=[1, 2, 3, 4])
 57    yr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=1)
 58    mnth: papd.Series[padt.UInt8] = pa.Field(ge=1, le=12)
 59    hr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=23)
 60    holiday: papd.Series[padt.Bool] = pa.Field()
 61    weekday: papd.Series[padt.UInt8] = pa.Field(ge=0, le=6)
 62    workingday: papd.Series[padt.Bool] = pa.Field()
 63    weathersit: papd.Series[padt.UInt8] = pa.Field(ge=1, le=4)
 64    temp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
 65    atemp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
 66    hum: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
 67    windspeed: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
 68    casual: papd.Series[padt.UInt32] = pa.Field(ge=0)
 69    registered: papd.Series[padt.UInt32] = pa.Field(ge=0)
 70
 71
 72Inputs = papd.DataFrame[InputsSchema]
 73
 74
 75class TargetsSchema(Schema):
 76    """Schema for the project target."""
 77
 78    instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
 79    cnt: papd.Series[padt.UInt32] = pa.Field(ge=0)
 80
 81
 82Targets = papd.DataFrame[TargetsSchema]
 83
 84
 85class OutputsSchema(Schema):
 86    """Schema for the project output."""
 87
 88    instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
 89    prediction: papd.Series[padt.UInt32] = pa.Field(ge=0)
 90
 91
 92Outputs = papd.DataFrame[OutputsSchema]
 93
 94
 95class SHAPValuesSchema(Schema):
 96    """Schema for the project shap values."""
 97
 98    class Config:
 99        """Default configurations this schema.
100
101        Parameters:
102            dtype (str): dataframe default data type.
103            strict (bool): ensure the data type is correct.
104        """
105
106        dtype: str = "float32"
107        strict: bool = False
108
109
110SHAPValues = papd.DataFrame[SHAPValuesSchema]
111
112
113class FeatureImportancesSchema(Schema):
114    """Schema for the project feature importances."""
115
116    feature: papd.Series[padt.String] = pa.Field()
117    importance: papd.Series[padt.Float32] = pa.Field()
118
119
120FeatureImportances = papd.DataFrame[FeatureImportancesSchema]

class Schema(typing.Generic[~TDataFrame, ~TSchema], pandera.api.base.model.BaseModel): View Source

21class Schema(pa.DataFrameModel):
22    """Base class for a dataframe schema.
23
24    Use a schema to type your dataframe object.
25    e.g., to communicate and validate its fields.
26    """
27
28    class Config:
29        """Default configurations for all schemas.
30
31        Parameters:
32            coerce (bool): convert data type if possible.
33            strict (bool): ensure the data type is correct.
34        """
35
36        coerce: bool = True
37        strict: bool = True
38
39    @classmethod
40    def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]:
41        """Check the dataframe with this schema.
42
43        Args:
44            data (pd.DataFrame): dataframe to check.
45
46        Returns:
47            papd.DataFrame[TSchema]: validated dataframe.
48        """
49        return T.cast(papd.DataFrame[TSchema], cls.validate(data))

Base class for a dataframe schema.

Use a schema to type your dataframe object. e.g., to communicate and validate its fields.

@docstring_substitution(validate_doc=BaseSchema.validate.__doc__)

Schema(*args, **kwargs) View Source

135    @docstring_substitution(validate_doc=BaseSchema.validate.__doc__)
136    def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]:  # type: ignore [misc]
137        """%(validate_doc)s"""
138        return cast(
139            DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs)
140        )

Validate a DataFrame based on the schema specification.

Parameters

pd.DataFrame check_obj: the dataframe to be validated.
head: validate the first n rows. Rows overlapping with tail or sample are de-duplicated.
tail: validate the last n rows. Rows overlapping with head or sample are de-duplicated.
sample: validate a random sample of n rows. Rows overlapping with head or tail are de-duplicated.
random_state: random seed for the sample argument.
lazy: if True, lazily evaluates dataframe against all validation checks and raises a SchemaErrors. Otherwise, raise SchemaError as soon as one occurs.
inplace: if True, applies coercion to the object of validation, otherwise creates a copy of the data. :returns: validated DataFrame

Raises

SchemaError: when DataFrame violates built-in or custom checks.

@classmethod

def check( cls: Type[~TSchema], data: pandas.core.frame.DataFrame) -> pandera.typing.pandas.DataFrame[~TSchema]: View Source

39    @classmethod
40    def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]:
41        """Check the dataframe with this schema.
42
43        Args:
44            data (pd.DataFrame): dataframe to check.
45
46        Returns:
47            papd.DataFrame[TSchema]: validated dataframe.
48        """
49        return T.cast(papd.DataFrame[TSchema], cls.validate(data))

Check the dataframe with this schema.

Arguments:

data (pd.DataFrame): dataframe to check.

Returns:

papd.DataFrame[TSchema]: validated dataframe.

class InputsSchema(typing.Generic[~TDataFrame, ~TSchema], pandera.api.base.model.BaseModel): View Source

52class InputsSchema(Schema):
53    """Schema for the project inputs."""
54
55    instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
56    dteday: papd.Series[padt.DateTime] = pa.Field()
57    season: papd.Series[padt.UInt8] = pa.Field(isin=[1, 2, 3, 4])
58    yr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=1)
59    mnth: papd.Series[padt.UInt8] = pa.Field(ge=1, le=12)
60    hr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=23)
61    holiday: papd.Series[padt.Bool] = pa.Field()
62    weekday: papd.Series[padt.UInt8] = pa.Field(ge=0, le=6)
63    workingday: papd.Series[padt.Bool] = pa.Field()
64    weathersit: papd.Series[padt.UInt8] = pa.Field(ge=1, le=4)
65    temp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
66    atemp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
67    hum: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
68    windspeed: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
69    casual: papd.Series[padt.UInt32] = pa.Field(ge=0)
70    registered: papd.Series[padt.UInt32] = pa.Field(ge=0)

Schema for the project inputs.

@docstring_substitution(validate_doc=BaseSchema.validate.__doc__)

InputsSchema(*args, **kwargs) View Source

135    @docstring_substitution(validate_doc=BaseSchema.validate.__doc__)
136    def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]:  # type: ignore [misc]
137        """%(validate_doc)s"""
138        return cast(
139            DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs)
140        )

Validate a DataFrame based on the schema specification.

Parameters

pd.DataFrame check_obj: the dataframe to be validated.
head: validate the first n rows. Rows overlapping with tail or sample are de-duplicated.
tail: validate the last n rows. Rows overlapping with head or sample are de-duplicated.
sample: validate a random sample of n rows. Rows overlapping with head or tail are de-duplicated.
random_state: random seed for the sample argument.
lazy: if True, lazily evaluates dataframe against all validation checks and raises a SchemaErrors. Otherwise, raise SchemaError as soon as one occurs.
inplace: if True, applies coercion to the object of validation, otherwise creates a copy of the data. :returns: validated DataFrame

Raises

SchemaError: when DataFrame violates built-in or custom checks.

instant: pandera.typing.pandas.Index[pandera.dtypes.UInt32]

Captures extra information about a field.

new in 0.5.0

dteday: pandera.typing.pandas.Series[pandera.dtypes.Timestamp]

Captures extra information about a field.

new in 0.5.0

season: pandera.typing.pandas.Series[pandera.dtypes.UInt8]

Captures extra information about a field.

new in 0.5.0

yr: pandera.typing.pandas.Series[pandera.dtypes.UInt8]

Captures extra information about a field.

new in 0.5.0

mnth: pandera.typing.pandas.Series[pandera.dtypes.UInt8]

Captures extra information about a field.

new in 0.5.0

hr: pandera.typing.pandas.Series[pandera.dtypes.UInt8]

Captures extra information about a field.

new in 0.5.0

holiday: pandera.typing.pandas.Series[pandera.dtypes.Bool]

Captures extra information about a field.

new in 0.5.0

weekday: pandera.typing.pandas.Series[pandera.dtypes.UInt8]

Captures extra information about a field.

new in 0.5.0

workingday: pandera.typing.pandas.Series[pandera.dtypes.Bool]

Captures extra information about a field.

new in 0.5.0

weathersit: pandera.typing.pandas.Series[pandera.dtypes.UInt8]

Captures extra information about a field.

new in 0.5.0

temp: pandera.typing.pandas.Series[pandera.dtypes.Float16]

Captures extra information about a field.

new in 0.5.0

atemp: pandera.typing.pandas.Series[pandera.dtypes.Float16]

Captures extra information about a field.

new in 0.5.0

hum: pandera.typing.pandas.Series[pandera.dtypes.Float16]

Captures extra information about a field.

new in 0.5.0

windspeed: pandera.typing.pandas.Series[pandera.dtypes.Float16]

Captures extra information about a field.

new in 0.5.0

casual: pandera.typing.pandas.Series[pandera.dtypes.UInt32]

Captures extra information about a field.

new in 0.5.0

registered: pandera.typing.pandas.Series[pandera.dtypes.UInt32]

Captures extra information about a field.

new in 0.5.0

Inherited Members

Schema: check

Inputs = pandera.typing.pandas.DataFrame[InputsSchema]

class TargetsSchema(typing.Generic[~TDataFrame, ~TSchema], pandera.api.base.model.BaseModel): View Source

76class TargetsSchema(Schema):
77    """Schema for the project target."""
78
79    instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
80    cnt: papd.Series[padt.UInt32] = pa.Field(ge=0)

Schema for the project target.

@docstring_substitution(validate_doc=BaseSchema.validate.__doc__)

TargetsSchema(*args, **kwargs) View Source

135    @docstring_substitution(validate_doc=BaseSchema.validate.__doc__)
136    def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]:  # type: ignore [misc]
137        """%(validate_doc)s"""
138        return cast(
139            DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs)
140        )

Validate a DataFrame based on the schema specification.

Parameters

pd.DataFrame check_obj: the dataframe to be validated.
head: validate the first n rows. Rows overlapping with tail or sample are de-duplicated.
tail: validate the last n rows. Rows overlapping with head or sample are de-duplicated.
sample: validate a random sample of n rows. Rows overlapping with head or tail are de-duplicated.
random_state: random seed for the sample argument.
lazy: if True, lazily evaluates dataframe against all validation checks and raises a SchemaErrors. Otherwise, raise SchemaError as soon as one occurs.
inplace: if True, applies coercion to the object of validation, otherwise creates a copy of the data. :returns: validated DataFrame

Raises

SchemaError: when DataFrame violates built-in or custom checks.

instant: pandera.typing.pandas.Index[pandera.dtypes.UInt32]

Captures extra information about a field.

new in 0.5.0

cnt: pandera.typing.pandas.Series[pandera.dtypes.UInt32]

Captures extra information about a field.

new in 0.5.0

Inherited Members

Schema: check

Targets = pandera.typing.pandas.DataFrame[TargetsSchema]

class OutputsSchema(typing.Generic[~TDataFrame, ~TSchema], pandera.api.base.model.BaseModel): View Source

86class OutputsSchema(Schema):
87    """Schema for the project output."""
88
89    instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
90    prediction: papd.Series[padt.UInt32] = pa.Field(ge=0)

Schema for the project output.

@docstring_substitution(validate_doc=BaseSchema.validate.__doc__)

OutputsSchema(*args, **kwargs) View Source

135    @docstring_substitution(validate_doc=BaseSchema.validate.__doc__)
136    def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]:  # type: ignore [misc]
137        """%(validate_doc)s"""
138        return cast(
139            DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs)
140        )

Validate a DataFrame based on the schema specification.

Parameters

pd.DataFrame check_obj: the dataframe to be validated.
head: validate the first n rows. Rows overlapping with tail or sample are de-duplicated.
tail: validate the last n rows. Rows overlapping with head or sample are de-duplicated.
sample: validate a random sample of n rows. Rows overlapping with head or tail are de-duplicated.
random_state: random seed for the sample argument.
lazy: if True, lazily evaluates dataframe against all validation checks and raises a SchemaErrors. Otherwise, raise SchemaError as soon as one occurs.
inplace: if True, applies coercion to the object of validation, otherwise creates a copy of the data. :returns: validated DataFrame

Raises

SchemaError: when DataFrame violates built-in or custom checks.

instant: pandera.typing.pandas.Index[pandera.dtypes.UInt32]

Captures extra information about a field.

new in 0.5.0

prediction: pandera.typing.pandas.Series[pandera.dtypes.UInt32]

Captures extra information about a field.

new in 0.5.0

Inherited Members

Schema: check

Outputs = pandera.typing.pandas.DataFrame[OutputsSchema]

class SHAPValuesSchema(typing.Generic[~TDataFrame, ~TSchema], pandera.api.base.model.BaseModel): View Source

 96class SHAPValuesSchema(Schema):
 97    """Schema for the project shap values."""
 98
 99    class Config:
100        """Default configurations this schema.
101
102        Parameters:
103            dtype (str): dataframe default data type.
104            strict (bool): ensure the data type is correct.
105        """
106
107        dtype: str = "float32"
108        strict: bool = False

Schema for the project shap values.

@docstring_substitution(validate_doc=BaseSchema.validate.__doc__)

SHAPValuesSchema(*args, **kwargs) View Source

135    @docstring_substitution(validate_doc=BaseSchema.validate.__doc__)
136    def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]:  # type: ignore [misc]
137        """%(validate_doc)s"""
138        return cast(
139            DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs)
140        )

Validate a DataFrame based on the schema specification.

Parameters

pd.DataFrame check_obj: the dataframe to be validated.
head: validate the first n rows. Rows overlapping with tail or sample are de-duplicated.
tail: validate the last n rows. Rows overlapping with head or sample are de-duplicated.
sample: validate a random sample of n rows. Rows overlapping with head or tail are de-duplicated.
random_state: random seed for the sample argument.
lazy: if True, lazily evaluates dataframe against all validation checks and raises a SchemaErrors. Otherwise, raise SchemaError as soon as one occurs.
inplace: if True, applies coercion to the object of validation, otherwise creates a copy of the data. :returns: validated DataFrame

Raises

SchemaError: when DataFrame violates built-in or custom checks.

Inherited Members

Schema: check

SHAPValues = pandera.typing.pandas.DataFrame[SHAPValuesSchema]

class FeatureImportancesSchema(typing.Generic[~TDataFrame, ~TSchema], pandera.api.base.model.BaseModel): View Source

114class FeatureImportancesSchema(Schema):
115    """Schema for the project feature importances."""
116
117    feature: papd.Series[padt.String] = pa.Field()
118    importance: papd.Series[padt.Float32] = pa.Field()

Schema for the project feature importances.

@docstring_substitution(validate_doc=BaseSchema.validate.__doc__)

FeatureImportancesSchema(*args, **kwargs) View Source

135    @docstring_substitution(validate_doc=BaseSchema.validate.__doc__)
136    def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]:  # type: ignore [misc]
137        """%(validate_doc)s"""
138        return cast(
139            DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs)
140        )

Validate a DataFrame based on the schema specification.

Parameters

pd.DataFrame check_obj: the dataframe to be validated.
head: validate the first n rows. Rows overlapping with tail or sample are de-duplicated.
tail: validate the last n rows. Rows overlapping with head or sample are de-duplicated.
sample: validate a random sample of n rows. Rows overlapping with head or tail are de-duplicated.
random_state: random seed for the sample argument.
lazy: if True, lazily evaluates dataframe against all validation checks and raises a SchemaErrors. Otherwise, raise SchemaError as soon as one occurs.
inplace: if True, applies coercion to the object of validation, otherwise creates a copy of the data. :returns: validated DataFrame

Raises

SchemaError: when DataFrame violates built-in or custom checks.

feature: pandera.typing.pandas.Series[pandera.dtypes.String]

Captures extra information about a field.

new in 0.5.0

importance: pandera.typing.pandas.Series[pandera.dtypes.Float32]

Captures extra information about a field.

new in 0.5.0

Inherited Members

Schema: check

FeatureImportances = pandera.typing.pandas.DataFrame[FeatureImportancesSchema]