bikes.core.schemas
Define and validate dataframe schemas.
1"""Define and validate dataframe schemas.""" 2 3# %% IMPORTS 4 5import typing as T 6 7import pandas as pd 8import pandera as pa 9import pandera.typing as papd 10import pandera.typing.common as padt 11 12# %% TYPES 13 14# Generic type for a dataframe container 15TSchema = T.TypeVar("TSchema", bound="pa.DataFrameModel") 16 17# %% SCHEMAS 18 19 20class Schema(pa.DataFrameModel): 21 """Base class for a dataframe schema. 22 23 Use a schema to type your dataframe object. 24 e.g., to communicate and validate its fields. 25 """ 26 27 class Config: 28 """Default configurations for all schemas. 29 30 Parameters: 31 coerce (bool): convert data type if possible. 32 strict (bool): ensure the data type is correct. 33 """ 34 35 coerce: bool = True 36 strict: bool = True 37 38 @classmethod 39 def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]: 40 """Check the dataframe with this schema. 41 42 Args: 43 data (pd.DataFrame): dataframe to check. 44 45 Returns: 46 papd.DataFrame[TSchema]: validated dataframe. 47 """ 48 return T.cast(papd.DataFrame[TSchema], cls.validate(data)) 49 50 51class InputsSchema(Schema): 52 """Schema for the project inputs.""" 53 54 instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 55 dteday: papd.Series[padt.DateTime] = pa.Field() 56 season: papd.Series[padt.UInt8] = pa.Field(isin=[1, 2, 3, 4]) 57 yr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=1) 58 mnth: papd.Series[padt.UInt8] = pa.Field(ge=1, le=12) 59 hr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=23) 60 holiday: papd.Series[padt.Bool] = pa.Field() 61 weekday: papd.Series[padt.UInt8] = pa.Field(ge=0, le=6) 62 workingday: papd.Series[padt.Bool] = pa.Field() 63 weathersit: papd.Series[padt.UInt8] = pa.Field(ge=1, le=4) 64 temp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 65 atemp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 66 hum: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 67 windspeed: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 68 casual: papd.Series[padt.UInt32] = pa.Field(ge=0) 69 registered: papd.Series[padt.UInt32] = pa.Field(ge=0) 70 71 72Inputs = papd.DataFrame[InputsSchema] 73 74 75class TargetsSchema(Schema): 76 """Schema for the project target.""" 77 78 instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 79 cnt: papd.Series[padt.UInt32] = pa.Field(ge=0) 80 81 82Targets = papd.DataFrame[TargetsSchema] 83 84 85class OutputsSchema(Schema): 86 """Schema for the project output.""" 87 88 instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 89 prediction: papd.Series[padt.UInt32] = pa.Field(ge=0) 90 91 92Outputs = papd.DataFrame[OutputsSchema] 93 94 95class SHAPValuesSchema(Schema): 96 """Schema for the project shap values.""" 97 98 class Config: 99 """Default configurations this schema. 100 101 Parameters: 102 dtype (str): dataframe default data type. 103 strict (bool): ensure the data type is correct. 104 """ 105 106 dtype: str = "float32" 107 strict: bool = False 108 109 110SHAPValues = papd.DataFrame[SHAPValuesSchema] 111 112 113class FeatureImportancesSchema(Schema): 114 """Schema for the project feature importances.""" 115 116 feature: papd.Series[padt.String] = pa.Field() 117 importance: papd.Series[padt.Float32] = pa.Field() 118 119 120FeatureImportances = papd.DataFrame[FeatureImportancesSchema]
21class Schema(pa.DataFrameModel): 22 """Base class for a dataframe schema. 23 24 Use a schema to type your dataframe object. 25 e.g., to communicate and validate its fields. 26 """ 27 28 class Config: 29 """Default configurations for all schemas. 30 31 Parameters: 32 coerce (bool): convert data type if possible. 33 strict (bool): ensure the data type is correct. 34 """ 35 36 coerce: bool = True 37 strict: bool = True 38 39 @classmethod 40 def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]: 41 """Check the dataframe with this schema. 42 43 Args: 44 data (pd.DataFrame): dataframe to check. 45 46 Returns: 47 papd.DataFrame[TSchema]: validated dataframe. 48 """ 49 return T.cast(papd.DataFrame[TSchema], cls.validate(data))
Base class for a dataframe schema.
Use a schema to type your dataframe object. e.g., to communicate and validate its fields.
135 @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) 136 def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]: # type: ignore [misc] 137 """%(validate_doc)s""" 138 return cast( 139 DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs) 140 )
Validate a DataFrame based on the schema specification.
Parameters
- pd.DataFrame check_obj: the dataframe to be validated.
- head: validate the first n rows. Rows overlapping with
tail
orsample
are de-duplicated. - tail: validate the last n rows. Rows overlapping with
head
orsample
are de-duplicated. - sample: validate a random sample of n rows. Rows overlapping
with
head
ortail
are de-duplicated. - random_state: random seed for the
sample
argument. - lazy: if True, lazily evaluates dataframe against all validation
checks and raises a
SchemaErrors
. Otherwise, raiseSchemaError
as soon as one occurs. - inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated
DataFrame
Raises
- SchemaError: when
DataFrame
violates built-in or custom checks.
39 @classmethod 40 def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]: 41 """Check the dataframe with this schema. 42 43 Args: 44 data (pd.DataFrame): dataframe to check. 45 46 Returns: 47 papd.DataFrame[TSchema]: validated dataframe. 48 """ 49 return T.cast(papd.DataFrame[TSchema], cls.validate(data))
Check the dataframe with this schema.
Arguments:
- data (pd.DataFrame): dataframe to check.
Returns:
papd.DataFrame[TSchema]: validated dataframe.
52class InputsSchema(Schema): 53 """Schema for the project inputs.""" 54 55 instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 56 dteday: papd.Series[padt.DateTime] = pa.Field() 57 season: papd.Series[padt.UInt8] = pa.Field(isin=[1, 2, 3, 4]) 58 yr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=1) 59 mnth: papd.Series[padt.UInt8] = pa.Field(ge=1, le=12) 60 hr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=23) 61 holiday: papd.Series[padt.Bool] = pa.Field() 62 weekday: papd.Series[padt.UInt8] = pa.Field(ge=0, le=6) 63 workingday: papd.Series[padt.Bool] = pa.Field() 64 weathersit: papd.Series[padt.UInt8] = pa.Field(ge=1, le=4) 65 temp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 66 atemp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 67 hum: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 68 windspeed: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 69 casual: papd.Series[padt.UInt32] = pa.Field(ge=0) 70 registered: papd.Series[padt.UInt32] = pa.Field(ge=0)
Schema for the project inputs.
135 @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) 136 def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]: # type: ignore [misc] 137 """%(validate_doc)s""" 138 return cast( 139 DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs) 140 )
Validate a DataFrame based on the schema specification.
Parameters
- pd.DataFrame check_obj: the dataframe to be validated.
- head: validate the first n rows. Rows overlapping with
tail
orsample
are de-duplicated. - tail: validate the last n rows. Rows overlapping with
head
orsample
are de-duplicated. - sample: validate a random sample of n rows. Rows overlapping
with
head
ortail
are de-duplicated. - random_state: random seed for the
sample
argument. - lazy: if True, lazily evaluates dataframe against all validation
checks and raises a
SchemaErrors
. Otherwise, raiseSchemaError
as soon as one occurs. - inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated
DataFrame
Raises
- SchemaError: when
DataFrame
violates built-in or custom checks.
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
Captures extra information about a field.
new in 0.5.0
76class TargetsSchema(Schema): 77 """Schema for the project target.""" 78 79 instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 80 cnt: papd.Series[padt.UInt32] = pa.Field(ge=0)
Schema for the project target.
135 @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) 136 def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]: # type: ignore [misc] 137 """%(validate_doc)s""" 138 return cast( 139 DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs) 140 )
Validate a DataFrame based on the schema specification.
Parameters
- pd.DataFrame check_obj: the dataframe to be validated.
- head: validate the first n rows. Rows overlapping with
tail
orsample
are de-duplicated. - tail: validate the last n rows. Rows overlapping with
head
orsample
are de-duplicated. - sample: validate a random sample of n rows. Rows overlapping
with
head
ortail
are de-duplicated. - random_state: random seed for the
sample
argument. - lazy: if True, lazily evaluates dataframe against all validation
checks and raises a
SchemaErrors
. Otherwise, raiseSchemaError
as soon as one occurs. - inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated
DataFrame
Raises
- SchemaError: when
DataFrame
violates built-in or custom checks.
Captures extra information about a field.
new in 0.5.0
86class OutputsSchema(Schema): 87 """Schema for the project output.""" 88 89 instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 90 prediction: papd.Series[padt.UInt32] = pa.Field(ge=0)
Schema for the project output.
135 @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) 136 def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]: # type: ignore [misc] 137 """%(validate_doc)s""" 138 return cast( 139 DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs) 140 )
Validate a DataFrame based on the schema specification.
Parameters
- pd.DataFrame check_obj: the dataframe to be validated.
- head: validate the first n rows. Rows overlapping with
tail
orsample
are de-duplicated. - tail: validate the last n rows. Rows overlapping with
head
orsample
are de-duplicated. - sample: validate a random sample of n rows. Rows overlapping
with
head
ortail
are de-duplicated. - random_state: random seed for the
sample
argument. - lazy: if True, lazily evaluates dataframe against all validation
checks and raises a
SchemaErrors
. Otherwise, raiseSchemaError
as soon as one occurs. - inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated
DataFrame
Raises
- SchemaError: when
DataFrame
violates built-in or custom checks.
Captures extra information about a field.
new in 0.5.0
96class SHAPValuesSchema(Schema): 97 """Schema for the project shap values.""" 98 99 class Config: 100 """Default configurations this schema. 101 102 Parameters: 103 dtype (str): dataframe default data type. 104 strict (bool): ensure the data type is correct. 105 """ 106 107 dtype: str = "float32" 108 strict: bool = False
Schema for the project shap values.
135 @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) 136 def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]: # type: ignore [misc] 137 """%(validate_doc)s""" 138 return cast( 139 DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs) 140 )
Validate a DataFrame based on the schema specification.
Parameters
- pd.DataFrame check_obj: the dataframe to be validated.
- head: validate the first n rows. Rows overlapping with
tail
orsample
are de-duplicated. - tail: validate the last n rows. Rows overlapping with
head
orsample
are de-duplicated. - sample: validate a random sample of n rows. Rows overlapping
with
head
ortail
are de-duplicated. - random_state: random seed for the
sample
argument. - lazy: if True, lazily evaluates dataframe against all validation
checks and raises a
SchemaErrors
. Otherwise, raiseSchemaError
as soon as one occurs. - inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated
DataFrame
Raises
- SchemaError: when
DataFrame
violates built-in or custom checks.
114class FeatureImportancesSchema(Schema): 115 """Schema for the project feature importances.""" 116 117 feature: papd.Series[padt.String] = pa.Field() 118 importance: papd.Series[padt.Float32] = pa.Field()
Schema for the project feature importances.
135 @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) 136 def __new__(cls, *args, **kwargs) -> DataFrameBase[TDataFrameModel]: # type: ignore [misc] 137 """%(validate_doc)s""" 138 return cast( 139 DataFrameBase[TDataFrameModel], cls.validate(*args, **kwargs) 140 )
Validate a DataFrame based on the schema specification.
Parameters
- pd.DataFrame check_obj: the dataframe to be validated.
- head: validate the first n rows. Rows overlapping with
tail
orsample
are de-duplicated. - tail: validate the last n rows. Rows overlapping with
head
orsample
are de-duplicated. - sample: validate a random sample of n rows. Rows overlapping
with
head
ortail
are de-duplicated. - random_state: random seed for the
sample
argument. - lazy: if True, lazily evaluates dataframe against all validation
checks and raises a
SchemaErrors
. Otherwise, raiseSchemaError
as soon as one occurs. - inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated
DataFrame
Raises
- SchemaError: when
DataFrame
violates built-in or custom checks.
Captures extra information about a field.
new in 0.5.0