bikes.core.models

Define trainable machine learning models.

  1"""Define trainable machine learning models."""
  2
  3# %% IMPORTS
  4
  5import abc
  6import typing as T
  7
  8import pydantic as pdt
  9import shap
 10from sklearn import compose, ensemble, pipeline, preprocessing
 11
 12from bikes.core import schemas
 13
 14# %% TYPES
 15
 16# Model params
 17ParamKey = str
 18ParamValue = T.Any
 19Params = dict[ParamKey, ParamValue]
 20
 21# %% MODELS
 22
 23
 24class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"):
 25    """Base class for a project model.
 26
 27    Use a model to adapt AI/ML frameworks.
 28    e.g., to swap easily one model with another.
 29    """
 30
 31    KIND: str
 32
 33    def get_params(self, deep: bool = True) -> Params:
 34        """Get the model params.
 35
 36        Args:
 37            deep (bool, optional): ignored.
 38
 39        Returns:
 40            Params: internal model parameters.
 41        """
 42        params: Params = {}
 43        for key, value in self.model_dump().items():
 44            if not key.startswith("_") and not key.isupper():
 45                params[key] = value
 46        return params
 47
 48    def set_params(self, **params: ParamValue) -> T.Self:
 49        """Set the model params in place.
 50
 51        Returns:
 52            T.Self: instance of the model.
 53        """
 54        for key, value in params.items():
 55            setattr(self, key, value)
 56        return self
 57
 58    @abc.abstractmethod
 59    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
 60        """Fit the model on the given inputs and targets.
 61
 62        Args:
 63            inputs (schemas.Inputs): model training inputs.
 64            targets (schemas.Targets): model training targets.
 65
 66        Returns:
 67            T.Self: instance of the model.
 68        """
 69
 70    @abc.abstractmethod
 71    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
 72        """Generate outputs with the model for the given inputs.
 73
 74        Args:
 75            inputs (schemas.Inputs): model prediction inputs.
 76
 77        Returns:
 78            schemas.Outputs: model prediction outputs.
 79        """
 80
 81    def explain_model(self) -> schemas.FeatureImportances:
 82        """Explain the internal model structure.
 83
 84        Raises:
 85            NotImplementedError: method not implemented.
 86
 87        Returns:
 88            schemas.FeatureImportances: feature importances.
 89        """
 90        raise NotImplementedError()
 91
 92    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
 93        """Explain model outputs on input samples.
 94
 95        Raises:
 96            NotImplementedError: method not implemented.
 97
 98        Returns:
 99            schemas.SHAPValues: SHAP values.
100        """
101        raise NotImplementedError()
102
103    def get_internal_model(self) -> T.Any:
104        """Return the internal model in the object.
105
106        Raises:
107            NotImplementedError: method not implemented.
108
109        Returns:
110            T.Any: any internal model (either empty or fitted).
111        """
112        raise NotImplementedError()
113
114
115class BaselineSklearnModel(Model):
116    """Simple baseline model based on scikit-learn.
117
118    Parameters:
119        max_depth (int): maximum depth of the random forest.
120        n_estimators (int): number of estimators in the random forest.
121        random_state (int, optional): random state of the machine learning pipeline.
122    """
123
124    KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
125
126    # params
127    max_depth: int = 20
128    n_estimators: int = 200
129    random_state: int | None = 42
130    # private
131    _pipeline: pipeline.Pipeline | None = None
132    _numericals: list[str] = [
133        "yr",
134        "mnth",
135        "hr",
136        "holiday",
137        "weekday",
138        "workingday",
139        "temp",
140        "atemp",
141        "hum",
142        "windspeed",
143        "casual",
144        # "registered", # too correlated with target
145    ]
146    _categoricals: list[str] = [
147        "season",
148        "weathersit",
149    ]
150
151    @T.override
152    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
153        # subcomponents
154        categoricals_transformer = preprocessing.OneHotEncoder(
155            sparse_output=False, handle_unknown="ignore"
156        )
157        # components
158        transformer = compose.ColumnTransformer(
159            [
160                ("categoricals", categoricals_transformer, self._categoricals),
161                ("numericals", "passthrough", self._numericals),
162            ],
163            remainder="drop",
164        )
165        regressor = ensemble.RandomForestRegressor(
166            max_depth=self.max_depth, n_estimators=self.n_estimators, random_state=self.random_state
167        )
168        # pipeline
169        self._pipeline = pipeline.Pipeline(
170            steps=[
171                ("transformer", transformer),
172                ("regressor", regressor),
173            ]
174        )
175        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
176        return self
177
178    @T.override
179    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
180        model = self.get_internal_model()
181        prediction = model.predict(inputs)
182        outputs = schemas.Outputs(
183            {schemas.OutputsSchema.prediction: prediction}, index=inputs.index
184        )
185        return outputs
186
187    @T.override
188    def explain_model(self) -> schemas.FeatureImportances:
189        model = self.get_internal_model()
190        regressor = model.named_steps["regressor"]
191        transformer = model.named_steps["transformer"]
192        column_names = transformer.get_feature_names_out()
193        feature_importances = schemas.FeatureImportances(
194            data={
195                "feature": column_names,
196                "importance": regressor.feature_importances_,
197            }
198        )
199        return feature_importances
200
201    @T.override
202    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
203        model = self.get_internal_model()
204        regressor = model.named_steps["regressor"]
205        transformer = model.named_steps["transformer"]
206        transformed = transformer.transform(X=inputs)
207        explainer = shap.TreeExplainer(model=regressor)
208        shap_values = schemas.SHAPValues(
209            data=explainer.shap_values(X=transformed),
210            columns=transformer.get_feature_names_out(),
211        )
212        return shap_values
213
214    @T.override
215    def get_internal_model(self) -> pipeline.Pipeline:
216        model = self._pipeline
217        if model is None:
218            raise ValueError("Model is not fitted yet!")
219        return model
220
221
222ModelKind = BaselineSklearnModel
ParamKey = <class 'str'>
ParamValue = typing.Any
Params = dict[str, typing.Any]
class Model(abc.ABC, pydantic.main.BaseModel):
 25class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"):
 26    """Base class for a project model.
 27
 28    Use a model to adapt AI/ML frameworks.
 29    e.g., to swap easily one model with another.
 30    """
 31
 32    KIND: str
 33
 34    def get_params(self, deep: bool = True) -> Params:
 35        """Get the model params.
 36
 37        Args:
 38            deep (bool, optional): ignored.
 39
 40        Returns:
 41            Params: internal model parameters.
 42        """
 43        params: Params = {}
 44        for key, value in self.model_dump().items():
 45            if not key.startswith("_") and not key.isupper():
 46                params[key] = value
 47        return params
 48
 49    def set_params(self, **params: ParamValue) -> T.Self:
 50        """Set the model params in place.
 51
 52        Returns:
 53            T.Self: instance of the model.
 54        """
 55        for key, value in params.items():
 56            setattr(self, key, value)
 57        return self
 58
 59    @abc.abstractmethod
 60    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
 61        """Fit the model on the given inputs and targets.
 62
 63        Args:
 64            inputs (schemas.Inputs): model training inputs.
 65            targets (schemas.Targets): model training targets.
 66
 67        Returns:
 68            T.Self: instance of the model.
 69        """
 70
 71    @abc.abstractmethod
 72    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
 73        """Generate outputs with the model for the given inputs.
 74
 75        Args:
 76            inputs (schemas.Inputs): model prediction inputs.
 77
 78        Returns:
 79            schemas.Outputs: model prediction outputs.
 80        """
 81
 82    def explain_model(self) -> schemas.FeatureImportances:
 83        """Explain the internal model structure.
 84
 85        Raises:
 86            NotImplementedError: method not implemented.
 87
 88        Returns:
 89            schemas.FeatureImportances: feature importances.
 90        """
 91        raise NotImplementedError()
 92
 93    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
 94        """Explain model outputs on input samples.
 95
 96        Raises:
 97            NotImplementedError: method not implemented.
 98
 99        Returns:
100            schemas.SHAPValues: SHAP values.
101        """
102        raise NotImplementedError()
103
104    def get_internal_model(self) -> T.Any:
105        """Return the internal model in the object.
106
107        Raises:
108            NotImplementedError: method not implemented.
109
110        Returns:
111            T.Any: any internal model (either empty or fitted).
112        """
113        raise NotImplementedError()

Base class for a project model.

Use a model to adapt AI/ML frameworks. e.g., to swap easily one model with another.

KIND: str
def get_params(self, deep: bool = True) -> dict[str, typing.Any]:
34    def get_params(self, deep: bool = True) -> Params:
35        """Get the model params.
36
37        Args:
38            deep (bool, optional): ignored.
39
40        Returns:
41            Params: internal model parameters.
42        """
43        params: Params = {}
44        for key, value in self.model_dump().items():
45            if not key.startswith("_") and not key.isupper():
46                params[key] = value
47        return params

Get the model params.

Arguments:
  • deep (bool, optional): ignored.
Returns:

Params: internal model parameters.

def set_params(self, **params: Any) -> Self:
49    def set_params(self, **params: ParamValue) -> T.Self:
50        """Set the model params in place.
51
52        Returns:
53            T.Self: instance of the model.
54        """
55        for key, value in params.items():
56            setattr(self, key, value)
57        return self

Set the model params in place.

Returns:

T.Self: instance of the model.

@abc.abstractmethod
def fit( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema]) -> Self:
59    @abc.abstractmethod
60    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
61        """Fit the model on the given inputs and targets.
62
63        Args:
64            inputs (schemas.Inputs): model training inputs.
65            targets (schemas.Targets): model training targets.
66
67        Returns:
68            T.Self: instance of the model.
69        """

Fit the model on the given inputs and targets.

Arguments:
  • inputs (schemas.Inputs): model training inputs.
  • targets (schemas.Targets): model training targets.
Returns:

T.Self: instance of the model.

@abc.abstractmethod
def predict( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.OutputsSchema]:
71    @abc.abstractmethod
72    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
73        """Generate outputs with the model for the given inputs.
74
75        Args:
76            inputs (schemas.Inputs): model prediction inputs.
77
78        Returns:
79            schemas.Outputs: model prediction outputs.
80        """

Generate outputs with the model for the given inputs.

Arguments:
  • inputs (schemas.Inputs): model prediction inputs.
Returns:

schemas.Outputs: model prediction outputs.

def explain_model( self) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.FeatureImportancesSchema]:
82    def explain_model(self) -> schemas.FeatureImportances:
83        """Explain the internal model structure.
84
85        Raises:
86            NotImplementedError: method not implemented.
87
88        Returns:
89            schemas.FeatureImportances: feature importances.
90        """
91        raise NotImplementedError()

Explain the internal model structure.

Raises:
  • NotImplementedError: method not implemented.
Returns:

schemas.FeatureImportances: feature importances.

def explain_samples( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.SHAPValuesSchema]:
 93    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
 94        """Explain model outputs on input samples.
 95
 96        Raises:
 97            NotImplementedError: method not implemented.
 98
 99        Returns:
100            schemas.SHAPValues: SHAP values.
101        """
102        raise NotImplementedError()

Explain model outputs on input samples.

Raises:
  • NotImplementedError: method not implemented.
Returns:

schemas.SHAPValues: SHAP values.

def get_internal_model(self) -> Any:
104    def get_internal_model(self) -> T.Any:
105        """Return the internal model in the object.
106
107        Raises:
108            NotImplementedError: method not implemented.
109
110        Returns:
111            T.Any: any internal model (either empty or fitted).
112        """
113        raise NotImplementedError()

Return the internal model in the object.

Raises:
  • NotImplementedError: method not implemented.
Returns:

T.Any: any internal model (either empty or fitted).

model_config = {'strict': True, 'frozen': False, 'extra': 'forbid'}
model_fields = {'KIND': FieldInfo(annotation=str, required=True)}
model_computed_fields = {}
Inherited Members
pydantic.main.BaseModel
BaseModel
model_extra
model_fields_set
model_construct
model_copy
model_dump
model_dump_json
model_json_schema
model_parametrized_name
model_post_init
model_rebuild
model_validate
model_validate_json
model_validate_strings
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
class BaselineSklearnModel(Model):
116class BaselineSklearnModel(Model):
117    """Simple baseline model based on scikit-learn.
118
119    Parameters:
120        max_depth (int): maximum depth of the random forest.
121        n_estimators (int): number of estimators in the random forest.
122        random_state (int, optional): random state of the machine learning pipeline.
123    """
124
125    KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
126
127    # params
128    max_depth: int = 20
129    n_estimators: int = 200
130    random_state: int | None = 42
131    # private
132    _pipeline: pipeline.Pipeline | None = None
133    _numericals: list[str] = [
134        "yr",
135        "mnth",
136        "hr",
137        "holiday",
138        "weekday",
139        "workingday",
140        "temp",
141        "atemp",
142        "hum",
143        "windspeed",
144        "casual",
145        # "registered", # too correlated with target
146    ]
147    _categoricals: list[str] = [
148        "season",
149        "weathersit",
150    ]
151
152    @T.override
153    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
154        # subcomponents
155        categoricals_transformer = preprocessing.OneHotEncoder(
156            sparse_output=False, handle_unknown="ignore"
157        )
158        # components
159        transformer = compose.ColumnTransformer(
160            [
161                ("categoricals", categoricals_transformer, self._categoricals),
162                ("numericals", "passthrough", self._numericals),
163            ],
164            remainder="drop",
165        )
166        regressor = ensemble.RandomForestRegressor(
167            max_depth=self.max_depth, n_estimators=self.n_estimators, random_state=self.random_state
168        )
169        # pipeline
170        self._pipeline = pipeline.Pipeline(
171            steps=[
172                ("transformer", transformer),
173                ("regressor", regressor),
174            ]
175        )
176        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
177        return self
178
179    @T.override
180    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
181        model = self.get_internal_model()
182        prediction = model.predict(inputs)
183        outputs = schemas.Outputs(
184            {schemas.OutputsSchema.prediction: prediction}, index=inputs.index
185        )
186        return outputs
187
188    @T.override
189    def explain_model(self) -> schemas.FeatureImportances:
190        model = self.get_internal_model()
191        regressor = model.named_steps["regressor"]
192        transformer = model.named_steps["transformer"]
193        column_names = transformer.get_feature_names_out()
194        feature_importances = schemas.FeatureImportances(
195            data={
196                "feature": column_names,
197                "importance": regressor.feature_importances_,
198            }
199        )
200        return feature_importances
201
202    @T.override
203    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
204        model = self.get_internal_model()
205        regressor = model.named_steps["regressor"]
206        transformer = model.named_steps["transformer"]
207        transformed = transformer.transform(X=inputs)
208        explainer = shap.TreeExplainer(model=regressor)
209        shap_values = schemas.SHAPValues(
210            data=explainer.shap_values(X=transformed),
211            columns=transformer.get_feature_names_out(),
212        )
213        return shap_values
214
215    @T.override
216    def get_internal_model(self) -> pipeline.Pipeline:
217        model = self._pipeline
218        if model is None:
219            raise ValueError("Model is not fitted yet!")
220        return model

Simple baseline model based on scikit-learn.

Arguments:
  • max_depth (int): maximum depth of the random forest.
  • n_estimators (int): number of estimators in the random forest.
  • random_state (int, optional): random state of the machine learning pipeline.
KIND: Literal['BaselineSklearnModel']
max_depth: int
n_estimators: int
random_state: int | None
@T.override
def fit( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema]) -> BaselineSklearnModel:
152    @T.override
153    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
154        # subcomponents
155        categoricals_transformer = preprocessing.OneHotEncoder(
156            sparse_output=False, handle_unknown="ignore"
157        )
158        # components
159        transformer = compose.ColumnTransformer(
160            [
161                ("categoricals", categoricals_transformer, self._categoricals),
162                ("numericals", "passthrough", self._numericals),
163            ],
164            remainder="drop",
165        )
166        regressor = ensemble.RandomForestRegressor(
167            max_depth=self.max_depth, n_estimators=self.n_estimators, random_state=self.random_state
168        )
169        # pipeline
170        self._pipeline = pipeline.Pipeline(
171            steps=[
172                ("transformer", transformer),
173                ("regressor", regressor),
174            ]
175        )
176        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
177        return self

Fit the model on the given inputs and targets.

Arguments:
  • inputs (schemas.Inputs): model training inputs.
  • targets (schemas.Targets): model training targets.
Returns:

T.Self: instance of the model.

@T.override
def predict( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.OutputsSchema]:
179    @T.override
180    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
181        model = self.get_internal_model()
182        prediction = model.predict(inputs)
183        outputs = schemas.Outputs(
184            {schemas.OutputsSchema.prediction: prediction}, index=inputs.index
185        )
186        return outputs

Generate outputs with the model for the given inputs.

Arguments:
  • inputs (schemas.Inputs): model prediction inputs.
Returns:

schemas.Outputs: model prediction outputs.

@T.override
def explain_model( self) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.FeatureImportancesSchema]:
188    @T.override
189    def explain_model(self) -> schemas.FeatureImportances:
190        model = self.get_internal_model()
191        regressor = model.named_steps["regressor"]
192        transformer = model.named_steps["transformer"]
193        column_names = transformer.get_feature_names_out()
194        feature_importances = schemas.FeatureImportances(
195            data={
196                "feature": column_names,
197                "importance": regressor.feature_importances_,
198            }
199        )
200        return feature_importances

Explain the internal model structure.

Raises:
  • NotImplementedError: method not implemented.
Returns:

schemas.FeatureImportances: feature importances.

@T.override
def explain_samples( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.SHAPValuesSchema]:
202    @T.override
203    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
204        model = self.get_internal_model()
205        regressor = model.named_steps["regressor"]
206        transformer = model.named_steps["transformer"]
207        transformed = transformer.transform(X=inputs)
208        explainer = shap.TreeExplainer(model=regressor)
209        shap_values = schemas.SHAPValues(
210            data=explainer.shap_values(X=transformed),
211            columns=transformer.get_feature_names_out(),
212        )
213        return shap_values

Explain model outputs on input samples.

Raises:
  • NotImplementedError: method not implemented.
Returns:

schemas.SHAPValues: SHAP values.

@T.override
def get_internal_model(self) -> sklearn.pipeline.Pipeline:
215    @T.override
216    def get_internal_model(self) -> pipeline.Pipeline:
217        model = self._pipeline
218        if model is None:
219            raise ValueError("Model is not fitted yet!")
220        return model

Return the internal model in the object.

Raises:
  • NotImplementedError: method not implemented.
Returns:

T.Any: any internal model (either empty or fitted).

model_config = {'strict': True, 'frozen': False, 'extra': 'forbid'}
def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
281def init_private_attributes(self: BaseModel, context: Any, /) -> None:
282    """This function is meant to behave like a BaseModel method to initialise private attributes.
283
284    It takes context as an argument since that's what pydantic-core passes when calling it.
285
286    Args:
287        self: The BaseModel instance.
288        context: The context.
289    """
290    if getattr(self, '__pydantic_private__', None) is None:
291        pydantic_private = {}
292        for name, private_attr in self.__private_attributes__.items():
293            default = private_attr.get_default()
294            if default is not PydanticUndefined:
295                pydantic_private[name] = default
296        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:
  • self: The BaseModel instance.
  • context: The context.
model_fields = {'KIND': FieldInfo(annotation=Literal['BaselineSklearnModel'], required=False, default='BaselineSklearnModel'), 'max_depth': FieldInfo(annotation=int, required=False, default=20), 'n_estimators': FieldInfo(annotation=int, required=False, default=200), 'random_state': FieldInfo(annotation=Union[int, NoneType], required=False, default=42)}
model_computed_fields = {}
Inherited Members
pydantic.main.BaseModel
BaseModel
model_extra
model_fields_set
model_construct
model_copy
model_dump
model_dump_json
model_json_schema
model_parametrized_name
model_rebuild
model_validate
model_validate_json
model_validate_strings
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
Model
get_params
set_params
ModelKind = <class 'BaselineSklearnModel'>