bikes.core.models

Define trainable machine learning models.

  1"""Define trainable machine learning models."""
  2
  3# %% IMPORTS
  4
  5import abc
  6import typing as T
  7
  8import pydantic as pdt
  9import shap
 10from sklearn import compose, ensemble, pipeline, preprocessing
 11
 12from bikes.core import schemas
 13
 14# %% TYPES
 15
 16# Model params
 17ParamKey = str
 18ParamValue = T.Any
 19Params = dict[ParamKey, ParamValue]
 20
 21# %% MODELS
 22
 23
 24class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"):
 25    """Base class for a project model.
 26
 27    Use a model to adapt AI/ML frameworks.
 28    e.g., to swap easily one model with another.
 29    """
 30
 31    KIND: str
 32
 33    def get_params(self, deep: bool = True) -> Params:
 34        """Get the model params.
 35
 36        Args:
 37            deep (bool, optional): ignored.
 38
 39        Returns:
 40            Params: internal model parameters.
 41        """
 42        params: Params = {}
 43        for key, value in self.model_dump().items():
 44            if not key.startswith("_") and not key.isupper():
 45                params[key] = value
 46        return params
 47
 48    def set_params(self, **params: ParamValue) -> T.Self:
 49        """Set the model params in place.
 50
 51        Returns:
 52            T.Self: instance of the model.
 53        """
 54        for key, value in params.items():
 55            setattr(self, key, value)
 56        return self
 57
 58    @abc.abstractmethod
 59    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
 60        """Fit the model on the given inputs and targets.
 61
 62        Args:
 63            inputs (schemas.Inputs): model training inputs.
 64            targets (schemas.Targets): model training targets.
 65
 66        Returns:
 67            T.Self: instance of the model.
 68        """
 69
 70    @abc.abstractmethod
 71    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
 72        """Generate outputs with the model for the given inputs.
 73
 74        Args:
 75            inputs (schemas.Inputs): model prediction inputs.
 76
 77        Returns:
 78            schemas.Outputs: model prediction outputs.
 79        """
 80
 81    def explain_model(self) -> schemas.FeatureImportances:
 82        """Explain the internal model structure.
 83
 84        Returns:
 85            schemas.FeatureImportances: feature importances.
 86        """
 87        raise NotImplementedError()
 88
 89    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
 90        """Explain model outputs on input samples.
 91
 92        Returns:
 93            schemas.SHAPValues: SHAP values.
 94        """
 95        raise NotImplementedError()
 96
 97    def get_internal_model(self) -> T.Any:
 98        """Return the internal model in the object.
 99
100        Raises:
101            NotImplementedError: method not implemented.
102
103        Returns:
104            T.Any: any internal model (either empty or fitted).
105        """
106        raise NotImplementedError()
107
108
109class BaselineSklearnModel(Model):
110    """Simple baseline model based on scikit-learn.
111
112    Parameters:
113        max_depth (int): maximum depth of the random forest.
114        n_estimators (int): number of estimators in the random forest.
115        random_state (int, optional): random state of the machine learning pipeline.
116    """
117
118    KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
119
120    # params
121    max_depth: int = 20
122    n_estimators: int = 200
123    random_state: int | None = 42
124    # private
125    _pipeline: pipeline.Pipeline | None = None
126    _numericals: list[str] = [
127        "yr",
128        "mnth",
129        "hr",
130        "holiday",
131        "weekday",
132        "workingday",
133        "temp",
134        "atemp",
135        "hum",
136        "windspeed",
137        "casual",
138        "registered",  # too correlated with target
139    ]
140    _categoricals: list[str] = [
141        "season",
142        "weathersit",
143    ]
144
145    @T.override
146    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
147        # subcomponents
148        categoricals_transformer = preprocessing.OneHotEncoder(
149            sparse_output=False, handle_unknown="ignore"
150        )
151        # components
152        transformer = compose.ColumnTransformer(
153            [
154                ("categoricals", categoricals_transformer, self._categoricals),
155                ("numericals", "passthrough", self._numericals),
156            ],
157            remainder="drop",
158        )
159        regressor = ensemble.RandomForestRegressor(
160            max_depth=self.max_depth,
161            n_estimators=self.n_estimators,
162            random_state=self.random_state,
163        )
164        # pipeline
165        self._pipeline = pipeline.Pipeline(
166            steps=[
167                ("transformer", transformer),
168                ("regressor", regressor),
169            ]
170        )
171        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
172        return self
173
174    @T.override
175    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
176        model = self.get_internal_model()
177        prediction = model.predict(inputs)
178        outputs = schemas.Outputs(
179            {schemas.OutputsSchema.prediction: prediction}, index=inputs.index
180        )
181        return outputs
182
183    @T.override
184    def explain_model(self) -> schemas.FeatureImportances:
185        model = self.get_internal_model()
186        regressor = model.named_steps["regressor"]
187        transformer = model.named_steps["transformer"]
188        feature = transformer.get_feature_names_out()
189        feature_importances = schemas.FeatureImportances(
190            data={
191                "feature": feature,
192                "importance": regressor.feature_importances_,
193            }
194        )
195        return feature_importances
196
197    @T.override
198    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
199        model = self.get_internal_model()
200        regressor = model.named_steps["regressor"]
201        transformer = model.named_steps["transformer"]
202        transformed = transformer.transform(X=inputs)
203        explainer = shap.TreeExplainer(model=regressor)
204        shap_values = schemas.SHAPValues(
205            data=explainer.shap_values(X=transformed),
206            columns=transformer.get_feature_names_out(),
207        )
208        return shap_values
209
210    @T.override
211    def get_internal_model(self) -> pipeline.Pipeline:
212        model = self._pipeline
213        if model is None:
214            raise ValueError("Model is not fitted yet!")
215        return model
216
217
218ModelKind = BaselineSklearnModel
ParamKey = <class 'str'>
ParamValue = typing.Any
Params = dict[str, typing.Any]
class Model(abc.ABC, pydantic.main.BaseModel):
 25class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"):
 26    """Base class for a project model.
 27
 28    Use a model to adapt AI/ML frameworks.
 29    e.g., to swap easily one model with another.
 30    """
 31
 32    KIND: str
 33
 34    def get_params(self, deep: bool = True) -> Params:
 35        """Get the model params.
 36
 37        Args:
 38            deep (bool, optional): ignored.
 39
 40        Returns:
 41            Params: internal model parameters.
 42        """
 43        params: Params = {}
 44        for key, value in self.model_dump().items():
 45            if not key.startswith("_") and not key.isupper():
 46                params[key] = value
 47        return params
 48
 49    def set_params(self, **params: ParamValue) -> T.Self:
 50        """Set the model params in place.
 51
 52        Returns:
 53            T.Self: instance of the model.
 54        """
 55        for key, value in params.items():
 56            setattr(self, key, value)
 57        return self
 58
 59    @abc.abstractmethod
 60    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
 61        """Fit the model on the given inputs and targets.
 62
 63        Args:
 64            inputs (schemas.Inputs): model training inputs.
 65            targets (schemas.Targets): model training targets.
 66
 67        Returns:
 68            T.Self: instance of the model.
 69        """
 70
 71    @abc.abstractmethod
 72    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
 73        """Generate outputs with the model for the given inputs.
 74
 75        Args:
 76            inputs (schemas.Inputs): model prediction inputs.
 77
 78        Returns:
 79            schemas.Outputs: model prediction outputs.
 80        """
 81
 82    def explain_model(self) -> schemas.FeatureImportances:
 83        """Explain the internal model structure.
 84
 85        Returns:
 86            schemas.FeatureImportances: feature importances.
 87        """
 88        raise NotImplementedError()
 89
 90    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
 91        """Explain model outputs on input samples.
 92
 93        Returns:
 94            schemas.SHAPValues: SHAP values.
 95        """
 96        raise NotImplementedError()
 97
 98    def get_internal_model(self) -> T.Any:
 99        """Return the internal model in the object.
100
101        Raises:
102            NotImplementedError: method not implemented.
103
104        Returns:
105            T.Any: any internal model (either empty or fitted).
106        """
107        raise NotImplementedError()

Base class for a project model.

Use a model to adapt AI/ML frameworks. e.g., to swap easily one model with another.

KIND: str
def get_params(self, deep: bool = True) -> dict[str, typing.Any]:
34    def get_params(self, deep: bool = True) -> Params:
35        """Get the model params.
36
37        Args:
38            deep (bool, optional): ignored.
39
40        Returns:
41            Params: internal model parameters.
42        """
43        params: Params = {}
44        for key, value in self.model_dump().items():
45            if not key.startswith("_") and not key.isupper():
46                params[key] = value
47        return params

Get the model params.

Arguments:
  • deep (bool, optional): ignored.
Returns:

Params: internal model parameters.

def set_params(self, **params: Any) -> Self:
49    def set_params(self, **params: ParamValue) -> T.Self:
50        """Set the model params in place.
51
52        Returns:
53            T.Self: instance of the model.
54        """
55        for key, value in params.items():
56            setattr(self, key, value)
57        return self

Set the model params in place.

Returns:

T.Self: instance of the model.

@abc.abstractmethod
def fit( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema]) -> Self:
59    @abc.abstractmethod
60    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
61        """Fit the model on the given inputs and targets.
62
63        Args:
64            inputs (schemas.Inputs): model training inputs.
65            targets (schemas.Targets): model training targets.
66
67        Returns:
68            T.Self: instance of the model.
69        """

Fit the model on the given inputs and targets.

Arguments:
  • inputs (schemas.Inputs): model training inputs.
  • targets (schemas.Targets): model training targets.
Returns:

T.Self: instance of the model.

@abc.abstractmethod
def predict( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.OutputsSchema]:
71    @abc.abstractmethod
72    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
73        """Generate outputs with the model for the given inputs.
74
75        Args:
76            inputs (schemas.Inputs): model prediction inputs.
77
78        Returns:
79            schemas.Outputs: model prediction outputs.
80        """

Generate outputs with the model for the given inputs.

Arguments:
  • inputs (schemas.Inputs): model prediction inputs.
Returns:

schemas.Outputs: model prediction outputs.

def explain_model( self) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.FeatureImportancesSchema]:
82    def explain_model(self) -> schemas.FeatureImportances:
83        """Explain the internal model structure.
84
85        Returns:
86            schemas.FeatureImportances: feature importances.
87        """
88        raise NotImplementedError()

Explain the internal model structure.

Returns:

schemas.FeatureImportances: feature importances.

def explain_samples( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.SHAPValuesSchema]:
90    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
91        """Explain model outputs on input samples.
92
93        Returns:
94            schemas.SHAPValues: SHAP values.
95        """
96        raise NotImplementedError()

Explain model outputs on input samples.

Returns:

schemas.SHAPValues: SHAP values.

def get_internal_model(self) -> Any:
 98    def get_internal_model(self) -> T.Any:
 99        """Return the internal model in the object.
100
101        Raises:
102            NotImplementedError: method not implemented.
103
104        Returns:
105            T.Any: any internal model (either empty or fitted).
106        """
107        raise NotImplementedError()

Return the internal model in the object.

Raises:
  • NotImplementedError: method not implemented.
Returns:

T.Any: any internal model (either empty or fitted).

model_config: ClassVar[pydantic.config.ConfigDict] = {'strict': True, 'frozen': False, 'extra': 'forbid'}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class BaselineSklearnModel(Model):
110class BaselineSklearnModel(Model):
111    """Simple baseline model based on scikit-learn.
112
113    Parameters:
114        max_depth (int): maximum depth of the random forest.
115        n_estimators (int): number of estimators in the random forest.
116        random_state (int, optional): random state of the machine learning pipeline.
117    """
118
119    KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
120
121    # params
122    max_depth: int = 20
123    n_estimators: int = 200
124    random_state: int | None = 42
125    # private
126    _pipeline: pipeline.Pipeline | None = None
127    _numericals: list[str] = [
128        "yr",
129        "mnth",
130        "hr",
131        "holiday",
132        "weekday",
133        "workingday",
134        "temp",
135        "atemp",
136        "hum",
137        "windspeed",
138        "casual",
139        "registered",  # too correlated with target
140    ]
141    _categoricals: list[str] = [
142        "season",
143        "weathersit",
144    ]
145
146    @T.override
147    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
148        # subcomponents
149        categoricals_transformer = preprocessing.OneHotEncoder(
150            sparse_output=False, handle_unknown="ignore"
151        )
152        # components
153        transformer = compose.ColumnTransformer(
154            [
155                ("categoricals", categoricals_transformer, self._categoricals),
156                ("numericals", "passthrough", self._numericals),
157            ],
158            remainder="drop",
159        )
160        regressor = ensemble.RandomForestRegressor(
161            max_depth=self.max_depth,
162            n_estimators=self.n_estimators,
163            random_state=self.random_state,
164        )
165        # pipeline
166        self._pipeline = pipeline.Pipeline(
167            steps=[
168                ("transformer", transformer),
169                ("regressor", regressor),
170            ]
171        )
172        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
173        return self
174
175    @T.override
176    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
177        model = self.get_internal_model()
178        prediction = model.predict(inputs)
179        outputs = schemas.Outputs(
180            {schemas.OutputsSchema.prediction: prediction}, index=inputs.index
181        )
182        return outputs
183
184    @T.override
185    def explain_model(self) -> schemas.FeatureImportances:
186        model = self.get_internal_model()
187        regressor = model.named_steps["regressor"]
188        transformer = model.named_steps["transformer"]
189        feature = transformer.get_feature_names_out()
190        feature_importances = schemas.FeatureImportances(
191            data={
192                "feature": feature,
193                "importance": regressor.feature_importances_,
194            }
195        )
196        return feature_importances
197
198    @T.override
199    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
200        model = self.get_internal_model()
201        regressor = model.named_steps["regressor"]
202        transformer = model.named_steps["transformer"]
203        transformed = transformer.transform(X=inputs)
204        explainer = shap.TreeExplainer(model=regressor)
205        shap_values = schemas.SHAPValues(
206            data=explainer.shap_values(X=transformed),
207            columns=transformer.get_feature_names_out(),
208        )
209        return shap_values
210
211    @T.override
212    def get_internal_model(self) -> pipeline.Pipeline:
213        model = self._pipeline
214        if model is None:
215            raise ValueError("Model is not fitted yet!")
216        return model

Simple baseline model based on scikit-learn.

Arguments:
  • max_depth (int): maximum depth of the random forest.
  • n_estimators (int): number of estimators in the random forest.
  • random_state (int, optional): random state of the machine learning pipeline.
KIND: Literal['BaselineSklearnModel']
max_depth: int
n_estimators: int
random_state: int | None
@T.override
def fit( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema]) -> BaselineSklearnModel:
146    @T.override
147    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
148        # subcomponents
149        categoricals_transformer = preprocessing.OneHotEncoder(
150            sparse_output=False, handle_unknown="ignore"
151        )
152        # components
153        transformer = compose.ColumnTransformer(
154            [
155                ("categoricals", categoricals_transformer, self._categoricals),
156                ("numericals", "passthrough", self._numericals),
157            ],
158            remainder="drop",
159        )
160        regressor = ensemble.RandomForestRegressor(
161            max_depth=self.max_depth,
162            n_estimators=self.n_estimators,
163            random_state=self.random_state,
164        )
165        # pipeline
166        self._pipeline = pipeline.Pipeline(
167            steps=[
168                ("transformer", transformer),
169                ("regressor", regressor),
170            ]
171        )
172        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
173        return self

Fit the model on the given inputs and targets.

Arguments:
  • inputs (schemas.Inputs): model training inputs.
  • targets (schemas.Targets): model training targets.
Returns:

T.Self: instance of the model.

@T.override
def predict( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.OutputsSchema]:
175    @T.override
176    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
177        model = self.get_internal_model()
178        prediction = model.predict(inputs)
179        outputs = schemas.Outputs(
180            {schemas.OutputsSchema.prediction: prediction}, index=inputs.index
181        )
182        return outputs

Generate outputs with the model for the given inputs.

Arguments:
  • inputs (schemas.Inputs): model prediction inputs.
Returns:

schemas.Outputs: model prediction outputs.

@T.override
def explain_model( self) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.FeatureImportancesSchema]:
184    @T.override
185    def explain_model(self) -> schemas.FeatureImportances:
186        model = self.get_internal_model()
187        regressor = model.named_steps["regressor"]
188        transformer = model.named_steps["transformer"]
189        feature = transformer.get_feature_names_out()
190        feature_importances = schemas.FeatureImportances(
191            data={
192                "feature": feature,
193                "importance": regressor.feature_importances_,
194            }
195        )
196        return feature_importances

Explain the internal model structure.

Returns:

schemas.FeatureImportances: feature importances.

@T.override
def explain_samples( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.SHAPValuesSchema]:
198    @T.override
199    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
200        model = self.get_internal_model()
201        regressor = model.named_steps["regressor"]
202        transformer = model.named_steps["transformer"]
203        transformed = transformer.transform(X=inputs)
204        explainer = shap.TreeExplainer(model=regressor)
205        shap_values = schemas.SHAPValues(
206            data=explainer.shap_values(X=transformed),
207            columns=transformer.get_feature_names_out(),
208        )
209        return shap_values

Explain model outputs on input samples.

Returns:

schemas.SHAPValues: SHAP values.

@T.override
def get_internal_model(self) -> sklearn.pipeline.Pipeline:
211    @T.override
212    def get_internal_model(self) -> pipeline.Pipeline:
213        model = self._pipeline
214        if model is None:
215            raise ValueError("Model is not fitted yet!")
216        return model

Return the internal model in the object.

Raises:
  • NotImplementedError: method not implemented.
Returns:

T.Any: any internal model (either empty or fitted).

model_config: ClassVar[pydantic.config.ConfigDict] = {'strict': True, 'frozen': False, 'extra': 'forbid'}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
384def init_private_attributes(self: BaseModel, context: Any, /) -> None:
385    """This function is meant to behave like a BaseModel method to initialise private attributes.
386
387    It takes context as an argument since that's what pydantic-core passes when calling it.
388
389    Args:
390        self: The BaseModel instance.
391        context: The context.
392    """
393    if getattr(self, '__pydantic_private__', None) is None:
394        pydantic_private = {}
395        for name, private_attr in self.__private_attributes__.items():
396            default = private_attr.get_default()
397            if default is not PydanticUndefined:
398                pydantic_private[name] = default
399        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:
  • self: The BaseModel instance.
  • context: The context.
Inherited Members
Model
get_params
set_params
ModelKind = <class 'BaselineSklearnModel'>