bikes.core.models

Define trainable machine learning models.

View Source

  1"""Define trainable machine learning models."""
  2
  3# %% IMPORTS
  4
  5import abc
  6import typing as T
  7
  8import pandas as pd
  9import pydantic as pdt
 10import shap
 11from sklearn import compose, ensemble, pipeline, preprocessing
 12
 13from bikes.core import schemas
 14
 15# %% TYPES
 16
 17# Model params
 18ParamKey = str
 19ParamValue = T.Any
 20Params = dict[ParamKey, ParamValue]
 21
 22# %% MODELS
 23
 24
 25class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"):
 26    """Base class for a project model.
 27
 28    Use a model to adapt AI/ML frameworks.
 29    e.g., to swap easily one model with another.
 30    """
 31
 32    KIND: str
 33
 34    def get_params(self, deep: bool = True) -> Params:
 35        """Get the model params.
 36
 37        Args:
 38            deep (bool, optional): ignored.
 39
 40        Returns:
 41            Params: internal model parameters.
 42        """
 43        params: Params = {}
 44        for key, value in self.model_dump().items():
 45            if not key.startswith("_") and not key.isupper():
 46                params[key] = value
 47        return params
 48
 49    def set_params(self, **params: ParamValue) -> T.Self:
 50        """Set the model params in place.
 51
 52        Returns:
 53            T.Self: instance of the model.
 54        """
 55        for key, value in params.items():
 56            setattr(self, key, value)
 57        return self
 58
 59    @abc.abstractmethod
 60    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
 61        """Fit the model on the given inputs and targets.
 62
 63        Args:
 64            inputs (schemas.Inputs): model training inputs.
 65            targets (schemas.Targets): model training targets.
 66
 67        Returns:
 68            T.Self: instance of the model.
 69        """
 70
 71    @abc.abstractmethod
 72    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
 73        """Generate outputs with the model for the given inputs.
 74
 75        Args:
 76            inputs (schemas.Inputs): model prediction inputs.
 77
 78        Returns:
 79            schemas.Outputs: model prediction outputs.
 80        """
 81
 82    def explain_model(self) -> schemas.FeatureImportances:
 83        """Explain the internal model structure.
 84
 85        Returns:
 86            schemas.FeatureImportances: feature importances.
 87        """
 88        raise NotImplementedError()
 89
 90    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
 91        """Explain model outputs on input samples.
 92
 93        Returns:
 94            schemas.SHAPValues: SHAP values.
 95        """
 96        raise NotImplementedError()
 97
 98    def get_internal_model(self) -> T.Any:
 99        """Return the internal model in the object.
100
101        Raises:
102            NotImplementedError: method not implemented.
103
104        Returns:
105            T.Any: any internal model (either empty or fitted).
106        """
107        raise NotImplementedError()
108
109
110class BaselineSklearnModel(Model):
111    """Simple baseline model based on scikit-learn.
112
113    Parameters:
114        max_depth (int): maximum depth of the random forest.
115        n_estimators (int): number of estimators in the random forest.
116        random_state (int, optional): random state of the machine learning pipeline.
117    """
118
119    KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
120
121    # params
122    max_depth: int = 20
123    n_estimators: int = 200
124    random_state: int | None = 42
125    # private
126    _pipeline: pipeline.Pipeline | None = None
127    _numericals: list[str] = [
128        "yr",
129        "mnth",
130        "hr",
131        "holiday",
132        "weekday",
133        "workingday",
134        "temp",
135        "atemp",
136        "hum",
137        "windspeed",
138        "casual",
139        "registered",  # too correlated with target
140    ]
141    _categoricals: list[str] = [
142        "season",
143        "weathersit",
144    ]
145
146    @T.override
147    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
148        # subcomponents
149        categoricals_transformer = preprocessing.OneHotEncoder(
150            sparse_output=False, handle_unknown="ignore"
151        )
152        # components
153        transformer = compose.ColumnTransformer(
154            [
155                ("categoricals", categoricals_transformer, self._categoricals),
156                ("numericals", "passthrough", self._numericals),
157            ],
158            remainder="drop",
159        )
160        regressor = ensemble.RandomForestRegressor(
161            max_depth=self.max_depth,
162            n_estimators=self.n_estimators,
163            random_state=self.random_state,
164        )
165        # pipeline
166        self._pipeline = pipeline.Pipeline(
167            steps=[
168                ("transformer", transformer),
169                ("regressor", regressor),
170            ]
171        )
172        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
173        return self
174
175    @T.override
176    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
177        model = self.get_internal_model()
178        prediction = model.predict(inputs)
179        outputs_ = pd.DataFrame(
180            data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index
181        )
182        outputs = schemas.OutputsSchema.check(data=outputs_)
183        return outputs
184
185    @T.override
186    def explain_model(self) -> schemas.FeatureImportances:
187        model = self.get_internal_model()
188        regressor = model.named_steps["regressor"]
189        transformer = model.named_steps["transformer"]
190        feature = transformer.get_feature_names_out()
191        feature_importances_ = pd.DataFrame(
192            data={
193                "feature": feature,
194                "importance": regressor.feature_importances_,
195            }
196        )
197        feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_)
198        return feature_importances
199
200    @T.override
201    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
202        model = self.get_internal_model()
203        regressor = model.named_steps["regressor"]
204        transformer = model.named_steps["transformer"]
205        transformed = transformer.transform(X=inputs)
206        explainer = shap.TreeExplainer(model=regressor)
207        shap_values_ = pd.DataFrame(
208            data=explainer.shap_values(X=transformed),
209            columns=transformer.get_feature_names_out(),
210        )
211        shap_values = schemas.SHAPValuesSchema.check(data=shap_values_)
212        return shap_values
213
214    @T.override
215    def get_internal_model(self) -> pipeline.Pipeline:
216        model = self._pipeline
217        if model is None:
218            raise ValueError("Model is not fitted yet!")
219        return model
220
221
222ModelKind = BaselineSklearnModel

ParamKey = <class 'str'>

ParamValue = typing.Any

Params = dict[str, typing.Any]

class Model(abc.ABC, pydantic.main.BaseModel): View Source

 26class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"):
 27    """Base class for a project model.
 28
 29    Use a model to adapt AI/ML frameworks.
 30    e.g., to swap easily one model with another.
 31    """
 32
 33    KIND: str
 34
 35    def get_params(self, deep: bool = True) -> Params:
 36        """Get the model params.
 37
 38        Args:
 39            deep (bool, optional): ignored.
 40
 41        Returns:
 42            Params: internal model parameters.
 43        """
 44        params: Params = {}
 45        for key, value in self.model_dump().items():
 46            if not key.startswith("_") and not key.isupper():
 47                params[key] = value
 48        return params
 49
 50    def set_params(self, **params: ParamValue) -> T.Self:
 51        """Set the model params in place.
 52
 53        Returns:
 54            T.Self: instance of the model.
 55        """
 56        for key, value in params.items():
 57            setattr(self, key, value)
 58        return self
 59
 60    @abc.abstractmethod
 61    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
 62        """Fit the model on the given inputs and targets.
 63
 64        Args:
 65            inputs (schemas.Inputs): model training inputs.
 66            targets (schemas.Targets): model training targets.
 67
 68        Returns:
 69            T.Self: instance of the model.
 70        """
 71
 72    @abc.abstractmethod
 73    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
 74        """Generate outputs with the model for the given inputs.
 75
 76        Args:
 77            inputs (schemas.Inputs): model prediction inputs.
 78
 79        Returns:
 80            schemas.Outputs: model prediction outputs.
 81        """
 82
 83    def explain_model(self) -> schemas.FeatureImportances:
 84        """Explain the internal model structure.
 85
 86        Returns:
 87            schemas.FeatureImportances: feature importances.
 88        """
 89        raise NotImplementedError()
 90
 91    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
 92        """Explain model outputs on input samples.
 93
 94        Returns:
 95            schemas.SHAPValues: SHAP values.
 96        """
 97        raise NotImplementedError()
 98
 99    def get_internal_model(self) -> T.Any:
100        """Return the internal model in the object.
101
102        Raises:
103            NotImplementedError: method not implemented.
104
105        Returns:
106            T.Any: any internal model (either empty or fitted).
107        """
108        raise NotImplementedError()

Base class for a project model.

Use a model to adapt AI/ML frameworks. e.g., to swap easily one model with another.

KIND: str

def get_params(self, deep: bool = True) -> dict[str, typing.Any]: View Source

35    def get_params(self, deep: bool = True) -> Params:
36        """Get the model params.
37
38        Args:
39            deep (bool, optional): ignored.
40
41        Returns:
42            Params: internal model parameters.
43        """
44        params: Params = {}
45        for key, value in self.model_dump().items():
46            if not key.startswith("_") and not key.isupper():
47                params[key] = value
48        return params

Get the model params.

Arguments:

deep (bool, optional): ignored.

Returns:

Params: internal model parameters.

def set_params(self, **params: Any) -> Self: View Source

50    def set_params(self, **params: ParamValue) -> T.Self:
51        """Set the model params in place.
52
53        Returns:
54            T.Self: instance of the model.
55        """
56        for key, value in params.items():
57            setattr(self, key, value)
58        return self

Set the model params in place.

Returns:

T.Self: instance of the model.

@abc.abstractmethod

def fit( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema]) -> Self: View Source

60    @abc.abstractmethod
61    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
62        """Fit the model on the given inputs and targets.
63
64        Args:
65            inputs (schemas.Inputs): model training inputs.
66            targets (schemas.Targets): model training targets.
67
68        Returns:
69            T.Self: instance of the model.
70        """

Fit the model on the given inputs and targets.

Arguments:

inputs (schemas.Inputs): model training inputs.
targets (schemas.Targets): model training targets.

Returns:

T.Self: instance of the model.

@abc.abstractmethod

def predict( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.OutputsSchema]: View Source

72    @abc.abstractmethod
73    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
74        """Generate outputs with the model for the given inputs.
75
76        Args:
77            inputs (schemas.Inputs): model prediction inputs.
78
79        Returns:
80            schemas.Outputs: model prediction outputs.
81        """

Generate outputs with the model for the given inputs.

Arguments:

inputs (schemas.Inputs): model prediction inputs.

Returns:

schemas.Outputs: model prediction outputs.

def explain_model( self) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.FeatureImportancesSchema]: View Source

83    def explain_model(self) -> schemas.FeatureImportances:
84        """Explain the internal model structure.
85
86        Returns:
87            schemas.FeatureImportances: feature importances.
88        """
89        raise NotImplementedError()

Explain the internal model structure.

Returns:

schemas.FeatureImportances: feature importances.

def explain_samples( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.SHAPValuesSchema]: View Source

91    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
92        """Explain model outputs on input samples.
93
94        Returns:
95            schemas.SHAPValues: SHAP values.
96        """
97        raise NotImplementedError()

Explain model outputs on input samples.

Returns:

schemas.SHAPValues: SHAP values.

def get_internal_model(self) -> Any: View Source

 99    def get_internal_model(self) -> T.Any:
100        """Return the internal model in the object.
101
102        Raises:
103            NotImplementedError: method not implemented.
104
105        Returns:
106            T.Any: any internal model (either empty or fitted).
107        """
108        raise NotImplementedError()

Return the internal model in the object.

Raises:

NotImplementedError: method not implemented.

Returns:

T.Any: any internal model (either empty or fitted).

model_config: ClassVar[pydantic.config.ConfigDict] = {'strict': True, 'frozen': False, 'extra': 'forbid'}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class BaselineSklearnModel(Model): View Source

111class BaselineSklearnModel(Model):
112    """Simple baseline model based on scikit-learn.
113
114    Parameters:
115        max_depth (int): maximum depth of the random forest.
116        n_estimators (int): number of estimators in the random forest.
117        random_state (int, optional): random state of the machine learning pipeline.
118    """
119
120    KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
121
122    # params
123    max_depth: int = 20
124    n_estimators: int = 200
125    random_state: int | None = 42
126    # private
127    _pipeline: pipeline.Pipeline | None = None
128    _numericals: list[str] = [
129        "yr",
130        "mnth",
131        "hr",
132        "holiday",
133        "weekday",
134        "workingday",
135        "temp",
136        "atemp",
137        "hum",
138        "windspeed",
139        "casual",
140        "registered",  # too correlated with target
141    ]
142    _categoricals: list[str] = [
143        "season",
144        "weathersit",
145    ]
146
147    @T.override
148    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
149        # subcomponents
150        categoricals_transformer = preprocessing.OneHotEncoder(
151            sparse_output=False, handle_unknown="ignore"
152        )
153        # components
154        transformer = compose.ColumnTransformer(
155            [
156                ("categoricals", categoricals_transformer, self._categoricals),
157                ("numericals", "passthrough", self._numericals),
158            ],
159            remainder="drop",
160        )
161        regressor = ensemble.RandomForestRegressor(
162            max_depth=self.max_depth,
163            n_estimators=self.n_estimators,
164            random_state=self.random_state,
165        )
166        # pipeline
167        self._pipeline = pipeline.Pipeline(
168            steps=[
169                ("transformer", transformer),
170                ("regressor", regressor),
171            ]
172        )
173        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
174        return self
175
176    @T.override
177    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
178        model = self.get_internal_model()
179        prediction = model.predict(inputs)
180        outputs_ = pd.DataFrame(
181            data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index
182        )
183        outputs = schemas.OutputsSchema.check(data=outputs_)
184        return outputs
185
186    @T.override
187    def explain_model(self) -> schemas.FeatureImportances:
188        model = self.get_internal_model()
189        regressor = model.named_steps["regressor"]
190        transformer = model.named_steps["transformer"]
191        feature = transformer.get_feature_names_out()
192        feature_importances_ = pd.DataFrame(
193            data={
194                "feature": feature,
195                "importance": regressor.feature_importances_,
196            }
197        )
198        feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_)
199        return feature_importances
200
201    @T.override
202    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
203        model = self.get_internal_model()
204        regressor = model.named_steps["regressor"]
205        transformer = model.named_steps["transformer"]
206        transformed = transformer.transform(X=inputs)
207        explainer = shap.TreeExplainer(model=regressor)
208        shap_values_ = pd.DataFrame(
209            data=explainer.shap_values(X=transformed),
210            columns=transformer.get_feature_names_out(),
211        )
212        shap_values = schemas.SHAPValuesSchema.check(data=shap_values_)
213        return shap_values
214
215    @T.override
216    def get_internal_model(self) -> pipeline.Pipeline:
217        model = self._pipeline
218        if model is None:
219            raise ValueError("Model is not fitted yet!")
220        return model

Simple baseline model based on scikit-learn.

Arguments:

max_depth (int): maximum depth of the random forest.
n_estimators (int): number of estimators in the random forest.
random_state (int, optional): random state of the machine learning pipeline.

KIND: Literal['BaselineSklearnModel']

max_depth: int

n_estimators: int

random_state: int | None

@T.override

def fit( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema]) -> BaselineSklearnModel: View Source

147    @T.override
148    def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
149        # subcomponents
150        categoricals_transformer = preprocessing.OneHotEncoder(
151            sparse_output=False, handle_unknown="ignore"
152        )
153        # components
154        transformer = compose.ColumnTransformer(
155            [
156                ("categoricals", categoricals_transformer, self._categoricals),
157                ("numericals", "passthrough", self._numericals),
158            ],
159            remainder="drop",
160        )
161        regressor = ensemble.RandomForestRegressor(
162            max_depth=self.max_depth,
163            n_estimators=self.n_estimators,
164            random_state=self.random_state,
165        )
166        # pipeline
167        self._pipeline = pipeline.Pipeline(
168            steps=[
169                ("transformer", transformer),
170                ("regressor", regressor),
171            ]
172        )
173        self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
174        return self

Fit the model on the given inputs and targets.

Arguments:

inputs (schemas.Inputs): model training inputs.
targets (schemas.Targets): model training targets.

Returns:

T.Self: instance of the model.

@T.override

def predict( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.OutputsSchema]: View Source

176    @T.override
177    def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
178        model = self.get_internal_model()
179        prediction = model.predict(inputs)
180        outputs_ = pd.DataFrame(
181            data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index
182        )
183        outputs = schemas.OutputsSchema.check(data=outputs_)
184        return outputs

Generate outputs with the model for the given inputs.

Arguments:

inputs (schemas.Inputs): model prediction inputs.

Returns:

schemas.Outputs: model prediction outputs.

@T.override

def explain_model( self) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.FeatureImportancesSchema]: View Source

186    @T.override
187    def explain_model(self) -> schemas.FeatureImportances:
188        model = self.get_internal_model()
189        regressor = model.named_steps["regressor"]
190        transformer = model.named_steps["transformer"]
191        feature = transformer.get_feature_names_out()
192        feature_importances_ = pd.DataFrame(
193            data={
194                "feature": feature,
195                "importance": regressor.feature_importances_,
196            }
197        )
198        feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_)
199        return feature_importances

Explain the internal model structure.

Returns:

schemas.FeatureImportances: feature importances.

@T.override

def explain_samples( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema]) -> pandera.typing.pandas.DataFrame[bikes.core.schemas.SHAPValuesSchema]: View Source

201    @T.override
202    def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
203        model = self.get_internal_model()
204        regressor = model.named_steps["regressor"]
205        transformer = model.named_steps["transformer"]
206        transformed = transformer.transform(X=inputs)
207        explainer = shap.TreeExplainer(model=regressor)
208        shap_values_ = pd.DataFrame(
209            data=explainer.shap_values(X=transformed),
210            columns=transformer.get_feature_names_out(),
211        )
212        shap_values = schemas.SHAPValuesSchema.check(data=shap_values_)
213        return shap_values

Explain model outputs on input samples.

Returns:

schemas.SHAPValues: SHAP values.

@T.override

def get_internal_model(self) -> sklearn.pipeline.Pipeline: View Source

215    @T.override
216    def get_internal_model(self) -> pipeline.Pipeline:
217        model = self._pipeline
218        if model is None:
219            raise ValueError("Model is not fitted yet!")
220        return model

Return the internal model in the object.

Raises:

NotImplementedError: method not implemented.

Returns:

T.Any: any internal model (either empty or fitted).

model_config: ClassVar[pydantic.config.ConfigDict] = {'strict': True, 'frozen': False, 'extra': 'forbid'}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

328def init_private_attributes(self: BaseModel, context: Any, /) -> None:
329    """This function is meant to behave like a BaseModel method to initialise private attributes.
330
331    It takes context as an argument since that's what pydantic-core passes when calling it.
332
333    Args:
334        self: The BaseModel instance.
335        context: The context.
336    """
337    if getattr(self, '__pydantic_private__', None) is None:
338        pydantic_private = {}
339        for name, private_attr in self.__private_attributes__.items():
340            default = private_attr.get_default()
341            if default is not PydanticUndefined:
342                pydantic_private[name] = default
343        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:

self: The BaseModel instance.
context: The context.

Inherited Members

Model: get_params; set_params

ModelKind = <class 'BaselineSklearnModel'>