bikes.core.models
Define trainable machine learning models.
1"""Define trainable machine learning models.""" 2 3# %% IMPORTS 4 5import abc 6import typing as T 7 8import pandas as pd 9import pydantic as pdt 10import shap 11from sklearn import compose, ensemble, pipeline, preprocessing 12 13from bikes.core import schemas 14 15# %% TYPES 16 17# Model params 18ParamKey = str 19ParamValue = T.Any 20Params = dict[ParamKey, ParamValue] 21 22# %% MODELS 23 24 25class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"): 26 """Base class for a project model. 27 28 Use a model to adapt AI/ML frameworks. 29 e.g., to swap easily one model with another. 30 """ 31 32 KIND: str 33 34 def get_params(self, deep: bool = True) -> Params: 35 """Get the model params. 36 37 Args: 38 deep (bool, optional): ignored. 39 40 Returns: 41 Params: internal model parameters. 42 """ 43 params: Params = {} 44 for key, value in self.model_dump().items(): 45 if not key.startswith("_") and not key.isupper(): 46 params[key] = value 47 return params 48 49 def set_params(self, **params: ParamValue) -> T.Self: 50 """Set the model params in place. 51 52 Returns: 53 T.Self: instance of the model. 54 """ 55 for key, value in params.items(): 56 setattr(self, key, value) 57 return self 58 59 @abc.abstractmethod 60 def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self: 61 """Fit the model on the given inputs and targets. 62 63 Args: 64 inputs (schemas.Inputs): model training inputs. 65 targets (schemas.Targets): model training targets. 66 67 Returns: 68 T.Self: instance of the model. 69 """ 70 71 @abc.abstractmethod 72 def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 73 """Generate outputs with the model for the given inputs. 74 75 Args: 76 inputs (schemas.Inputs): model prediction inputs. 77 78 Returns: 79 schemas.Outputs: model prediction outputs. 80 """ 81 82 def explain_model(self) -> schemas.FeatureImportances: 83 """Explain the internal model structure. 84 85 Returns: 86 schemas.FeatureImportances: feature importances. 87 """ 88 raise NotImplementedError() 89 90 def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 91 """Explain model outputs on input samples. 92 93 Returns: 94 schemas.SHAPValues: SHAP values. 95 """ 96 raise NotImplementedError() 97 98 def get_internal_model(self) -> T.Any: 99 """Return the internal model in the object. 100 101 Raises: 102 NotImplementedError: method not implemented. 103 104 Returns: 105 T.Any: any internal model (either empty or fitted). 106 """ 107 raise NotImplementedError() 108 109 110class BaselineSklearnModel(Model): 111 """Simple baseline model based on scikit-learn. 112 113 Parameters: 114 max_depth (int): maximum depth of the random forest. 115 n_estimators (int): number of estimators in the random forest. 116 random_state (int, optional): random state of the machine learning pipeline. 117 """ 118 119 KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel" 120 121 # params 122 max_depth: int = 20 123 n_estimators: int = 200 124 random_state: int | None = 42 125 # private 126 _pipeline: pipeline.Pipeline | None = None 127 _numericals: list[str] = [ 128 "yr", 129 "mnth", 130 "hr", 131 "holiday", 132 "weekday", 133 "workingday", 134 "temp", 135 "atemp", 136 "hum", 137 "windspeed", 138 "casual", 139 "registered", # too correlated with target 140 ] 141 _categoricals: list[str] = [ 142 "season", 143 "weathersit", 144 ] 145 146 @T.override 147 def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel": 148 # subcomponents 149 categoricals_transformer = preprocessing.OneHotEncoder( 150 sparse_output=False, handle_unknown="ignore" 151 ) 152 # components 153 transformer = compose.ColumnTransformer( 154 [ 155 ("categoricals", categoricals_transformer, self._categoricals), 156 ("numericals", "passthrough", self._numericals), 157 ], 158 remainder="drop", 159 ) 160 regressor = ensemble.RandomForestRegressor( 161 max_depth=self.max_depth, 162 n_estimators=self.n_estimators, 163 random_state=self.random_state, 164 ) 165 # pipeline 166 self._pipeline = pipeline.Pipeline( 167 steps=[ 168 ("transformer", transformer), 169 ("regressor", regressor), 170 ] 171 ) 172 self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt]) 173 return self 174 175 @T.override 176 def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 177 model = self.get_internal_model() 178 prediction = model.predict(inputs) 179 outputs_ = pd.DataFrame( 180 data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index 181 ) 182 outputs = schemas.OutputsSchema.check(data=outputs_) 183 return outputs 184 185 @T.override 186 def explain_model(self) -> schemas.FeatureImportances: 187 model = self.get_internal_model() 188 regressor = model.named_steps["regressor"] 189 transformer = model.named_steps["transformer"] 190 feature = transformer.get_feature_names_out() 191 feature_importances_ = pd.DataFrame( 192 data={ 193 "feature": feature, 194 "importance": regressor.feature_importances_, 195 } 196 ) 197 feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_) 198 return feature_importances 199 200 @T.override 201 def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 202 model = self.get_internal_model() 203 regressor = model.named_steps["regressor"] 204 transformer = model.named_steps["transformer"] 205 transformed = transformer.transform(X=inputs) 206 explainer = shap.TreeExplainer(model=regressor) 207 shap_values_ = pd.DataFrame( 208 data=explainer.shap_values(X=transformed), 209 columns=transformer.get_feature_names_out(), 210 ) 211 shap_values = schemas.SHAPValuesSchema.check(data=shap_values_) 212 return shap_values 213 214 @T.override 215 def get_internal_model(self) -> pipeline.Pipeline: 216 model = self._pipeline 217 if model is None: 218 raise ValueError("Model is not fitted yet!") 219 return model 220 221 222ModelKind = BaselineSklearnModel
26class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"): 27 """Base class for a project model. 28 29 Use a model to adapt AI/ML frameworks. 30 e.g., to swap easily one model with another. 31 """ 32 33 KIND: str 34 35 def get_params(self, deep: bool = True) -> Params: 36 """Get the model params. 37 38 Args: 39 deep (bool, optional): ignored. 40 41 Returns: 42 Params: internal model parameters. 43 """ 44 params: Params = {} 45 for key, value in self.model_dump().items(): 46 if not key.startswith("_") and not key.isupper(): 47 params[key] = value 48 return params 49 50 def set_params(self, **params: ParamValue) -> T.Self: 51 """Set the model params in place. 52 53 Returns: 54 T.Self: instance of the model. 55 """ 56 for key, value in params.items(): 57 setattr(self, key, value) 58 return self 59 60 @abc.abstractmethod 61 def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self: 62 """Fit the model on the given inputs and targets. 63 64 Args: 65 inputs (schemas.Inputs): model training inputs. 66 targets (schemas.Targets): model training targets. 67 68 Returns: 69 T.Self: instance of the model. 70 """ 71 72 @abc.abstractmethod 73 def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 74 """Generate outputs with the model for the given inputs. 75 76 Args: 77 inputs (schemas.Inputs): model prediction inputs. 78 79 Returns: 80 schemas.Outputs: model prediction outputs. 81 """ 82 83 def explain_model(self) -> schemas.FeatureImportances: 84 """Explain the internal model structure. 85 86 Returns: 87 schemas.FeatureImportances: feature importances. 88 """ 89 raise NotImplementedError() 90 91 def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 92 """Explain model outputs on input samples. 93 94 Returns: 95 schemas.SHAPValues: SHAP values. 96 """ 97 raise NotImplementedError() 98 99 def get_internal_model(self) -> T.Any: 100 """Return the internal model in the object. 101 102 Raises: 103 NotImplementedError: method not implemented. 104 105 Returns: 106 T.Any: any internal model (either empty or fitted). 107 """ 108 raise NotImplementedError()
Base class for a project model.
Use a model to adapt AI/ML frameworks. e.g., to swap easily one model with another.
35 def get_params(self, deep: bool = True) -> Params: 36 """Get the model params. 37 38 Args: 39 deep (bool, optional): ignored. 40 41 Returns: 42 Params: internal model parameters. 43 """ 44 params: Params = {} 45 for key, value in self.model_dump().items(): 46 if not key.startswith("_") and not key.isupper(): 47 params[key] = value 48 return params
Get the model params.
Arguments:
- deep (bool, optional): ignored.
Returns:
Params: internal model parameters.
50 def set_params(self, **params: ParamValue) -> T.Self: 51 """Set the model params in place. 52 53 Returns: 54 T.Self: instance of the model. 55 """ 56 for key, value in params.items(): 57 setattr(self, key, value) 58 return self
Set the model params in place.
Returns:
T.Self: instance of the model.
60 @abc.abstractmethod 61 def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self: 62 """Fit the model on the given inputs and targets. 63 64 Args: 65 inputs (schemas.Inputs): model training inputs. 66 targets (schemas.Targets): model training targets. 67 68 Returns: 69 T.Self: instance of the model. 70 """
Fit the model on the given inputs and targets.
Arguments:
- inputs (schemas.Inputs): model training inputs.
- targets (schemas.Targets): model training targets.
Returns:
T.Self: instance of the model.
72 @abc.abstractmethod 73 def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 74 """Generate outputs with the model for the given inputs. 75 76 Args: 77 inputs (schemas.Inputs): model prediction inputs. 78 79 Returns: 80 schemas.Outputs: model prediction outputs. 81 """
Generate outputs with the model for the given inputs.
Arguments:
- inputs (schemas.Inputs): model prediction inputs.
Returns:
schemas.Outputs: model prediction outputs.
83 def explain_model(self) -> schemas.FeatureImportances: 84 """Explain the internal model structure. 85 86 Returns: 87 schemas.FeatureImportances: feature importances. 88 """ 89 raise NotImplementedError()
Explain the internal model structure.
Returns:
schemas.FeatureImportances: feature importances.
91 def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 92 """Explain model outputs on input samples. 93 94 Returns: 95 schemas.SHAPValues: SHAP values. 96 """ 97 raise NotImplementedError()
Explain model outputs on input samples.
Returns:
schemas.SHAPValues: SHAP values.
99 def get_internal_model(self) -> T.Any: 100 """Return the internal model in the object. 101 102 Raises: 103 NotImplementedError: method not implemented. 104 105 Returns: 106 T.Any: any internal model (either empty or fitted). 107 """ 108 raise NotImplementedError()
Return the internal model in the object.
Raises:
- NotImplementedError: method not implemented.
Returns:
T.Any: any internal model (either empty or fitted).
111class BaselineSklearnModel(Model): 112 """Simple baseline model based on scikit-learn. 113 114 Parameters: 115 max_depth (int): maximum depth of the random forest. 116 n_estimators (int): number of estimators in the random forest. 117 random_state (int, optional): random state of the machine learning pipeline. 118 """ 119 120 KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel" 121 122 # params 123 max_depth: int = 20 124 n_estimators: int = 200 125 random_state: int | None = 42 126 # private 127 _pipeline: pipeline.Pipeline | None = None 128 _numericals: list[str] = [ 129 "yr", 130 "mnth", 131 "hr", 132 "holiday", 133 "weekday", 134 "workingday", 135 "temp", 136 "atemp", 137 "hum", 138 "windspeed", 139 "casual", 140 "registered", # too correlated with target 141 ] 142 _categoricals: list[str] = [ 143 "season", 144 "weathersit", 145 ] 146 147 @T.override 148 def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel": 149 # subcomponents 150 categoricals_transformer = preprocessing.OneHotEncoder( 151 sparse_output=False, handle_unknown="ignore" 152 ) 153 # components 154 transformer = compose.ColumnTransformer( 155 [ 156 ("categoricals", categoricals_transformer, self._categoricals), 157 ("numericals", "passthrough", self._numericals), 158 ], 159 remainder="drop", 160 ) 161 regressor = ensemble.RandomForestRegressor( 162 max_depth=self.max_depth, 163 n_estimators=self.n_estimators, 164 random_state=self.random_state, 165 ) 166 # pipeline 167 self._pipeline = pipeline.Pipeline( 168 steps=[ 169 ("transformer", transformer), 170 ("regressor", regressor), 171 ] 172 ) 173 self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt]) 174 return self 175 176 @T.override 177 def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 178 model = self.get_internal_model() 179 prediction = model.predict(inputs) 180 outputs_ = pd.DataFrame( 181 data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index 182 ) 183 outputs = schemas.OutputsSchema.check(data=outputs_) 184 return outputs 185 186 @T.override 187 def explain_model(self) -> schemas.FeatureImportances: 188 model = self.get_internal_model() 189 regressor = model.named_steps["regressor"] 190 transformer = model.named_steps["transformer"] 191 feature = transformer.get_feature_names_out() 192 feature_importances_ = pd.DataFrame( 193 data={ 194 "feature": feature, 195 "importance": regressor.feature_importances_, 196 } 197 ) 198 feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_) 199 return feature_importances 200 201 @T.override 202 def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 203 model = self.get_internal_model() 204 regressor = model.named_steps["regressor"] 205 transformer = model.named_steps["transformer"] 206 transformed = transformer.transform(X=inputs) 207 explainer = shap.TreeExplainer(model=regressor) 208 shap_values_ = pd.DataFrame( 209 data=explainer.shap_values(X=transformed), 210 columns=transformer.get_feature_names_out(), 211 ) 212 shap_values = schemas.SHAPValuesSchema.check(data=shap_values_) 213 return shap_values 214 215 @T.override 216 def get_internal_model(self) -> pipeline.Pipeline: 217 model = self._pipeline 218 if model is None: 219 raise ValueError("Model is not fitted yet!") 220 return model
Simple baseline model based on scikit-learn.
Arguments:
- max_depth (int): maximum depth of the random forest.
- n_estimators (int): number of estimators in the random forest.
- random_state (int, optional): random state of the machine learning pipeline.
147 @T.override 148 def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel": 149 # subcomponents 150 categoricals_transformer = preprocessing.OneHotEncoder( 151 sparse_output=False, handle_unknown="ignore" 152 ) 153 # components 154 transformer = compose.ColumnTransformer( 155 [ 156 ("categoricals", categoricals_transformer, self._categoricals), 157 ("numericals", "passthrough", self._numericals), 158 ], 159 remainder="drop", 160 ) 161 regressor = ensemble.RandomForestRegressor( 162 max_depth=self.max_depth, 163 n_estimators=self.n_estimators, 164 random_state=self.random_state, 165 ) 166 # pipeline 167 self._pipeline = pipeline.Pipeline( 168 steps=[ 169 ("transformer", transformer), 170 ("regressor", regressor), 171 ] 172 ) 173 self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt]) 174 return self
Fit the model on the given inputs and targets.
Arguments:
- inputs (schemas.Inputs): model training inputs.
- targets (schemas.Targets): model training targets.
Returns:
T.Self: instance of the model.
176 @T.override 177 def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 178 model = self.get_internal_model() 179 prediction = model.predict(inputs) 180 outputs_ = pd.DataFrame( 181 data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index 182 ) 183 outputs = schemas.OutputsSchema.check(data=outputs_) 184 return outputs
Generate outputs with the model for the given inputs.
Arguments:
- inputs (schemas.Inputs): model prediction inputs.
Returns:
schemas.Outputs: model prediction outputs.
186 @T.override 187 def explain_model(self) -> schemas.FeatureImportances: 188 model = self.get_internal_model() 189 regressor = model.named_steps["regressor"] 190 transformer = model.named_steps["transformer"] 191 feature = transformer.get_feature_names_out() 192 feature_importances_ = pd.DataFrame( 193 data={ 194 "feature": feature, 195 "importance": regressor.feature_importances_, 196 } 197 ) 198 feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_) 199 return feature_importances
Explain the internal model structure.
Returns:
schemas.FeatureImportances: feature importances.
201 @T.override 202 def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 203 model = self.get_internal_model() 204 regressor = model.named_steps["regressor"] 205 transformer = model.named_steps["transformer"] 206 transformed = transformer.transform(X=inputs) 207 explainer = shap.TreeExplainer(model=regressor) 208 shap_values_ = pd.DataFrame( 209 data=explainer.shap_values(X=transformed), 210 columns=transformer.get_feature_names_out(), 211 ) 212 shap_values = schemas.SHAPValuesSchema.check(data=shap_values_) 213 return shap_values
Explain model outputs on input samples.
Returns:
schemas.SHAPValues: SHAP values.
215 @T.override 216 def get_internal_model(self) -> pipeline.Pipeline: 217 model = self._pipeline 218 if model is None: 219 raise ValueError("Model is not fitted yet!") 220 return model
Return the internal model in the object.
Raises:
- NotImplementedError: method not implemented.
Returns:
T.Any: any internal model (either empty or fitted).
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
328def init_private_attributes(self: BaseModel, context: Any, /) -> None: 329 """This function is meant to behave like a BaseModel method to initialise private attributes. 330 331 It takes context as an argument since that's what pydantic-core passes when calling it. 332 333 Args: 334 self: The BaseModel instance. 335 context: The context. 336 """ 337 if getattr(self, '__pydantic_private__', None) is None: 338 pydantic_private = {} 339 for name, private_attr in self.__private_attributes__.items(): 340 default = private_attr.get_default() 341 if default is not PydanticUndefined: 342 pydantic_private[name] = default 343 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Arguments:
- self: The BaseModel instance.
- context: The context.