bikes.utils.splitters
Split dataframes into subsets (e.g., train/valid/test).
1"""Split dataframes into subsets (e.g., train/valid/test).""" 2 3# %% IMPORTS 4 5import abc 6import typing as T 7 8import numpy as np 9import numpy.typing as npt 10import pydantic as pdt 11from sklearn import model_selection 12 13from bikes.core import schemas 14 15# %% TYPES 16 17Index = npt.NDArray[np.int64] 18TrainTestIndex = tuple[Index, Index] 19TrainTestSplits = T.Iterator[TrainTestIndex] 20 21# %% SPLITTERS 22 23 24class Splitter(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 25 """Base class for a splitter. 26 27 Use splitters to split data in sets. 28 e.g., split between a train/test subsets. 29 30 # https://scikit-learn.org/stable/glossary.html#term-CV-splitter 31 """ 32 33 KIND: str 34 35 @abc.abstractmethod 36 def split( 37 self, 38 inputs: schemas.Inputs, 39 targets: schemas.Targets, 40 groups: Index | None = None, 41 ) -> TrainTestSplits: 42 """Split a dataframe into subsets. 43 44 Args: 45 inputs (schemas.Inputs): model inputs. 46 targets (schemas.Targets): model targets. 47 groups (Index | None, optional): group labels. 48 49 Returns: 50 TrainTestSplits: iterator over the dataframe train/test splits. 51 """ 52 53 @abc.abstractmethod 54 def get_n_splits( 55 self, 56 inputs: schemas.Inputs, 57 targets: schemas.Targets, 58 groups: Index | None = None, 59 ) -> int: 60 """Get the number of splits generated. 61 62 Args: 63 inputs (schemas.Inputs): models inputs. 64 targets (schemas.Targets): model targets. 65 groups (Index | None, optional): group labels. 66 67 Returns: 68 int: number of splits generated. 69 """ 70 71 72class TrainTestSplitter(Splitter): 73 """Split a dataframe into a train and test set. 74 75 Parameters: 76 shuffle (bool): shuffle the dataset. Default is False. 77 test_size (int | float): number/ratio for the test set. 78 random_state (int): random state for the splitter object. 79 """ 80 81 KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter" 82 83 shuffle: bool = False # required (time sensitive) 84 test_size: int | float = 24 * 30 * 2 # 2 months 85 random_state: int = 42 86 87 @T.override 88 def split( 89 self, 90 inputs: schemas.Inputs, 91 targets: schemas.Targets, 92 groups: Index | None = None, 93 ) -> TrainTestSplits: 94 index = np.arange(len(inputs)) # return integer position 95 train_index, test_index = model_selection.train_test_split( 96 index, 97 shuffle=self.shuffle, 98 test_size=self.test_size, 99 random_state=self.random_state, 100 ) 101 yield train_index, test_index 102 103 @T.override 104 def get_n_splits( 105 self, 106 inputs: schemas.Inputs, 107 targets: schemas.Targets, 108 groups: Index | None = None, 109 ) -> int: 110 return 1 111 112 113class TimeSeriesSplitter(Splitter): 114 """Split a dataframe into fixed time series subsets. 115 116 Parameters: 117 gap (int): gap between splits. 118 n_splits (int): number of split to generate. 119 test_size (int | float): number or ratio for the test dataset. 120 """ 121 122 KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter" 123 124 gap: int = 0 125 n_splits: int = 4 126 test_size: int | float = 24 * 30 * 2 # 2 months 127 128 @T.override 129 def split( 130 self, 131 inputs: schemas.Inputs, 132 targets: schemas.Targets, 133 groups: Index | None = None, 134 ) -> TrainTestSplits: 135 splitter = model_selection.TimeSeriesSplit( 136 n_splits=self.n_splits, test_size=self.test_size, gap=self.gap 137 ) 138 yield from splitter.split(inputs) 139 140 @T.override 141 def get_n_splits( 142 self, 143 inputs: schemas.Inputs, 144 targets: schemas.Targets, 145 groups: Index | None = None, 146 ) -> int: 147 return self.n_splits 148 149 150SplitterKind = TrainTestSplitter | TimeSeriesSplitter
Index =
numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]]
TrainTestIndex =
tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]], numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]]]
TrainTestSplits =
typing.Iterator[tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]], numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]]]]
class
Splitter(abc.ABC, pydantic.main.BaseModel):
25class Splitter(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 26 """Base class for a splitter. 27 28 Use splitters to split data in sets. 29 e.g., split between a train/test subsets. 30 31 # https://scikit-learn.org/stable/glossary.html#term-CV-splitter 32 """ 33 34 KIND: str 35 36 @abc.abstractmethod 37 def split( 38 self, 39 inputs: schemas.Inputs, 40 targets: schemas.Targets, 41 groups: Index | None = None, 42 ) -> TrainTestSplits: 43 """Split a dataframe into subsets. 44 45 Args: 46 inputs (schemas.Inputs): model inputs. 47 targets (schemas.Targets): model targets. 48 groups (Index | None, optional): group labels. 49 50 Returns: 51 TrainTestSplits: iterator over the dataframe train/test splits. 52 """ 53 54 @abc.abstractmethod 55 def get_n_splits( 56 self, 57 inputs: schemas.Inputs, 58 targets: schemas.Targets, 59 groups: Index | None = None, 60 ) -> int: 61 """Get the number of splits generated. 62 63 Args: 64 inputs (schemas.Inputs): models inputs. 65 targets (schemas.Targets): model targets. 66 groups (Index | None, optional): group labels. 67 68 Returns: 69 int: number of splits generated. 70 """
Base class for a splitter.
Use splitters to split data in sets. e.g., split between a train/test subsets.
https://scikit-learn.org/stable/glossary.html#term-CV-splitter
@abc.abstractmethod
def
split( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> Iterator[tuple[numpy.ndarray[Any, numpy.dtype[numpy.int64]], numpy.ndarray[Any, numpy.dtype[numpy.int64]]]]:
36 @abc.abstractmethod 37 def split( 38 self, 39 inputs: schemas.Inputs, 40 targets: schemas.Targets, 41 groups: Index | None = None, 42 ) -> TrainTestSplits: 43 """Split a dataframe into subsets. 44 45 Args: 46 inputs (schemas.Inputs): model inputs. 47 targets (schemas.Targets): model targets. 48 groups (Index | None, optional): group labels. 49 50 Returns: 51 TrainTestSplits: iterator over the dataframe train/test splits. 52 """
Split a dataframe into subsets.
Arguments:
- inputs (schemas.Inputs): model inputs.
- targets (schemas.Targets): model targets.
- groups (Index | None, optional): group labels.
Returns:
TrainTestSplits: iterator over the dataframe train/test splits.
@abc.abstractmethod
def
get_n_splits( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> int:
54 @abc.abstractmethod 55 def get_n_splits( 56 self, 57 inputs: schemas.Inputs, 58 targets: schemas.Targets, 59 groups: Index | None = None, 60 ) -> int: 61 """Get the number of splits generated. 62 63 Args: 64 inputs (schemas.Inputs): models inputs. 65 targets (schemas.Targets): model targets. 66 groups (Index | None, optional): group labels. 67 68 Returns: 69 int: number of splits generated. 70 """
Get the number of splits generated.
Arguments:
- inputs (schemas.Inputs): models inputs.
- targets (schemas.Targets): model targets.
- groups (Index | None, optional): group labels.
Returns:
int: number of splits generated.
73class TrainTestSplitter(Splitter): 74 """Split a dataframe into a train and test set. 75 76 Parameters: 77 shuffle (bool): shuffle the dataset. Default is False. 78 test_size (int | float): number/ratio for the test set. 79 random_state (int): random state for the splitter object. 80 """ 81 82 KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter" 83 84 shuffle: bool = False # required (time sensitive) 85 test_size: int | float = 24 * 30 * 2 # 2 months 86 random_state: int = 42 87 88 @T.override 89 def split( 90 self, 91 inputs: schemas.Inputs, 92 targets: schemas.Targets, 93 groups: Index | None = None, 94 ) -> TrainTestSplits: 95 index = np.arange(len(inputs)) # return integer position 96 train_index, test_index = model_selection.train_test_split( 97 index, 98 shuffle=self.shuffle, 99 test_size=self.test_size, 100 random_state=self.random_state, 101 ) 102 yield train_index, test_index 103 104 @T.override 105 def get_n_splits( 106 self, 107 inputs: schemas.Inputs, 108 targets: schemas.Targets, 109 groups: Index | None = None, 110 ) -> int: 111 return 1
Split a dataframe into a train and test set.
Arguments:
- shuffle (bool): shuffle the dataset. Default is False.
- test_size (int | float): number/ratio for the test set.
- random_state (int): random state for the splitter object.
@T.override
def
split( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> Iterator[tuple[numpy.ndarray[Any, numpy.dtype[numpy.int64]], numpy.ndarray[Any, numpy.dtype[numpy.int64]]]]:
88 @T.override 89 def split( 90 self, 91 inputs: schemas.Inputs, 92 targets: schemas.Targets, 93 groups: Index | None = None, 94 ) -> TrainTestSplits: 95 index = np.arange(len(inputs)) # return integer position 96 train_index, test_index = model_selection.train_test_split( 97 index, 98 shuffle=self.shuffle, 99 test_size=self.test_size, 100 random_state=self.random_state, 101 ) 102 yield train_index, test_index
Split a dataframe into subsets.
Arguments:
- inputs (schemas.Inputs): model inputs.
- targets (schemas.Targets): model targets.
- groups (Index | None, optional): group labels.
Returns:
TrainTestSplits: iterator over the dataframe train/test splits.
@T.override
def
get_n_splits( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> int:
104 @T.override 105 def get_n_splits( 106 self, 107 inputs: schemas.Inputs, 108 targets: schemas.Targets, 109 groups: Index | None = None, 110 ) -> int: 111 return 1
Get the number of splits generated.
Arguments:
- inputs (schemas.Inputs): models inputs.
- targets (schemas.Targets): model targets.
- groups (Index | None, optional): group labels.
Returns:
int: number of splits generated.
114class TimeSeriesSplitter(Splitter): 115 """Split a dataframe into fixed time series subsets. 116 117 Parameters: 118 gap (int): gap between splits. 119 n_splits (int): number of split to generate. 120 test_size (int | float): number or ratio for the test dataset. 121 """ 122 123 KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter" 124 125 gap: int = 0 126 n_splits: int = 4 127 test_size: int | float = 24 * 30 * 2 # 2 months 128 129 @T.override 130 def split( 131 self, 132 inputs: schemas.Inputs, 133 targets: schemas.Targets, 134 groups: Index | None = None, 135 ) -> TrainTestSplits: 136 splitter = model_selection.TimeSeriesSplit( 137 n_splits=self.n_splits, test_size=self.test_size, gap=self.gap 138 ) 139 yield from splitter.split(inputs) 140 141 @T.override 142 def get_n_splits( 143 self, 144 inputs: schemas.Inputs, 145 targets: schemas.Targets, 146 groups: Index | None = None, 147 ) -> int: 148 return self.n_splits
Split a dataframe into fixed time series subsets.
Arguments:
- gap (int): gap between splits.
- n_splits (int): number of split to generate.
- test_size (int | float): number or ratio for the test dataset.
@T.override
def
split( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> Iterator[tuple[numpy.ndarray[Any, numpy.dtype[numpy.int64]], numpy.ndarray[Any, numpy.dtype[numpy.int64]]]]:
129 @T.override 130 def split( 131 self, 132 inputs: schemas.Inputs, 133 targets: schemas.Targets, 134 groups: Index | None = None, 135 ) -> TrainTestSplits: 136 splitter = model_selection.TimeSeriesSplit( 137 n_splits=self.n_splits, test_size=self.test_size, gap=self.gap 138 ) 139 yield from splitter.split(inputs)
Split a dataframe into subsets.
Arguments:
- inputs (schemas.Inputs): model inputs.
- targets (schemas.Targets): model targets.
- groups (Index | None, optional): group labels.
Returns:
TrainTestSplits: iterator over the dataframe train/test splits.
@T.override
def
get_n_splits( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> int:
141 @T.override 142 def get_n_splits( 143 self, 144 inputs: schemas.Inputs, 145 targets: schemas.Targets, 146 groups: Index | None = None, 147 ) -> int: 148 return self.n_splits
Get the number of splits generated.
Arguments:
- inputs (schemas.Inputs): models inputs.
- targets (schemas.Targets): model targets.
- groups (Index | None, optional): group labels.
Returns:
int: number of splits generated.
SplitterKind =
TrainTestSplitter | TimeSeriesSplitter