bikes.utils.splitters

Split dataframes into subsets (e.g., train/valid/test).

  1"""Split dataframes into subsets (e.g., train/valid/test)."""
  2
  3# %% IMPORTS
  4
  5import abc
  6import typing as T
  7
  8import numpy as np
  9import numpy.typing as npt
 10import pydantic as pdt
 11from sklearn import model_selection
 12
 13from bikes.core import schemas
 14
 15# %% TYPES
 16
 17Index = npt.NDArray[np.int64]
 18TrainTestIndex = tuple[Index, Index]
 19TrainTestSplits = T.Iterator[TrainTestIndex]
 20
 21# %% SPLITTERS
 22
 23
 24class Splitter(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
 25    """Base class for a splitter.
 26
 27    Use splitters to split data in sets.
 28    e.g., split between a train/test subsets.
 29
 30    # https://scikit-learn.org/stable/glossary.html#term-CV-splitter
 31    """
 32
 33    KIND: str
 34
 35    @abc.abstractmethod
 36    def split(
 37        self,
 38        inputs: schemas.Inputs,
 39        targets: schemas.Targets,
 40        groups: Index | None = None,
 41    ) -> TrainTestSplits:
 42        """Split a dataframe into subsets.
 43
 44        Args:
 45            inputs (schemas.Inputs): model inputs.
 46            targets (schemas.Targets): model targets.
 47            groups (Index | None, optional): group labels.
 48
 49        Returns:
 50            TrainTestSplits: iterator over the dataframe train/test splits.
 51        """
 52
 53    @abc.abstractmethod
 54    def get_n_splits(
 55        self,
 56        inputs: schemas.Inputs,
 57        targets: schemas.Targets,
 58        groups: Index | None = None,
 59    ) -> int:
 60        """Get the number of splits generated.
 61
 62        Args:
 63            inputs (schemas.Inputs): models inputs.
 64            targets (schemas.Targets): model targets.
 65            groups (Index | None, optional): group labels.
 66
 67        Returns:
 68            int: number of splits generated.
 69        """
 70
 71
 72class TrainTestSplitter(Splitter):
 73    """Split a dataframe into a train and test set.
 74
 75    Parameters:
 76        shuffle (bool): shuffle the dataset. Default is False.
 77        test_size (int | float): number/ratio for the test set.
 78        random_state (int): random state for the splitter object.
 79    """
 80
 81    KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter"
 82
 83    shuffle: bool = False  # required (time sensitive)
 84    test_size: int | float = 24 * 30 * 2  # 2 months
 85    random_state: int = 42
 86
 87    @T.override
 88    def split(
 89        self,
 90        inputs: schemas.Inputs,
 91        targets: schemas.Targets,
 92        groups: Index | None = None,
 93    ) -> TrainTestSplits:
 94        index = np.arange(len(inputs))  # return integer position
 95        train_index, test_index = model_selection.train_test_split(
 96            index,
 97            shuffle=self.shuffle,
 98            test_size=self.test_size,
 99            random_state=self.random_state,
100        )
101        yield train_index, test_index
102
103    @T.override
104    def get_n_splits(
105        self,
106        inputs: schemas.Inputs,
107        targets: schemas.Targets,
108        groups: Index | None = None,
109    ) -> int:
110        return 1
111
112
113class TimeSeriesSplitter(Splitter):
114    """Split a dataframe into fixed time series subsets.
115
116    Parameters:
117        gap (int): gap between splits.
118        n_splits (int): number of split to generate.
119        test_size (int | float): number or ratio for the test dataset.
120    """
121
122    KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter"
123
124    gap: int = 0
125    n_splits: int = 4
126    test_size: int | float = 24 * 30 * 2  # 2 months
127
128    @T.override
129    def split(
130        self,
131        inputs: schemas.Inputs,
132        targets: schemas.Targets,
133        groups: Index | None = None,
134    ) -> TrainTestSplits:
135        splitter = model_selection.TimeSeriesSplit(
136            n_splits=self.n_splits, test_size=self.test_size, gap=self.gap
137        )
138        yield from splitter.split(inputs)
139
140    @T.override
141    def get_n_splits(
142        self,
143        inputs: schemas.Inputs,
144        targets: schemas.Targets,
145        groups: Index | None = None,
146    ) -> int:
147        return self.n_splits
148
149
150SplitterKind = TrainTestSplitter | TimeSeriesSplitter
Index = numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]]
TrainTestIndex = tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]], numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]]]
TrainTestSplits = typing.Iterator[tuple[numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]], numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]]]]
class Splitter(abc.ABC, pydantic.main.BaseModel):
25class Splitter(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
26    """Base class for a splitter.
27
28    Use splitters to split data in sets.
29    e.g., split between a train/test subsets.
30
31    # https://scikit-learn.org/stable/glossary.html#term-CV-splitter
32    """
33
34    KIND: str
35
36    @abc.abstractmethod
37    def split(
38        self,
39        inputs: schemas.Inputs,
40        targets: schemas.Targets,
41        groups: Index | None = None,
42    ) -> TrainTestSplits:
43        """Split a dataframe into subsets.
44
45        Args:
46            inputs (schemas.Inputs): model inputs.
47            targets (schemas.Targets): model targets.
48            groups (Index | None, optional): group labels.
49
50        Returns:
51            TrainTestSplits: iterator over the dataframe train/test splits.
52        """
53
54    @abc.abstractmethod
55    def get_n_splits(
56        self,
57        inputs: schemas.Inputs,
58        targets: schemas.Targets,
59        groups: Index | None = None,
60    ) -> int:
61        """Get the number of splits generated.
62
63        Args:
64            inputs (schemas.Inputs): models inputs.
65            targets (schemas.Targets): model targets.
66            groups (Index | None, optional): group labels.
67
68        Returns:
69            int: number of splits generated.
70        """

Base class for a splitter.

Use splitters to split data in sets. e.g., split between a train/test subsets.

https://scikit-learn.org/stable/glossary.html#term-CV-splitter

KIND: str
@abc.abstractmethod
def split( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> Iterator[tuple[numpy.ndarray[Any, numpy.dtype[numpy.int64]], numpy.ndarray[Any, numpy.dtype[numpy.int64]]]]:
36    @abc.abstractmethod
37    def split(
38        self,
39        inputs: schemas.Inputs,
40        targets: schemas.Targets,
41        groups: Index | None = None,
42    ) -> TrainTestSplits:
43        """Split a dataframe into subsets.
44
45        Args:
46            inputs (schemas.Inputs): model inputs.
47            targets (schemas.Targets): model targets.
48            groups (Index | None, optional): group labels.
49
50        Returns:
51            TrainTestSplits: iterator over the dataframe train/test splits.
52        """

Split a dataframe into subsets.

Arguments:
  • inputs (schemas.Inputs): model inputs.
  • targets (schemas.Targets): model targets.
  • groups (Index | None, optional): group labels.
Returns:

TrainTestSplits: iterator over the dataframe train/test splits.

@abc.abstractmethod
def get_n_splits( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> int:
54    @abc.abstractmethod
55    def get_n_splits(
56        self,
57        inputs: schemas.Inputs,
58        targets: schemas.Targets,
59        groups: Index | None = None,
60    ) -> int:
61        """Get the number of splits generated.
62
63        Args:
64            inputs (schemas.Inputs): models inputs.
65            targets (schemas.Targets): model targets.
66            groups (Index | None, optional): group labels.
67
68        Returns:
69            int: number of splits generated.
70        """

Get the number of splits generated.

Arguments:
  • inputs (schemas.Inputs): models inputs.
  • targets (schemas.Targets): model targets.
  • groups (Index | None, optional): group labels.
Returns:

int: number of splits generated.

model_config: ClassVar[pydantic.config.ConfigDict] = {'strict': True, 'frozen': True, 'extra': 'forbid'}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class TrainTestSplitter(Splitter):
 73class TrainTestSplitter(Splitter):
 74    """Split a dataframe into a train and test set.
 75
 76    Parameters:
 77        shuffle (bool): shuffle the dataset. Default is False.
 78        test_size (int | float): number/ratio for the test set.
 79        random_state (int): random state for the splitter object.
 80    """
 81
 82    KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter"
 83
 84    shuffle: bool = False  # required (time sensitive)
 85    test_size: int | float = 24 * 30 * 2  # 2 months
 86    random_state: int = 42
 87
 88    @T.override
 89    def split(
 90        self,
 91        inputs: schemas.Inputs,
 92        targets: schemas.Targets,
 93        groups: Index | None = None,
 94    ) -> TrainTestSplits:
 95        index = np.arange(len(inputs))  # return integer position
 96        train_index, test_index = model_selection.train_test_split(
 97            index,
 98            shuffle=self.shuffle,
 99            test_size=self.test_size,
100            random_state=self.random_state,
101        )
102        yield train_index, test_index
103
104    @T.override
105    def get_n_splits(
106        self,
107        inputs: schemas.Inputs,
108        targets: schemas.Targets,
109        groups: Index | None = None,
110    ) -> int:
111        return 1

Split a dataframe into a train and test set.

Arguments:
  • shuffle (bool): shuffle the dataset. Default is False.
  • test_size (int | float): number/ratio for the test set.
  • random_state (int): random state for the splitter object.
KIND: Literal['TrainTestSplitter']
shuffle: bool
test_size: int | float
random_state: int
@T.override
def split( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> Iterator[tuple[numpy.ndarray[Any, numpy.dtype[numpy.int64]], numpy.ndarray[Any, numpy.dtype[numpy.int64]]]]:
 88    @T.override
 89    def split(
 90        self,
 91        inputs: schemas.Inputs,
 92        targets: schemas.Targets,
 93        groups: Index | None = None,
 94    ) -> TrainTestSplits:
 95        index = np.arange(len(inputs))  # return integer position
 96        train_index, test_index = model_selection.train_test_split(
 97            index,
 98            shuffle=self.shuffle,
 99            test_size=self.test_size,
100            random_state=self.random_state,
101        )
102        yield train_index, test_index

Split a dataframe into subsets.

Arguments:
  • inputs (schemas.Inputs): model inputs.
  • targets (schemas.Targets): model targets.
  • groups (Index | None, optional): group labels.
Returns:

TrainTestSplits: iterator over the dataframe train/test splits.

@T.override
def get_n_splits( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> int:
104    @T.override
105    def get_n_splits(
106        self,
107        inputs: schemas.Inputs,
108        targets: schemas.Targets,
109        groups: Index | None = None,
110    ) -> int:
111        return 1

Get the number of splits generated.

Arguments:
  • inputs (schemas.Inputs): models inputs.
  • targets (schemas.Targets): model targets.
  • groups (Index | None, optional): group labels.
Returns:

int: number of splits generated.

model_config: ClassVar[pydantic.config.ConfigDict] = {'strict': True, 'frozen': True, 'extra': 'forbid'}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class TimeSeriesSplitter(Splitter):
114class TimeSeriesSplitter(Splitter):
115    """Split a dataframe into fixed time series subsets.
116
117    Parameters:
118        gap (int): gap between splits.
119        n_splits (int): number of split to generate.
120        test_size (int | float): number or ratio for the test dataset.
121    """
122
123    KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter"
124
125    gap: int = 0
126    n_splits: int = 4
127    test_size: int | float = 24 * 30 * 2  # 2 months
128
129    @T.override
130    def split(
131        self,
132        inputs: schemas.Inputs,
133        targets: schemas.Targets,
134        groups: Index | None = None,
135    ) -> TrainTestSplits:
136        splitter = model_selection.TimeSeriesSplit(
137            n_splits=self.n_splits, test_size=self.test_size, gap=self.gap
138        )
139        yield from splitter.split(inputs)
140
141    @T.override
142    def get_n_splits(
143        self,
144        inputs: schemas.Inputs,
145        targets: schemas.Targets,
146        groups: Index | None = None,
147    ) -> int:
148        return self.n_splits

Split a dataframe into fixed time series subsets.

Arguments:
  • gap (int): gap between splits.
  • n_splits (int): number of split to generate.
  • test_size (int | float): number or ratio for the test dataset.
KIND: Literal['TimeSeriesSplitter']
gap: int
n_splits: int
test_size: int | float
@T.override
def split( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> Iterator[tuple[numpy.ndarray[Any, numpy.dtype[numpy.int64]], numpy.ndarray[Any, numpy.dtype[numpy.int64]]]]:
129    @T.override
130    def split(
131        self,
132        inputs: schemas.Inputs,
133        targets: schemas.Targets,
134        groups: Index | None = None,
135    ) -> TrainTestSplits:
136        splitter = model_selection.TimeSeriesSplit(
137            n_splits=self.n_splits, test_size=self.test_size, gap=self.gap
138        )
139        yield from splitter.split(inputs)

Split a dataframe into subsets.

Arguments:
  • inputs (schemas.Inputs): model inputs.
  • targets (schemas.Targets): model targets.
  • groups (Index | None, optional): group labels.
Returns:

TrainTestSplits: iterator over the dataframe train/test splits.

@T.override
def get_n_splits( self, inputs: pandera.typing.pandas.DataFrame[bikes.core.schemas.InputsSchema], targets: pandera.typing.pandas.DataFrame[bikes.core.schemas.TargetsSchema], groups: numpy.ndarray[typing.Any, numpy.dtype[numpy.int64]] | None = None) -> int:
141    @T.override
142    def get_n_splits(
143        self,
144        inputs: schemas.Inputs,
145        targets: schemas.Targets,
146        groups: Index | None = None,
147    ) -> int:
148        return self.n_splits

Get the number of splits generated.

Arguments:
  • inputs (schemas.Inputs): models inputs.
  • targets (schemas.Targets): model targets.
  • groups (Index | None, optional): group labels.
Returns:

int: number of splits generated.

model_config: ClassVar[pydantic.config.ConfigDict] = {'strict': True, 'frozen': True, 'extra': 'forbid'}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].