Source code for probflow.data.array_data_generator

import numpy as np
import pandas as pd

from .data_generator import DataGenerator


[docs]class ArrayDataGenerator(DataGenerator): """Generate array-structured data to feed through a model. TODO Parameters ---------- x : |ndarray| or |DataFrame| or |Series| or |DataGenerator| Independent variable values (or, if fitting a generative model, the dependent variable values). Should be of shape (Nsamples,...) y : |None| or |ndarray| or |DataFrame| or |Series| Dependent variable values (or, if fitting a generative model, ``None``). Should be of shape (Nsamples,...). Default = ``None`` batch_size : int Number of samples to use per minibatch. Use ``None`` to use a single batch for all the data. Default = ``None`` shuffle : bool Whether to shuffle the data each epoch. Default = ``False`` testing : bool Whether to treat data as testing data (allow no dependent variable). Default = ``False`` """ def __init__( self, x=None, y=None, batch_size=None, shuffle=False, test=False, num_workers=None, ): # Set number of worker threads super().__init__(num_workers=num_workers) # Check types data_types = (np.ndarray, pd.DataFrame, pd.Series) if x is not None and not isinstance(x, data_types): raise TypeError("x must be an ndarray, a DataFrame, or a Series") if y is not None and not isinstance(y, data_types): raise TypeError("y must be an ndarray, a DataFrame, or a Series") if batch_size is not None: if not isinstance(batch_size, int): raise TypeError("batch_size must be an int") if batch_size < 1: raise ValueError("batch_size must be >0") if not isinstance(shuffle, bool): raise TypeError("shuffle must be True or False") if not isinstance(test, bool): raise TypeError("test must be True or False") # No data? (eg when sampling from a generative model) if x is None and y is None: self._empty = True self._n_samples = 1 self._batch_size = 1 return else: self._empty = False # Check sizes are consistent if x is not None and y is not None: if x.shape[0] != y.shape[0]: raise ValueError("x and y must contain same number of samples") # Generative model? if not test and y is None: y = x x = None # Number of samples if x is None: self._n_samples = y.shape[0] else: self._n_samples = x.shape[0] # Batch size if batch_size is None or self._n_samples < batch_size: self._batch_size = self._n_samples else: self._batch_size = batch_size # Store references to data self.x = x self.y = y # Shuffle data self.shuffle = shuffle self.on_epoch_end() @property def n_samples(self): """Number of samples in the dataset""" return self._n_samples @property def batch_size(self): """Number of samples to generate each minibatch""" return self._batch_size
[docs] def get_batch(self, index): """Generate one batch of data""" # Return none if no data if self._empty: return None, None # Get shuffled indexes ix = slice(index * self.batch_size, (index + 1) * self.batch_size) if self.shuffle: ix = self.ids[ix] # Get x data if self.x is None: x = None elif isinstance(self.x, pd.DataFrame): x = self.x.iloc[ix, :] elif isinstance(self.x, pd.Series): x = self.x.iloc[ix] else: x = self.x[ix, ...] # Get y data if self.y is None: y = None elif isinstance(self.y, pd.DataFrame): y = self.y.iloc[ix, :] elif isinstance(self.y, pd.Series): y = self.y.iloc[ix] else: y = self.y[ix, ...] # Return both x and y return x, y
[docs] def on_epoch_end(self): """Shuffle data each epoch""" if self.shuffle: self.ids = np.random.permutation(self.n_samples)