Source code for omegaml.mixins.store.projected

  1import pandas as pd
  2import re
  3
  4

[docs]
  5class ProjectedMixin(object):
  6    """
  7    A OmegaStore mixin to process column specifications in dataset name
  8    """
  9    colspec_pattern = re.compile(r"(?P<name>.*)\[(?P<colspec>.*)\].*$")
 10
 11    def metadata(self, name, *args, **kwargs):
 12        if isinstance(name, str):
 13            name, colspec = self._extract_column_specs(name)
 14        return super(ProjectedMixin, self).metadata(name, *args, **kwargs)
 15

[docs]
 16    def get(self, name, *args, **kwargs):
 17        """
 18        Return a projected dataset given a name of form name[colspec]
 19
 20        colspec can be any of
 21
 22        * a comma separated list of columns, e.g. ``foo[a,b]``
 23        * an open-ended slice, e.g. ``foo[a:]`` => all columns following a, inclusive
 24        * an closed slice, e.g. ``foo[a:b]`` => all columns between a,b, inclusive
 25        * a close-ended slice, e.g. ``foo[:b]`` => all columns up to b, inclusive
 26        * an empty slice, e.g. ``foo[:]`` => all columns
 27        * a list of columns to exclude, e.g. ``foo[^b]`` => all columns except b
 28
 29        :param name: (str) the name of the dataset, optionally including a
 30           column specification
 31        :return: the dataset with projected columns
 32        """
 33        # split base name from specs, get metadata
 34        name, colspec = self._extract_column_specs(name)
 35        if colspec is None:
 36            # no column spec in name, avoid projection
 37            data = super(ProjectedMixin, self).get(name, *args, **kwargs)
 38        else:
 39            # column specs in name, get projected data
 40            data = self._get_data_from_projection(name, colspec, *args, **kwargs)
 41        return data

 42
 43    def _extract_column_specs(self, name):
 44        colspec_pattern = self.colspec_pattern
 45        match = colspec_pattern.match(name)
 46        colspec = None
 47        if match is not None:
 48            name, colspec = match.groups()
 49        return name, colspec
 50
 51    def _get_data_from_projection(self, name, colspec, *args, **kwargs):
 52        # see if we can get columns from metadata
 53        # if so we can specify the columns before getting the data
 54        meta = self.metadata(name)
 55        if 'columns' in meta.kind_meta:
 56            colmap = meta.kind_meta['columns']
 57            if isinstance(colmap, dict):
 58                all_columns = list(colmap.keys())[1]
 59            else:
 60                # colmap is list of tuples (colname, storedname)
 61                all_columns = list(zip(*colmap))[1]
 62            columns = columnset(colspec, all_columns)
 63            kwargs['columns'] = columns
 64            data = super(ProjectedMixin, self).get(name, *args, **kwargs)
 65        else:
 66            # we don't have columns in metadata, get the data first
 67            # only subset on dataframes
 68            data = super(ProjectedMixin, self).get(name, *args, **kwargs)
 69            if isinstance(data, pd.DataFrame):
 70                all_columns = data.columns
 71                name, columns = columnset(colspec, all_columns)
 72                data = data[columns]
 73        return data

 74
 75
 76def columnset(colspec, all_columns):
 77    """
 78    find the specified columns in data[colspec] in list of all columns
 79
 80    colspec can be any of
 81
 82    * a comma separated list of columns, e.g. foo[a,b]
 83    * an open-ended slice, e.g. foo[a:] => all columns following a, inclusive
 84    * an closed slice, e.g. foo[a:b] => all columns between a,b, inclusive
 85    * a close-ended slice, e.g. foo[:b] => all columns up to b, inclusive
 86    * an empty slice, e.g. foo[:] => all columns
 87    * a list of columns to exclude, e.g. foo[^b] => all columns except b
 88    """
 89    if colspec is not None:
 90        if ':' in colspec:
 91            from_col, to_col = colspec.split(':')
 92            from_i = (all_columns.index(from_col)
 93                      if from_col else 0)
 94            to_i = (all_columns.index(to_col)
 95                    if to_col else len(all_columns)) + 1
 96            columns = all_columns[from_i:to_i]
 97        elif colspec.startswith('^'):
 98            columns = [col for col in all_columns
 99                       if col not in colspec[1:].split(',')]
100        else:
101            columns = colspec.split(',')
102    else:
103        columns = all_columns
104    return columns