Source code for omegaml.mixins.store.projected

  1import pandas as pd
  2import re
  3
  4
[docs] 5class ProjectedMixin(object): 6 """ 7 A OmegaStore mixin to process column specifications in dataset name 8 """ 9 colspec_pattern = re.compile(r"(?P<name>.*)\[(?P<colspec>.*)\].*$") 10 11 def metadata(self, name, *args, **kwargs): 12 if isinstance(name, str): 13 name, colspec = self._extract_column_specs(name) 14 return super(ProjectedMixin, self).metadata(name, *args, **kwargs) 15
[docs] 16 def get(self, name, *args, **kwargs): 17 """ 18 Return a projected dataset given a name of form name[colspec] 19 20 colspec can be any of 21 22 * a comma separated list of columns, e.g. ``foo[a,b]`` 23 * an open-ended slice, e.g. ``foo[a:]`` => all columns following a, inclusive 24 * an closed slice, e.g. ``foo[a:b]`` => all columns between a,b, inclusive 25 * a close-ended slice, e.g. ``foo[:b]`` => all columns up to b, inclusive 26 * an empty slice, e.g. ``foo[:]`` => all columns 27 * a list of columns to exclude, e.g. ``foo[^b]`` => all columns except b 28 29 :param name: (str) the name of the dataset, optionally including a 30 column specification 31 :return: the dataset with projected columns 32 """ 33 # split base name from specs, get metadata 34 name, colspec = self._extract_column_specs(name) 35 if colspec is None: 36 # no column spec in name, avoid projection 37 data = super(ProjectedMixin, self).get(name, *args, **kwargs) 38 else: 39 # column specs in name, get projected data 40 data = self._get_data_from_projection(name, colspec, *args, **kwargs) 41 return data
42 43 def _extract_column_specs(self, name): 44 colspec_pattern = self.colspec_pattern 45 match = colspec_pattern.match(name) 46 colspec = None 47 if match is not None: 48 name, colspec = match.groups() 49 return name, colspec 50 51 def _get_data_from_projection(self, name, colspec, *args, **kwargs): 52 # see if we can get columns from metadata 53 # if so we can specify the columns before getting the data 54 meta = self.metadata(name) 55 if 'columns' in meta.kind_meta: 56 colmap = meta.kind_meta['columns'] 57 if isinstance(colmap, dict): 58 all_columns = list(colmap.keys())[1] 59 else: 60 # colmap is list of tuples (colname, storedname) 61 all_columns = list(zip(*colmap))[1] 62 columns = columnset(colspec, all_columns) 63 kwargs['columns'] = columns 64 data = super(ProjectedMixin, self).get(name, *args, **kwargs) 65 else: 66 # we don't have columns in metadata, get the data first 67 # only subset on dataframes 68 data = super(ProjectedMixin, self).get(name, *args, **kwargs) 69 if isinstance(data, pd.DataFrame): 70 all_columns = data.columns 71 name, columns = columnset(colspec, all_columns) 72 data = data[columns] 73 return data
74 75 76def columnset(colspec, all_columns): 77 """ 78 find the specified columns in data[colspec] in list of all columns 79 80 colspec can be any of 81 82 * a comma separated list of columns, e.g. foo[a,b] 83 * an open-ended slice, e.g. foo[a:] => all columns following a, inclusive 84 * an closed slice, e.g. foo[a:b] => all columns between a,b, inclusive 85 * a close-ended slice, e.g. foo[:b] => all columns up to b, inclusive 86 * an empty slice, e.g. foo[:] => all columns 87 * a list of columns to exclude, e.g. foo[^b] => all columns except b 88 """ 89 if colspec is not None: 90 if ':' in colspec: 91 from_col, to_col = colspec.split(':') 92 from_i = (all_columns.index(from_col) 93 if from_col else 0) 94 to_i = (all_columns.index(to_col) 95 if to_col else len(all_columns)) + 1 96 columns = all_columns[from_i:to_i] 97 elif colspec.startswith('^'): 98 columns = [col for col in all_columns 99 if col not in colspec[1:].split(',')] 100 else: 101 columns = colspec.split(',') 102 else: 103 columns = all_columns 104 return columns