1import pandas as pd
2import re
3
4
[docs]
5class ProjectedMixin(object):
6 """
7 A OmegaStore mixin to process column specifications in dataset name
8 """
9 colspec_pattern = re.compile(r"(?P<name>.*)\[(?P<colspec>.*)\].*$")
10
11 def metadata(self, name, *args, **kwargs):
12 if isinstance(name, str):
13 name, colspec = self._extract_column_specs(name)
14 return super(ProjectedMixin, self).metadata(name, *args, **kwargs)
15
[docs]
16 def get(self, name, *args, **kwargs):
17 """
18 Return a projected dataset given a name of form name[colspec]
19
20 colspec can be any of
21
22 * a comma separated list of columns, e.g. ``foo[a,b]``
23 * an open-ended slice, e.g. ``foo[a:]`` => all columns following a, inclusive
24 * an closed slice, e.g. ``foo[a:b]`` => all columns between a,b, inclusive
25 * a close-ended slice, e.g. ``foo[:b]`` => all columns up to b, inclusive
26 * an empty slice, e.g. ``foo[:]`` => all columns
27 * a list of columns to exclude, e.g. ``foo[^b]`` => all columns except b
28
29 :param name: (str) the name of the dataset, optionally including a
30 column specification
31 :return: the dataset with projected columns
32 """
33 # split base name from specs, get metadata
34 name, colspec = self._extract_column_specs(name)
35 if colspec is None:
36 # no column spec in name, avoid projection
37 data = super(ProjectedMixin, self).get(name, *args, **kwargs)
38 else:
39 # column specs in name, get projected data
40 data = self._get_data_from_projection(name, colspec, *args, **kwargs)
41 return data
42
43 def _extract_column_specs(self, name):
44 colspec_pattern = self.colspec_pattern
45 match = colspec_pattern.match(name)
46 colspec = None
47 if match is not None:
48 name, colspec = match.groups()
49 return name, colspec
50
51 def _get_data_from_projection(self, name, colspec, *args, **kwargs):
52 # see if we can get columns from metadata
53 # if so we can specify the columns before getting the data
54 meta = self.metadata(name)
55 if 'columns' in meta.kind_meta:
56 colmap = meta.kind_meta['columns']
57 if isinstance(colmap, dict):
58 all_columns = list(colmap.keys())[1]
59 else:
60 # colmap is list of tuples (colname, storedname)
61 all_columns = list(zip(*colmap))[1]
62 columns = columnset(colspec, all_columns)
63 kwargs['columns'] = columns
64 data = super(ProjectedMixin, self).get(name, *args, **kwargs)
65 else:
66 # we don't have columns in metadata, get the data first
67 # only subset on dataframes
68 data = super(ProjectedMixin, self).get(name, *args, **kwargs)
69 if isinstance(data, pd.DataFrame):
70 all_columns = data.columns
71 name, columns = columnset(colspec, all_columns)
72 data = data[columns]
73 return data
74
75
76def columnset(colspec, all_columns):
77 """
78 find the specified columns in data[colspec] in list of all columns
79
80 colspec can be any of
81
82 * a comma separated list of columns, e.g. foo[a,b]
83 * an open-ended slice, e.g. foo[a:] => all columns following a, inclusive
84 * an closed slice, e.g. foo[a:b] => all columns between a,b, inclusive
85 * a close-ended slice, e.g. foo[:b] => all columns up to b, inclusive
86 * an empty slice, e.g. foo[:] => all columns
87 * a list of columns to exclude, e.g. foo[^b] => all columns except b
88 """
89 if colspec is not None:
90 if ':' in colspec:
91 from_col, to_col = colspec.split(':')
92 from_i = (all_columns.index(from_col)
93 if from_col else 0)
94 to_i = (all_columns.index(to_col)
95 if to_col else len(all_columns)) + 1
96 columns = all_columns[from_i:to_i]
97 elif colspec.startswith('^'):
98 columns = [col for col in all_columns
99 if col not in colspec[1:].split(',')]
100 else:
101 columns = colspec.split(',')
102 else:
103 columns = all_columns
104 return columns