Source code for omegaml.notebook.omegacontentsmgr

  1import mimetypes
  2from base64 import encodebytes, decodebytes
  3
  4import json
  5import nbformat
  6import os
  7from datetime import datetime
  8from io import BytesIO
  9from jupyter_server.services.contents.manager import ContentsManager
 10from tornado import web
 11from traitlets import default
 12from urllib.parse import unquote
 13
 14from omegaml.notebook.checkpoints import NoOpCheckpoints
 15
 16

[docs]
 17class OmegaStoreContentsManager(ContentsManager):
 18    """
 19    Jupyter notebook storage manager for omegaml
 20
 21    Adopted from notebook/services/contents/filemanager.py
 22
 23    This requires a properly configured omegaml instance.
 24    see https://jupyter-server.readthedocs.io/en/latest/developers/contents.html
 25    """
 26
 27    def __init__(self, **kwargs):
 28        # pass omega= for testing purpose
 29        self._omega = kwargs.pop('omega', None)
 30        super(OmegaStoreContentsManager, self).__init__(**kwargs)
 31
 32    def _checkpoints_class_default(self):
 33        return NoOpCheckpoints
 34
 35    @property
 36    def omega(self):
 37        """
 38        return the omega instance used by the contents manager
 39        """
 40        if self._omega is None:
 41            import omegaml as om
 42            self._omega = om
 43        self._omega.jobs._include_dir_placeholder = True
 44        return self._omega
 45
 46    @property
 47    def store(self):
 48        """
 49        return the OmageStore for jobs (notebooks)
 50        """
 51        return self.omega.jobs.store
 52
 53    @property
 54    def _dir_placeholder(self):
 55        return self.omega.jobs._dir_placeholder
 56
 57    def get(self, path, content=True, type=None, format=None):
 58        """
 59        get an entry in the store
 60
 61        this is called by the contents engine to get the contents of the jobs
 62        store.
 63        """
 64        path = unquote(path).strip('/')
 65        if type == 'notebook' or (type is None and path.endswith('.ipynb')):
 66            model = self._notebook_model(path, content=content)
 67        elif type == 'file':
 68            model = self._file_model(path, content=content)
 69        elif type in (None, 'directory'):
 70            # jupyterlab passes None to get directory
 71            # we never return content with a directory listing to save time
 72            # the frontend will request the specific contents
 73            model = self._dir_model(path, content=False)
 74        else:
 75            raise web.HTTPError(404, u'Type {} at {} is not supported'.format(type, path))
 76        return model
 77
 78    def save(self, model, path):
 79        """
 80        save an entry in the store
 81
 82        this is called by the contents engine to store a notebook
 83        """
 84        om = self.omega
 85        path = unquote(path).strip('/')
 86        type = model.get('type')
 87        name = model.get('name')
 88        self.run_pre_save_hooks(model=model, path=path)
 89        if type is None:
 90            raise web.HTTPError(400, u'No file type provided')
 91        try:
 92            if type == 'notebook' or (type == 'file' and path.endswith('.ipynb')):
 93                content = model.get('content')
 94                # parse input
 95                if model.get('format') == 'text' and isinstance(content, bytes):
 96                    content = content.decode('utf8')
 97                elif model.get('format') == 'base64':
 98                    if isinstance(content, str):
 99                        content = content.encode('ascii')
100                    content = decodebytes(content).decode('utf8')
101                while isinstance(content, str):
102                    content = json.loads(content)
103                    model['content'] = content
104                    model['format'] = None
105                if content is None or not isinstance(content, dict):
106                    raise web.HTTPError(400, u'No file content provided or wrong format')
107                # create notebook
108                nb = nbformat.from_dict(content)
109                self.check_and_sign(nb, path)
110                self.omega.jobs.put(nb, path)
111                self.validate_notebook_model(model)
112                validation_message = model.get('message', None)
113                model = self.get(path, content=False, type=type)
114                if validation_message:
115                    model['message'] = validation_message
116            elif type == 'directory':
117                ph_name = '{path}/{self._dir_placeholder}'.format(**locals()).strip('/')
118                self.omega.jobs.create("#placeholder", ph_name)
119                model = self.get(path, content=False, type=type)
120                model['content'] = None
121                model['format'] = None
122                validation_message = None
123            elif type == 'file':
124                content = model.get('content')
125                fmt = model.get('format')
126                self._save_file(path, content, fmt)
127                model = self.get(path, content=False, type=type)
128            else:
129                raise web.HTTPError(
130                    400, "Unhandled contents type: %s" % model['type'])
131        except web.HTTPError:
132            raise
133        except Exception as e:
134            self.log.error(
135                u'Error while saving file: %s %s', path, e, exc_info=True)
136            raise web.HTTPError(
137                500, u'Unexpected error while saving file: %s %s' % (path, e))
138        return model
139
140    def delete_file(self, path):
141        """
142        delete an entry
143
144        this is called by the contents engine to delete an entry
145        """
146        path = unquote(path).strip('/')
147        try:
148            self.omega.jobs.drop(path)
149        except Exception as e:
150            self.omega.jobs.drop(path + '/' + self._dir_placeholder)
151
152    def rename_file(self, old_path, new_path):
153        """
154        rename a file
155
156        this is called by the contents engine to rename an entry
157        """
158        old_path = unquote(old_path).strip('/')
159        new_path = unquote(new_path).strip('/')
160        # check file or directory
161        if self.file_exists(new_path):
162            raise web.HTTPError(409, u'Notebook already exists: %s' % new_path)
163        elif self.dir_exists(new_path):
164            raise web.HTTPError(409, u'Directory already exists: %s' % new_path)
165        # do the renaming
166        if self.dir_exists(old_path):
167            old_dirname = old_path + '/' + self._dir_placeholder
168            new_dirname = new_path + '/' + self._dir_placeholder
169            meta = self.omega.jobs.metadata(old_dirname)
170            meta.name = new_dirname
171        elif self.file_exists(old_path):
172            meta = self.omega.jobs.metadata(old_path)
173            meta.name = new_path
174        # rename on metadata. Note the gridfile instance stays the same
175        meta.save()
176
177    def exists(self, path):
178        """
179        Does a file or dir exist at the given collection in gridFS?
180        We do not have dir so dir_exists returns true.
181
182        :param path: (str) The relative path to the file's directory
183          (with '/' as separator)
184        :returns exists: (boo) The relative path to the file's directory (with '/' as separator)
185        """
186        path = unquote(path).strip('/')
187        return self.file_exists(path) or self.dir_exists(path)
188
189    def dir_exists(self, path=''):
190        """check if directory exists
191
192        Args:
193            path: name of directory
194
195        Returns:
196            True if directory exists
197        """
198        path = unquote(path).strip('/')
199        if path == '':
200            return True
201        pattern = r'^{path}.*/({placeholder}|.+)'.format(path=path, placeholder=self._dir_placeholder)
202        return len(self.omega.jobs.list(regexp=pattern)) > 0
203
204    def file_exists(self, path=""):
205        """check if file exists
206
207        Args:
208            path: name of file
209
210        Returns:
211            True if file exists
212        """
213        path = unquote(path).strip('/')
214        if not path:
215            return False
216        # always check for an actual file, not some sub path
217        pattern = r'^{}$'.format(path)
218        does_exist = len(self.omega.jobs.list(regexp=pattern)) > 0
219        does_exist |= len(self.omega.datasets.list(regexp=pattern)) > 0
220        return does_exist
221
222    def is_hidden(self, path):
223        """check if path or file is hidden
224
225        Args:
226            path: name of file or path
227
228        Returns:
229            False, currently always returns false
230        """
231        return False
232
233    def _read_notebook(self, path, as_version=None):
234        path = unquote(path).strip('/')
235        return self.omega.jobs.get(path)
236
237    def _notebook_model(self, path, content=True, meta=None):
238        """
239        Build a notebook model
240        if content is requested, the notebook content will be populated
241        as a JSON structure (not double-serialized)
242        """
243        path = unquote(path).strip('/')
244        model = self._base_model(path)
245        model['type'] = 'notebook'
246        # always add accurate created and modified
247        meta = meta or self.omega.jobs.metadata(path)
248        if meta is not None:
249            model['created'] = meta.created
250            model['last_modified'] = meta.modified
251        if content:
252            nb = self._read_notebook(path, as_version=4)
253            if nb is None:
254                raise web.HTTPError(400, "Cannot read non-file {}".format(path))
255            self.mark_trusted_cells(nb, path)
256            model['content'] = nb
257            model['format'] = 'json'
258            self.validate_notebook_model(model)
259        return model
260
261    def _base_model(self, path, kind=None):
262        """Build the common base of a contents model"""
263        # http://jupyter-notebook.readthedocs.io/en/stable/extending/contents.html
264        path = unquote(path).strip('/')
265        last_modified = datetime.utcnow()
266        created = last_modified
267        # Create the base model.
268        model = {}
269        model['name'] = os.path.basename(path)
270        model['path'] = path
271        model['last_modified'] = last_modified
272        model['created'] = created
273        model['content'] = None
274        model['format'] = None
275        model['mimetype'] = None
276        model['writable'] = True
277        if kind:
278            model['type'] = kind
279            model['content'] = [] if kind == 'directory' else None
280        return model
281
282    def _dir_model(self, path, content=True):
283        """
284        Build a model to return all of the files in gridfs
285        if content is requested, will include a listing of the directory
286        """
287        # this looks like a seemingly simple task, it's carefully crafted
288        path = unquote(path).strip('/')
289        model = self._base_model(path, kind='directory')
290        model['format'] = 'json'
291        contents = model['content']
292        # get existing entries from a pattern that matches either
293        #    top-level files: ([\w -]+\.[\w -]*)
294        #    directories (files in): ([\w ]+/([\w ]+\.[\w]*))
295        # does not work:
296        #    pattern = r'([\w ]+/_placeholder\.[\w]*)|([\w ]+\.[\w]*)$'
297        #    it is too restrictive as entries can be generated without a placeholder
298        # so we get all, which may include sub/sub/file
299        # and we need to include them because we need to find sub/sub directories
300        # note \w is any word character (letter, digit, underscore)
301        #      \s is any white space
302        #      \d is any digit
303        #      :_  match literally
304        #      [^\/]  matches any character except /
305        # pattern = r'([\w\s\-.\d:()+]+\/)?([\w\s\-.\d:()+]+\.[\w]*)$'
306        pattern = r'([^\/]+\/)?([^\/]+\.[^\/]*)$'
307        # if we're looking in an existing directory, prepend that
308        if path:
309            pattern = r'{path}/{pattern}'.format(path=path, pattern=pattern)
310        pattern = r'^{}'.format(pattern)
311        entries = self.omega.jobs.list(regexp=pattern, raw=True, hidden=True, include_temp=True)
312        if path and not entries:
313            raise web.HTTPError(400, "Directory not found {}".format(path))
314        # by default assume the current path is listed already
315        directories = [path]
316        for meta in entries:
317            # get path of entry, e.g. sub/foo.ipynb => sub
318            entry_path = os.path.dirname(meta.name)
319            # if not part of listed directories yet, include
320            if entry_path not in directories:
321                entry = self._base_model(entry_path, kind='directory')
322                contents.append(entry)
323                directories.append(entry_path)
324            # ignore placeholder files
325            if meta.name.endswith(self._dir_placeholder):
326                continue
327            # only include files that are in the path we're listing
328            if entry_path != path:
329                continue
330            # include the actual file
331            try:
332                entry = self._notebook_model(meta.name, content=content, meta=meta)
333            except Exception as e:
334                msg = ('_dir_model error, cannot get {}, '
335                       'removing from list, exception {}'.format(meta.name, str(e)))
336                self.log.warning(msg)
337            else:
338                contents.append(entry)
339        return model
340
341    def _file_model(self, path, content=True, format=None):
342        model = self._base_model(path)
343        model['type'] = 'file'
344
345        model['mimetype'] = mimetypes.guess_type(path)[0]
346
347        if content:
348            content, format = self._read_file(path, format)
349            if model['mimetype'] is None:
350                default_mime = {
351                    'text': 'text/plain',
352                    'base64': 'application/octet-stream'
353                }[format]
354                model['mimetype'] = default_mime
355
356            model.update(
357                content=content,
358                format=format,
359            )
360
361        return model
362
363    def _read_file(self, os_path, format):
364        """Read a non-notebook file.
365
366        os_path: The path to be read.
367        format:
368          If 'text', the contents will be decoded as UTF-8.
369          If 'base64', the raw bytes contents will be encoded as base64.
370          If not specified, try to decode as UTF-8, and fall back to base64
371        """
372        if os_path.endswith('.ipynb'):
373            meta = self.omega.jobs.metadata(os_path)
374        else:
375            meta = self.omega.datasets.metadata(os_path)
376        if meta is None or meta.gridfile is None:
377            raise web.HTTPError(400, "Cannot read non-file %s" % os_path)
378
379        if meta.gridfile:
380            bcontent = meta.gridfile.read()
381            meta.gridfile.close()
382
383        if format is None or format == 'text':
384            # Try to interpret as unicode if format is unknown or if unicode
385            # was explicitly requested.
386            try:
387                return bcontent.decode('utf8'), 'text'
388            except UnicodeError:
389                if format == 'text':
390                    raise web.HTTPError(
391                        400,
392                        "%s is not UTF-8 encoded" % os_path,
393                        reason='bad format',
394                    )
395        return encodebytes(bcontent).decode('ascii'), 'base64'
396
397    def _save_file(self, os_path, content, format):
398        """Save content of a generic file."""
399        if format not in {'text', 'base64'}:
400            raise web.HTTPError(
401                400,
402                "Must specify format of file contents as 'text' or 'base64'",
403            )
404        try:
405            if format == 'text':
406                bcontent = content.encode('utf8')
407            else:
408                b64_bytes = content.encode('ascii')
409                bcontent = decodebytes(b64_bytes)
410        except Exception as e:
411            raise web.HTTPError(
412                400, u'Encoding error saving %s: %s' % (os_path, e)
413            )
414
415        self.omega.datasets.put(BytesIO(bcontent), os_path)
416
417    @default('files_handler_params')
418    def _files_handler_params_default(self):
419        # avoid exception
420        #   TypeError: StaticFileHandler.initialize() missing 1 required positional argument: 'path'
421        #   issue: https://github.com/jupyter-server/jupyter_server/issues/1313
422        # What this does
423        # - ensure that ContentsManager.files_handler_class=FilesHandler gets has a path attribute on initialize()
424        # - this only happens when dealing with local files, e.g. when downloading a notebook
425        # - source of the issue is that ContentsManager.files_handler_params is not set
426        return {'path': self.root_dir}