Merge pull request #24 from DenisaCG/getContents

Add content retrieval logic
QuantStack · Nov 25, 2024 · 760d10e · 760d10e
2 parents 11585dc + 50f3a12
commit 760d10e
Show file tree

Hide file tree

Showing 7 changed files with 327 additions and 39 deletions.
diff --git a/jupyter_drives/handlers.py b/jupyter_drives/handlers.py
@@ -69,17 +69,17 @@ def initialize(self, logger: logging.Logger, manager: JupyterDrivesManager):
         return super().initialize(logger, manager)
 
     @tornado.web.authenticated
-    async def get(self, path: str = "", drive: str = ""):
+    async def get(self, drive: str = "", path: str = ""):
         result = await self._manager.get_contents(drive, path)
         self.finish(result)
 
     @tornado.web.authenticated
-    async def post(self, path: str = "", drive: str = ""):
+    async def post(self, drive: str = "", path: str = ""):
         result = await self._manager.new_file(drive, path)
         self.finish(result)
 
     @tornado.web.authenticated
-    async def patch(self, path: str = "", drive: str = ""):
+    async def patch(self, drive: str = "", path: str = ""):
         body = self.get_json_body()
         result = await self._manager.rename_file(drive, path, **body)
         self.finish(result)

diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py
@@ -3,14 +3,17 @@
 import logging
 from typing import Dict, List, Optional, Tuple, Union, Any
 
+import os
 import tornado
 import httpx
 import traitlets
+import base64
 from jupyter_server.utils import url_path_join
 
 import obstore as obs
 from libcloud.storage.types import Provider
 from libcloud.storage.providers import get_driver
+import pyarrow
 
 from .log import get_logger
 from .base import DrivesConfig
@@ -86,7 +89,7 @@ async def list_drives(self):
                         "name": result.name,
                         "region": self._config.region_name if self._config.region_name is not None else "eu-north-1",
                         "creation_date": result.extra["creation_date"],
-                        "mounted": "true" if result.name not in self._content_managers else "false",
+                        "mounted": False if result.name not in self._content_managers else True,
                         "provider": self._config.provider
                     }
                 )
@@ -153,14 +156,86 @@ async def unmount_drive(self, drive_name: str):
 
         return
 
-    async def get_contents(self, drive_name, path, **kwargs):
+    async def get_contents(self, drive_name, path):
         """Get contents of a file or directory.
 
         Args:
             drive_name: name of drive to get the contents of
-            path: path to file or directory
+            path: path to file or directory (empty string for root listing)
         """
-        print('Get contents function called.')
+        if path == '/':
+            path = ''
+        try :
+            data = []
+            isDir = False
+            emptyDir = True # assume we are dealing with an empty directory
+
+            # using Arrow lists as they are recommended for large results
+            # stream will be an async iterable of RecordBatch
+            stream = obs.list(self._content_managers[drive_name], path, chunk_size=100, return_arrow=True)
+            async for batch in stream:
+                # if content exists we are dealing with a directory
+                if isDir is False and batch: 
+                    isDir = True
+                    emptyDir = False
+
+                contents_list = pyarrow.record_batch(batch).to_pylist()
+                for object in contents_list:
+                    data.append({
+                        "path": object["path"],
+                        "last_modified": object["last_modified"].isoformat(),
+                        "size": object["size"],
+                    })
+
+            # check if we are dealing with an empty drive
+            if isDir is False and path != '':
+                content = b""
+                # retrieve contents of object
+                obj = await obs.get_async(self._content_managers[drive_name], path)
+                stream = obj.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks
+                async for buf in stream: 
+                    # if content exists we are dealing with a file
+                    if emptyDir is True and buf:
+                        emptyDir = False
+                    content += buf
+
+                # retrieve metadata of object
+                metadata = await obs.head_async(self._content_managers[drive_name], path)
+
+                # for certain media type files, extracted content needs to be read as a byte array and decoded to base64 to be viewable in JupyterLab
+                # the following extensions correspond to a base64 file format or are of type PDF
+                ext = os.path.splitext(path)[1]
+                if ext == '.pdf' or ext == '.svg' or ext == '.tif' or ext == '.tiff' or ext == '.jpg' or ext == '.jpeg' or ext == '.gif' or ext == '.png' or ext == '.bmp' or ext == '.webp':
+                    processed_content = base64.b64encode(content).decode("utf-8")
+                else:
+                    processed_content = content.decode("utf-8")
+
+                data = {
+                    "path": path, 
+                    "content": processed_content,
+                    "last_modified": metadata["last_modified"].isoformat(),
+                    "size": metadata["size"]
+                }
+
+            # dealing with the case of an empty directory, making sure it is not an empty file
+            # TO DO: find better way to check
+            if emptyDir is True: 
+                ext_list = ['.R', '.bmp', '.csv', '.gif', '.html', '.ipynb', '.jl', '.jpeg', '.jpg', '.json', '.jsonl', '.md', '.ndjson', '.pdf', '.png', '.py', '.svg', '.tif', '.tiff', '.tsv', '.txt', '.webp', '.yaml', '.yml']
+                object_name = os.path.basename(path)
+                # if object doesn't contain . or doesn't end in one of the registered extensions
+                if object_name.find('.') == -1 or ext_list.count(os.path.splitext(object_name)[1]) == 0:
+                    data = []
+
+            response = {
+                "data": data
+            }
+        except Exception as e:
+            raise tornado.web.HTTPError(
+            status_code= httpx.codes.BAD_REQUEST,
+            reason=f"The following error occured when retrieving the contents: {e}",
+            )
+
+        return response
 
     async def new_file(self, drive_name, path, **kwargs):
         """Create a new file or directory at the given path.

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ classifiers = [
 ]
 dependencies = [
     "obstore>=0.2.0,<0.3",
+    "pyarrow>=18.0.0,<19.0.0",
     "jupyter_server>=2.14.2,<3",
     "s3contents>=0.11.1,<0.12.0",
     "apache-libcloud>=3.8.0, <4",

diff --git a/src/contents.ts b/src/contents.ts
@@ -1,11 +1,8 @@
-// Copyright (c) Jupyter Development Team.
-// Distributed under the terms of the Modified BSD License.
-
+import { JupyterFrontEnd } from '@jupyterlab/application';
 import { Signal, ISignal } from '@lumino/signaling';
 import { Contents, ServerConnection } from '@jupyterlab/services';
-import { PathExt } from '@jupyterlab/coreutils';
-import { IDriveInfo } from './token';
-import { mountDrive } from './requests';
+import { IDriveInfo, IRegisteredFileTypes } from './token';
+import { getContents, mountDrive } from './requests';
 
 let data: Contents.IModel = {
   name: '',
@@ -120,6 +117,20 @@ export class Drive implements Contents.IDrive {
     return this._serverSettings;
   }
 
+  /**
+   * The registered file types
+   */
+  get registeredFileTypes(): IRegisteredFileTypes {
+    return this._registeredFileTypes;
+  }
+
+  /**
+   * The registered file types
+   */
+  set registeredFileTypes(fileTypes: IRegisteredFileTypes) {
+    this._registeredFileTypes = fileTypes;
+  }
+
   /**
    * A signal emitted when a file operation takes place.
    */
@@ -185,40 +196,41 @@ export class Drive implements Contents.IDrive {
   ): Promise<Contents.IModel> {
     let relativePath = '';
     if (localPath !== '') {
-      if (localPath.includes(this.name)) {
-        relativePath = localPath.split(this.name + '/')[1];
-      } else {
-        relativePath = localPath;
-      }
-
       // extract current drive name
-      const currentDrive = this.drivesList.filter(x => x.name === localPath)[0];
+      const currentDrive = this._drivesList.filter(
+        x =>
+          x.name ===
+          (localPath.indexOf('/') !== -1
+            ? localPath.substring(0, localPath.indexOf('/'))
+            : localPath)
+      )[0];
+
       // when accessed the first time, mount drive
-      if (!currentDrive.mounted) {
+      if (currentDrive.mounted === false) {
         try {
           await mountDrive(localPath, {
             provider: currentDrive.provider,
             region: currentDrive.region
           });
-          currentDrive.mounted = true;
+          this._drivesList.filter(x => x.name === localPath)[0].mounted = true;
         } catch (e) {
           console.log(e);
         }
       }
 
-      data = {
-        name: PathExt.basename(localPath),
-        path: PathExt.basename(localPath),
-        last_modified: '',
-        created: '',
-        content: [],
-        format: 'json',
-        mimetype: '',
-        size: undefined,
-        writable: true,
-        type: 'directory'
-      };
+      // eliminate drive name from path
+      relativePath =
+        localPath.indexOf('/') !== -1
+          ? localPath.substring(localPath.indexOf('/') + 1)
+          : '';
+
+      data = await getContents(currentDrive.name, {
+        path: relativePath,
+        registeredFileTypes: this._registeredFileTypes
+      });
     } else {
+      // retriving list of contents from root
+      // in our case: list available drives
       const drivesList: Contents.IModel[] = [];
       for (const drive of this._drivesList) {
         drivesList.push({
@@ -248,7 +260,6 @@ export class Drive implements Contents.IDrive {
         type: 'directory'
       };
     }
-    console.log('GET: ', relativePath);
 
     Contents.validateContentsModel(data);
     return data;
@@ -558,7 +569,11 @@ export class Drive implements Contents.IDrive {
    *   checkpoint is created.
    */
   createCheckpoint(path: string): Promise<Contents.ICheckpointModel> {
-    return Promise.reject('Repository is read only');
+    const emptyCheckpoint: Contents.ICheckpointModel = {
+      id: '',
+      last_modified: ''
+    };
+    return Promise.resolve(emptyCheckpoint);
   }
 
   /**
@@ -599,6 +614,40 @@ export class Drive implements Contents.IDrive {
     return Promise.reject('Read only');
   }
 
+  /**
+   * Get all registered file types and store them accordingly with their file
+   * extension (e.g.: .txt, .pdf, .jpeg), file mimetype (e.g.: text/plain, application/pdf)
+   * and file format (e.g.: base64, text).
+   *
+   * @param app
+   */
+  getRegisteredFileTypes(app: JupyterFrontEnd) {
+    // get called when instating the toolbar
+    const registeredFileTypes = app.docRegistry.fileTypes();
+
+    for (const fileType of registeredFileTypes) {
+      // check if we are dealing with a directory
+      if (fileType.extensions.length === 0) {
+        this._registeredFileTypes[''] = {
+          fileType: 'directory',
+          fileFormat: 'json',
+          fileMimeTypes: ['text/directory']
+        };
+      }
+
+      // store the mimetype and fileformat for each file extension
+      fileType.extensions.forEach(extension => {
+        if (!this._registeredFileTypes[extension]) {
+          this._registeredFileTypes[extension] = {
+            fileType: fileType.name,
+            fileMimeTypes: [...fileType.mimeTypes],
+            fileFormat: fileType.fileFormat ?? ''
+          };
+        }
+      });
+    }
+  }
+
   /**
    * Get a REST url for a file given a path.
    */
@@ -619,6 +668,7 @@ export class Drive implements Contents.IDrive {
   private _fileChanged = new Signal<this, Contents.IChangedArgs>(this);
   private _isDisposed: boolean = false;
   private _disposed = new Signal<this, void>(this);
+  private _registeredFileTypes: IRegisteredFileTypes = {};
 }
 
 export namespace Drive {

diff --git a/src/index.ts b/src/index.ts
@@ -172,8 +172,8 @@ const drivesListProvider: JupyterFrontEndPlugin<IDriveInfo[]> = {
           mounted: drive.mounted
         });
       }
-    } catch {
-      console.log('Failed loading available drives list.');
+    } catch (error) {
+      console.log('Failed loading available drives list, with error: ', error);
     }
     return drives;
   }
@@ -224,6 +224,9 @@ const driveFileBrowser: JupyterFrontEndPlugin<void> = {
 
     app.serviceManager.contents.addDrive(drive);
 
+    // get registered file types
+    drive.getRegisteredFileTypes(app);
+
     // Manually restore and load the drive file browser.
     const driveBrowser = fileBrowserFactory.createFileBrowser('drivebrowser', {
       auto: false,