From 20722796256b16084cd44a81ccf8e0484d6afdb3 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Tue, 19 Nov 2024 18:05:30 +0100 Subject: [PATCH 01/19] add getContents endpoint --- jupyter_drives/handlers.py | 6 ++-- jupyter_drives/manager.py | 57 ++++++++++++++++++++++++++++++++++++-- src/contents.ts | 5 +++- src/requests.ts | 11 ++++++++ 4 files changed, 72 insertions(+), 7 deletions(-) diff --git a/jupyter_drives/handlers.py b/jupyter_drives/handlers.py index 5fb7d2b..b01847f 100644 --- a/jupyter_drives/handlers.py +++ b/jupyter_drives/handlers.py @@ -69,17 +69,17 @@ def initialize(self, logger: logging.Logger, manager: JupyterDrivesManager): return super().initialize(logger, manager) @tornado.web.authenticated - async def get(self, path: str = "", drive: str = ""): + async def get(self, drive: str = "", path: str = ""): result = await self._manager.get_contents(drive, path) self.finish(result) @tornado.web.authenticated - async def post(self, path: str = "", drive: str = ""): + async def post(self, drive: str = "", path: str = ""): result = await self._manager.new_file(drive, path) self.finish(result) @tornado.web.authenticated - async def patch(self, path: str = "", drive: str = ""): + async def patch(self, drive: str = "", path: str = ""): body = self.get_json_body() result = await self._manager.rename_file(drive, path, **body) self.finish(result) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 3e39d94..211e654 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -3,6 +3,7 @@ import logging from typing import Dict, List, Optional, Tuple, Union, Any +import os import tornado import httpx import traitlets @@ -11,6 +12,7 @@ import obstore as obs from libcloud.storage.types import Provider from libcloud.storage.providers import get_driver +import pyarrow from .log import get_logger from .base import DrivesConfig @@ -153,14 +155,63 @@ async def unmount_drive(self, drive_name: str): return - async def get_contents(self, drive_name, path, **kwargs): + async def get_contents(self, drive_name, path): """Get contents of a file or directory. Args: drive_name: name of drive to get the contents of - path: path to file or directory + path: path to file or directory (empty string for root listing) """ - print('Get contents function called.') + print('!!!!!!!!!!!!!!!!!!!', drive_name, 'path: ', path) + if path == '/': + path = '' + drive_name = 'jupyter-drives-test-bucket-1' + try : + currentObject = os.path.basename(path) if os.path.basename(path) is not None else '' + print('currentObject: ', currentObject) + # check if we are listing contents of a directory + if currentObject.find('.') == -1: + print('in if') + print('store: ', self._content_managers) + data = [] + # using Arrow lists as they are recommended for large results + # sream will be an async iterable of RecordBatch + stream = obs.list(self._content_managers[drive_name], path, chunk_size=100, return_arrow=True) + async for batch in stream: + contents_list = pyarrow.record_batch(batch).to_pylist() + for object in contents_list: + data.append({ + "path": object["path"], + "last_modified": object["last_modified"].isoformat(), + "size": object["size"], + }) + else: + content = b"" + # retrieve contents of object + obj = await obs.get_async(self._content_managers[drive_name], path) + stream = obj.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks + async for buf in stream: + content += buf + + # retrieve metadata of object + metadata = await obs.head_async(self._content_managers[drive_name], path) + data = { + "path": path, + "content": content, + "last_modified": metadata["last_modified"].isoformat(), + "size": metadata["size"] + } + print(data) + response = { + "data": data + } + except Exception as e: + raise tornado.web.HTTPError( + status_code= httpx.codes.BAD_REQUEST, + reason=f"The following error occured when retrieving the contents: {e}", + ) + + return response async def new_file(self, drive_name, path, **kwargs): """Create a new file or directory at the given path. diff --git a/src/contents.ts b/src/contents.ts index 470d824..562d871 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -5,7 +5,7 @@ import { Signal, ISignal } from '@lumino/signaling'; import { Contents, ServerConnection } from '@jupyterlab/services'; import { PathExt } from '@jupyterlab/coreutils'; import { IDriveInfo } from './token'; -import { mountDrive } from './requests'; +import { getContents, mountDrive } from './requests'; let data: Contents.IModel = { name: '', @@ -206,6 +206,9 @@ export class Drive implements Contents.IDrive { } } + const resp = await getContents(currentDrive.name, { path: '' }); + console.log('resp: ', resp); + data = { name: PathExt.basename(localPath), path: PathExt.basename(localPath), diff --git a/src/requests.ts b/src/requests.ts index d6ac6e3..59a11b0 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -1,4 +1,5 @@ import { ReadonlyJSONObject } from '@lumino/coreutils'; + import { requestAPI } from './handler'; /** @@ -24,3 +25,13 @@ export async function mountDrive( }; return await requestAPI('drives', 'POST', body); } + +export async function getContents( + driveName: string, + options: { path: string } +) { + return await requestAPI( + 'drives/' + driveName + '/' + options.path, + 'GET' + ); +} From 7fb06f7c8b500aede9cf1526ed7c1009beaafd29 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Tue, 19 Nov 2024 18:06:32 +0100 Subject: [PATCH 02/19] fix mounted drive check --- jupyter_drives/manager.py | 2 +- src/contents.ts | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 211e654..ab9a907 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -88,7 +88,7 @@ async def list_drives(self): "name": result.name, "region": self._config.region_name if self._config.region_name is not None else "eu-north-1", "creation_date": result.extra["creation_date"], - "mounted": "true" if result.name not in self._content_managers else "false", + "mounted": False if result.name not in self._content_managers else True, "provider": self._config.provider } ) diff --git a/src/contents.ts b/src/contents.ts index 562d871..0968121 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -192,15 +192,17 @@ export class Drive implements Contents.IDrive { } // extract current drive name - const currentDrive = this.drivesList.filter(x => x.name === localPath)[0]; + const currentDrive = this._drivesList.filter( + x => x.name === localPath + )[0]; // when accessed the first time, mount drive - if (!currentDrive.mounted) { + if (currentDrive.mounted === false) { try { await mountDrive(localPath, { provider: currentDrive.provider, region: currentDrive.region }); - currentDrive.mounted = true; + this._drivesList.filter(x => x.name === localPath)[0].mounted = true; } catch (e) { console.log(e); } From 084f063521bbf82995077a3cab74e983b5fbd826 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Tue, 19 Nov 2024 18:25:28 +0100 Subject: [PATCH 03/19] error handling for listing drives --- src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index.ts b/src/index.ts index 14dcc3c..a5efed4 100644 --- a/src/index.ts +++ b/src/index.ts @@ -172,8 +172,8 @@ const drivesListProvider: JupyterFrontEndPlugin = { mounted: drive.mounted }); } - } catch { - console.log('Failed loading available drives list.'); + } catch (error) { + console.log('Failed loading available drives list, with error: ', error); } return drives; } From 6a078b470bf779f242d58b6ad00c3bb57ffa48b0 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Tue, 19 Nov 2024 18:35:36 +0100 Subject: [PATCH 04/19] add pyarrow package --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index a487a72..06c08e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ classifiers = [ ] dependencies = [ "obstore>=0.2.0,<0.3", + "pyarrow>=18.0.0,<19.0.0", "jupyter_server>=2.14.2,<3", "s3contents>=0.11.1,<0.12.0", "apache-libcloud>=3.8.0, <4", From 09d0f600106cfcf54c9b3236fc0dc5797fe6eb5c Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Tue, 19 Nov 2024 23:08:19 +0100 Subject: [PATCH 05/19] iterate on contents listing --- jupyter_drives/manager.py | 6 ----- src/contents.ts | 17 +----------- src/requests.ts | 57 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index ab9a907..fe8325b 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -162,17 +162,12 @@ async def get_contents(self, drive_name, path): drive_name: name of drive to get the contents of path: path to file or directory (empty string for root listing) """ - print('!!!!!!!!!!!!!!!!!!!', drive_name, 'path: ', path) if path == '/': path = '' - drive_name = 'jupyter-drives-test-bucket-1' try : currentObject = os.path.basename(path) if os.path.basename(path) is not None else '' - print('currentObject: ', currentObject) # check if we are listing contents of a directory if currentObject.find('.') == -1: - print('in if') - print('store: ', self._content_managers) data = [] # using Arrow lists as they are recommended for large results # sream will be an async iterable of RecordBatch @@ -201,7 +196,6 @@ async def get_contents(self, drive_name, path): "last_modified": metadata["last_modified"].isoformat(), "size": metadata["size"] } - print(data) response = { "data": data } diff --git a/src/contents.ts b/src/contents.ts index 0968121..c952851 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -3,7 +3,6 @@ import { Signal, ISignal } from '@lumino/signaling'; import { Contents, ServerConnection } from '@jupyterlab/services'; -import { PathExt } from '@jupyterlab/coreutils'; import { IDriveInfo } from './token'; import { getContents, mountDrive } from './requests'; @@ -208,21 +207,7 @@ export class Drive implements Contents.IDrive { } } - const resp = await getContents(currentDrive.name, { path: '' }); - console.log('resp: ', resp); - - data = { - name: PathExt.basename(localPath), - path: PathExt.basename(localPath), - last_modified: '', - created: '', - content: [], - format: 'json', - mimetype: '', - size: undefined, - writable: true, - type: 'directory' - }; + data = await getContents(currentDrive.name, { path: '' }); } else { const drivesList: Contents.IModel[] = []; for (const drive of this._drivesList) { diff --git a/src/requests.ts b/src/requests.ts index 59a11b0..77a2267 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -1,7 +1,26 @@ import { ReadonlyJSONObject } from '@lumino/coreutils'; +import { Contents } from '@jupyterlab/services'; +import { PathExt } from '@jupyterlab/coreutils'; import { requestAPI } from './handler'; +let data: Contents.IModel = { + name: '', + path: '', + last_modified: '', + created: '', + content: null, + format: null, + mimetype: '', + size: 0, + writable: true, + type: '' +}; + +interface IContentsList { + [fileName: string]: Contents.IModel; +} + /** * Fetch the list of available drives. * @returns list of drives @@ -30,8 +49,44 @@ export async function getContents( driveName: string, options: { path: string } ) { - return await requestAPI( + const response = await requestAPI( 'drives/' + driveName + '/' + options.path, 'GET' ); + + if (response.data) { + const fileList: IContentsList = {}; + + response.data.forEach((row: any) => { + const fileName = PathExt.basename(row.path); + + fileList[fileName] = fileList[fileName] ?? { + name: fileName, + path: driveName + '/' + row.path, + last_modified: row.last_modified, + created: '', + content: !fileName.split('.')[1] ? [] : null, + format: null, //fileFormat as Contents.FileFormat, + mimetype: 'null', //fileMimeType, + size: row.size, + writable: true, + type: 'directory' //fileType + }; + }); + + data = { + name: options.path ? PathExt.basename(options.path) : '', + path: options.path ? options.path + '/' : '', + last_modified: '', + created: '', + content: Object.values(fileList), + format: 'json', + mimetype: '', + size: undefined, + writable: true, + type: 'directory' + }; + } + + return data; } From ae54abf36d68c2c2a70c714bbb1fd880bdd04321 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 20 Nov 2024 12:01:03 +0100 Subject: [PATCH 06/19] add logic for registered file types retrieval and usage --- src/contents.ts | 73 +++++++++++++++++++++++++++++++++++++++++-------- src/index.ts | 3 ++ src/requests.ts | 14 +++++++--- src/token.ts | 38 +++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 15 deletions(-) diff --git a/src/contents.ts b/src/contents.ts index c952851..872dd42 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -1,9 +1,7 @@ -// Copyright (c) Jupyter Development Team. -// Distributed under the terms of the Modified BSD License. - +import { JupyterFrontEnd } from '@jupyterlab/application'; import { Signal, ISignal } from '@lumino/signaling'; import { Contents, ServerConnection } from '@jupyterlab/services'; -import { IDriveInfo } from './token'; +import { IDriveInfo, IRegisteredFileTypes } from './token'; import { getContents, mountDrive } from './requests'; let data: Contents.IModel = { @@ -119,6 +117,20 @@ export class Drive implements Contents.IDrive { return this._serverSettings; } + /** + * The registered file types + */ + get registeredFileTypes(): IRegisteredFileTypes { + return this._registeredFileTypes; + } + + /** + * The registered file types + */ + set registeredFileTypes(fileTypes: IRegisteredFileTypes) { + this._registeredFileTypes = fileTypes; + } + /** * A signal emitted when a file operation takes place. */ @@ -182,13 +194,14 @@ export class Drive implements Contents.IDrive { localPath: string, options?: Contents.IFetchOptions ): Promise { - let relativePath = ''; + const relativePath = ''; + console.log('GET localpath: ', localPath); if (localPath !== '') { - if (localPath.includes(this.name)) { - relativePath = localPath.split(this.name + '/')[1]; - } else { - relativePath = localPath; - } + // if (localPath.includes(this.name)) { + // relativePath = localPath.split(this.name + '/')[1]; + // } else { + // relativePath = localPath; + // } // extract current drive name const currentDrive = this._drivesList.filter( @@ -207,7 +220,10 @@ export class Drive implements Contents.IDrive { } } - data = await getContents(currentDrive.name, { path: '' }); + data = await getContents(currentDrive.name, { + path: '', + registeredFileTypes: this._registeredFileTypes + }); } else { const drivesList: Contents.IModel[] = []; for (const drive of this._drivesList) { @@ -589,6 +605,40 @@ export class Drive implements Contents.IDrive { return Promise.reject('Read only'); } + /** + * Get all registered file types and store them accordingly with their file + * extension (e.g.: .txt, .pdf, .jpeg), file mimetype (e.g.: text/plain, application/pdf) + * and file format (e.g.: base64, text). + * + * @param app + */ + getRegisteredFileTypes(app: JupyterFrontEnd) { + // get called when instating the toolbar + const registeredFileTypes = app.docRegistry.fileTypes(); + + for (const fileType of registeredFileTypes) { + // check if we are dealing with a directory + if (fileType.extensions.length === 0) { + this._registeredFileTypes[''] = { + fileType: 'directory', + fileFormat: 'json', + fileMimeTypes: ['text/directory'] + }; + } + + // store the mimetype and fileformat for each file extension + fileType.extensions.forEach(extension => { + if (!this._registeredFileTypes[extension]) { + this._registeredFileTypes[extension] = { + fileType: fileType.name, + fileMimeTypes: [...fileType.mimeTypes], + fileFormat: fileType.fileFormat ? fileType.fileFormat : '' + }; + } + }); + } + } + /** * Get a REST url for a file given a path. */ @@ -609,6 +659,7 @@ export class Drive implements Contents.IDrive { private _fileChanged = new Signal(this); private _isDisposed: boolean = false; private _disposed = new Signal(this); + private _registeredFileTypes: IRegisteredFileTypes = {}; } export namespace Drive { diff --git a/src/index.ts b/src/index.ts index a5efed4..684053e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -224,6 +224,9 @@ const driveFileBrowser: JupyterFrontEndPlugin = { app.serviceManager.contents.addDrive(drive); + // get registered file types + drive.getRegisteredFileTypes(app); + // Manually restore and load the drive file browser. const driveBrowser = fileBrowserFactory.createFileBrowser('drivebrowser', { auto: false, diff --git a/src/requests.ts b/src/requests.ts index 77a2267..80a84c0 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -3,6 +3,7 @@ import { Contents } from '@jupyterlab/services'; import { PathExt } from '@jupyterlab/coreutils'; import { requestAPI } from './handler'; +import { getFileType, IRegisteredFileTypes } from './token'; let data: Contents.IModel = { name: '', @@ -47,7 +48,7 @@ export async function mountDrive( export async function getContents( driveName: string, - options: { path: string } + options: { path: string; registeredFileTypes: IRegisteredFileTypes } ) { const response = await requestAPI( 'drives/' + driveName + '/' + options.path, @@ -60,17 +61,22 @@ export async function getContents( response.data.forEach((row: any) => { const fileName = PathExt.basename(row.path); + const [fileType, fileMimeType, fileFormat] = getFileType( + PathExt.extname(PathExt.basename(fileName)), + options.registeredFileTypes + ); + fileList[fileName] = fileList[fileName] ?? { name: fileName, path: driveName + '/' + row.path, last_modified: row.last_modified, created: '', content: !fileName.split('.')[1] ? [] : null, - format: null, //fileFormat as Contents.FileFormat, - mimetype: 'null', //fileMimeType, + format: fileFormat as Contents.FileFormat, + mimetype: fileMimeType, size: row.size, writable: true, - type: 'directory' //fileType + type: fileType }; }); diff --git a/src/token.ts b/src/token.ts index e54c5d6..860e672 100644 --- a/src/token.ts +++ b/src/token.ts @@ -32,3 +32,41 @@ export interface IDriveInfo { */ mounted: boolean; } + +/** + * An interface that stores the registered file type, mimetype and format for each file extension. + */ +export interface IRegisteredFileTypes { + [fileExtension: string]: { + fileType: string; + fileMimeTypes: string[]; + fileFormat: string; + }; +} + +/** + * Helping function to define file type, mimetype and format based on file extension. + * @param extension file extension (e.g.: txt, ipynb, csv) + * @returns + */ +export function getFileType( + extension: string, + registeredFileTypes: IRegisteredFileTypes +) { + let fileType: string = 'text'; + let fileMimetype: string = 'text/plain'; + let fileFormat: string = 'text'; + + if (registeredFileTypes[extension]) { + fileType = registeredFileTypes[extension].fileType; + fileMimetype = registeredFileTypes[extension].fileMimeTypes[0]; + fileFormat = registeredFileTypes[extension].fileFormat; + } + + // the file format for notebooks appears as json, but should be text + if (extension === '.ipynb') { + fileFormat = 'text'; + } + + return [fileType, fileMimetype, fileFormat]; +} From 28c2eddd858f2d66e56801d68c9ff8c7602411b5 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 20 Nov 2024 12:03:30 +0100 Subject: [PATCH 07/19] update docstrings --- src/requests.ts | 22 ++++++++++++++++------ src/token.ts | 8 ++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/requests.ts b/src/requests.ts index 80a84c0..d9fa5b0 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -3,8 +3,11 @@ import { Contents } from '@jupyterlab/services'; import { PathExt } from '@jupyterlab/coreutils'; import { requestAPI } from './handler'; -import { getFileType, IRegisteredFileTypes } from './token'; +import { getFileType, IRegisteredFileTypes, IContentsList } from './token'; +/** + * The data contents model. + */ let data: Contents.IModel = { name: '', path: '', @@ -18,13 +21,9 @@ let data: Contents.IModel = { type: '' }; -interface IContentsList { - [fileName: string]: Contents.IModel; -} - /** * Fetch the list of available drives. - * @returns list of drives + * @returns The list of available drives. */ export async function getDrivesList() { return await requestAPI('drives', 'GET'); @@ -33,6 +32,8 @@ export async function getDrivesList() { /** * Mount a drive by establishing a connection with it. * @param driveName + * @param options.provider The provider of the drive to be mounted. + * @param options.region The region of the drive to be mounted. */ export async function mountDrive( driveName: string, @@ -46,6 +47,15 @@ export async function mountDrive( return await requestAPI('drives', 'POST', body); } +/** + * Get contents of a directory or retrieve contents of a specific file. + * + * @param driveName + * @param options.path The path of object to be retrived + * @param options.path The list containing all registered file types. + * + * @returns A promise which resolves with the contents model. + */ export async function getContents( driveName: string, options: { path: string; registeredFileTypes: IRegisteredFileTypes } diff --git a/src/token.ts b/src/token.ts index 860e672..9f0137a 100644 --- a/src/token.ts +++ b/src/token.ts @@ -1,4 +1,5 @@ import { Token } from '@lumino/coreutils'; +import { Contents } from '@jupyterlab/services'; /** * A token for the plugin that provides the list of drives. @@ -33,6 +34,13 @@ export interface IDriveInfo { mounted: boolean; } +/** + * An interface for storing the contents of a directory. + */ +export interface IContentsList { + [fileName: string]: Contents.IModel; +} + /** * An interface that stores the registered file type, mimetype and format for each file extension. */ From 03887818e87a0f89f60f551085bdd66c5caf766a Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 20 Nov 2024 14:13:14 +0100 Subject: [PATCH 08/19] iterate on path configuration for contents listing --- src/contents.ts | 25 ++++++++++++++++--------- src/requests.ts | 40 +++++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/src/contents.ts b/src/contents.ts index 872dd42..61633c8 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -194,19 +194,18 @@ export class Drive implements Contents.IDrive { localPath: string, options?: Contents.IFetchOptions ): Promise { - const relativePath = ''; + let relativePath = ''; console.log('GET localpath: ', localPath); if (localPath !== '') { - // if (localPath.includes(this.name)) { - // relativePath = localPath.split(this.name + '/')[1]; - // } else { - // relativePath = localPath; - // } - // extract current drive name const currentDrive = this._drivesList.filter( - x => x.name === localPath + x => + x.name === + (localPath.indexOf('/') !== -1 + ? localPath.substring(0, localPath.indexOf('/')) + : localPath) )[0]; + // when accessed the first time, mount drive if (currentDrive.mounted === false) { try { @@ -220,11 +219,19 @@ export class Drive implements Contents.IDrive { } } + // eliminate drive name from path + relativePath = + localPath.indexOf('/') !== -1 + ? localPath.substring(localPath.indexOf('/') + 1) + : ''; + data = await getContents(currentDrive.name, { - path: '', + path: relativePath, registeredFileTypes: this._registeredFileTypes }); } else { + // retriving list of contents from root + // in our case: list available drives const drivesList: Contents.IModel[] = []; for (const drive of this._drivesList) { drivesList.push({ diff --git a/src/requests.ts b/src/requests.ts index d9fa5b0..4956804 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -69,25 +69,31 @@ export async function getContents( const fileList: IContentsList = {}; response.data.forEach((row: any) => { - const fileName = PathExt.basename(row.path); + // check if we are dealing with files inside a subfolder + if (row.path !== options.path && row.path !== options.path + '/') { + // extract object name from path + const fileName = row.path + .replace(options.path ? options.path + '/' : '', '') + .split('/')[0]; - const [fileType, fileMimeType, fileFormat] = getFileType( - PathExt.extname(PathExt.basename(fileName)), - options.registeredFileTypes - ); + const [fileType, fileMimeType, fileFormat] = getFileType( + PathExt.extname(PathExt.basename(fileName)), + options.registeredFileTypes + ); - fileList[fileName] = fileList[fileName] ?? { - name: fileName, - path: driveName + '/' + row.path, - last_modified: row.last_modified, - created: '', - content: !fileName.split('.')[1] ? [] : null, - format: fileFormat as Contents.FileFormat, - mimetype: fileMimeType, - size: row.size, - writable: true, - type: fileType - }; + fileList[fileName] = fileList[fileName] ?? { + name: fileName, + path: driveName + '/' + row.path, + last_modified: row.last_modified, + created: '', + content: !fileName.split('.')[1] ? [] : null, + format: fileFormat as Contents.FileFormat, + mimetype: fileMimeType, + size: row.size, + writable: true, + type: fileType + }; + } }); data = { From f69d1f4a239547fab6ac9835957935a7c7af98d7 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 20 Nov 2024 16:57:09 +0100 Subject: [PATCH 09/19] iterate on file contents retrival --- jupyter_drives/manager.py | 3 +- src/requests.ts | 99 ++++++++++++++++++++++++--------------- 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index fe8325b..ad7a146 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -166,6 +166,7 @@ async def get_contents(self, drive_name, path): path = '' try : currentObject = os.path.basename(path) if os.path.basename(path) is not None else '' + # check if we are listing contents of a directory if currentObject.find('.') == -1: data = [] @@ -192,7 +193,7 @@ async def get_contents(self, drive_name, path): metadata = await obs.head_async(self._content_managers[drive_name], path) data = { "path": path, - "content": content, + "content": content.decode("utf-8"), "last_modified": metadata["last_modified"].isoformat(), "size": metadata["size"] } diff --git a/src/requests.ts b/src/requests.ts index 4956804..36c6c46 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -66,48 +66,71 @@ export async function getContents( ); if (response.data) { - const fileList: IContentsList = {}; + // listing the contents of a directory + if (options.path.indexOf('.') === -1) { + const fileList: IContentsList = {}; - response.data.forEach((row: any) => { - // check if we are dealing with files inside a subfolder - if (row.path !== options.path && row.path !== options.path + '/') { - // extract object name from path - const fileName = row.path - .replace(options.path ? options.path + '/' : '', '') - .split('/')[0]; + response.data.forEach((row: any) => { + // check if we are dealing with files inside a subfolder + if (row.path !== options.path && row.path !== options.path + '/') { + // extract object name from path + const fileName = row.path + .replace(options.path ? options.path + '/' : '', '') + .split('/')[0]; - const [fileType, fileMimeType, fileFormat] = getFileType( - PathExt.extname(PathExt.basename(fileName)), - options.registeredFileTypes - ); + const [fileType, fileMimeType, fileFormat] = getFileType( + PathExt.extname(PathExt.basename(fileName)), + options.registeredFileTypes + ); - fileList[fileName] = fileList[fileName] ?? { - name: fileName, - path: driveName + '/' + row.path, - last_modified: row.last_modified, - created: '', - content: !fileName.split('.')[1] ? [] : null, - format: fileFormat as Contents.FileFormat, - mimetype: fileMimeType, - size: row.size, - writable: true, - type: fileType - }; - } - }); + fileList[fileName] = fileList[fileName] ?? { + name: fileName, + path: driveName + '/' + row.path, + last_modified: row.last_modified, + created: '', + content: !fileName.split('.')[1] ? [] : null, + format: fileFormat as Contents.FileFormat, + mimetype: fileMimeType, + size: row.size, + writable: true, + type: fileType + }; + } + }); - data = { - name: options.path ? PathExt.basename(options.path) : '', - path: options.path ? options.path + '/' : '', - last_modified: '', - created: '', - content: Object.values(fileList), - format: 'json', - mimetype: '', - size: undefined, - writable: true, - type: 'directory' - }; + data = { + name: options.path ? PathExt.basename(options.path) : '', + path: options.path ? options.path + '/' : '', + last_modified: '', + created: '', + content: Object.values(fileList), + format: 'json', + mimetype: '', + size: undefined, + writable: true, + type: 'directory' + }; + } + // getting the contents of a file + else { + const [fileType, fileMimeType, fileFormat] = getFileType( + PathExt.extname(PathExt.basename(options.path)), + options.registeredFileTypes + ); + + data = { + name: PathExt.basename(options.path), + path: driveName + '/' + response.data.path, + last_modified: response.data.last_modified, + created: '', + content: response.data.content, + format: fileFormat as Contents.FileFormat, + mimetype: fileMimeType, + size: response.data.size, + writable: true, + type: fileType + }; + } } return data; From 123bfda6d23e6c031d9e5017de7730f61302d25a Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 20 Nov 2024 16:58:25 +0100 Subject: [PATCH 10/19] add create checkpoint logic --- src/contents.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/contents.ts b/src/contents.ts index 61633c8..fad7bde 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -571,7 +571,11 @@ export class Drive implements Contents.IDrive { * checkpoint is created. */ createCheckpoint(path: string): Promise { - return Promise.reject('Repository is read only'); + const emptyCheckpoint: Contents.ICheckpointModel = { + id: '', + last_modified: '' + }; + return Promise.resolve(emptyCheckpoint); } /** From b7a1635cf7e711e8c1116ff945f0fa004b95ac2f Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 20 Nov 2024 18:28:31 +0100 Subject: [PATCH 11/19] add logic to decode special media types --- jupyter_drives/manager.py | 14 ++++++++++++-- src/contents.ts | 2 -- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index ad7a146..cdb38d3 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -7,6 +7,7 @@ import tornado import httpx import traitlets +import base64 from jupyter_server.utils import url_path_join import obstore as obs @@ -155,7 +156,7 @@ async def unmount_drive(self, drive_name: str): return - async def get_contents(self, drive_name, path): + async def get_contents(self, drive_name, path, special_type=False): """Get contents of a file or directory. Args: @@ -191,9 +192,18 @@ async def get_contents(self, drive_name, path): # retrieve metadata of object metadata = await obs.head_async(self._content_managers[drive_name], path) + + # for certain media type files, extracted contents need to be read as a byte array and decoded to base64 to be viewable in JupyterLab + # the following extesnions correspond to a base64 file format or are of type PDF + ext = os.path.splitext(path)[1] + if ext == '.pdf' or ext == '.svg' or ext == '.tif' or ext == '.tiff' or ext == '.jpg' or ext == '.jpeg' or ext == '.gif' or ext == '.png' or ext == '.bmp' or ext == '.webp': + processed_content = base64.b64encode(content).decode("utf-8") + else: + processed_content = content.decode("utf-8") + data = { "path": path, - "content": content.decode("utf-8"), + "content": processed_content, "last_modified": metadata["last_modified"].isoformat(), "size": metadata["size"] } diff --git a/src/contents.ts b/src/contents.ts index fad7bde..dc71cba 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -195,7 +195,6 @@ export class Drive implements Contents.IDrive { options?: Contents.IFetchOptions ): Promise { let relativePath = ''; - console.log('GET localpath: ', localPath); if (localPath !== '') { // extract current drive name const currentDrive = this._drivesList.filter( @@ -261,7 +260,6 @@ export class Drive implements Contents.IDrive { type: 'directory' }; } - console.log('GET: ', relativePath); Contents.validateContentsModel(data); return data; From c507df0c673eff997d6b9ab425c81780c7e2ef45 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Wed, 20 Nov 2024 18:34:41 +0100 Subject: [PATCH 12/19] fix typo --- jupyter_drives/manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index cdb38d3..5d60750 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -193,8 +193,8 @@ async def get_contents(self, drive_name, path, special_type=False): # retrieve metadata of object metadata = await obs.head_async(self._content_managers[drive_name], path) - # for certain media type files, extracted contents need to be read as a byte array and decoded to base64 to be viewable in JupyterLab - # the following extesnions correspond to a base64 file format or are of type PDF + # for certain media type files, extracted content needs to be read as a byte array and decoded to base64 to be viewable in JupyterLab + # the following extensions correspond to a base64 file format or are of type PDF ext = os.path.splitext(path)[1] if ext == '.pdf' or ext == '.svg' or ext == '.tif' or ext == '.tiff' or ext == '.jpg' or ext == '.jpeg' or ext == '.gif' or ext == '.png' or ext == '.bmp' or ext == '.webp': processed_content = base64.b64encode(content).decode("utf-8") From 391803404632acab55d986b8b8d0a70b4e71afae Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Thu, 21 Nov 2024 11:01:33 +0100 Subject: [PATCH 13/19] remove unused arg --- jupyter_drives/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 5d60750..45d5884 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -156,7 +156,7 @@ async def unmount_drive(self, drive_name: str): return - async def get_contents(self, drive_name, path, special_type=False): + async def get_contents(self, drive_name, path): """Get contents of a file or directory. Args: From f745535d7fc924657539151533cb1ba948683625 Mon Sep 17 00:00:00 2001 From: Denisa Checiu <91504950+DenisaCG@users.noreply.github.com> Date: Fri, 22 Nov 2024 13:06:34 +0100 Subject: [PATCH 14/19] update type check Co-authored-by: Afshin Taylor Darian --- src/contents.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/contents.ts b/src/contents.ts index dc71cba..3c52da0 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -641,7 +641,7 @@ export class Drive implements Contents.IDrive { this._registeredFileTypes[extension] = { fileType: fileType.name, fileMimeTypes: [...fileType.mimeTypes], - fileFormat: fileType.fileFormat ? fileType.fileFormat : '' + fileFormat: fileType.fileFormat ?? '' }; } }); From a93b5fb7f1cd2a19f216f2dd6b890b75906c7ad7 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Fri, 22 Nov 2024 13:19:18 +0100 Subject: [PATCH 15/19] update directory check --- src/requests.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/requests.ts b/src/requests.ts index 36c6c46..da34dab 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -64,10 +64,11 @@ export async function getContents( 'drives/' + driveName + '/' + options.path, 'GET' ); + const isDir: boolean = PathExt.extname(options.path) === ''; if (response.data) { // listing the contents of a directory - if (options.path.indexOf('.') === -1) { + if (isDir) { const fileList: IContentsList = {}; response.data.forEach((row: any) => { From d41464227b0876e74c7b9b1ed6e2971eddf4f146 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Fri, 22 Nov 2024 17:33:20 +0100 Subject: [PATCH 16/19] improve check for dealing with directories --- jupyter_drives/manager.py | 50 +++++++++++++++++++++++++-------------- src/requests.ts | 3 ++- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 45d5884..2b5cd69 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -166,30 +166,38 @@ async def get_contents(self, drive_name, path): if path == '/': path = '' try : - currentObject = os.path.basename(path) if os.path.basename(path) is not None else '' - - # check if we are listing contents of a directory - if currentObject.find('.') == -1: - data = [] - # using Arrow lists as they are recommended for large results - # sream will be an async iterable of RecordBatch - stream = obs.list(self._content_managers[drive_name], path, chunk_size=100, return_arrow=True) - async for batch in stream: - contents_list = pyarrow.record_batch(batch).to_pylist() - for object in contents_list: - data.append({ - "path": object["path"], - "last_modified": object["last_modified"].isoformat(), - "size": object["size"], - }) - else: + data = [] + isDir = False + emptyDir = True # assume we are dealing with an empty directory + + # using Arrow lists as they are recommended for large results + # stream will be an async iterable of RecordBatch + stream = obs.list(self._content_managers[drive_name], path, chunk_size=100, return_arrow=True) + async for batch in stream: + # check once if we are dealing with a directory + if isDir is False and batch: + isDir = True + emptyDir = False + + contents_list = pyarrow.record_batch(batch).to_pylist() + for object in contents_list: + data.append({ + "path": object["path"], + "last_modified": object["last_modified"].isoformat(), + "size": object["size"], + }) + + # check if we are dealing with an empty drive + if isDir is False and path != '': content = b"" # retrieve contents of object obj = await obs.get_async(self._content_managers[drive_name], path) stream = obj.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks async for buf in stream: + if emptyDir is True and buf: + emptyDir = False content += buf - + # retrieve metadata of object metadata = await obs.head_async(self._content_managers[drive_name], path) @@ -207,6 +215,12 @@ async def get_contents(self, drive_name, path): "last_modified": metadata["last_modified"].isoformat(), "size": metadata["size"] } + + # dealing with the case of an empty directory + # TO DO: find better way to check + if emptyDir is True and os.path.basename(path).find('.') == -1: + data = [] + response = { "data": data } diff --git a/src/requests.ts b/src/requests.ts index da34dab..2edb2c7 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -64,7 +64,8 @@ export async function getContents( 'drives/' + driveName + '/' + options.path, 'GET' ); - const isDir: boolean = PathExt.extname(options.path) === ''; + // checking if we are dealing with a directory or a file + const isDir: boolean = response.data.length !== undefined; if (response.data) { // listing the contents of a directory From 0790c732efde939b2c54702552f0b5811322ef74 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Mon, 25 Nov 2024 10:42:29 +0100 Subject: [PATCH 17/19] add extra check for listing empty directory --- jupyter_drives/manager.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 2b5cd69..4db483d 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -218,8 +218,12 @@ async def get_contents(self, drive_name, path): # dealing with the case of an empty directory # TO DO: find better way to check - if emptyDir is True and os.path.basename(path).find('.') == -1: - data = [] + if emptyDir is True: + ext_list = ['.R', '.bmp', '.csv', '.gif', '.html', '.ipynb', '.jl', '.jpeg', '.jpg', '.json', '.jsonl', '.md', '.ndjson', '.pdf', '.png', '.py', '.svg', '.tif', '.tiff', '.tsv', '.txt', '.webp', '.yaml', '.yml'] + object_name = os.path.basename(path) + # if object doesn't contain . or doesn't end in one of the registered extensions + if object_name.find('.') == -1 or ext_list.count(os.path.splitext(object_name)[1]) == 0: + data = [] response = { "data": data From 38b9d5796ff286456f809d84bfeb1405e97d4483 Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Mon, 25 Nov 2024 10:43:26 +0100 Subject: [PATCH 18/19] iterate --- jupyter_drives/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 4db483d..13968b5 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -216,7 +216,7 @@ async def get_contents(self, drive_name, path): "size": metadata["size"] } - # dealing with the case of an empty directory + # dealing with the case of an empty directory, making sure it is not an empty file # TO DO: find better way to check if emptyDir is True: ext_list = ['.R', '.bmp', '.csv', '.gif', '.html', '.ipynb', '.jl', '.jpeg', '.jpg', '.json', '.jsonl', '.md', '.ndjson', '.pdf', '.png', '.py', '.svg', '.tif', '.tiff', '.tsv', '.txt', '.webp', '.yaml', '.yml'] From 50f3a122ac0c151b7672e84b58880fe352c7d74e Mon Sep 17 00:00:00 2001 From: DenisaCG Date: Mon, 25 Nov 2024 10:55:23 +0100 Subject: [PATCH 19/19] update comments --- jupyter_drives/manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 13968b5..df0f946 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -174,7 +174,7 @@ async def get_contents(self, drive_name, path): # stream will be an async iterable of RecordBatch stream = obs.list(self._content_managers[drive_name], path, chunk_size=100, return_arrow=True) async for batch in stream: - # check once if we are dealing with a directory + # if content exists we are dealing with a directory if isDir is False and batch: isDir = True emptyDir = False @@ -194,6 +194,7 @@ async def get_contents(self, drive_name, path): obj = await obs.get_async(self._content_managers[drive_name], path) stream = obj.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks async for buf in stream: + # if content exists we are dealing with a file if emptyDir is True and buf: emptyDir = False content += buf