Skip to content

Commit

Permalink
Merge pull request #24 from DenisaCG/getContents
Browse files Browse the repository at this point in the history
Add content retrieval logic
  • Loading branch information
DenisaCG authored Nov 25, 2024
2 parents 11585dc + 50f3a12 commit 760d10e
Show file tree
Hide file tree
Showing 7 changed files with 327 additions and 39 deletions.
6 changes: 3 additions & 3 deletions jupyter_drives/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,17 @@ def initialize(self, logger: logging.Logger, manager: JupyterDrivesManager):
return super().initialize(logger, manager)

@tornado.web.authenticated
async def get(self, path: str = "", drive: str = ""):
async def get(self, drive: str = "", path: str = ""):
result = await self._manager.get_contents(drive, path)
self.finish(result)

@tornado.web.authenticated
async def post(self, path: str = "", drive: str = ""):
async def post(self, drive: str = "", path: str = ""):
result = await self._manager.new_file(drive, path)
self.finish(result)

@tornado.web.authenticated
async def patch(self, path: str = "", drive: str = ""):
async def patch(self, drive: str = "", path: str = ""):
body = self.get_json_body()
result = await self._manager.rename_file(drive, path, **body)
self.finish(result)
Expand Down
83 changes: 79 additions & 4 deletions jupyter_drives/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
import logging
from typing import Dict, List, Optional, Tuple, Union, Any

import os
import tornado
import httpx
import traitlets
import base64
from jupyter_server.utils import url_path_join

import obstore as obs
from libcloud.storage.types import Provider
from libcloud.storage.providers import get_driver
import pyarrow

from .log import get_logger
from .base import DrivesConfig
Expand Down Expand Up @@ -86,7 +89,7 @@ async def list_drives(self):
"name": result.name,
"region": self._config.region_name if self._config.region_name is not None else "eu-north-1",
"creation_date": result.extra["creation_date"],
"mounted": "true" if result.name not in self._content_managers else "false",
"mounted": False if result.name not in self._content_managers else True,
"provider": self._config.provider
}
)
Expand Down Expand Up @@ -153,14 +156,86 @@ async def unmount_drive(self, drive_name: str):

return

async def get_contents(self, drive_name, path, **kwargs):
async def get_contents(self, drive_name, path):
"""Get contents of a file or directory.
Args:
drive_name: name of drive to get the contents of
path: path to file or directory
path: path to file or directory (empty string for root listing)
"""
print('Get contents function called.')
if path == '/':
path = ''
try :
data = []
isDir = False
emptyDir = True # assume we are dealing with an empty directory

# using Arrow lists as they are recommended for large results
# stream will be an async iterable of RecordBatch
stream = obs.list(self._content_managers[drive_name], path, chunk_size=100, return_arrow=True)
async for batch in stream:
# if content exists we are dealing with a directory
if isDir is False and batch:
isDir = True
emptyDir = False

contents_list = pyarrow.record_batch(batch).to_pylist()
for object in contents_list:
data.append({
"path": object["path"],
"last_modified": object["last_modified"].isoformat(),
"size": object["size"],
})

# check if we are dealing with an empty drive
if isDir is False and path != '':
content = b""
# retrieve contents of object
obj = await obs.get_async(self._content_managers[drive_name], path)
stream = obj.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks
async for buf in stream:
# if content exists we are dealing with a file
if emptyDir is True and buf:
emptyDir = False
content += buf

# retrieve metadata of object
metadata = await obs.head_async(self._content_managers[drive_name], path)

# for certain media type files, extracted content needs to be read as a byte array and decoded to base64 to be viewable in JupyterLab
# the following extensions correspond to a base64 file format or are of type PDF
ext = os.path.splitext(path)[1]
if ext == '.pdf' or ext == '.svg' or ext == '.tif' or ext == '.tiff' or ext == '.jpg' or ext == '.jpeg' or ext == '.gif' or ext == '.png' or ext == '.bmp' or ext == '.webp':
processed_content = base64.b64encode(content).decode("utf-8")
else:
processed_content = content.decode("utf-8")

data = {
"path": path,
"content": processed_content,
"last_modified": metadata["last_modified"].isoformat(),
"size": metadata["size"]
}

# dealing with the case of an empty directory, making sure it is not an empty file
# TO DO: find better way to check
if emptyDir is True:
ext_list = ['.R', '.bmp', '.csv', '.gif', '.html', '.ipynb', '.jl', '.jpeg', '.jpg', '.json', '.jsonl', '.md', '.ndjson', '.pdf', '.png', '.py', '.svg', '.tif', '.tiff', '.tsv', '.txt', '.webp', '.yaml', '.yml']
object_name = os.path.basename(path)
# if object doesn't contain . or doesn't end in one of the registered extensions
if object_name.find('.') == -1 or ext_list.count(os.path.splitext(object_name)[1]) == 0:
data = []

response = {
"data": data
}
except Exception as e:
raise tornado.web.HTTPError(
status_code= httpx.codes.BAD_REQUEST,
reason=f"The following error occured when retrieving the contents: {e}",
)

return response

async def new_file(self, drive_name, path, **kwargs):
"""Create a new file or directory at the given path.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ classifiers = [
]
dependencies = [
"obstore>=0.2.0,<0.3",
"pyarrow>=18.0.0,<19.0.0",
"jupyter_server>=2.14.2,<3",
"s3contents>=0.11.1,<0.12.0",
"apache-libcloud>=3.8.0, <4",
Expand Down
108 changes: 79 additions & 29 deletions src/contents.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
// Copyright (c) Jupyter Development Team.
// Distributed under the terms of the Modified BSD License.

import { JupyterFrontEnd } from '@jupyterlab/application';
import { Signal, ISignal } from '@lumino/signaling';
import { Contents, ServerConnection } from '@jupyterlab/services';
import { PathExt } from '@jupyterlab/coreutils';
import { IDriveInfo } from './token';
import { mountDrive } from './requests';
import { IDriveInfo, IRegisteredFileTypes } from './token';
import { getContents, mountDrive } from './requests';

let data: Contents.IModel = {
name: '',
Expand Down Expand Up @@ -120,6 +117,20 @@ export class Drive implements Contents.IDrive {
return this._serverSettings;
}

/**
* The registered file types
*/
get registeredFileTypes(): IRegisteredFileTypes {
return this._registeredFileTypes;
}

/**
* The registered file types
*/
set registeredFileTypes(fileTypes: IRegisteredFileTypes) {
this._registeredFileTypes = fileTypes;
}

/**
* A signal emitted when a file operation takes place.
*/
Expand Down Expand Up @@ -185,40 +196,41 @@ export class Drive implements Contents.IDrive {
): Promise<Contents.IModel> {
let relativePath = '';
if (localPath !== '') {
if (localPath.includes(this.name)) {
relativePath = localPath.split(this.name + '/')[1];
} else {
relativePath = localPath;
}

// extract current drive name
const currentDrive = this.drivesList.filter(x => x.name === localPath)[0];
const currentDrive = this._drivesList.filter(
x =>
x.name ===
(localPath.indexOf('/') !== -1
? localPath.substring(0, localPath.indexOf('/'))
: localPath)
)[0];

// when accessed the first time, mount drive
if (!currentDrive.mounted) {
if (currentDrive.mounted === false) {
try {
await mountDrive(localPath, {
provider: currentDrive.provider,
region: currentDrive.region
});
currentDrive.mounted = true;
this._drivesList.filter(x => x.name === localPath)[0].mounted = true;
} catch (e) {
console.log(e);
}
}

data = {
name: PathExt.basename(localPath),
path: PathExt.basename(localPath),
last_modified: '',
created: '',
content: [],
format: 'json',
mimetype: '',
size: undefined,
writable: true,
type: 'directory'
};
// eliminate drive name from path
relativePath =
localPath.indexOf('/') !== -1
? localPath.substring(localPath.indexOf('/') + 1)
: '';

data = await getContents(currentDrive.name, {
path: relativePath,
registeredFileTypes: this._registeredFileTypes
});
} else {
// retriving list of contents from root
// in our case: list available drives
const drivesList: Contents.IModel[] = [];
for (const drive of this._drivesList) {
drivesList.push({
Expand Down Expand Up @@ -248,7 +260,6 @@ export class Drive implements Contents.IDrive {
type: 'directory'
};
}
console.log('GET: ', relativePath);

Contents.validateContentsModel(data);
return data;
Expand Down Expand Up @@ -558,7 +569,11 @@ export class Drive implements Contents.IDrive {
* checkpoint is created.
*/
createCheckpoint(path: string): Promise<Contents.ICheckpointModel> {
return Promise.reject('Repository is read only');
const emptyCheckpoint: Contents.ICheckpointModel = {
id: '',
last_modified: ''
};
return Promise.resolve(emptyCheckpoint);
}

/**
Expand Down Expand Up @@ -599,6 +614,40 @@ export class Drive implements Contents.IDrive {
return Promise.reject('Read only');
}

/**
* Get all registered file types and store them accordingly with their file
* extension (e.g.: .txt, .pdf, .jpeg), file mimetype (e.g.: text/plain, application/pdf)
* and file format (e.g.: base64, text).
*
* @param app
*/
getRegisteredFileTypes(app: JupyterFrontEnd) {
// get called when instating the toolbar
const registeredFileTypes = app.docRegistry.fileTypes();

for (const fileType of registeredFileTypes) {
// check if we are dealing with a directory
if (fileType.extensions.length === 0) {
this._registeredFileTypes[''] = {
fileType: 'directory',
fileFormat: 'json',
fileMimeTypes: ['text/directory']
};
}

// store the mimetype and fileformat for each file extension
fileType.extensions.forEach(extension => {
if (!this._registeredFileTypes[extension]) {
this._registeredFileTypes[extension] = {
fileType: fileType.name,
fileMimeTypes: [...fileType.mimeTypes],
fileFormat: fileType.fileFormat ?? ''
};
}
});
}
}

/**
* Get a REST url for a file given a path.
*/
Expand All @@ -619,6 +668,7 @@ export class Drive implements Contents.IDrive {
private _fileChanged = new Signal<this, Contents.IChangedArgs>(this);
private _isDisposed: boolean = false;
private _disposed = new Signal<this, void>(this);
private _registeredFileTypes: IRegisteredFileTypes = {};
}

export namespace Drive {
Expand Down
7 changes: 5 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ const drivesListProvider: JupyterFrontEndPlugin<IDriveInfo[]> = {
mounted: drive.mounted
});
}
} catch {
console.log('Failed loading available drives list.');
} catch (error) {
console.log('Failed loading available drives list, with error: ', error);
}
return drives;
}
Expand Down Expand Up @@ -224,6 +224,9 @@ const driveFileBrowser: JupyterFrontEndPlugin<void> = {

app.serviceManager.contents.addDrive(drive);

// get registered file types
drive.getRegisteredFileTypes(app);

// Manually restore and load the drive file browser.
const driveBrowser = fileBrowserFactory.createFileBrowser('drivebrowser', {
auto: false,
Expand Down
Loading

0 comments on commit 760d10e

Please sign in to comment.