Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[api] Improve normal file upload design for public APIs #3878

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 34 additions & 70 deletions apps/filebrowser/src/filebrowser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import operator
import mimetypes
import posixpath
from io import BytesIO as string_io

from django.core.paginator import EmptyPage, Paginator
from django.http import HttpResponse, HttpResponseNotModified, HttpResponseRedirect, StreamingHttpResponse
Expand All @@ -46,6 +47,7 @@
FILE_DOWNLOAD_CACHE_CONTROL,
MAX_FILE_SIZE_UPLOAD_LIMIT,
REDIRECT_DOWNLOAD,
RESTRICT_FILE_EXTENSIONS,
SHOW_DOWNLOAD_BUTTON,
)
from filebrowser.lib import xxd
Expand Down Expand Up @@ -394,81 +396,42 @@ def upload_complete(request):

@api_error_handler
def upload_file(request):
"""
A wrapper around the actual upload view function to clean up the temporary file afterwards if it fails.

Returns JSON.
"""
pass
# response = {}

# try:
# response = _upload_file(request)
# except Exception as e:
# LOG.exception('Upload operation failed.')

# file = request.FILES.get('file')
# if file and hasattr(file, 'remove'): # TODO: Call from proxyFS -- Check feasibility of this old comment
# file.remove()

# return HttpResponse(str(e).split('\n', 1)[0], status=500) # TODO: Check error message and status code

# return JsonResponse(response)
# Read request body first to prevent RawPostDataException later on which occurs when trying to access body after it has already been read
body_data_bytes = string_io(request.body)


def _upload_file(request):
"""
Handles file uploaded by HDFSfileUploadHandler.

The uploaded file is stored in HDFS at its destination with a .tmp suffix.
We just need to rename it to the destination path.
"""
uploaded_file = request.FILES['file']
dest_path = request.GET.get('dest')
response = {}

if MAX_FILE_SIZE_UPLOAD_LIMIT.get() >= 0 and uploaded_file.size > MAX_FILE_SIZE_UPLOAD_LIMIT.get():
return HttpResponse(f'File exceeds maximum allowed size of {MAX_FILE_SIZE_UPLOAD_LIMIT.get()} bytes.', status=500)
dest_path = request.POST.get('destination_path')

# Use form for now to triger the upload handler process by Django.
# Might be a better solution now to try directly using handler in request.fs.upload() for all FS.
# form = UploadAPIFileForm(request.POST, request.FILES)
# Check if the file type is restricted
_, file_type = os.path.splitext(uploaded_file.name)
if RESTRICT_FILE_EXTENSIONS.get() and file_type.lower() in [ext.lower() for ext in RESTRICT_FILE_EXTENSIONS.get()]:
return HttpResponse(f'File type "{file_type}" is not allowed. Please choose a file with a different extetypension.', status=400)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: spelling extetypension

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

two questions

  • Is there room for a longer error description, e.g including while formats that are allowed?
  • are we exposing the allowed types via some setting or API so that we can prevent those files from being uplaaded in the UI in the first place?


if request.META.get('upload_failed'):
raise Exception(request.META.get('upload_failed')) # TODO: Check error message and status code
# Check if the file size exceeds the maximum allowed size
max_size = MAX_FILE_SIZE_UPLOAD_LIMIT.get()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the same for all file systems?

if max_size >= 0 and uploaded_file.size >= max_size:
return HttpResponse(f'File exceeds maximum allowed size of {max_size} bytes. Please upload a smaller file.', status=413)

# if not form.is_valid():
# raise Exception(f"Error in upload form: {form.errors}")
# Check if the destination path is a directory and the file name contains a path separator
# This prevents directory traversal attacks
if request.fs.isdir(dest_path) and posixpath.sep in uploaded_file.name:
return HttpResponse(f'Invalid filename. Path separators are not allowed.', status=400)

# Check if the file already exists at the destination path
filepath = request.fs.join(dest_path, uploaded_file.name)

if request.fs.isdir(dest_path) and posixpath.sep in uploaded_file.name:
raise Exception(f'Upload failed: {posixpath.sep} is not allowed in the filename {uploaded_file.name}.') # TODO: status code
if request.fs.exists(filepath):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ideally there should be an option to override an existing file here if the user has already been informed and made that choice. E.g. in the case of an updated version of the file but at same file path.

return HttpResponse(f'The file path {filepath} already exists.', status=409)

try:
request.fs.upload(file=uploaded_file, path=dest_path, username=request.user.username)
except IOError as ex:
already_exists = False
try:
already_exists = request.fs.exists(dest_path)
except Exception:
pass

if already_exists:
messsage = f'Upload failed: Destination {filepath} already exists.'
else:
messsage = f'Upload error: Copy to {filepath} failed: {str(ex)}'
raise Exception(messsage) # TODO: Check error messages above and status code
request.fs.upload_v1(request.META, input_data=body_data_bytes, destination=dest_path, username=request.user.username)
except Exception as ex:
return HttpResponse(f'Upload to {filepath} failed: {str(ex)}', status=500)

# TODO: Check response fields below
response.update(
{
'path': filepath,
'result': _massage_stats(request, stat_absolute_path(filepath, request.fs.stats(filepath))),
}
)
response = {
'uploaded_file_stats': _massage_stats(request, stat_absolute_path(filepath, request.fs.stats(filepath))),
}

return response
return JsonResponse(response)


@api_error_handler
Expand Down Expand Up @@ -766,7 +729,6 @@ def bulk_op(request, op):

error_dict = {}
for p in path_list:

tmp_dict = bulk_dict
if op in (copy, move):
tmp_dict['source_path'] = p
Expand Down Expand Up @@ -795,10 +757,12 @@ def _massage_stats(request, stats):
stats_dict = stats.to_json_dict()
normalized_path = request.fs.normpath(stats_dict.get('path'))

stats_dict.update({
'path': normalized_path,
'type': filetype(stats.mode),
'rwx': rwx(stats.mode, stats.aclBit),
})
stats_dict.update(
{
'path': normalized_path,
'type': filetype(stats.mode),
'rwx': rwx(stats.mode, stats.aclBit),
}
)

return stats_dict
61 changes: 35 additions & 26 deletions apps/filebrowser/src/filebrowser/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,32 +18,32 @@
from django.utils.translation import gettext_lazy as _

from desktop.conf import ENABLE_DOWNLOAD, is_oozie_enabled
from desktop.lib.conf import Config, coerce_bool
from desktop.lib.conf import Config, coerce_bool, coerce_csv

MAX_SNAPPY_DECOMPRESSION_SIZE = Config(
key="max_snappy_decompression_size",
help=_("Max snappy decompression size in bytes."),
private=True,
default=1024 * 1024 * 25,
type=int)
key="max_snappy_decompression_size", help=_("Max snappy decompression size in bytes."), private=True, default=1024 * 1024 * 25, type=int
)

ARCHIVE_UPLOAD_TEMPDIR = Config(
key="archive_upload_tempdir",
help=_("Location on local filesystem where the uploaded archives are temporary stored."),
default="/tmp/hue_uploads",
type=str)
type=str,
)

FILE_UPLOAD_CHUNK_SIZE = Config(
key="file_upload_chunk_size",
default=5242880,
type=int,
help=_('Configure chunk size of the chunked file uploader. Default chunk size is set to 5MB.'))
help=_('Configure chunk size of the chunked file uploader. Default chunk size is set to 5MB.'),
)

CONCURRENT_MAX_CONNECTIONS = Config(
key="concurrent_max_connections",
default=5,
type=int,
help=_('Configure the maximum number of concurrent connections(chunks) for file uploads using the chunked file uploader.'))
help=_('Configure the maximum number of concurrent connections(chunks) for file uploads using the chunked file uploader.'),
)


def get_desktop_enable_download():
Expand All @@ -55,50 +55,59 @@ def get_desktop_enable_download():
key="show_download_button",
help=_("whether to show the download button in hdfs file browser."),
type=coerce_bool,
dynamic_default=get_desktop_enable_download)
dynamic_default=get_desktop_enable_download,
)

SHOW_UPLOAD_BUTTON = Config(
key="show_upload_button",
help=_("whether to show the upload button in hdfs file browser."),
type=coerce_bool,
default=True)

key="show_upload_button", help=_("whether to show the upload button in hdfs file browser."), type=coerce_bool, default=True
)

ENABLE_EXTRACT_UPLOADED_ARCHIVE = Config(
key="enable_extract_uploaded_archive",
help=_("Flag to enable the extraction of a uploaded archive in HDFS."),
type=coerce_bool,
dynamic_default=is_oozie_enabled
dynamic_default=is_oozie_enabled,
)

REDIRECT_DOWNLOAD = Config(
key="redirect_download",
help=_("Redirect client to WebHdfs or S3 for file download. Note: Turning this on will "
"override notebook/redirect_whitelist for user selected file downloads on WebHdfs & S3."),
help=_(
"Redirect client to WebHdfs or S3 for file download. Note: Turning this on will "
"override notebook/redirect_whitelist for user selected file downloads on WebHdfs & S3."
),
type=coerce_bool,
default=False)
default=False,
)

# DEPRECATED in favor of DEFAULT_HOME_PATH per FS config level.
REMOTE_STORAGE_HOME = Config(
key="remote_storage_home",
type=str,
default=None,
help="Optionally set this if you want a different home directory path. e.g. s3a://gethue.")
help="Optionally set this if you want a different home directory path. e.g. s3a://gethue.",
)

MAX_FILE_SIZE_UPLOAD_LIMIT = Config(
key="max_file_size_upload_limit",
default=-1,
type=int,
help=_('A limit on a file size (bytes) that can be uploaded to a filesystem. '
'A value of -1 means there will be no limit.'))
help=_('A limit on a file size (bytes) that can be uploaded to a filesystem. ' 'A value of -1 means there will be no limit.'),
)


def max_file_size_upload_limit():
return MAX_FILE_SIZE_UPLOAD_LIMIT.get()


FILE_DOWNLOAD_CACHE_CONTROL = Config(
key="file_download_cache_control",
type=str,
default=None,
help="Optionally set this to control the caching strategy for files download")
key="file_download_cache_control", type=str, default=None, help="Optionally set this to control the caching strategy for files download"
)

RESTRICT_FILE_EXTENSIONS = Config(
key='restrict_file_extensions',
default='',
type=coerce_csv,
help=_(
'Specify file extensions that are not allowed, separated by commas. For example: .exe, .zip, .rar, .tar, .gz'
),
)
3 changes: 3 additions & 0 deletions desktop/conf.dist/hue.ini
Original file line number Diff line number Diff line change
Expand Up @@ -1736,6 +1736,9 @@ submit_to=True
# A value of -1 means there will be no limit.
## max_file_size_upload_limit=-1

# Specify file extensions that are not allowed, separated by commas.
## restrict_file_extensions=.exe, .zip, .rar, .tar, .gz

###########################################################################
# Settings to configure Pig
###########################################################################
Expand Down
3 changes: 3 additions & 0 deletions desktop/conf/pseudo-distributed.ini.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -1719,6 +1719,9 @@
# A value of -1 means there will be no limit.
## max_file_size_upload_limit=-1

# Specify file extensions that are not allowed, separated by commas.
## restrict_file_extensions=.exe, .zip, .rar, .tar, .gz


###########################################################################
# Settings to configure Pig
Expand Down
2 changes: 1 addition & 1 deletion desktop/core/src/desktop/api_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def storage_save_file(request):
@api_view(["POST"])
def storage_upload_file(request):
django_request = get_django_request(request)
return filebrowser_views.upload_file(django_request) # TODO: Fix new api method and switch here
return filebrowser_api.upload_file(django_request)


@api_view(["POST"])
Expand Down
11 changes: 11 additions & 0 deletions desktop/core/src/desktop/lib/fs/gc/gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from boto.gs.connection import Location
from boto.gs.key import Key
from boto.s3.prefix import Prefix
from django.http.multipartparser import MultiPartParser
from django.utils.translation import gettext as _

from aws.s3.s3fs import S3FileSystem
Expand Down Expand Up @@ -477,3 +478,13 @@ def _check_key_parent_path(self, src, dst):
return True
else:
return False

@translate_gs_error
@auth_error_handler
def upload_v1(self, META, input_data, destination, username):
from desktop.lib.fs.gc.upload import GSNewFileUploadHandler # Circular dependency

gs_upload_handler = GSNewFileUploadHandler(destination, username)

parser = MultiPartParser(META, input_data, [gs_upload_handler])
return parser.parse()
Loading