diff --git a/tests/providers/dataverse/fixtures.py b/tests/providers/dataverse/fixtures.py index 94eef05bb..a3bb06181 100644 --- a/tests/providers/dataverse/fixtures.py +++ b/tests/providers/dataverse/fixtures.py @@ -31,6 +31,7 @@ def settings(): 'name': 'A look at wizards', } + @pytest.fixture def native_file_metadata(): with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp: @@ -65,12 +66,20 @@ def dataset_metadata_object(): 'Dataset Test Version' ) + @pytest.fixture def file_metadata_object(): with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp: return DataverseFileMetadata(json.load(fp)['native_file_metadata']['datafile'], 'latest') +@pytest.fixture +def csv_file_metadata_object(): + with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp: + return DataverseFileMetadata(json.load(fp)['csv_native_file_metadata']['datafile'], + 'latest') + + @pytest.fixture def revision_metadata_object(): return DataverseRevision('Test Dataset Verision') diff --git a/tests/providers/dataverse/fixtures/root_provider.json b/tests/providers/dataverse/fixtures/root_provider.json index 3fd461287..850753ee5 100644 --- a/tests/providers/dataverse/fixtures/root_provider.json +++ b/tests/providers/dataverse/fixtures/root_provider.json @@ -258,6 +258,22 @@ "label":"thefile.txt", "version":1 }, + "csv_native_file_metadata":{ + "datafile":{ + "contentType":"text/tab-separated-values", + "description":"", + "filename":"%2Fusr%2Flocal%2Fglassfish4%2Fglassfish%2Fdomains%2Fdomain1%2Ffiles%2F10.5072%2FFK2%2F232XYH%2F14c7a73d734-8383551cc713", + "id":20, + "md5":"6b50249f91258397fc5cb7d5a4127e15", + "name":"thefile.tab", + "originalFormatLabel":"Comma Separated Values", + "originalFileFormat": "text/csv" + }, + "datasetVersionId":5, + "description":"", + "label":"thefile.tab", + "version":1 + }, "checksum_mismatch_dataset_metadata":{ "data":{ "createTime":"2015-04-02T13:21:59Z", diff --git a/tests/providers/dataverse/test_metadata.py b/tests/providers/dataverse/test_metadata.py index ccb139087..44ff6a384 100644 --- a/tests/providers/dataverse/test_metadata.py +++ b/tests/providers/dataverse/test_metadata.py @@ -2,9 +2,11 @@ from tests.providers.dataverse.fixtures import ( dataset_metadata_object, revision_metadata_object, + csv_file_metadata_object, file_metadata_object ) + class TestDatasetMetadata: def test_dataset_metadata(self, dataset_metadata_object): @@ -45,6 +47,7 @@ def test_file_metadata(self, file_metadata_object): assert not file_metadata_object.created_utc assert file_metadata_object.content_type == 'text/plain; charset=US-ASCII' assert file_metadata_object.etag == 'latest::20' + assert file_metadata_object.original_names == ['thefile.txt'] assert file_metadata_object.extra == { 'fileId': '20', 'datasetVersion': 'latest', @@ -53,3 +56,29 @@ def test_file_metadata(self, file_metadata_object): 'md5': '6b50249f91258397fc5cb7d5a4127e15', }, } + + def test_csv_file_metadata(self, csv_file_metadata_object): + assert csv_file_metadata_object.is_file + assert not csv_file_metadata_object.is_folder + assert csv_file_metadata_object.provider == 'dataverse' + assert csv_file_metadata_object.kind == 'file' + assert csv_file_metadata_object.file_id == '20' + assert csv_file_metadata_object.name == 'thefile.tab' + assert csv_file_metadata_object.path == '/20' + assert csv_file_metadata_object.materialized_path == '/thefile.tab' + assert not csv_file_metadata_object.size + assert not csv_file_metadata_object.modified + assert not csv_file_metadata_object.created_utc + assert csv_file_metadata_object.content_type == 'text/tab-separated-values' + assert csv_file_metadata_object.etag == 'latest::20' + names = csv_file_metadata_object.original_names + assert 'thefile.csv' in names + assert 'thefile.CSV' in names + assert csv_file_metadata_object.extra == { + 'fileId': '20', + 'datasetVersion': 'latest', + 'hasPublishedVersion': False, + 'hashes': { + 'md5': '6b50249f91258397fc5cb7d5a4127e15', + }, + } diff --git a/tests/providers/dataverse/test_provider.py b/tests/providers/dataverse/test_provider.py index 141abe45c..01e4c6164 100644 --- a/tests/providers/dataverse/test_provider.py +++ b/tests/providers/dataverse/test_provider.py @@ -11,6 +11,7 @@ from waterbutler.core.path import WaterButlerPath from waterbutler.providers.dataverse import settings as dvs from waterbutler.providers.dataverse import DataverseProvider +from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError from waterbutler.providers.dataverse.metadata import DataverseFileMetadata, DataverseRevision from tests.providers.dataverse.fixtures import ( @@ -235,6 +236,32 @@ async def test_upload_create(self, provider, file_stream, native_file_metadata, assert aiohttpretty.has_call(method='GET', uri=latest_url) assert aiohttpretty.has_call(method='GET', uri=latest_published_url) + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_upload_ingestion_exception(self, provider, file_stream, native_file_metadata, + empty_native_dataset_metadata, native_dataset_metadata): + path = WaterButlerPath('/thefile.txt') + url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi) + aiohttpretty.register_uri('POST', url, status=400, body=b'something dataset lock: Ingest') + + with pytest.raises(DataverseIngestionLockError): + await provider.upload(file_stream, path) + + assert aiohttpretty.has_call(method='POST', uri=url) + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_upload_random_exception(self, provider, file_stream, native_file_metadata, + empty_native_dataset_metadata, native_dataset_metadata): + path = WaterButlerPath('/thefile.txt') + url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi) + aiohttpretty.register_uri('POST', url, status=400, body=b'something something error') + + with pytest.raises(exceptions.UploadError): + await provider.upload(file_stream, path) + + assert aiohttpretty.has_call(method='POST', uri=url) + @pytest.mark.asyncio @pytest.mark.aiohttpretty async def test_upload_updates(self, provider, diff --git a/tests/providers/dataverse/test_utils.py b/tests/providers/dataverse/test_utils.py new file mode 100644 index 000000000..0566d5e14 --- /dev/null +++ b/tests/providers/dataverse/test_utils.py @@ -0,0 +1,51 @@ +import pytest + +from waterbutler.providers.dataverse import utils as dv_utils + + +@pytest.fixture +def format_dict(): + return { + 'xlsx': { + 'originalFileFormat': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'originalFormatLabel': 'MS Excel (XLSX)', + 'contentType': 'text/tab-separated-values', + }, + 'RData': { + 'originalFileFormat': 'application/x-rlang-transport', + 'originalFormatLabel': 'R Data', + 'contentType': 'text/tab-separated-values' + }, + 'sav': { + 'originalFileFormat': 'application/x-spss-sav', + 'originalFormatLabel': 'SPSS SAV', + 'contentType': 'text/tab-separated-values' + }, + 'dta': { + 'originalFileFormat': 'application/x-stata', + 'originalFormatLabel': 'Stata Binary', + 'contentType': 'text/tab-separated-values' + }, + 'por': { + 'originalFileFormat': 'application/x-spss-por', + 'originalFormatLabel': 'SPSS Portable', + 'contentType': 'text/tab-separated-values' + }, + 'csv': { + 'originalFileFormat': 'text/csv', + 'originalFormatLabel': 'Comma Separated Values', + 'contentType': 'text/tab-separated-values' + } + } + + +class TestUtils: + + def test_original_ext_from_raw_metadata(self, format_dict): + for key in format_dict: + assert key in dv_utils.original_ext_from_raw_metadata(format_dict[key]) + + def test_original_ext_from_raw_metadata_none_case(self, format_dict): + for key, ext in format_dict.items(): + ext['originalFormatLabel'] = 'blarg' + assert dv_utils.original_ext_from_raw_metadata(ext) is None diff --git a/waterbutler/providers/dataverse/exceptions.py b/waterbutler/providers/dataverse/exceptions.py new file mode 100644 index 000000000..b41364b7f --- /dev/null +++ b/waterbutler/providers/dataverse/exceptions.py @@ -0,0 +1,15 @@ +from http import HTTPStatus + +from waterbutler.core.exceptions import UploadError + + +class DataverseIngestionLockError(UploadError): + def __init__(self, message, code=HTTPStatus.BAD_REQUEST): + """``dummy`` argument is because children of ``WaterButlerError`` must be instantiable with + a single integer argument. See :class:`waterbutler.core.exceptions.WaterButlerError` + for details. + """ + super().__init__( + 'Some uploads to Dataverse will lock uploading for a time. Please wait' + ' a few seconds and try again.', + code=code) diff --git a/waterbutler/providers/dataverse/metadata.py b/waterbutler/providers/dataverse/metadata.py index 125325dbf..beafe3f6b 100644 --- a/waterbutler/providers/dataverse/metadata.py +++ b/waterbutler/providers/dataverse/metadata.py @@ -1,4 +1,5 @@ from waterbutler.core import metadata +from waterbutler.providers.dataverse import utils as dv_utils class BaseDataverseMetadata(metadata.BaseMetadata): @@ -26,6 +27,23 @@ def file_id(self): def name(self): return self.raw.get('name', None) or self.raw.get('filename', None) + @property + def original_names(self): + """ Dataverse 'ingests' some files types. This changes their extension. + This property will look through the metadata to try to determine possible + original names of the file. + """ + + extensions = dv_utils.original_ext_from_raw_metadata(self.raw) + if extensions is None: + return [self.name] + else: + names = [] + for ext in extensions: + name = self.name[:self.name.rfind('.')] + names.append(name + '.{}'.format(ext)) + return names + @property def path(self): return self.build_path(self.file_id) diff --git a/waterbutler/providers/dataverse/provider.py b/waterbutler/providers/dataverse/provider.py index eddaed0b9..30d6165fa 100644 --- a/waterbutler/providers/dataverse/provider.py +++ b/waterbutler/providers/dataverse/provider.py @@ -11,6 +11,7 @@ from waterbutler.providers.dataverse import settings from waterbutler.providers.dataverse.metadata import DataverseRevision from waterbutler.providers.dataverse.metadata import DataverseDatasetMetadata +from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError class DataverseProvider(provider.BaseProvider): @@ -170,15 +171,26 @@ async def upload(self, stream, path, **kwargs): headers=dv_headers, auth=(self.token, ), data=file_stream, - expects=(201, ), + expects=(201, 400,), throws=exceptions.UploadError ) + + if resp.status == 400: + data = await resp.read() + data = data.decode('utf-8') + + if 'dataset lock: Ingest' in data: + raise DataverseIngestionLockError({'response': data}) + else: + raise (await exceptions.exception_from_response(resp, + error=exceptions.UploadError)) await resp.release() # Find appropriate version of file metadata = await self._get_data('latest') files = metadata if isinstance(metadata, list) else [] - file_metadata = next(file for file in files if file.name == path.name) + file_metadata = next(file for file in files if (file.name == path.name or + path.name in file.original_names)) if stream.writers['md5'].hexdigest != file_metadata.extra['hashes']['md5']: raise exceptions.UploadChecksumMismatchError() diff --git a/waterbutler/providers/dataverse/utils.py b/waterbutler/providers/dataverse/utils.py new file mode 100644 index 000000000..e6db4148a --- /dev/null +++ b/waterbutler/providers/dataverse/utils.py @@ -0,0 +1,58 @@ +ORIGINAL_FORMATS = { + + 'RData': { + 'original_format': 'application/x-rlang-transport', + 'original_label': 'R Data', + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['rdata', 'Rdata', 'RData'] + }, + 'sav': { + 'original_format': 'application/x-spss-sav', + 'original_label': 'SPSS SAV', + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['sav'] + }, + 'dta': { + 'original_format': 'application/x-stata', + 'original_label': 'Stata Binary', + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['dta'] + }, + 'por': { + 'original_format': 'application/x-spss-por', + 'original_label': 'SPSS Portable', + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['por'] + }, + 'csv': { + 'original_format': 'text/csv', + 'original_label': 'Comma Separated Values', + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['csv', 'CSV'] + }, + 'xlsx': { + 'original_format': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'original_label': 'MS Excel (XLSX)', + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['xlsx'] + } +} + + +def original_ext_from_raw_metadata(data): + """Use the raw metadata to figure out possible original extensions.""" + label = data.get('originalFormatLabel', None) + file_format = data.get('originalFileFormat', None) + content_type = data.get('contentType', None) + + if not label or not file_format or not content_type: + return None + + for key, ext in ORIGINAL_FORMATS.items(): + if (label == ext['original_label'] and + file_format == ext['original_format'] and + content_type == ext['content_type']): + + return ext['all_extensions'] + + return None