Skip to content

Commit

Permalink
Merge pull request #32 from radbrt/azure
Browse files Browse the repository at this point in the history
Support for Azure Storage accounts
  • Loading branch information
menzenski authored Jan 26, 2023
2 parents 34f135b + c285b37 commit 4fb97f2
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 3 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
'protobuf>=4.21.12',
'openpyxl',
'xlrd',
'paramiko'
'paramiko',
'azure-storage-blob>=12.14.0'
],
entry_points="""
[console_scripts]
Expand Down
11 changes: 11 additions & 0 deletions tap_spreadsheets_anywhere/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import tap_spreadsheets_anywhere.format_handler
import tap_spreadsheets_anywhere.conversion as conversion
import smart_open.ssh as ssh_transport
from azure.storage.blob import BlobServiceClient
import smart_open.ftp as ftp_transport

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -137,6 +138,8 @@ def get_matching_objects(table_spec, modified_since=None):
target_objects = list_files_in_gs_bucket(bucket,table_spec.get('search_prefix'))
elif protocol in ["http", "https"]:
target_objects = convert_URL_to_file_list(table_spec)
elif protocol in ["azure"]:
target_objects = list_files_in_azure_bucket(bucket,table_spec.get('search_prefix'))
else:
raise ValueError("Protocol {} not yet supported. Pull Requests are welcome!")

Expand Down Expand Up @@ -277,6 +280,14 @@ def list_files_in_gs_bucket(bucket, search_prefix=None):

return target_objects

def list_files_in_azure_bucket(container_name, search_prefix=None):
sas_key = os.environ['AZURE_STORAGE_CONNECTION_STRING']
blob_service_client = BlobServiceClient.from_connection_string(sas_key)
container_client = blob_service_client.get_container_client(container_name)
blob_iterator = container_client.list_blobs(name_starts_with=search_prefix)
return [{'Key': blob.name, 'LastModified': blob.last_modified} for blob in blob_iterator if blob.size > 0]



def list_files_in_s3_bucket(bucket, search_prefix=None):
s3_client = boto3.client('s3')
Expand Down
19 changes: 17 additions & 2 deletions tap_spreadsheets_anywhere/format_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import tap_spreadsheets_anywhere.excel_handler
import tap_spreadsheets_anywhere.json_handler
import tap_spreadsheets_anywhere.jsonl_handler

from azure.storage.blob import BlobServiceClient
import os

class InvalidFormatError(Exception):
def __init__(self, fname, message="The file was not in the expected format"):
Expand All @@ -18,7 +19,21 @@ def __str__(self):


def get_streamreader(uri, universal_newlines=True,newline='',open_mode='r'):
streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape')
kwarg_dispatch = {
"azure": lambda: {
"transport_params": {
"client": BlobServiceClient.from_connection_string(
os.environ['AZURE_STORAGE_CONNECTION_STRING'],
)
}
},
}

SCHEME_SEP = "://"
kwargs = kwarg_dispatch.get(uri.split(SCHEME_SEP, 1)[0], lambda: {})()

streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape', **kwargs)

if not universal_newlines and isinstance(streamreader, StreamReader):
return monkey_patch_streamreader(streamreader)
return streamreader
Expand Down

0 comments on commit 4fb97f2

Please sign in to comment.