Skip to content

Commit

Permalink
Improving handling of overwriting when uploading a dataset.
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucaweihs committed Mar 7, 2024
1 parent ffb9437 commit 682f4d3
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 4 deletions.
42 changes: 38 additions & 4 deletions objathor/dataset/upload_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,37 @@
from argparse import ArgumentParser


def rclone_check_file_exists(bucket_path: str, file_name: str) -> bool:
assert "/" not in file_name, "file_name should not contain '/'"

rclone_command = [
"rclone",
"lsf",
bucket_path,
f"--include={file_name}",
]
result = subprocess.run(rclone_command, capture_output=True)

return file_name in [f.strip() for f in result.stdout.decode("utf-8").split("\n")]


def rlcone_upload(
path_to_upload: str, bucket_path: str, less_safe_more_fast: bool = False
path_to_upload: str,
bucket_path: str,
less_safe_more_fast: bool = False,
overwrite: bool = False,
):
if rclone_check_file_exists(
bucket_path=bucket_path, file_name=os.path.basename(path_to_upload)
):
if overwrite:
print(f"{path_to_upload} already exists in {bucket_path}. Overwriting...")
else:
print(
f"{path_to_upload} already exists in {bucket_path}. Skipping upload..."
)
return

rclone_command = [
"rclone",
"copy",
Expand All @@ -26,6 +54,10 @@ def rlcone_upload(
"--s3-disable-checksum",
]
)

if not overwrite:
rclone_command.append("--ignore-existing")

rclone_command.extend([path_to_upload, bucket_path])

subprocess.run(rclone_command, check=True)
Expand Down Expand Up @@ -78,8 +110,10 @@ def upload_dataset(dataset_dir: str, bucket_path: str):
help="Bucket to upload the data to.",
)
args = parser.parse_args()
assert (
os.path.dirname(args.base_dir) == args.bucket_path.split("/")[-1]
), f"The base_dir should be the parent directory of the bucket_path. base_dir={args.base_dir}, bucket_path={args.bucket_path}"
args.base_dir = os.path.abspath(args.base_dir)
assert os.path.basename(args.base_dir) == args.bucket_path.split("/")[-1], (
f"The base_dir should have the same name as the bucket path dir."
f" base dir name={os.path.basename(args.base_dir)}, bucket path dir name={args.bucket_path.split('/')[-1]}"
)

upload_dataset(dataset_dir=args.base_dir, bucket_path=args.bucket_path)
11 changes: 11 additions & 0 deletions objathor/utils/download_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@
from tqdm import tqdm


def does_file_at_url_exist(url: str) -> bool:
response = requests.get(url, stream=True)

if response.status_code != 200:
return False

content_type = response.headers.get("content-type")

return not content_type.startswith("text/html")


def download_with_progress_bar(
url: str, save_path: str, desc: str = "", overwrite: bool = False
):
Expand Down

0 comments on commit 682f4d3

Please sign in to comment.