Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Links #120

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open

Links #120

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ dist
MANIFEST
bagit.egg-info
.idea
.eggs
*.log
95 changes: 66 additions & 29 deletions bagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,13 @@ def find_locale_dir():
UNICODE_BYTE_ORDER_MARK = "\uFEFF"


def make_bag(
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
def make_bag(bag_dir,
bag_info=None,
processes=1,
checksums=None,
checksum=None,
encoding="utf-8",
follow_links=False,
):
"""
Convert a given directory into a bag. You can pass in arbitrary
Expand Down Expand Up @@ -234,7 +239,11 @@ def make_bag(
os.chmod("data", os.stat(cwd).st_mode)

total_bytes, total_files = make_manifests(
"data", processes, algorithms=checksums, encoding=encoding
"data",
processes,
algorithms=checksums,
encoding=encoding,
follow_links=follow_links
)

LOGGER.info(_("Creating bagit.txt"))
Expand Down Expand Up @@ -266,7 +275,7 @@ def make_bag(
finally:
os.chdir(old_dir)

return Bag(bag_dir)
return Bag(bag_dir, follow_links=follow_links)


class Bag(object):
Expand All @@ -275,7 +284,7 @@ class Bag(object):
valid_files = ["bagit.txt", "fetch.txt"]
valid_directories = ["data"]

def __init__(self, path=None):
def __init__(self, path=None, follow_links=False):
super(Bag, self).__init__()
self.tags = {}
self.info = {}
Expand All @@ -297,6 +306,7 @@ def __init__(self, path=None):

self.algorithms = []
self.tag_file_name = None
self.follow_links = follow_links
self.path = abspath(path)
if path:
# if path ends in a path separator, strip it off
Expand Down Expand Up @@ -428,7 +438,8 @@ def payload_files(self):
"""Returns a list of filenames which are present on the local filesystem"""
payload_dir = os.path.join(self.path, "data")

for dirpath, _, filenames in os.walk(payload_dir):
for dirpath, _, filenames in os.walk(payload_dir,
followlinks=self.follow_links):
for f in filenames:
# Jump through some hoops here to make the payload files are
# returned with the directory structure relative to the base
Expand Down Expand Up @@ -507,7 +518,11 @@ def save(self, processes=1, manifests=False):
# Generate new manifest files
if manifests:
total_bytes, total_files = make_manifests(
"data", processes, algorithms=self.algorithms, encoding=self.encoding
"data",
processes,
algorithms=self.algorithms,
encoding=self.encoding,
follow_links=self.follow_links
)

# Update Payload-Oxum
Expand Down Expand Up @@ -921,21 +936,29 @@ def _validate_bagittxt(self):
def _path_is_dangerous(self, path):
"""
Return true if path looks dangerous, i.e. potentially operates
outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json,
\\?\c:\, D:\sys32\cmd.exe
"""
if os.path.isabs(path):
return True
if os.path.expanduser(path) != path:
return True
if os.path.expandvars(path) != path:
return True
real_path = os.path.realpath(os.path.join(self.path, path))
real_path = os.path.normpath(real_path)
bag_path = os.path.realpath(self.path)
bag_path = os.path.normpath(bag_path)
common = os.path.commonprefix((bag_path, real_path))
return not (common == bag_path)

# check for unsafe character sequences like
# ~/.bashrc, ../../../secrets.json, \\?\c:\, D:\sys32\cmd.exe
norm_path = os.path.normpath(os.path.join(self.path, path))
norm_bag_path = os.path.normpath(self.path)
if os.path.commonprefix((norm_path, norm_bag_path)) != norm_bag_path:
return True

# check for symbolic or hard links
real_path = os.path.realpath(norm_path)
real_bag_path = os.path.realpath(norm_bag_path)
if os.path.commonprefix((real_path, real_bag_path)) != real_bag_path \
and not self.follow_links:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So --follow-links allows a symlink pointing to /etc/passwd?

return True

return False


class BagError(Exception):
Expand Down Expand Up @@ -1232,7 +1255,8 @@ def _make_tag_file(bag_info_path, bag_info):
f.write("%s: %s\n" % (h, txt))


def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"):
def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS,
encoding="utf-8", follow_links=False):
LOGGER.info(
_("Using %(process_count)d processes to generate manifests: %(algorithms)s"),
{"process_count": processes, "algorithms": ", ".join(algorithms)},
Expand All @@ -1242,11 +1266,13 @@ def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="

if processes > 1:
pool = multiprocessing.Pool(processes=processes)
checksums = pool.map(manifest_line_generator, _walk(data_dir))
checksums = pool.map(manifest_line_generator, _walk(data_dir,
follow_links=follow_links))
pool.close()
pool.join()
else:
checksums = [manifest_line_generator(i) for i in _walk(data_dir)]
checksums = [manifest_line_generator(i) for i in _walk(data_dir,
follow_links=follow_links)]

# At this point we have a list of tuples which start with the algorithm name:
manifest_data = {}
Expand Down Expand Up @@ -1309,12 +1335,12 @@ def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"):
tagmanifest.write("%s %s\n" % (digest, filename))


def _find_tag_files(bag_dir):
def _find_tag_files(bag_dir, follow_links=False):
for dir in os.listdir(bag_dir):
if dir != "data":
if os.path.isfile(dir) and not dir.startswith("tagmanifest-"):
yield dir
for dir_name, _, filenames in os.walk(dir):
for dir_name, _, filenames in os.walk(dir, followlinks=follow_links):
for filename in filenames:
if filename.startswith("tagmanifest-"):
continue
Expand All @@ -1323,8 +1349,8 @@ def _find_tag_files(bag_dir):
yield os.path.relpath(p, bag_dir)


def _walk(data_dir):
for dirpath, dirnames, filenames in os.walk(data_dir):
def _walk(data_dir, follow_links=False):
for dirpath, dirnames, filenames in os.walk(data_dir, followlinks=follow_links):
# if we don't sort here the order of entries is non-deterministic
# which makes it hard to test the fixity of tagmanifest-md5.txt
filenames.sort()
Expand All @@ -1338,7 +1364,7 @@ def _walk(data_dir):
yield path


def _can_bag(test_dir):
def _can_bag(test_dir, follow_links=False):
"""Scan the provided directory for files which cannot be bagged due to insufficient permissions"""
unbaggable = []

Expand All @@ -1350,7 +1376,8 @@ def _can_bag(test_dir):
if not os.access(test_dir, os.W_OK):
unbaggable.append(test_dir)

for dirpath, dirnames, filenames in os.walk(test_dir):
for dirpath, dirnames, filenames in os.walk(test_dir,
followlinks=follow_links):
for directory in dirnames:
full_path = os.path.join(dirpath, directory)
if not os.access(full_path, os.W_OK):
Expand All @@ -1359,7 +1386,7 @@ def _can_bag(test_dir):
return unbaggable


def _can_read(test_dir):
def _can_read(test_dir, follow_links=False):
"""
returns ((unreadable_dirs), (unreadable_files))
"""
Expand All @@ -1369,7 +1396,8 @@ def _can_read(test_dir):
if not os.access(test_dir, os.R_OK):
unreadable_dirs.append(test_dir)
else:
for dirpath, dirnames, filenames in os.walk(test_dir):
for dirpath, dirnames, filenames in os.walk(test_dir,
followlinks=follow_links):
for dn in dirnames:
full_path = os.path.join(dirpath, dn)
if not os.access(full_path, os.R_OK):
Expand Down Expand Up @@ -1499,6 +1527,15 @@ def _make_parser():
" without performing checksum validation to detect corruption."
),
)
parser.add_argument(
"--follow-links",
action="store_true",
help=_(
"Allow bag payload directory to contain symbolic or hard links"
" on operating systems that support them."
),
)


checksum_args = parser.add_argument_group(
_("Checksum Algorithms"),
Expand All @@ -1508,7 +1545,6 @@ def _make_parser():
)
% ", ".join(DEFAULT_CHECKSUMS),
)

for i in CHECKSUM_ALGOS:
alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper())
checksum_args.add_argument(
Expand Down Expand Up @@ -1571,12 +1607,12 @@ def main():
# validate the bag
if args.validate:
try:
bag = Bag(bag_dir)
bag = Bag(bag_dir, follow_links=args.follow_links)
# validate throws a BagError or BagValidationError
bag.validate(
processes=args.processes,
fast=args.fast,
completeness_only=args.completeness_only,
completeness_only=args.completeness_only
)
if args.fast:
LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir)
Expand All @@ -1596,6 +1632,7 @@ def main():
bag_info=args.bag_info,
processes=args.processes,
checksums=args.checksums,
follow_links=args.follow_links
)
except Exception as exc:
LOGGER.error(
Expand Down
31 changes: 31 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,37 @@ def test_fetch_malformed_url(self):

self.assertEqual(expected_msg, str(cm.exception))

def test_bag_symlink_is_dangerous(self):
src = j(os.path.dirname(__file__), "README.rst")
dst = j(self.tmpdir, "README.rst")
os.symlink(src, dst)
self.assertRaisesRegexp(
bagit.BagError,
'Path "data/README.rst" in manifest ".*?" is unsafe',
bagit.make_bag,
self.tmpdir
)

def test_bag_symlink_file(self):
src = j(os.path.dirname(__file__), "README.rst")
dst = j(self.tmpdir, "README.rst")
os.symlink(src, dst)
bag = bagit.make_bag(self.tmpdir, follow_links=True)
self.assertTrue(bag.validate())

def test_symlink_directory_ignored(self):
src = j(os.path.dirname(__file__), 'test-data', 'si')
dst = j(self.tmpdir, "si-again")
os.symlink(src, dst)
bag = bagit.make_bag(self.tmpdir)
self.assertEqual(len(bag.entries), 15)

def test_symlink_directory_followed(self):
src = j(os.path.dirname(__file__), 'test-data', 'si')
dst = j(self.tmpdir, "si-again")
os.symlink(src, dst)
bag = bagit.make_bag(self.tmpdir, follow_links=True)
self.assertEqual(len(bag.entries), 17)

class TestUtils(unittest.TestCase):
def setUp(self):
Expand Down