From 265adfd18f3d5d1c478e63037e37f250604d881d Mon Sep 17 00:00:00 2001 From: Nicolas Wavrant Date: Fri, 18 Oct 2024 10:21:38 +0900 Subject: [PATCH 1/5] repozo: factorize code doing the actual recover (write), in preparation to the implementation of the incremental recover --- src/ZODB/scripts/repozo.py | 71 ++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py index 8dce5dc80..32d54f561 100755 --- a/src/ZODB/scripts/repozo.py +++ b/src/ZODB/scripts/repozo.py @@ -396,6 +396,42 @@ def func(data): return bytesread, sum.hexdigest() +def recover_repofiles(options, repofiles, outfp): + if options.withverify: + datfile = os.path.splitext(repofiles[0])[0] + '.dat' + with open(datfile) as fp: + truth_dict = {} + for line in fp: + fn, startpos, endpos, sum = line.split() + startpos = int(startpos) + endpos = int(endpos) + filename = os.path.join(options.repository, + os.path.basename(fn)) + truth_dict[filename] = { + 'size': endpos - startpos, + 'sum': sum, + } + totalsz = 0 + for repofile in repofiles: + reposz, reposum = concat([repofile], outfp) + expected_truth = truth_dict[repofile] + if reposz != expected_truth['size']: + raise VerificationFail( + "%s is %d bytes, should be %d bytes" % ( + repofile, reposz, expected_truth['size'])) + if reposum != expected_truth['sum']: + raise VerificationFail( + "{} has checksum {} instead of {}".format( + repofile, reposum, expected_truth['sum'])) + totalsz += reposz + log("Recovered chunk %s : %s bytes, md5: %s", + repofile, reposz, reposum) + log("Recovered a total of %s bytes", totalsz) + else: + reposz, reposum = concat(repofiles, outfp) + log('Recovered %s bytes, md5: %s', reposz, reposum) + + def gen_filedate(options): return getattr(options, 'test_now', time.gmtime()[:6]) @@ -698,40 +734,7 @@ def do_recover(options): files_to_close += (outfp,) try: - if options.withverify: - datfile = os.path.splitext(repofiles[0])[0] + '.dat' - with open(datfile) as fp: - truth_dict = {} - for line in fp: - fn, startpos, endpos, sum = line.split() - startpos = int(startpos) - endpos = int(endpos) - filename = os.path.join(options.repository, - os.path.basename(fn)) - truth_dict[filename] = { - 'size': endpos - startpos, - 'sum': sum, - } - totalsz = 0 - for repofile in repofiles: - reposz, reposum = concat([repofile], outfp) - expected_truth = truth_dict[repofile] - if reposz != expected_truth['size']: - raise VerificationFail( - "%s is %d bytes, should be %d bytes" % ( - repofile, reposz, expected_truth['size'])) - if reposum != expected_truth['sum']: - raise VerificationFail( - "{} has checksum {} instead of {}".format( - repofile, reposum, expected_truth['sum'])) - totalsz += reposz - log("Recovered chunk %s : %s bytes, md5: %s", - repofile, reposz, reposum) - log("Recovered a total of %s bytes", totalsz) - else: - reposz, reposum = concat(repofiles, outfp) - log('Recovered %s bytes, md5: %s', reposz, reposum) - + recover_repofiles(options, repofiles, outfp) if options.output is not None: last_base = os.path.splitext(repofiles[-1])[0] source_index = '%s.index' % last_base From f5eacd0a6116a977f65c1efe5a5eb5945d458fad Mon Sep 17 00:00:00 2001 From: Nicolas Wavrant Date: Thu, 17 Oct 2024 17:20:03 +0900 Subject: [PATCH 2/5] repozo: support incremental recover Which allows to recover a zodb filestorage by only appending the missing chunks from the latest recovered file, instead of always recovering from zero. Based on the work of @vpelletier (incpozo). --- CHANGES.rst | 11 +- src/ZODB/scripts/repozo.py | 121 +++++++++++--- src/ZODB/scripts/tests/test_repozo.py | 227 +++++++++++++++++++++++++- 3 files changed, 331 insertions(+), 28 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 4bdbfea0c..87122f15d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,9 +5,16 @@ 6.0.1 (unreleased) ================== - - repozo: fix restoring on stdout. +- repozo: Change restoration to be incremental by default, unless ``--full`` is + provided. + Repozo now tries to append the new incremental deltafs on previously restored + filestorage, if the file's sizes and the checksum of the last restored increment + match, otherwise it will fallback to a full recover. + For details see `#403 `_. - - repozo: prevent an incorrect "option ignored" warning when running backup or verify. +- repozo: fix restoring on stdout. + +- repozo: prevent an incorrect "option ignored" warning when running backup or verify. 6.0 (2024-03-20) diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py index 32d54f561..283cae2a3 100755 --- a/src/ZODB/scripts/repozo.py +++ b/src/ZODB/scripts/repozo.py @@ -73,6 +73,12 @@ Note: for the stdout case, the index file will **not** be restored automatically. + -F / --full + Force a full recover. By default, an incremental recover is made + if possible, by only copying the latest backup delta to the recovered + ZODB file. A full recover will always be done if a pack has occured + since the last incremental backup. + -w --with-verify Verify on the fly the backup files on recovering. This option runs @@ -185,7 +191,7 @@ class Options: mode = None # BACKUP, RECOVER or VERIFY file = None # name of input Data.fs file repository = None # name of directory holding backups - full = False # True forces full backup + full = False # True forces full backup or full recovery date = None # -D argument, if any output = None # where to write recovered data; None = stdout quick = False # -Q flag state @@ -396,9 +402,8 @@ def func(data): return bytesread, sum.hexdigest() -def recover_repofiles(options, repofiles, outfp): +def recover_repofiles(options, repofiles, datfile, outfp): if options.withverify: - datfile = os.path.splitext(repofiles[0])[0] + '.dat' with open(datfile) as fp: truth_dict = {} for line in fp: @@ -709,15 +714,7 @@ def do_backup(options): do_full_backup(options) -def do_recover(options): - # Find the first full backup at or before the specified date - repofiles = find_files(options) - if not repofiles: - if options.date: - raise NoFiles(f'No files in repository before {options.date}') - else: - raise NoFiles('No files in repository') - +def do_full_recover(options, repofiles): files_to_close = () if options.output is None: log('Recovering file to stdout') @@ -734,17 +731,8 @@ def do_recover(options): files_to_close += (outfp,) try: - recover_repofiles(options, repofiles, outfp) - if options.output is not None: - last_base = os.path.splitext(repofiles[-1])[0] - source_index = '%s.index' % last_base - target_index = '%s.index' % options.output - if os.path.exists(source_index): - log('Restoring index file %s to %s', - source_index, target_index) - shutil.copyfile(source_index, target_index) - else: - log('No index file to restore: %s', source_index) + datfile = os.path.splitext(repofiles[0])[0] + '.dat' + recover_repofiles(options, repofiles, datfile, outfp) finally: for f in files_to_close: f.close() @@ -758,6 +746,93 @@ def do_recover(options): raise +def do_incremental_recover(options, repofiles): + datfile = os.path.splitext(repofiles[0])[0] + '.dat' + log('Recovering (incrementally) file to %s', options.output) + with open(options.output, 'r+b') as outfp: + outfp.seek(0, 2) + initial_length = outfp.tell() + with open(datfile) as fp: + previous_chunk = None + for line in fp: + fn, startpos, endpos, _ = chunk = line.split() + startpos = int(startpos) + endpos = int(endpos) + if endpos > initial_length: + break + previous_chunk = chunk + + if previous_chunk is None: + log('Target file smaller than full backup, ' + 'falling back to a full recover.') + return do_full_recover(options, repofiles) + if endpos < initial_length: + log('Target file is larger than latest backup, ' + 'falling back to a full recover.') + return do_full_recover(options, repofiles) + check_startpos = int(previous_chunk[1]) + check_endpos = int(previous_chunk[2]) + with open(options.output, 'r+b') as outfp: + outfp.seek(check_startpos) + check_sum = checksum(outfp, check_endpos - check_startpos) + if endpos == initial_length and chunk[3] == check_sum: + log('Target file is same size as latest backup, ' + 'doing nothing.') + return + elif previous_chunk[3] != check_sum: + if endpos == initial_length: + log('Target file is not consistent with latest backup, ' + 'falling back to a full recover.') + return do_full_recover(options, repofiles) + else: + log('Last whole common chunk checksum did not match with backup, ' + 'falling back to a full recover.') + return do_full_recover(options, repofiles) + + filename = os.path.join(options.repository, + os.path.basename(fn)) + first_file_to_restore = repofiles.index(filename) + assert first_file_to_restore > 0, ( + first_file_to_restore, options.repository, fn, filename, repofiles) + + temporary_output_file = options.output + '.part' + os.rename(options.output, temporary_output_file) + with open(temporary_output_file, 'r+b') as outfp: + outfp.seek(startpos) + recover_repofiles(options, + repofiles[first_file_to_restore:], + datfile, + outfp) + os.rename(temporary_output_file, options.output) + + +def do_recover(options): + # Find the first full backup at or before the specified date + repofiles = find_files(options) + if not repofiles: + if options.date: + raise NoFiles(f'No files in repository before {options.date}') + else: + raise NoFiles('No files in repository') + + if (options.full or options.output is None + or not os.path.exists(options.output)): + do_full_recover(options, repofiles) + else: + do_incremental_recover(options, repofiles) + + if options.output is not None: + last_base = os.path.splitext(repofiles[-1])[0] + source_index = '%s.index' % last_base + target_index = '%s.index' % options.output + if os.path.exists(source_index): + log('Restoring index file %s to %s', + source_index, target_index) + shutil.copyfile(source_index, target_index) + else: + log('No index file to restore: %s', source_index) + + def do_verify(options): # Verify the sizes and checksums of all files mentioned in the .dat file repofiles = find_files(options) diff --git a/src/ZODB/scripts/tests/test_repozo.py b/src/ZODB/scripts/tests/test_repozo.py index 13d24bef5..eccb4f1c1 100644 --- a/src/ZODB/scripts/tests/test_repozo.py +++ b/src/ZODB/scripts/tests/test_repozo.py @@ -864,8 +864,7 @@ def test_w_changes(self): self.assertEqual(index.maxKey(), db.maxkey) -class Test_do_recover(OptionsTestBase, unittest.TestCase): - +class Mixin_do_recover: def _callFUT(self, options): from ZODB.scripts.repozo import do_recover return do_recover(options) @@ -902,6 +901,17 @@ def test_no_files_before_explicit_date(self): files.append(self._makeFile(h, m, s, e)) self.assertRaises(NoFiles, self._callFUT, options) + +class Test_do_full_recover( + Mixin_do_recover, + OptionsTestBase, + unittest.TestCase +): + def _makeOptions(self, **kw): + options = super()._makeOptions(**kw) + options.full = True + return options + def test_w_full_backup_latest_no_index(self): import tempfile dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') @@ -1010,6 +1020,216 @@ def test_w_incr_backup_with_verify_size_inconsistent(self): self.assertTrue(os.path.exists(output + '.part')) +class Test_do_incremental_recover( + Mixin_do_recover, + OptionsTestBase, + unittest.TestCase +): + def setUp(self): + from ZODB.scripts import repozo + self._old_verbosity = repozo.VERBOSE + self._old_stderr = sys.stderr + repozo.VERBOSE = True + sys.stderr = StringIO() + + def tearDown(self): + from ZODB.scripts import repozo + sys.stderr = self._old_stderr + repozo.VERBOSE = self._old_verbosity + + def _makeOptions(self, **kw): + options = super()._makeOptions(**kw) + options.full = False + return options + + def _createRecoveredDataFS(self, output, options): + self._makeFile(2, 3, 4, '.fs', 'AAA') + self._makeFile(4, 5, 6, '.deltafs', 'BBB') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n') # noqa: E501 line too long + self._callFUT(options) + self.assertEqual(_read_file(output), b'AAABBB') + self.assertFalse(os.path.exists(output + '.part')) + return output + + def test_do_nothing(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=False) + self._createRecoveredDataFS(output, options) + self._callFUT(options) + self.assertIn( + "doing nothing", sys.stderr.getvalue()) + + def test_w_incr_recover_from_incr_backup(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=False) + self._createRecoveredDataFS(output, options) + # Create 2 more .deltafs, to prove the code knows where to pick up + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile(8, 9, 10, '.deltafs', 'DDD') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long + '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long + os.unlink( + os.path.join(self._repository_directory, + '2010-05-14-04-05-06.deltafs')) + self._callFUT(options) + self.assertNotIn('falling back to a full recover.', + sys.stderr.getvalue()) + self.assertEqual(_read_file(output), b'AAABBBCCCDDD') + self.assertFalse(os.path.exists(output + '.part')) + + def test_w_incr_backup_with_verify_sum_inconsistent(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=True) + self._createRecoveredDataFS(output, options) + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile(8, 9, 10, '.deltafs', 'DDD') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166af\n' # noqa: E501 line too long + '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long + from ZODB.scripts.repozo import VerificationFail + self.assertRaises(VerificationFail, self._callFUT, options) + self.assertNotIn('falling back to a full recover.', + sys.stderr.getvalue()) + self.assertTrue(os.path.exists(output + '.part')) + + def test_w_incr_backup_with_verify_size_inconsistent_too_small(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=True) + self._createRecoveredDataFS(output, options) + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile(8, 9, 10, '.deltafs', 'DDD') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 8 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long + '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long + from ZODB.scripts.repozo import VerificationFail + self.assertRaises(VerificationFail, self._callFUT, options) + self.assertNotIn('falling back to a full recover.', + sys.stderr.getvalue()) + self.assertTrue(os.path.exists(output + '.part')) + + def test_w_incr_backup_with_verify_size_inconsistent_too_big(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=True) + self._createRecoveredDataFS(output, options) + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile(8, 9, 10, '.deltafs', 'DDD') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 10 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long + '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long + from ZODB.scripts.repozo import VerificationFail + self.assertRaises(VerificationFail, self._callFUT, options) + self.assertNotIn('falling back to a full recover.', + sys.stderr.getvalue()) + self.assertTrue(os.path.exists(output + '.part')) + + def test_w_incr_backup_switch_auto_to_full_recover_if_output_larger_than_dat(self): # noqa: E501 line too long + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=False) + self._createRecoveredDataFS(output, options) + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long + # The ZODB is longer than announced in the .dat file + with open(output, 'r+b') as f: + f.write(b'AAABBBCCCDDD') + self._callFUT(options) + self.assertEqual(_read_file(output), b'AAABBBCCC') + self.assertFalse(os.path.exists(output + '.part')) + self.assertIn( + "falling back to a full recover", sys.stderr.getvalue()) + + def test_w_incr_backup_switch_auto_to_full_recover_if_chunk_is_wrong(self): # noqa: E501 line too long + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=False) + self._createRecoveredDataFS(output, options) + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a4\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long + self._callFUT(options) + self.assertEqual(_read_file(output), b'AAABBBCCC') + self.assertFalse(os.path.exists(output + '.part')) + self.assertIn( + "Last whole common chunk checksum did not match with backup, falling back to a full recover.", # noqa: E501 line too long + sys.stderr.getvalue()) + + def test_w_incr_backup_switch_auto_to_full_recover_after_pack(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + withverify=False) + self._makeFile(2, 3, 4, '.fs', 'AAA') + self._makeFile(4, 5, 6, '.deltafs', 'BBB') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n') # noqa: E501 line too long + self._callFUT(options) + self.assertEqual(_read_file(output), b'AAABBB') + + self._makeFile(6, 7, 8, '.fs', 'CCDD') + self._makeFile( + 6, 7, 8, '.dat', + '/backup/2010-05-14-06-07-08.fs 0 4 dc0ee37408176d839c13f291a4d588de\n') # noqa: E501 line too long + self._callFUT(options) + self.assertEqual(_read_file(output), b'CCDD') + self.assertFalse(os.path.exists(output + '.part')) + self.assertIn( + 'Target file is larger than latest backup, falling back to a full recover.', # noqa: E501 line too long + sys.stderr.getvalue()) + + class Test_do_verify(OptionsTestBase, unittest.TestCase): def _callFUT(self, options): @@ -1296,7 +1516,8 @@ def test_suite(): loadTestsFromTestCase(Test_do_full_backup), loadTestsFromTestCase(Test_do_incremental_backup), # unittest.makeSuite(Test_do_backup), #TODO - loadTestsFromTestCase(Test_do_recover), + loadTestsFromTestCase(Test_do_full_recover), + loadTestsFromTestCase(Test_do_incremental_recover), loadTestsFromTestCase(Test_do_verify), # N.B.: this test take forever to run (~40sec on a fast laptop), # *and* it is non-deterministic. From c21a6df92544302d971594c893452224c3e71279 Mon Sep 17 00:00:00 2001 From: Nicolas Wavrant Date: Thu, 14 Nov 2024 16:17:42 +0900 Subject: [PATCH 3/5] repozo: integrate a quick option for the incremental recover The default behavior becomes to check the integrality of the existing recovered Data.fs before incrementally restore on it. The quick option allows to verify only the latest chunck previously recovered on the Data.fs before restoring incrementally. This saves many reads. --- src/ZODB/scripts/repozo.py | 59 +++++++++++------- src/ZODB/scripts/tests/test_repozo.py | 87 +++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 26 deletions(-) diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py index 283cae2a3..8240bebc7 100755 --- a/src/ZODB/scripts/repozo.py +++ b/src/ZODB/scripts/repozo.py @@ -79,6 +79,13 @@ ZODB file. A full recover will always be done if a pack has occured since the last incremental backup. + -Q / --quick + Verify via md5 checksum only the last incremental recovered of the + output file. This reduces the disk i/o at the (theoretical) cost of + inconsistency. This is a probabilistic way of determining whether a + full recover is necessary. This argument is ignored when -F / --full + is used. + -w --with-verify Verify on the fly the backup files on recovering. This option runs @@ -267,6 +274,9 @@ class Options: if options.killold: log('--kill-old-on-full option is ignored in recover mode') options.killold = False + if options.full and options.quick: + log('--quick option is ignored if --full option is used') + options.quick = None else: assert options.mode == VERIFY if options.date is not None: @@ -752,42 +762,49 @@ def do_incremental_recover(options, repofiles): with open(options.output, 'r+b') as outfp: outfp.seek(0, 2) initial_length = outfp.tell() - with open(datfile) as fp: - previous_chunk = None + + error = '' + previous_chunk = None + with open(datfile) as fp, open(options.output, 'r+b') as outfp: for line in fp: - fn, startpos, endpos, _ = chunk = line.split() + fn, startpos, endpos, check_sum = chunk = line.split() startpos = int(startpos) endpos = int(endpos) if endpos > initial_length: break + if not options.quick: + if check_sum != checksum(outfp, endpos - startpos): + error = ('Target file is not consistent with backup %s, ' + 'falling back to a full recover.') % fn + break previous_chunk = chunk - - if previous_chunk is None: + if error: + log(error) + return do_full_recover(options, repofiles) + elif previous_chunk is None: log('Target file smaller than full backup, ' 'falling back to a full recover.') return do_full_recover(options, repofiles) - if endpos < initial_length: + elif endpos < initial_length: log('Target file is larger than latest backup, ' 'falling back to a full recover.') return do_full_recover(options, repofiles) - check_startpos = int(previous_chunk[1]) - check_endpos = int(previous_chunk[2]) - with open(options.output, 'r+b') as outfp: - outfp.seek(check_startpos) - check_sum = checksum(outfp, check_endpos - check_startpos) - if endpos == initial_length and chunk[3] == check_sum: + if options.quick: + check_startpos = int(previous_chunk[1]) + check_endpos = int(previous_chunk[2]) + with open(options.output, 'r+b') as outfp: + outfp.seek(check_startpos) + if previous_chunk[3] != checksum( + outfp, check_endpos - check_startpos): + error = ('Target file is not consistent with backup %s, ' + 'falling back to a full recover.' % previous_chunk[0]) + if error: + log(error) + return do_full_recover(options, repofiles) + if endpos == initial_length: log('Target file is same size as latest backup, ' 'doing nothing.') return - elif previous_chunk[3] != check_sum: - if endpos == initial_length: - log('Target file is not consistent with latest backup, ' - 'falling back to a full recover.') - return do_full_recover(options, repofiles) - else: - log('Last whole common chunk checksum did not match with backup, ' - 'falling back to a full recover.') - return do_full_recover(options, repofiles) filename = os.path.join(options.repository, os.path.basename(fn)) diff --git a/src/ZODB/scripts/tests/test_repozo.py b/src/ZODB/scripts/tests/test_repozo.py index eccb4f1c1..9d5696e72 100644 --- a/src/ZODB/scripts/tests/test_repozo.py +++ b/src/ZODB/scripts/tests/test_repozo.py @@ -220,13 +220,16 @@ def test_recover_ignored_args(self): from ZODB.scripts import repozo options = repozo.parseargs(['-R', '-r', '/tmp/nosuchdir', '-v', '-f', '/tmp/ignored.fs', - '-k']) + '-k', '--full', '--quick']) self.assertEqual(options.file, None) self.assertIn('--file option is ignored in recover mode', sys.stderr.getvalue()) self.assertEqual(options.killold, False) self.assertIn('--kill-old-on-full option is ignored in recover mode', sys.stderr.getvalue()) + self.assertEqual(options.quick, None) + self.assertIn('--quick option is ignored if --full option is used', + sys.stderr.getvalue()) def test_verify_ignored_args(self): from ZODB.scripts import repozo @@ -1040,6 +1043,7 @@ def tearDown(self): def _makeOptions(self, **kw): options = super()._makeOptions(**kw) options.full = False + options.quick = kw.get('quick', False) return options def _createRecoveredDataFS(self, output, options): @@ -1092,6 +1096,33 @@ def test_w_incr_recover_from_incr_backup(self): self.assertEqual(_read_file(output), b'AAABBBCCCDDD') self.assertFalse(os.path.exists(output + '.part')) + def test_w_quick_incr_recover_from_incr_backup(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + quick=True, + withverify=False) + self._createRecoveredDataFS(output, options) + # Create 2 more .deltafs, to prove the code knows where to pick up + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile(8, 9, 10, '.deltafs', 'DDD') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long + '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long + os.unlink( + os.path.join(self._repository_directory, + '2010-05-14-04-05-06.deltafs')) + self._callFUT(options) + self.assertNotIn('falling back to a full recover.', + sys.stderr.getvalue()) + self.assertEqual(_read_file(output), b'AAABBBCCCDDD') + self.assertFalse(os.path.exists(output + '.part')) + def test_w_incr_backup_with_verify_sum_inconsistent(self): import tempfile dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') @@ -1192,14 +1223,14 @@ def test_w_incr_backup_switch_auto_to_full_recover_if_chunk_is_wrong(self): # n self._makeFile(6, 7, 8, '.deltafs', 'CCC') self._makeFile( 2, 3, 4, '.dat', - '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long - '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a4\n' # noqa: E501 line too long + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b8\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long self._callFUT(options) self.assertEqual(_read_file(output), b'AAABBBCCC') self.assertFalse(os.path.exists(output + '.part')) self.assertIn( - "Last whole common chunk checksum did not match with backup, falling back to a full recover.", # noqa: E501 line too long + "Target file is not consistent with backup /backup/2010-05-14-02-03-04.fs, falling back to a full recover.", # noqa: E501 line too long sys.stderr.getvalue()) def test_w_incr_backup_switch_auto_to_full_recover_after_pack(self): @@ -1226,9 +1257,55 @@ def test_w_incr_backup_switch_auto_to_full_recover_after_pack(self): self.assertEqual(_read_file(output), b'CCDD') self.assertFalse(os.path.exists(output + '.part')) self.assertIn( - 'Target file is larger than latest backup, falling back to a full recover.', # noqa: E501 line too long + "Target file is not consistent with backup /backup/2010-05-14-06-07-08.fs, falling back to a full recover.", # noqa: E501 line too long sys.stderr.getvalue()) + def test_w_quick_incr_backup_switch_auto_to_full_recover_if_last_chunk_is_wrong(self): # noqa: E501 line too long + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + quick=True, + withverify=False) + self._createRecoveredDataFS(output, options) + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a4\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long + self._callFUT(options) + self.assertEqual(_read_file(output), b'AAABBBCCC') + self.assertFalse(os.path.exists(output + '.part')) + self.assertIn( + "Target file is not consistent with backup /backup/2010-05-14-04-05-06.deltafs, falling back to a full recover.", # noqa: E501 line too long + sys.stderr.getvalue()) + + def test_w_quick_incr_backup_dont_see_old_inconsistencies(self): + import tempfile + dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-') + output = os.path.join(dd, 'Data.fs') + options = self._makeOptions(date='2010-05-15-13-30-57', + output=output, + quick=True, + withverify=False) + self._createRecoveredDataFS(output, options) + self._makeFile(6, 7, 8, '.deltafs', 'CCC') + self._makeFile( + 2, 3, 4, '.dat', + '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long + '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long + '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long + # The ZODB is longer than announced in the .dat file + with open(output, 'r+b') as f: + f.write(b'ZZZBBBCCC') + self._callFUT(options) + self.assertEqual(_read_file(output), b'ZZZBBBCCC') + self.assertFalse(os.path.exists(output + '.part')) + self.assertNotIn( + "falling back to a full recover", sys.stderr.getvalue()) + class Test_do_verify(OptionsTestBase, unittest.TestCase): From 93a1e0653873067db46cc675c548750310189576 Mon Sep 17 00:00:00 2001 From: Nicolas Wavrant Date: Fri, 15 Nov 2024 16:25:09 +0900 Subject: [PATCH 4/5] repozo: be more explicit on the FileStorage size retrieval methodology --- src/ZODB/scripts/repozo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py index 8240bebc7..aaf5620a6 100755 --- a/src/ZODB/scripts/repozo.py +++ b/src/ZODB/scripts/repozo.py @@ -760,7 +760,10 @@ def do_incremental_recover(options, repofiles): datfile = os.path.splitext(repofiles[0])[0] + '.dat' log('Recovering (incrementally) file to %s', options.output) with open(options.output, 'r+b') as outfp: - outfp.seek(0, 2) + # Note that we do not open the FileStorage to use getSize here, + # we really want the actual file size, even if there is invalid + # transaction data at the end. + outfp.seek(0, os.SEEK_END) initial_length = outfp.tell() error = '' From 3b4abd939171fc5af92de3016f66ce20f55e5a2f Mon Sep 17 00:00:00 2001 From: Nicolas Wavrant Date: Fri, 15 Nov 2024 16:53:04 +0900 Subject: [PATCH 5/5] repozo: log files that will be used by the incremental recovery As find_files logs all the files needed to recover the state from 0, add a log to show which files will be effectively used in the case of an incremental recovery. --- src/ZODB/scripts/repozo.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py index aaf5620a6..015418c12 100755 --- a/src/ZODB/scripts/repozo.py +++ b/src/ZODB/scripts/repozo.py @@ -814,6 +814,9 @@ def do_incremental_recover(options, repofiles): first_file_to_restore = repofiles.index(filename) assert first_file_to_restore > 0, ( first_file_to_restore, options.repository, fn, filename, repofiles) + log('remaining files needed to recover incrementally onto target file:') + for f in repofiles[first_file_to_restore:]: + log('\t%s', f) temporary_output_file = options.output + '.part' os.rename(options.output, temporary_output_file)