diff --git a/src/borg/archive.py b/src/borg/archive.py index 9807bfc605..e1cb7f4530 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -720,76 +720,38 @@ def extract_helper(self, item, path, hlm, *, dry_run=False): pass def compare_and_extract_chunks(self, item, fs_path): - print(f"Initial fs_path: {fs_path}") - print(f"self.cwd: {self.cwd}") - if fs_path.startswith(self.cwd): - fs_path = fs_path[len(self.cwd) :].lstrip(os.sep) - print(f"Relative fs_path: {fs_path}") - - # Construct the final path - fs_path = os.path.normpath(os.path.join(self.cwd, fs_path)) - print(f"Final fs_path: {fs_path}") - print(f"File exists at final path: {os.path.isfile(fs_path)}") - - os.makedirs(os.path.dirname(fs_path), exist_ok=True) + """Compare file chunks and patch if needed. Returns True if patching succeeded.""" try: - if os.path.isfile(fs_path): - with open(fs_path, "rb+") as fs_file: - chunk_offset = 0 - for chunk_entry in item.chunks: - chunkid_A = chunk_entry.id - size = chunk_entry.size - print(f"Processing chunk at offset {chunk_offset}") + st = os.stat(fs_path, follow_symlinks=False) + if not stat.S_ISREG(st.st_mode): + return False - fs_file.seek(chunk_offset) - data_F = fs_file.read(size) - print(f"Read {len(data_F)} bytes at offset {chunk_offset}") - print(f"File content: {data_F[:20]}...") # Show first 20 bytes - - if len(data_F) == size: - chunkid_F = self.key.id_hash(data_F) - print("Comparing hashes:") # Debug - print(f"Archive hash: {chunkid_A.hex()}") # Debug - print(f"File hash: {chunkid_F.hex()}") # Debug - print(f"Hashes match? {chunkid_A == chunkid_F}") - if chunkid_A != chunkid_F: - print("Hashes don't match, fetching new chunk") # Debug - fs_file.seek(chunk_offset) # Go back to the start of the chunk - chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM)) - print(f"Fetched content: {chunk_data[:20]}...") - fs_file.write(chunk_data) - fs_file.flush() - print("Wrote and flushed new chunk data") - else: - print(f"Chunk size mismatch at offset {chunk_offset}") - fs_file.seek(chunk_offset) - chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM)) - fs_file.write(chunk_data) + with open(fs_path, "rb+") as fs_file: + chunk_offset = 0 + for chunk_entry in item.chunks: + chunkid_A = chunk_entry.id + size = chunk_entry.size - chunk_offset += size + fs_file.seek(chunk_offset) + data_F = fs_file.read(size) - fs_file.truncate(item.size) - print(f"\nFinal file size: {os.path.getsize(fs_path)}") - with open(fs_path, "rb") as f: - print(f"Final content: {f.read()[:20]}...") - else: - with open(fs_path, "wb") as fs_file: - for chunk_entry in item.chunks: - chunk_data = b"".join(self.pipeline.fetch_many([chunk_entry.id], ro_type=ROBJ_FILE_STREAM)) + needs_update = True + if len(data_F) == size: + chunkid_F = self.key.id_hash(data_F) + needs_update = chunkid_A != chunkid_F + + if needs_update: + chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM)) + fs_file.seek(chunk_offset) fs_file.write(chunk_data) - fs_file.truncate(item.size) - with open(fs_path, "rb") as fs_file: - preview = fs_file.read(50) - print(f"Final file size: {os.path.getsize(fs_path)}, Expected: {item.size}") - print(f"Content preview (text): {preview.decode('utf-8', errors='replace')}") + chunk_offset += size - except OSError as e: - print(f"IO error processing {fs_path}: {e}") - raise - except Exception as e: - print(f"Error processing {fs_path}: {str(e)}") - raise + fs_file.truncate(item.size) + return True + + except (OSError, Exception): + return False def extract_item( self, @@ -802,7 +764,6 @@ def extract_item( hlm=None, pi=None, continue_extraction=False, - check_existing=False, ): """ Extract archive item. @@ -815,7 +776,6 @@ def extract_item( :param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes) :param continue_extraction: continue a previously interrupted extraction of same archive - :param check_existing: check against existing file/block device and only retrieve changed data """ def same_item(item, st): @@ -836,16 +796,6 @@ def same_item(item, st): # if a previous extraction was interrupted between setting the mtime and setting non-default flags. return True - if check_existing: - dest = os.path.normpath(self.cwd) - fs_path = os.path.join(dest, item.path) - - if not os.path.normpath(fs_path).startswith(dest): - raise Exception(f"Path {fs_path} is outside of extraction directory {dest}") - - self.compare_and_extract_chunks(item, fs_path) - return - has_damaged_chunks = "chunks_healthy" in item if dry_run or stdout: with self.extract_helper(item, "", hlm, dry_run=dry_run or stdout) as hardlink_set: @@ -905,6 +855,9 @@ def make_parent(path): with self.extract_helper(item, path, hlm) as hardlink_set: if hardlink_set: return + if self.compare_and_extract_chunks(item, path): + return + with backup_io("open"): fd = open(path, "wb") with fd: diff --git a/src/borg/archiver/extract_cmd.py b/src/borg/archiver/extract_cmd.py index db71220c82..c7fc4b08c7 100644 --- a/src/borg/archiver/extract_cmd.py +++ b/src/borg/archiver/extract_cmd.py @@ -44,7 +44,6 @@ def do_extract(self, args, repository, manifest, archive): sparse = args.sparse strip_components = args.strip_components continue_extraction = args.continue_extraction - check_existing = args.check_existing dirs = [] hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path @@ -97,7 +96,6 @@ def do_extract(self, args, repository, manifest, archive): hlm=hlm, pi=pi, continue_extraction=continue_extraction, - check_existing=check_existing, ) except BackupError as e: self.print_warning_instance(BackupWarning(remove_surrogates(orig_path), e)) @@ -194,12 +192,6 @@ def build_parser_extract(self, subparsers, common_parser, mid_common_parser): action="store_true", help="continue a previously interrupted extraction of same archive", ) - subparser.add_argument( - "--check-existing", - dest="check_existing", - action="store_true", - help="check against existing file/block device and only retrieve changed data", - ) subparser.add_argument("name", metavar="NAME", type=archivename_validator, help="specify the archive name") subparser.add_argument( "paths", metavar="PATH", nargs="*", type=PathSpec, help="paths to extract; patterns are supported" diff --git a/src/borg/testsuite/archive_test.py b/src/borg/testsuite/archive_test.py index 53b75be7af..3304141495 100644 --- a/src/borg/testsuite/archive_test.py +++ b/src/borg/testsuite/archive_test.py @@ -409,132 +409,152 @@ def test_reject_non_sanitized_item(): Item(path=path, user="root", group="root") -def test_compare_and_extract_chunks(tmpdir, monkeypatch): - """Test chunk comparison and selective extraction with fixed-size chunks""" - # Setup mock repository and key - repository = Mock() - key = PlaintextKey(repository) - manifest = Manifest(key, repository) - - cache = MockCache() - - # Create a test file with known content divided into 512-byte chunks - chunk_size = 512 - test_data = b"block" * 128 # 640 bytes - will create 2 chunks - original_file = tmpdir.join("test.txt") - original_file.write_binary(test_data) - - # Create mock item with chunks - chunks = [] - for i in range(0, len(test_data), chunk_size): - chunk_data = test_data[i : i + chunk_size] - chunk_id = key.id_hash(chunk_data) - chunks.append(Mock(id=chunk_id, size=len(chunk_data))) - cache.objects[chunk_id] = chunk_data - - item = Mock(chunks=chunks, size=len(test_data)) - - # Test case 1: File doesn't exist (full extraction) - extractor = Archive(manifest=manifest, name="test", create=True) - extractor.pipeline = cache - extractor.key = key - extractor.cwd = str(tmpdir) - - target_path = str(tmpdir.join("extracted.txt")) - extractor.compare_and_extract_chunks(item, target_path) - - with open(target_path, "rb") as f: - assert f.read() == test_data - - # Test case 2: File exists with partially matching chunks - modified_data = test_data[:256] + b"modified" + test_data[264:] - with open(target_path, "wb") as f: - f.write(modified_data) - - extractor.compare_and_extract_chunks(item, target_path) - - with open(target_path, "rb") as f: - extracted = f.read() - assert extracted == test_data - assert extracted != modified_data - - # Test case 3: File exists with all matching chunks - extractor.compare_and_extract_chunks(item, target_path) - with open(target_path, "rb") as f: - assert f.read() == test_data +@pytest.fixture +def setup_extractor(tmpdir): + """Setup common test infrastructure""" + class MockCache: + def __init__(self): + self.objects = {} -def test_compare_and_extract_chunks_size_mismatch(tmpdir): - """Test chunk comparison when file size doesn't match chunk size""" repository = Mock() key = PlaintextKey(repository) manifest = Manifest(key, repository) - cache = MockCache() - # Create a smaller file than expected - test_data = b"block" * 64 # 320 bytes - expected_data = b"block" * 128 # 640 bytes - - original_file = tmpdir.join("test.txt") - original_file.write_binary(test_data) - - # Create mock item with chunks expecting larger size - chunks = [] - for i in range(0, len(expected_data), 512): - chunk_data = expected_data[i : i + 512] - chunk_id = key.id_hash(chunk_data) - chunks.append(Mock(id=chunk_id, size=len(chunk_data))) - cache.objects[chunk_id] = chunk_data - - item = Mock(chunks=chunks, size=len(expected_data)) - - # Test extraction extractor = Archive(manifest=manifest, name="test", create=True) extractor.pipeline = cache extractor.key = key extractor.cwd = str(tmpdir) - target_path = str(original_file) - extractor.compare_and_extract_chunks(item, target_path) + # Track fetched chunks across tests + fetched_chunks = [] - with open(target_path, "rb") as f: - assert f.read() == expected_data + def create_mock_chunks(test_data, chunk_size=512): + """Helper function to create mock chunks from test data""" + chunks = [] + for i in range(0, len(test_data), chunk_size): + chunk_data = test_data[i : i + chunk_size] + chunk_id = key.id_hash(chunk_data) + chunks.append(Mock(id=chunk_id, size=len(chunk_data))) + cache.objects[chunk_id] = chunk_data + item = Mock(chunks=chunks, size=len(test_data)) + target_path = str(tmpdir.join("test.txt")) + return item, target_path -def test_compare_and_extract_chunks_partial_chunk(tmpdir): - """Test chunk comparison with a final partial chunk""" - repository = Mock() - key = PlaintextKey(repository) - manifest = Manifest(key, repository) + def mock_fetch_many(chunk_ids, ro_type): + """Helper function to track and mock chunk fetching""" + fetched_chunks.extend(chunk_ids) + return [cache.objects[chunk_id] for chunk_id in chunk_ids] - cache = MockCache() + def clear_fetched_chunks(): + """Helper function to clear tracked chunks between tests""" + fetched_chunks.clear() - # Create data that doesn't align with chunk boundaries - chunk_size = 512 - test_data = b"block" * 130 # 650 bytes - will create 2 chunks, second one partial + def get_fetched_chunks(): + """Helper function to get the list of fetched chunks""" + return fetched_chunks - original_file = tmpdir.join("test.txt") - original_file.write_binary(test_data) + cache.fetch_many = mock_fetch_many - # Create mock item with chunks - chunks = [] - for i in range(0, len(test_data), chunk_size): - chunk_data = test_data[i : i + chunk_size] - chunk_id = key.id_hash(chunk_data) - chunks.append(Mock(id=chunk_id, size=len(chunk_data))) - cache.objects[chunk_id] = chunk_data + return extractor, key, cache, tmpdir, create_mock_chunks, get_fetched_chunks, clear_fetched_chunks - item = Mock(chunks=chunks, size=len(test_data)) - # Test extraction - extractor = Archive(manifest=manifest, name="test", create=True) - extractor.pipeline = cache - extractor.key = key - extractor.cwd = str(tmpdir) +@pytest.mark.parametrize( + "name, test_data, initial_data, expected_fetched_chunks, expected_success", + [ + ( + "no_changes", + b"A" * 512, # One complete chunk, no changes needed + b"A" * 512, # Identical content + 0, # No chunks should be fetched + True, + ), + ( + "single_chunk_change", + b"A" * 512 + b"B" * 512, # Two chunks + b"A" * 512 + b"X" * 512, # Second chunk different + 1, # Only second chunk should be fetched + True, + ), + ( + "cross_boundary_change", + b"A" * 512 + b"B" * 512, # Two chunks + b"A" * 500 + b"X" * 24, # Change crosses chunk boundary + 2, # Both chunks need update + True, + ), + ( + "exact_multiple_chunks", + b"A" * 512 + b"B" * 512 + b"C" * 512, # Three complete chunks + b"A" * 512 + b"X" * 512 + b"C" * 512, # Middle chunk different + 1, # Only middle chunk fetched + True, + ), + ( + "first_chunk_change", + b"A" * 512 + b"B" * 512, # Two chunks + b"X" * 512 + b"B" * 512, # First chunk different + 1, # Only first chunk should be fetched + True, + ), + ( + "all_chunks_different", + b"A" * 512 + b"B" * 512, # Two chunks + b"X" * 512 + b"Y" * 512, # Both chunks different + 2, # Both chunks should be fetched + True, + ), + ( + "partial_last_chunk", + b"A" * 512 + b"B" * 100, # One full chunk + partial + b"A" * 512 + b"X" * 100, # Partial chunk different + 1, # Only second chunk should be fetched + True, + ), + ], +) +def test_compare_and_extract_chunks( + setup_extractor, name, test_data, initial_data, expected_fetched_chunks, expected_success +): + """Test chunk comparison and extraction""" + extractor, key, cache, tmpdir, create_mock_chunks, get_fetched_chunks, clear_fetched_chunks = setup_extractor + clear_fetched_chunks() - target_path = str(tmpdir.join("extracted.txt")) - extractor.compare_and_extract_chunks(item, target_path) + item, target_path = create_mock_chunks(test_data, chunk_size=512) - with open(target_path, "rb") as f: - assert f.read() == test_data + original_chunk_ids = [chunk.id for chunk in item.chunks] + + # Write initial file state + with open(target_path, "wb") as f: + f.write(initial_data) + + result = extractor.compare_and_extract_chunks(item, target_path) + assert result == expected_success + + if expected_success: + # Verify only the expected chunks were fetched + fetched_chunks = get_fetched_chunks() + assert ( + len(fetched_chunks) == expected_fetched_chunks + ), f"Expected {expected_fetched_chunks} chunks to be fetched, got {len(fetched_chunks)}" + + # For single chunk changes, verify it's the correct chunk + if expected_fetched_chunks == 1: + # Find which chunk should have changed by comparing initial_data with test_data + for i, (orig_chunk, mod_chunk) in enumerate( + zip( + [test_data[i : i + 512] for i in range(0, len(test_data), 512)], + [initial_data[i : i + 512] for i in range(0, len(initial_data), 512)], + ) + ): + if orig_chunk != mod_chunk: + assert ( + fetched_chunks[0] == original_chunk_ids[i] + ), f"Wrong chunk fetched. Expected chunk at position {i}" + break + + # Verify final content + with open(target_path, "rb") as f: + assert f.read() == test_data