Skip to content

Commit

Permalink
db: unconditionally interleave ignorable boundaries
Browse files Browse the repository at this point in the history
The levelIter has a concept of ignorable boundary keys. When a levelIter
exhausts a file's point keys, it's possible that the same file still contains
range deletions relevant to other levels of the LSM. The levelIter will, under
some conditions, interleave the largest boundary key of the sstable into
iteration as an 'ignorable boundary key,' so that the file's range deletions
remain accessible until all other levels progress beyound the file's boundary.

When block-property filters are in use, a file's point key iterator may become
exhausted early, before the file's range deletions are irrelevant, even if the
file's largest key is not a range deletion. To work around this subtlety, the
sstable iterator previously would surface knowledge of whether any point keys
may have been skipped through a MaybeFilteredKeys method. The levelIter used
this method to determine when to interleave an ignorable largest boundary in
case there may be additional relevant range deletions.

This commit removes the conditioning on the output of MaybeFilteredKeys,
instead unconditionally interleaving a synthetic boundary at a file's largest
point key if the levelIter user is using the file's range deletion iterator.
This simplifies the logic and removes a fragile reliance on the sstable's
iterators accounting of when it may have filtered keys.

Future work (cockroachdb#2863) will remove the need to interleave synthetic boundaries at
all, instead interleaving the range deletion bounds themselves among point
keys.

Informs cockroachdb#2863.
Close cockroachdb#3334.
  • Loading branch information
jbowens committed Apr 1, 2024
1 parent eae0efc commit 1c35b03
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 66 deletions.
78 changes: 41 additions & 37 deletions level_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ func (m *simpleMergingIter) step() bool {
item := &m.heap.items[0]
l := &m.levels[item.index]
// Sentinels are not relevant for this point checking.
if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
if !l.isIgnorableBoundaryKey && !item.key.IsExclusiveSentinel() &&
item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
m.numPoints++
keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0
if !keyChanged {
Expand Down Expand Up @@ -223,46 +224,49 @@ func (m *simpleMergingIter) step() bool {
m.lastIterMsg = l.iter.String()

// Step to the next point.
if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil {
// Check point keys in an sstable are ordered. Although not required, we check
// for memtables as well. A subtle check here is that successive sstables of
// L1 and higher levels are ordered. This happens when levelIter moves to the
// next sstable in the level, in which case item.key is previous sstable's
// last point key.
if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
m.err = errors.Errorf("out of order keys %s >= %s in %s",
item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
return false
l.iterKey, l.iterValue = l.iter.Next()
if !l.isIgnorableBoundaryKey {
if l.iterKey != nil {
// Check point keys in an sstable are ordered. Although not required, we check
// for memtables as well. A subtle check here is that successive sstables of
// L1 and higher levels are ordered. This happens when levelIter moves to the
// next sstable in the level, in which case item.key is previous sstable's
// last point key.
if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
m.err = errors.Errorf("out of order keys %s >= %s in %s",
item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
return false
}
item.key.Trailer = l.iterKey.Trailer
item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
item.value = l.iterValue
if m.heap.len() > 1 {
m.heap.fix(0)
}
} else {
m.err = l.iter.Close()
l.iter = nil
m.heap.pop()
}
item.key.Trailer = l.iterKey.Trailer
item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
item.value = l.iterValue
if m.heap.len() > 1 {
m.heap.fix(0)
if m.err != nil {
return false
}
} else {
m.err = l.iter.Close()
l.iter = nil
m.heap.pop()
}
if m.err != nil {
return false
}
if m.heap.len() == 0 {
// Last record was a MERGE record.
if m.valueMerger != nil {
var closer io.Closer
_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
if m.err == nil && closer != nil {
m.err = closer.Close()
}
if m.err != nil {
m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
item.key.Pretty(m.formatKey), m.lastIterMsg)
if m.heap.len() == 0 {
// Last record was a MERGE record.
if m.valueMerger != nil {
var closer io.Closer
_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
if m.err == nil && closer != nil {
m.err = closer.Close()
}
if m.err != nil {
m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
item.key.Pretty(m.formatKey), m.lastIterMsg)
}
m.valueMerger = nil
}
m.valueMerger = nil
return false
}
return false
}
m.positionRangeDels()
return true
Expand Down
44 changes: 18 additions & 26 deletions level_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -1018,37 +1018,29 @@ func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) {
// bounds.
return nil, base.LazyValue{}
}
// If the boundary is a range deletion tombstone, return that key.
if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete {
l.largestBoundary = &l.iterFile.LargestPointKey
if l.boundaryContext != nil {
l.boundaryContext.isIgnorableBoundaryKey = true
}
return l.largestBoundary, base.LazyValue{}
}
// If the last point iterator positioning op might've skipped keys,
// it's possible the file's range deletions are still relevant to
// other levels. Return the largest boundary as a special ignorable
// marker to avoid advancing to the next file.
//
// The sstable iterator cannot guarantee that keys were skipped. A
// SeekGE that lands on a index separator k only knows that the
// block at the index entry contains keys ≤ k. We can't know whether
// there were actually keys between the seek key and the index
// separator key. If the block is then excluded due to block
// property filters, the iterator does not know whether keys were
// actually skipped by the block's exclusion.
// If the boundary is a range deletion tombstone, or the caller is
// accessing range dels through l.rangeDelIterPtr, pause at an
// ignorable boundary key to avoid advancing to the next file until
// other levels are caught up.
//
// Since MaybeFilteredKeys cannot guarantee that keys were skipped,
// it's possible l.iterFile.Largest was already returned. Returning
// l.iterFile.Largest again is a violation of the strict
// Note that even if the largest boundary is not a range deletion,
// there may still be range deletions beyong the last point key
// returned. When block-property filters are in use, the sstable
// iterator may have transparently skipped a tail of the point keys
// in the file. If the last point key returned /was/ the largest
// key, then we'll return a key with the same user key and trailer
// twice. Returning it again is a violation of the strict
// monotonicity normally provided. The mergingIter's heap can
// tolerate this repeat key and in this case will keep the level at
// the top of the heap and immediately skip the entry, advancing to
// the next file.
if *l.rangeDelIterPtr != nil && l.filteredIter != nil &&
l.filteredIter.MaybeFilteredKeys() {
l.largestBoundary = &l.iterFile.Largest
//
// TODO(jackson): We should be able to condition this only on
// *l.rangeDelIterPtr != nil, but the getIter retains tombstones
// returned by the rangeDelIter after it's nil'd the ptr.
if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete ||
*l.rangeDelIterPtr != nil {
l.largestBoundary = &l.iterFile.LargestPointKey
if l.boundaryContext != nil {
l.boundaryContext.isIgnorableBoundaryKey = true
}
Expand Down
14 changes: 12 additions & 2 deletions merging_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,17 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
if m.levels[item.index].isSyntheticIterBoundsKey {
break
}
// Skip ignorable boundary keys. These are not real keys and exist to
// keep sstables open until we've surpassed their end boundaries so that
// their range deletions are visible.
if m.levels[item.index].isIgnorableBoundaryKey {
m.err = m.prevEntry(item)
if m.err != nil {
return nil, base.LazyValue{}
}
continue
}

m.addItemStats(item)
if isDeleted, err := m.isPrevEntryDeleted(item); err != nil {
m.err = err
Expand All @@ -960,8 +971,7 @@ func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
m.stats.PointsCoveredByRangeTombstones++
continue
}
if item.iterKey.Visible(m.snapshot, m.batchSnapshot) &&
(!m.levels[item.index].isIgnorableBoundaryKey) {
if item.iterKey.Visible(m.snapshot, m.batchSnapshot) {
return item.iterKey, item.iterValue
}
m.err = m.prevEntry(item)
Expand Down
2 changes: 2 additions & 0 deletions testdata/level_iter_boundaries
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,10 @@ c.SET.2:c
iter
first
next
next
----
c#2,SET:c
c#2,SET:
.

iter
Expand Down
6 changes: 6 additions & 0 deletions testdata/level_iter_seek
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ next
stats
next
stats
next
stats
reset-stats
stats
----
Expand All @@ -117,6 +119,8 @@ c#7,SET:c
{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
f#5,SET:f
{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
f#5,SET:
{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
g#4,SET:g
{BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
h#3,SET:h
Expand Down Expand Up @@ -380,8 +384,10 @@ j.SET.3:j
iter
seek-ge f
next
next
----
f/<invalid>#5,SET:f
f#5,SET:
i#4,SET:i

# The below count should be 2, as we skip over the rangekey-only file.
Expand Down
2 changes: 1 addition & 1 deletion testdata/merging_iter
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ c#27,SET:27
e#10,SET:10
g#20,SET:20
.
{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:8 PointCount:5 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:8 PointCount:6 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}

# seekGE() should not allow the rangedel to act on points in the lower sstable that are after it.
Expand Down

0 comments on commit 1c35b03

Please sign in to comment.