diff --git a/metamorphic/ops.go b/metamorphic/ops.go index ae9f9643aa..2be5f6b714 100644 --- a/metamorphic/ops.go +++ b/metamorphic/ops.go @@ -652,10 +652,11 @@ func buildForIngest( equal := t.opts.Comparer.Equal tableFormat := db.FormatMajorVersion().MaxTableFormat() - w := sstable.NewWriter( - objstorageprovider.NewFileWritable(f), - t.opts.MakeWriterOptions(0, tableFormat), - ) + wOpts := t.opts.MakeWriterOptions(0, tableFormat) + if t.testOpts.disableValueBlocksForIngestSSTables { + wOpts.DisableValueBlocks = true + } + w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), wOpts) var lastUserKey []byte for key, value := iter.First(); key != nil; key, value = iter.Next() { diff --git a/metamorphic/options.go b/metamorphic/options.go index d6f08f4ca4..9f2fa46673 100644 --- a/metamorphic/options.go +++ b/metamorphic/options.go @@ -95,6 +95,9 @@ func parseOptions( opts.enableValueBlocks = true opts.Opts.Experimental.EnableValueBlocks = func() bool { return true } return true + case "TestOptions.disable_value_blocks_for_ingest_sstables": + opts.disableValueBlocksForIngestSSTables = true + return true case "TestOptions.async_apply_to_db": opts.asyncApplyToDB = true return true @@ -188,6 +191,9 @@ func optionsToString(opts *TestOptions) string { if opts.enableValueBlocks { fmt.Fprintf(&buf, " enable_value_blocks=%t\n", opts.enableValueBlocks) } + if opts.disableValueBlocksForIngestSSTables { + fmt.Fprintf(&buf, " disable_value_blocks_for_ingest_sstables=%t\n", opts.disableValueBlocksForIngestSSTables) + } if opts.asyncApplyToDB { fmt.Fprint(&buf, " async_apply_to_db=true\n") } @@ -287,6 +293,8 @@ type TestOptions struct { disableBlockPropertyCollector bool // Enable the use of value blocks. enableValueBlocks bool + // Disables value blocks in the sstables written for ingest. + disableValueBlocksForIngestSSTables bool // Use DB.ApplyNoSyncWait for applies that want to sync the WAL. asyncApplyToDB bool // Enable the use of shared storage. @@ -618,6 +626,7 @@ func RandomOptions( if testOpts.enableValueBlocks { testOpts.Opts.Experimental.EnableValueBlocks = func() bool { return true } } + testOpts.disableValueBlocksForIngestSSTables = rng.Intn(2) == 0 testOpts.asyncApplyToDB = rng.Intn(2) != 0 // 20% of time, enable shared storage. if rng.Intn(5) == 0 { diff --git a/sstable/options.go b/sstable/options.go index a198926a76..8d88a32deb 100644 --- a/sstable/options.go +++ b/sstable/options.go @@ -219,6 +219,17 @@ type WriterOptions struct { // RequiredInPlaceValueBound mirrors // Options.Experimental.RequiredInPlaceValueBound. RequiredInPlaceValueBound UserKeyPrefixBound + + // DisableValueBlocks is only used for TableFormat >= TableFormatPebblev3, + // and if set to true, does not write any values to value blocks. This is + // only intended for cases where the in-memory buffering of all value blocks + // while writing a sstable is too expensive and likely to cause an OOM. It + // is never set to true by a Pebble DB, and can be set to true when some + // external code is directly generating huge sstables using Pebble's + // sstable.Writer (for example, CockroachDB backups can sometimes write + // 750MB sstables -- see + // https://github.com/cockroachdb/cockroach/issues/117113). + DisableValueBlocks bool } func (o WriterOptions) ensureDefaults() WriterOptions { diff --git a/sstable/testdata/writer_value_blocks b/sstable/testdata/writer_value_blocks index e98a4b5527..9461626665 100644 --- a/sstable/testdata/writer_value_blocks +++ b/sstable/testdata/writer_value_blocks @@ -34,6 +34,32 @@ scan-cloned-lazy-values 3(in-place: len 4): bat3 4(lazy: len 5, attr: 5): vbat2 +# Same data as previous, with disable-value-blocks set to true +build disable-value-blocks=true +a@2.SET.1:a2 +b@5.SET.7:b5 +b@4.DEL.3: +b@3.SET.2:bat3 +b@2.SET.1:vbat2 +---- +value-blocks: num-values 0, num-blocks: 0, size: 0 + +scan-raw +---- +a@2#1,1:in-place a2, same-pre false +b@5#7,1:in-place b5, same-pre false +b@4#3,0: +b@3#2,1:in-place bat3, same-pre false +b@2#1,1:in-place vbat2, same-pre true + +scan +---- +a@2#1,1:a2 +b@5#7,1:b5 +b@4#3,0: +b@3#2,1:bat3 +b@2#1,1:vbat2 + # Size of value index is 3 bytes plus 5 + 5 = 10 bytes of trailer of the value # block and value index block. So size 33 - 13 = 20 is the total size of the # values in the value block. diff --git a/sstable/writer.go b/sstable/writer.go index 46dbf350ba..0b2a193d9f 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -207,7 +207,9 @@ type Writer struct { // For value blocks. shortAttributeExtractor base.ShortAttributeExtractor requiredInPlaceValueBound UserKeyPrefixBound - valueBlockWriter *valueBlockWriter + // When w.tableFormat >= TableFormatPebblev3, valueBlockWriter is nil iff + // WriterOptions.DisableValueBlocks was true. + valueBlockWriter *valueBlockWriter } type pointKeyInfo struct { @@ -803,6 +805,7 @@ func (w *Writer) getLastPointUserKey() []byte { return w.dataBlockBuf.dataBlock.getCurUserKey() } +// REQUIRES: w.tableFormat >= TableFormatPebblev3 func (w *Writer) makeAddPointDecisionV3( key InternalKey, valueLen int, ) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) { @@ -912,14 +915,13 @@ func (w *Writer) makeAddPointDecisionV3( // NB: it is possible that cmpUser == 0, i.e., these two SETs have identical // user keys (because of an open snapshot). This should be the rare case. setHasSamePrefix = cmpPrefix == 0 - considerWriteToValueBlock = setHasSamePrefix // Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of // valueHandle, this should be > 3. But tiny values are common in test and // unlikely in production, so we use 0 here for better test coverage. const tinyValueThreshold = 0 - if considerWriteToValueBlock && valueLen <= tinyValueThreshold { - considerWriteToValueBlock = false - } + // NB: setting WriterOptions.DisableValueBlocks does not disable the + // setHasSamePrefix optimization. + considerWriteToValueBlock = setHasSamePrefix && valueLen > tinyValueThreshold && w.valueBlockWriter != nil return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil } @@ -931,7 +933,7 @@ func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) err var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool var isObsolete bool maxSharedKeyLen := len(key.UserKey) - if w.valueBlockWriter != nil { + if w.tableFormat >= TableFormatPebblev3 { // maxSharedKeyLen is limited to the prefix of the preceding key. If the // preceding key was in a different block, then the blockWriter will // ignore this maxSharedKeyLen. @@ -2218,10 +2220,12 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write if w.tableFormat >= TableFormatPebblev3 { w.shortAttributeExtractor = o.ShortAttributeExtractor w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound - w.valueBlockWriter = newValueBlockWriter( - w.blockSize, w.blockSizeThreshold, w.compression, w.checksumType, func(compressedSize int) { - w.coordination.sizeEstimate.dataBlockCompressed(compressedSize, 0) - }) + if !o.DisableValueBlocks { + w.valueBlockWriter = newValueBlockWriter( + w.blockSize, w.blockSizeThreshold, w.compression, w.checksumType, func(compressedSize int) { + w.coordination.sizeEstimate.dataBlockCompressed(compressedSize, 0) + }) + } } w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType) diff --git a/sstable/writer_test.go b/sstable/writer_test.go index 64c1b231c9..60adf6d6a4 100644 --- a/sstable/writer_test.go +++ b/sstable/writer_test.go @@ -282,6 +282,10 @@ func TestWriterWithValueBlocks(t *testing.T) { inPlaceValueBound.Lower = []byte(l) inPlaceValueBound.Upper = []byte(u) } + var disableValueBlocks bool + if td.HasArg("disable-value-blocks") { + td.ScanArgs(t, "disable-value-blocks", &disableValueBlocks) + } meta, r, err = runBuildCmd(td, &WriterOptions{ BlockSize: blockSize, Comparer: testkeys.Comparer, @@ -289,6 +293,7 @@ func TestWriterWithValueBlocks(t *testing.T) { Parallelism: parallelism, RequiredInPlaceValueBound: inPlaceValueBound, ShortAttributeExtractor: attributeExtractor, + DisableValueBlocks: disableValueBlocks, }, 0) if err != nil { return err.Error()