Speed up type literal lexing and make it more strict. (#4430)

This rejects type literals with more digits than we can lex without APInt's help, and using a custom diagnostic. This is a pretty arbitrary implementation limit, I'm wide open to even more strict rules here. Despite no special casing and a very simplistic approach, by not using APInt this completely eliminates the lexing overhead for `i32` in the generated compilation benchmark where that specific type literal is very common. We see a 10% improvement in lexing there: ``` BM_CompileAPIFileDenseDecls<Phase::Lex>/256 39.0µs ± 4% 34.8µs ± 2% -10.86% (p=0.000 n=19+20) BM_CompileAPIFileDenseDecls<Phase::Lex>/1024 180µs ± 1% 158µs ± 2% -12.22% (p=0.000 n=18+20) BM_CompileAPIFileDenseDecls<Phase::Lex>/4096 731µs ± 2% 641µs ± 1% -12.31% (p=0.000 n=18+20) BM_CompileAPIFileDenseDecls<Phase::Lex>/16384 3.20ms ± 2% 2.86ms ± 2% -10.47% (p=0.000 n=18+19) BM_CompileAPIFileDenseDecls<Phase::Lex>/65536 13.8ms ± 1% 12.4ms ± 2% -9.78% (p=0.000 n=18+19) BM_CompileAPIFileDenseDecls<Phase::Lex>/262144 64.0ms ± 2% 58.4ms ± 2% -8.70% (p=0.000 n=19+18) ``` This starts to fix a TODO in the diagnostic for these by giving a reasonably good diagnostic about a very large type literal. However, in practice it regresses the diagnostics because error tokens produce noisy extraneous diagnostics from parse and check currently. Leaving the TODO there, and I have a follow-up PR to start improving the extraneous diagnostics.
carbon-language · Oct 24, 2024 · 577fda1 · 577fda1
1 parent b67d031
commit 577fda1
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 135 deletions.
diff --git a/toolchain/check/testdata/basics/type_literals.carbon b/toolchain/check/testdata/basics/type_literals.carbon
@@ -39,10 +39,18 @@ var test_i15: i15;
 // CHECK:STDERR:
 var test_i1000000000: i1000000000;
 // TODO: This diagnostic is not very good.
-// CHECK:STDERR: fail_iN_bad_width.carbon:[[@LINE+4]]:33: error: integer literal with value 10000000000000000000 does not fit in i32 [IntLiteralTooLargeForI32]
+// CHECK:STDERR: fail_iN_bad_width.carbon:[[@LINE+12]]:33: error: expected expression [ExpectedExpr]
 // CHECK:STDERR: var test_i10000000000000000000: i10000000000000000000;
 // CHECK:STDERR:                                 ^~~~~~~~~~~~~~~~~~~~~
 // CHECK:STDERR:
+// CHECK:STDERR: fail_iN_bad_width.carbon:[[@LINE+8]]:33: error: semantics TODO: `HandleInvalidParse` [SemanticsTodo]
+// CHECK:STDERR: var test_i10000000000000000000: i10000000000000000000;
+// CHECK:STDERR:                                 ^~~~~~~~~~~~~~~~~~~~~
+// CHECK:STDERR:
+// CHECK:STDERR: fail_iN_bad_width.carbon:[[@LINE+4]]:34: error: found a type literal with a bit width using 20 digits, which is greater than the limit of 18 [TooManyTypeBitWidthDigits]
+// CHECK:STDERR: var test_i10000000000000000000: i10000000000000000000;
+// CHECK:STDERR:                                  ^
+// CHECK:STDERR:
 var test_i10000000000000000000: i10000000000000000000;
 
 // --- uN.carbon
@@ -76,10 +84,18 @@ var test_u15: u15;
 // CHECK:STDERR:
 var test_u1000000000: u1000000000;
 // TODO: This diagnostic is not very good.
-// CHECK:STDERR: fail_uN_bad_width.carbon:[[@LINE+4]]:33: error: integer literal with value 10000000000000000000 does not fit in i32 [IntLiteralTooLargeForI32]
+// CHECK:STDERR: fail_uN_bad_width.carbon:[[@LINE+12]]:33: error: expected expression [ExpectedExpr]
+// CHECK:STDERR: var test_u10000000000000000000: u10000000000000000000;
+// CHECK:STDERR:                                 ^~~~~~~~~~~~~~~~~~~~~
+// CHECK:STDERR:
+// CHECK:STDERR: fail_uN_bad_width.carbon:[[@LINE+8]]:33: error: semantics TODO: `HandleInvalidParse` [SemanticsTodo]
 // CHECK:STDERR: var test_u10000000000000000000: u10000000000000000000;
 // CHECK:STDERR:                                 ^~~~~~~~~~~~~~~~~~~~~
 // CHECK:STDERR:
+// CHECK:STDERR: fail_uN_bad_width.carbon:[[@LINE+4]]:34: error: found a type literal with a bit width using 20 digits, which is greater than the limit of 18 [TooManyTypeBitWidthDigits]
+// CHECK:STDERR: var test_u10000000000000000000: u10000000000000000000;
+// CHECK:STDERR:                                  ^
+// CHECK:STDERR:
 var test_u10000000000000000000: u10000000000000000000;
 
 // --- fail_fN_bad_width.carbon
@@ -183,58 +199,7 @@ var test_f128: f128;
 // CHECK:STDOUT:   %.6: i32 = int_literal 1000000000 [template]
 // CHECK:STDOUT: }
 // CHECK:STDOUT:
-// CHECK:STDOUT: imports {
-// CHECK:STDOUT:   %Core: <namespace> = namespace file.%Core.import, [template] {
-// CHECK:STDOUT:     .Int = %import_ref
-// CHECK:STDOUT:     import Core//prelude
-// CHECK:STDOUT:     import Core//prelude/operators
-// CHECK:STDOUT:     import Core//prelude/types
-// CHECK:STDOUT:     import Core//prelude/operators/arithmetic
-// CHECK:STDOUT:     import Core//prelude/operators/as
-// CHECK:STDOUT:     import Core//prelude/operators/bitwise
-// CHECK:STDOUT:     import Core//prelude/operators/comparison
-// CHECK:STDOUT:     import Core//prelude/types/bool
-// CHECK:STDOUT:   }
-// CHECK:STDOUT:   %import_ref: %Int.type = import_ref Core//prelude/types, inst+29, loaded [template = constants.%Int]
-// CHECK:STDOUT: }
-// CHECK:STDOUT:
-// CHECK:STDOUT: file {
-// CHECK:STDOUT:   package: <namespace> = namespace [template] {
-// CHECK:STDOUT:     .Core = imports.%Core
-// CHECK:STDOUT:     .test_i0 = %test_i0
-// CHECK:STDOUT:     .test_i1 = %test_i1
-// CHECK:STDOUT:     .test_i15 = %test_i15
-// CHECK:STDOUT:     .test_i1000000000 = %test_i1000000000
-// CHECK:STDOUT:     .test_i10000000000000000000 = %test_i10000000000000000000
-// CHECK:STDOUT:   }
-// CHECK:STDOUT:   %Core.import = import Core
-// CHECK:STDOUT:   %i0.ref: <error> = name_ref i0, <error> [template = <error>]
-// CHECK:STDOUT:   %test_i0.var: ref <error> = var test_i0
-// CHECK:STDOUT:   %test_i0: ref <error> = bind_name test_i0, %test_i0.var
-// CHECK:STDOUT:   %.loc12_14.1: i32 = int_literal 1 [template = constants.%.1]
-// CHECK:STDOUT:   %int.make_type_signed.loc12: init type = call constants.%Int(%.loc12_14.1) [template = constants.%.3]
-// CHECK:STDOUT:   %.loc12_14.2: type = value_of_initializer %int.make_type_signed.loc12 [template = constants.%.3]
-// CHECK:STDOUT:   %.loc12_14.3: type = converted %int.make_type_signed.loc12, %.loc12_14.2 [template = constants.%.3]
-// CHECK:STDOUT:   %test_i1.var: ref %.3 = var test_i1
-// CHECK:STDOUT:   %test_i1: ref %.3 = bind_name test_i1, %test_i1.var
-// CHECK:STDOUT:   %.loc17_15.1: i32 = int_literal 15 [template = constants.%.4]
-// CHECK:STDOUT:   %int.make_type_signed.loc17: init type = call constants.%Int(%.loc17_15.1) [template = constants.%.5]
-// CHECK:STDOUT:   %.loc17_15.2: type = value_of_initializer %int.make_type_signed.loc17 [template = constants.%.5]
-// CHECK:STDOUT:   %.loc17_15.3: type = converted %int.make_type_signed.loc17, %.loc17_15.2 [template = constants.%.5]
-// CHECK:STDOUT:   %test_i15.var: ref %.5 = var test_i15
-// CHECK:STDOUT:   %test_i15: ref %.5 = bind_name test_i15, %test_i15.var
-// CHECK:STDOUT:   %.loc22_23.1: i32 = int_literal 1000000000 [template = constants.%.6]
-// CHECK:STDOUT:   %int.make_type_signed.loc22: init type = call constants.%Int(%.loc22_23.1) [template = <error>]
-// CHECK:STDOUT:   %.loc22_23.2: type = value_of_initializer %int.make_type_signed.loc22 [template = <error>]
-// CHECK:STDOUT:   %.loc22_23.3: type = converted %int.make_type_signed.loc22, %.loc22_23.2 [template = <error>]
-// CHECK:STDOUT:   %test_i1000000000.var: ref <error> = var test_i1000000000
-// CHECK:STDOUT:   %test_i1000000000: ref <error> = bind_name test_i1000000000, %test_i1000000000.var
-// CHECK:STDOUT:   %int.make_type_signed.loc28: init type = call constants.%Int(<invalid>) [template = <error>]
-// CHECK:STDOUT:   %.loc28_33.1: type = value_of_initializer %int.make_type_signed.loc28 [template = <error>]
-// CHECK:STDOUT:   %.loc28_33.2: type = converted %int.make_type_signed.loc28, %.loc28_33.1 [template = <error>]
-// CHECK:STDOUT:   %test_i10000000000000000000.var: ref <error> = var test_i10000000000000000000
-// CHECK:STDOUT:   %test_i10000000000000000000: ref <error> = bind_name test_i10000000000000000000, %test_i10000000000000000000.var
-// CHECK:STDOUT: }
+// CHECK:STDOUT: file {}
 // CHECK:STDOUT:
 // CHECK:STDOUT: fn @Int(%size.param_patt: i32) -> type = "int.make_type_signed";
 // CHECK:STDOUT:
@@ -310,58 +275,7 @@ var test_f128: f128;
 // CHECK:STDOUT:   %.6: i32 = int_literal 1000000000 [template]
 // CHECK:STDOUT: }
 // CHECK:STDOUT:
-// CHECK:STDOUT: imports {
-// CHECK:STDOUT:   %Core: <namespace> = namespace file.%Core.import, [template] {
-// CHECK:STDOUT:     .UInt = %import_ref
-// CHECK:STDOUT:     import Core//prelude
-// CHECK:STDOUT:     import Core//prelude/operators
-// CHECK:STDOUT:     import Core//prelude/types
-// CHECK:STDOUT:     import Core//prelude/operators/arithmetic
-// CHECK:STDOUT:     import Core//prelude/operators/as
-// CHECK:STDOUT:     import Core//prelude/operators/bitwise
-// CHECK:STDOUT:     import Core//prelude/operators/comparison
-// CHECK:STDOUT:     import Core//prelude/types/bool
-// CHECK:STDOUT:   }
-// CHECK:STDOUT:   %import_ref: %UInt.type = import_ref Core//prelude/types, inst+43, loaded [template = constants.%UInt]
-// CHECK:STDOUT: }
-// CHECK:STDOUT:
-// CHECK:STDOUT: file {
-// CHECK:STDOUT:   package: <namespace> = namespace [template] {
-// CHECK:STDOUT:     .Core = imports.%Core
-// CHECK:STDOUT:     .test_u0 = %test_u0
-// CHECK:STDOUT:     .test_u1 = %test_u1
-// CHECK:STDOUT:     .test_u15 = %test_u15
-// CHECK:STDOUT:     .test_u1000000000 = %test_u1000000000
-// CHECK:STDOUT:     .test_u10000000000000000000 = %test_u10000000000000000000
-// CHECK:STDOUT:   }
-// CHECK:STDOUT:   %Core.import = import Core
-// CHECK:STDOUT:   %u0.ref: <error> = name_ref u0, <error> [template = <error>]
-// CHECK:STDOUT:   %test_u0.var: ref <error> = var test_u0
-// CHECK:STDOUT:   %test_u0: ref <error> = bind_name test_u0, %test_u0.var
-// CHECK:STDOUT:   %.loc12_14.1: i32 = int_literal 1 [template = constants.%.1]
-// CHECK:STDOUT:   %int.make_type_unsigned.loc12: init type = call constants.%UInt(%.loc12_14.1) [template = constants.%.3]
-// CHECK:STDOUT:   %.loc12_14.2: type = value_of_initializer %int.make_type_unsigned.loc12 [template = constants.%.3]
-// CHECK:STDOUT:   %.loc12_14.3: type = converted %int.make_type_unsigned.loc12, %.loc12_14.2 [template = constants.%.3]
-// CHECK:STDOUT:   %test_u1.var: ref %.3 = var test_u1
-// CHECK:STDOUT:   %test_u1: ref %.3 = bind_name test_u1, %test_u1.var
-// CHECK:STDOUT:   %.loc17_15.1: i32 = int_literal 15 [template = constants.%.4]
-// CHECK:STDOUT:   %int.make_type_unsigned.loc17: init type = call constants.%UInt(%.loc17_15.1) [template = constants.%.5]
-// CHECK:STDOUT:   %.loc17_15.2: type = value_of_initializer %int.make_type_unsigned.loc17 [template = constants.%.5]
-// CHECK:STDOUT:   %.loc17_15.3: type = converted %int.make_type_unsigned.loc17, %.loc17_15.2 [template = constants.%.5]
-// CHECK:STDOUT:   %test_u15.var: ref %.5 = var test_u15
-// CHECK:STDOUT:   %test_u15: ref %.5 = bind_name test_u15, %test_u15.var
-// CHECK:STDOUT:   %.loc22_23.1: i32 = int_literal 1000000000 [template = constants.%.6]
-// CHECK:STDOUT:   %int.make_type_unsigned.loc22: init type = call constants.%UInt(%.loc22_23.1) [template = <error>]
-// CHECK:STDOUT:   %.loc22_23.2: type = value_of_initializer %int.make_type_unsigned.loc22 [template = <error>]
-// CHECK:STDOUT:   %.loc22_23.3: type = converted %int.make_type_unsigned.loc22, %.loc22_23.2 [template = <error>]
-// CHECK:STDOUT:   %test_u1000000000.var: ref <error> = var test_u1000000000
-// CHECK:STDOUT:   %test_u1000000000: ref <error> = bind_name test_u1000000000, %test_u1000000000.var
-// CHECK:STDOUT:   %int.make_type_unsigned.loc28: init type = call constants.%UInt(<invalid>) [template = <error>]
-// CHECK:STDOUT:   %.loc28_33.1: type = value_of_initializer %int.make_type_unsigned.loc28 [template = <error>]
-// CHECK:STDOUT:   %.loc28_33.2: type = converted %int.make_type_unsigned.loc28, %.loc28_33.1 [template = <error>]
-// CHECK:STDOUT:   %test_u10000000000000000000.var: ref <error> = var test_u10000000000000000000
-// CHECK:STDOUT:   %test_u10000000000000000000: ref <error> = bind_name test_u10000000000000000000, %test_u10000000000000000000.var
-// CHECK:STDOUT: }
+// CHECK:STDOUT: file {}
 // CHECK:STDOUT:
 // CHECK:STDOUT: fn @UInt(%size.param_patt: i32) -> type = "int.make_type_unsigned";
 // CHECK:STDOUT:

diff --git a/toolchain/diagnostics/diagnostic_kind.def b/toolchain/diagnostics/diagnostic_kind.def
@@ -42,6 +42,7 @@ CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes)
 CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer)
 CARBON_DIAGNOSTIC_KIND(TooManyDigits)
 CARBON_DIAGNOSTIC_KIND(TooManyTokens)
+CARBON_DIAGNOSTIC_KIND(TooManyTypeBitWidthDigits)
 CARBON_DIAGNOSTIC_KIND(TrailingComment)
 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeMissingBracedDigits)
 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)

diff --git a/toolchain/lex/lex.cpp b/toolchain/lex/lex.cpp
@@ -1166,10 +1166,6 @@ auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int32_t byte_offset)
     // Too short to form one of these tokens.
     return LexResult::NoMatch();
   }
-  if (word[1] < '1' || word[1] > '9') {
-    // Doesn't start with a valid initial digit.
-    return LexResult::NoMatch();
-  }
 
   TokenKind kind;
   switch (word.front()) {
@@ -1186,17 +1182,49 @@ auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int32_t byte_offset)
       return LexResult::NoMatch();
   };
 
+  // No leading zeros allowed.
+  if ('1' > word[1] || word[1] > '9') {
+    return LexResult::NoMatch();
+  }
+
   llvm::StringRef suffix = word.substr(1);
-  if (!CanLexInt(emitter_, suffix)) {
+
+  // Type bit-widths can't usefully be large integers so we restrict to small
+  // ones that are especially easy to parse into a normal integer variable by
+  // restricting the number of digits to round trip.
+  int64_t suffix_value;
+  constexpr ssize_t DigitLimit =
+      std::numeric_limits<decltype(suffix_value)>::digits10;
+  if (suffix.size() > DigitLimit) {
+    // See if this is not actually a type literal.
+    if (!llvm::all_of(suffix, IsDecimalDigit)) {
+      return LexResult::NoMatch();
+    }
+
+    // Otherwise, diagnose and produce an error token.
+    CARBON_DIAGNOSTIC(TooManyTypeBitWidthDigits, Error,
+                      "found a type literal with a bit width using {0} digits, "
+                      "which is greater than the limit of {1}",
+                      size_t, size_t);
+    emitter_.Emit(word.begin() + 1, TooManyTypeBitWidthDigits, suffix.size(),
+                  DigitLimit);
     return LexTokenWithPayload(TokenKind::Error, word.size(), byte_offset);
   }
-  llvm::APInt suffix_value;
-  if (suffix.getAsInteger(10, suffix_value)) {
-    return LexResult::NoMatch();
+
+  // It's tempting to do something more clever because we know the length ahead
+  // of time, but we expect these to be short (1-3 digits) and profiling doesn't
+  // show the loop as hot in the short cases.
+  suffix_value = suffix[0] - '0';
+  for (char c : suffix.drop_front()) {
+    if (!IsDecimalDigit(c)) {
+      return LexResult::NoMatch();
+    }
+    suffix_value = suffix_value * 10 + (c - '0');
   }
 
   return LexTokenWithPayload(
-      kind, buffer_.value_stores_->ints().Add(std::move(suffix_value)).index,
+      kind,
+      buffer_.value_stores_->ints().Add(llvm::APInt(64, suffix_value)).index,
       byte_offset);
 }
 

diff --git a/toolchain/lex/tokenized_buffer_test.cpp b/toolchain/lex/tokenized_buffer_test.cpp
@@ -7,6 +7,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <cmath>
 #include <forward_list>
 #include <iterator>
 
@@ -1023,27 +1024,54 @@ TEST_F(LexerTest, TypeLiterals) {
 }
 
 TEST_F(LexerTest, TypeLiteralTooManyDigits) {
+  // We increase the number of digits until the first one that is to large.
+  Testing::MockDiagnosticConsumer consumer;
+  EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
+                            DiagnosticKind::TooManyTypeBitWidthDigits,
+                            DiagnosticLevel::Error, 1, 2, _)));
   std::string code = "i";
+  // A 128-bit APInt should be plenty large, but if needed in the future it can
+  // be widened without issue.
+  llvm::APInt bits = llvm::APInt::getZero(128);
+  for ([[maybe_unused]] int _ : llvm::seq(1, 30)) {
+    code.append("9");
+    bits = bits * 10 + 9;
+    auto [buffer, value_stores] =
+        compile_helper_.GetTokenizedBufferWithSharedValueStore(code, &consumer);
+    if (buffer.has_errors()) {
+      ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                              {.kind = TokenKind::FileStart},
+                              {.kind = TokenKind::Error, .text = code},
+                              {.kind = TokenKind::FileEnd},
+                          }));
+      break;
+    }
+    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                            {.kind = TokenKind::FileStart},
+                            {.kind = TokenKind::IntTypeLiteral, .text = code},
+                            {.kind = TokenKind::FileEnd},
+                        }));
+    auto token = buffer.tokens().begin()[1];
+    EXPECT_TRUE(llvm::APInt::isSameValue(
+        value_stores.ints().Get(buffer.GetTypeLiteralSize(token)), bits));
+  }
+
+  // Make sure we can also gracefully reject very large number of digits without
+  // crashing or hanging, and show the correct number.
   constexpr int Count = 10000;
+  EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
+                            DiagnosticKind::TooManyTypeBitWidthDigits,
+                            DiagnosticLevel::Error, 1, 2,
+                            HasSubstr(llvm::formatv(" {0} ", Count)))));
+  code = "i";
   code.append(Count, '9');
-
-  Testing::MockDiagnosticConsumer consumer;
-  EXPECT_CALL(consumer,
-              HandleDiagnostic(IsSingleDiagnostic(
-                  DiagnosticKind::TooManyDigits, DiagnosticLevel::Error, 1, 2,
-                  HasSubstr(llvm::formatv(" {0} ", Count)))));
   auto& buffer = compile_helper_.GetTokenizedBuffer(code, &consumer);
-  EXPECT_TRUE(buffer.has_errors());
-  ASSERT_THAT(buffer,
-              HasTokens(llvm::ArrayRef<ExpectedToken>{
-                  {.kind = TokenKind::FileStart, .line = 1, .column = 1},
-                  {.kind = TokenKind::Error,
-                   .line = 1,
-                   .column = 1,
-                   .indent_column = 1,
-                   .text = code},
-                  {.kind = TokenKind::FileEnd, .line = 1, .column = Count + 2},
-              }));
+  ASSERT_TRUE(buffer.has_errors());
+  ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                          {.kind = TokenKind::FileStart},
+                          {.kind = TokenKind::Error, .text = code},
+                          {.kind = TokenKind::FileEnd},
+                      }));
 }
 
 TEST_F(LexerTest, DiagnosticTrailingComment) {

diff --git a/toolchain/testing/compile_helper.cpp b/toolchain/testing/compile_helper.cpp
@@ -17,9 +17,10 @@ auto CompileHelper::GetTokenizedBuffer(llvm::StringRef text,
   return token_storage_.front();
 }
 
-auto CompileHelper::GetTokenizedBufferWithSharedValueStore(llvm::StringRef text)
+auto CompileHelper::GetTokenizedBufferWithSharedValueStore(
+    llvm::StringRef text, DiagnosticConsumer* consumer)
     -> std::pair<Lex::TokenizedBuffer&, SharedValueStores&> {
-  auto& tokens = GetTokenizedBuffer(text);
+  auto& tokens = GetTokenizedBuffer(text, consumer);
   return {tokens, value_store_storage_.front()};
 }
 

diff --git a/toolchain/testing/compile_helper.h b/toolchain/testing/compile_helper.h
@@ -25,7 +25,8 @@ class CompileHelper {
       -> Lex::TokenizedBuffer&;
 
   // Returns the result of lex along with shared values.
-  auto GetTokenizedBufferWithSharedValueStore(llvm::StringRef text)
+  auto GetTokenizedBufferWithSharedValueStore(
+      llvm::StringRef text, DiagnosticConsumer* consumer = nullptr)
       -> std::pair<Lex::TokenizedBuffer&, SharedValueStores&>;
 
   // Returns the result of parse.