pulumi · guineveresaenger · Aug 13, 2024 · Aug 13, 2024 · Aug 15, 2024 · Aug 16, 2024
diff --git a/pkg/tfgen/docs.go b/pkg/tfgen/docs.go
@@ -356,8 +356,8 @@ var (
 	// [1]: https://docs.aws.amazon.com/lambda/latest/dg/welcome.html
 	linkFooterRegexp = regexp.MustCompile(`(?m)^(\[\d+\]):\s(.*)`)
 
-	argumentBulletRegexp = regexp.MustCompile(
-		"^\\s*[*+-]\\s*`([a-z0-9_]*)`\\s*(\\([a-zA-Z]*\\)\\s*)?\\s*[:–-]?\\s*(\\([^\\)]*\\)[-\\s]*)?(.*)",
+	descriptionRegexp = regexp.MustCompile(
+		"^\\s*`([a-z0-9_]*)`\\s*(\\([a-zA-Z]*\\)\\s*)?\\s*[:–-]?\\s*(\\([^)]*\\)[-\\s]*)?((.|\n)*)",
 	)
 
 	bulletPointRegexStr       = "^\\s*[*+-]"          // matches any bullet point-like character
@@ -369,7 +369,6 @@ var (
 	)
 
 	attributionFormatString = "This Pulumi package is based on the [`%[1]s` Terraform Provider](https://%[3]s/%[2]s/terraform-provider-%[1]s)."
-	listMarkerRegex         = regexp.MustCompile("[-*+]")
 )
 
 func trimFrontMatter(text []byte) []byte {
@@ -385,7 +384,6 @@ func trimFrontMatter(text []byte) []byte {
 	}
 	return body[idx+3:]
 }
-
 func splitByMarkdownHeaders(text string, level int) [][]string {
 	// splitByMarkdownHeaders parses text, then walks the resulting AST to find
 	// appropriate header nodes. It uses the location of these header nodes to split
@@ -397,7 +395,6 @@ func splitByMarkdownHeaders(text string, level int) [][]string {
 	contract.Assertf(offset >= 0, "The offset generated by chopping of the front-matter cannot be negative")
 
 	gm := goldmark.New(goldmark.WithExtensions(parse.TFRegistryExtension))
-
 	headers := []int{}
 	parse.WalkNode(gm.Parser().Parse(gmtext.NewReader(bytes)), func(heading *gmast.Heading) {
 		if heading.Level != level {
@@ -797,91 +794,6 @@ func (p *tfMarkdownParser) parseSchemaWithNestedSections(subsection []string) {
 	parseTopLevelSchemaIntoDocs(&p.ret, topLevelSchema, p.sink.warn)
 }
 
-type markdownLineInfo struct {
-	name, desc string
-	isFound    bool
-}
-
-type bulletListEntry struct {
-	name  string
-	index int
-}
-
-// trackBulletListIndentation looks at the index of the bullet list marker ( `*`, `-` or `+`) in a docs line and
-// compares it to a collection that tracks the level of list nesting by comparing to the previous list entry's nested
-// level (if any).
-// Note that this function only looks at the placement of the bullet list marker, and assumes same-level list markers
-// to be in the same location in each line. This is not necessarily the case for Markdown, which considers a range of
-// locations within 1-4 whitespace characters, as well as considers the start index of the text following the bullet
-// point. If and when this becomes an issue during docs parsing, we may consider adding some of those rules here.
-// Read more about nested lists in GitHub-flavored Markdown:
-// https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#nested-lists
-//
-//nolint:lll
-func trackBulletListIndentation(line, name string, tracker []bulletListEntry) []bulletListEntry {
-
-	listMarkerLocation := listMarkerRegex.FindStringIndex(line)
-	contract.Assertf(len(listMarkerLocation) == 2,
-		fmt.Sprintf("Expected to find bullet list marker in line %s", line))
-	listMarkerIndex := listMarkerLocation[0]
-
-	// If our tracker is empty, we are at list nested level 0.
-	if len(tracker) == 0 {
-		newEntry := bulletListEntry{
-			name:  name,
-			index: listMarkerIndex,
-		}
-		return append(tracker, newEntry)
-	}
-	// Always compare to last entry in tracker
-	lastListEntry := tracker[len(tracker)-1]
-
-	// if current line's listMarkerIndex is greater than the tracker's last entry's listMarkerIndex,
-	// make a new tracker entry and push it on there with all the info.
-	if listMarkerIndex > lastListEntry.index {
-		name = lastListEntry.name + "." + name
-		newEntry := bulletListEntry{
-			name:  name,
-			index: listMarkerIndex,
-		}
-		return append(tracker, newEntry)
-	}
-	// if current line's listMarkerIndex is the same as the last entry's, we're at the same level.
-	if listMarkerIndex == lastListEntry.index {
-		// Replace the last entry in our tracker
-		replaceEntry := bulletListEntry{
-			index: listMarkerIndex,
-		}
-		if len(tracker) == 1 {
-			replaceEntry.name = name
-		} else {
-			// use the penultimate entry name to build current name
-			replaceName := tracker[(len(tracker)-2)].name + "." + name
-			replaceEntry.name = replaceName
-		}
-		return append(tracker[:len(tracker)-1], replaceEntry)
-	}
-
-	// The current line's listMarkerIndex is smaller that the previous entry's.
-	// Pop off the latest entry, and retry to see if the next previous entry is a match.
-	return trackBulletListIndentation(line, name, tracker[:len(tracker)-1])
-}
-
-// parseArgFromMarkdownLine takes a line of Markdown and attempts to parse it for a Terraform argument and its
-// description. It returns a struct containing the name and description of the arg, and whether an arg was found.
-func parseArgFromMarkdownLine(line string) markdownLineInfo {
-	matches := argumentBulletRegexp.FindStringSubmatch(line)
-	var parsed markdownLineInfo
-	if len(matches) > 4 {
-		parsed.name = matches[1]
-		parsed.desc = matches[4]
-		parsed.isFound = true
-	}
-	return parsed
-}
-
-var genericNestedRegexp = regexp.MustCompile("supports? the following:")
-
 var nestedObjectRegexps = []*regexp.Regexp{
 	// For example:
 	// s3_bucket.html.markdown: "The `website` object supports the following:"
@@ -1022,106 +934,138 @@ func getNestedBlockNames(line string) []string {
 }
 
 func parseArgReferenceSection(subsection []string, ret *entityDocs) {
-	// Variable to remember the last argument we found.
-	var lastMatch string
-	// Collection to hold all arguments that headline a nested description.
-	var nesteds []docsPath
-
-	addNewHeading := func(name, desc, line string) {
-		// found a property bullet, extract the name and description
-		if len(nesteds) > 0 {
-			for _, nested := range nesteds {
-				// We found this line within a nested field. We should record it as such.
-				if ret.Arguments[nested] == nil {
-					totalArgumentsFromDocs++
-				}
-				ret.Arguments[nested.join(name)] = &argumentDocs{desc}
-			}
 
-		} else {
-			if genericNestedRegexp.MatchString(line) {
-				return
+	// Treat our subsection as a markdown node. This will later just be a node.
+	docBytes := []byte(strings.Join(subsection, "\n"))
+
+	// Parse the document using Goldmark parser
+	gm := goldmark.New(goldmark.WithExtensions(parse.TFRegistryExtension))
+	astNode := gm.Parser().Parse(gmtext.NewReader(docBytes))
+
+	var paths []string
+	var writeList bool // tracking whether we need to write a list verbatim
+	err := gmast.Walk(astNode, func(node gmast.Node, enter bool) (gmast.WalkStatus, error) {
+		// When we find a list item, we check if it is an argument entry.
+		if node.Kind().String() == "ListItem" {
+			if enter {
+				// For any list item, we want to check if it opens with a code span.
+				// It will be list item --> Text --> Code Span, so the grandchild of the list item.
+				codeSpanNode := node.FirstChild().FirstChild()
+				if codeSpanNode.Kind().String() == "CodeSpan" {
+					codeSpanItem := codeSpanNode.Text(docBytes)
+
+					// The list item's first child is a text block.
+					// For most of our entries, this is all we need.
+					desc := writeLines(node.FirstChild().Lines(), docBytes)
+
+					// To see if we have a TF name, use a regex match.
+					// The submatch looks for patterns such as
+					//
+					// `follow_gae_application` - (Optional) A GAE application whose zone to remain"
+					descs := descriptionRegexp.FindStringSubmatch(desc)
+					if len(descs) <= 4 {
+						writeList = true
+					}
+
+					// add to docspaths if writeList is false
+					if !writeList {
+						paths = addPaths(paths, codeSpanItem)
+					}
+					// Read results into the return argument docs. When we're reading subfields for multiple fields,
+					// the description is still the same as discovered from the node's lines.
+					for _, path := range paths {
+						if !writeList {
+							ret.Arguments[docsPath(path)] = &argumentDocs{descs[4]}
+						} else {
+							// We need to write the entire list item into the description.
+							// We'll just append each list item as it is visited.
+							currentDesc := ret.Arguments[docsPath(path)].description
+							newDesc := currentDesc + "\n* " + desc
+							ret.Arguments[docsPath(path)] = &argumentDocs{newDesc}
+						}
+					}
+				}
+			} else {
+				if !writeList {
+					paths = cutPaths(paths)
+				}
 			}
-			ret.Arguments[docsPath(name)] = &argumentDocs{description: desc}
-			totalArgumentsFromDocs++
-		}
-	}
-	// This function adds the current line as a description to the last matched resource,
-	//in cases where there's no resource match found on this line.
-	//It represents a multi-line description for a field.
-	extendExistingHeading := func(line string) {
-		if len(nesteds) > 0 {
-			for _, nested := range nesteds {
-				line = "\n" + strings.TrimSpace(line)
-				ret.Arguments[nested.join(lastMatch)].description += line
+		}
+		if node.Kind().String() == "Section" {
+			writeList = false
+			// A Section's first child is its heading.
+			// In this part of the upstream document, a heading generally means a subresource name.
+			if enter {
+				// The text next to an arg reference's section header is assumed to be a resource field.
+				headerItem := node.FirstChild().Text(docBytes)
+				// add to docs paths
+				paths = addPaths(paths, headerItem)
+			} else {
+				paths = cutPaths(paths)
 			}
-		} else {
-			if genericNestedRegexp.MatchString(line) {
-				lastMatch = ""
-				nesteds = []docsPath{}
-				return
+		}
+		// Additionally, there are top-level paragraphs that can contain information about nested docs,
+		// such as "The `foo_bar` object supports the following:".
+		if node.Kind().String() == "Paragraph" && node.Parent().Kind().String() == "Document" {
+			writeList = false
+			if enter {
+				// All of the fields mentioned in paragraphs can be treated as top-level, i.e.
+				// they're of the format "(The) `foo` [field|resource] supports the following:", or they already
+				// include the nested path as in "(The) `foo.bar` [field|resource] supports the following:".
+				// This means that at any detection of a top-level Paragraph node, we re-set the docsPath slice to empty.
+				paths = []string{}
+				paragraph := writeLines(node.Lines(), docBytes)
+				// Check if our paragraph matches any of the nested object signifiers. See `nestedObjectRegexps`.
+				nestedBlockNames := getNestedBlockNames(paragraph)
+				if len(nestedBlockNames) > 0 {
+					// write to docspath
+					paths = nestedBlockNames
+				}
+			} else {
+				// Because descriptions nested under paragraphs are not children, but rather siblings,
+				// we do not manipulate the docspath level at this point. Continue walking.
+				return gmast.WalkContinue, nil
 			}
-			line = "\n" + strings.TrimSpace(line)
-			ret.Arguments[docsPath(lastMatch)].description += line
 		}
-	}
-
-	// hadSpace tells us if the previous line was blank.
-	var hadSpace bool
+		return gmast.WalkContinue, nil
+	})
+	contract.AssertNoErrorf(err, "Cannot fail to parse argument reference")
+}
 
-	// bulletListTracker is a stack-like collection that tracks the level of nesting for a bulleted list with
-	// nested lists. The name of the topmost entry represents the nested docs path for the current line.
-	var bulletListTracker []bulletListEntry
+func writeLines(lines *gmtext.Segments, docBytes []byte) string {
+	var desc bytes.Buffer
+	for i := 0; i < lines.Len(); i++ {
+		line := lines.At(i)
+		desc.Write(line.Value(docBytes))
+	}
+	return desc.String()
+}
 
-	for _, line := range subsection {
-		parsedArg := parseArgFromMarkdownLine(line)
-		matchFound := parsedArg.isFound
-		if matchFound { // We have found a new property bullet point.
-			desc := parsedArg.desc
-			bulletListTracker = trackBulletListIndentation(line, parsedArg.name, bulletListTracker)
-			name := bulletListTracker[len(bulletListTracker)-1].name
-			lastMatch = name
-			addNewHeading(name, desc, line)
-
-		} else if strings.TrimSpace(line) == "---" {
-			// --- is a markdown section break. This probably indicates the
-			// section is over, but we take it to mean that the current
-			// heading is over.
-			lastMatch = ""
-			bulletListTracker = nil
-		} else if nestedBlockCurrentLine := getNestedBlockNames(line); hadSpace && len(nestedBlockCurrentLine) > 0 {
-			// This tells us if there's a resource that is about to have subfields (nesteds)
-			// in subsequent lines.
-			//empty nesteds
-			nesteds = []docsPath{}
-			for _, item := range nestedBlockCurrentLine {
-				nesteds = append(nesteds, docsPath(item))
-			}
-			lastMatch = ""
-			bulletListTracker = nil
-		} else if !isBlank(line) && lastMatch != "" {
-			// This appends the current line to the previous match's description.
-			extendExistingHeading(line)
-
-		} else if nestedBlockCurrentLine := getNestedBlockNames(line); len(nestedBlockCurrentLine) > 0 {
-			// This tells us if there's a resource that is about to have subfields (nesteds)
-			// in subsequent lines.
-			//empty nesteds
-			nesteds = []docsPath{}
-			for _, item := range nestedBlockCurrentLine {
-				nesteds = append(nesteds, docsPath(item))
-			}
-			lastMatch = ""
-			bulletListTracker = nil
-		} else if lastMatch != "" {
-			extendExistingHeading(line)
+func cutPaths(paths []string) []string {
+	var newpaths []string
+	for _, p := range paths {
+		pathIndex := strings.LastIndex(
+			p, ".")
+		if pathIndex > 0 {
+			p = p[:pathIndex]
+			newpaths = append(newpaths, p)
 		}
-		hadSpace = isBlank(line)
 	}
+	return newpaths
+}
 
-	for _, v := range ret.Arguments {
-		v.description = strings.TrimRightFunc(v.description, unicode.IsSpace)
+func addPaths(paths []string, pathSection []byte) []string {
+	if len(paths) == 0 {
+		paths = append(paths, string(pathSection))
+	} else {
+		var newPaths []string
+		for _, p := range paths {
+			p = p + "." + string(pathSection)
+			newPaths = append(newPaths, p)
+		}
+		paths = newPaths
 	}
+	return paths
 }
 
 func parseAttributesReferenceSection(subsection []string, ret *entityDocs) {