Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace string manipulation with AST logic in parseArgReferenceSection #2318

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
302 changes: 123 additions & 179 deletions pkg/tfgen/docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,8 @@ var (
// [1]: https://docs.aws.amazon.com/lambda/latest/dg/welcome.html
linkFooterRegexp = regexp.MustCompile(`(?m)^(\[\d+\]):\s(.*)`)

argumentBulletRegexp = regexp.MustCompile(
"^\\s*[*+-]\\s*`([a-z0-9_]*)`\\s*(\\([a-zA-Z]*\\)\\s*)?\\s*[:–-]?\\s*(\\([^\\)]*\\)[-\\s]*)?(.*)",
descriptionRegexp = regexp.MustCompile(
"^\\s*`([a-z0-9_]*)`\\s*(\\([a-zA-Z]*\\)\\s*)?\\s*[:–-]?\\s*(\\([^)]*\\)[-\\s]*)?((.|\n)*)",
)

bulletPointRegexStr = "^\\s*[*+-]" // matches any bullet point-like character
Expand All @@ -369,7 +369,6 @@ var (
)

attributionFormatString = "This Pulumi package is based on the [`%[1]s` Terraform Provider](https://%[3]s/%[2]s/terraform-provider-%[1]s)."
listMarkerRegex = regexp.MustCompile("[-*+]")
)

func trimFrontMatter(text []byte) []byte {
Expand All @@ -385,7 +384,6 @@ func trimFrontMatter(text []byte) []byte {
}
return body[idx+3:]
}

func splitByMarkdownHeaders(text string, level int) [][]string {
// splitByMarkdownHeaders parses text, then walks the resulting AST to find
// appropriate header nodes. It uses the location of these header nodes to split
Expand All @@ -397,7 +395,6 @@ func splitByMarkdownHeaders(text string, level int) [][]string {
contract.Assertf(offset >= 0, "The offset generated by chopping of the front-matter cannot be negative")

gm := goldmark.New(goldmark.WithExtensions(parse.TFRegistryExtension))

headers := []int{}
parse.WalkNode(gm.Parser().Parse(gmtext.NewReader(bytes)), func(heading *gmast.Heading) {
if heading.Level != level {
Expand Down Expand Up @@ -797,91 +794,6 @@ func (p *tfMarkdownParser) parseSchemaWithNestedSections(subsection []string) {
parseTopLevelSchemaIntoDocs(&p.ret, topLevelSchema, p.sink.warn)
}

type markdownLineInfo struct {
name, desc string
isFound bool
}

type bulletListEntry struct {
name string
index int
}

// trackBulletListIndentation looks at the index of the bullet list marker ( `*`, `-` or `+`) in a docs line and
// compares it to a collection that tracks the level of list nesting by comparing to the previous list entry's nested
// level (if any).
// Note that this function only looks at the placement of the bullet list marker, and assumes same-level list markers
// to be in the same location in each line. This is not necessarily the case for Markdown, which considers a range of
// locations within 1-4 whitespace characters, as well as considers the start index of the text following the bullet
// point. If and when this becomes an issue during docs parsing, we may consider adding some of those rules here.
// Read more about nested lists in GitHub-flavored Markdown:
// https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#nested-lists
//
//nolint:lll
func trackBulletListIndentation(line, name string, tracker []bulletListEntry) []bulletListEntry {

listMarkerLocation := listMarkerRegex.FindStringIndex(line)
contract.Assertf(len(listMarkerLocation) == 2,
fmt.Sprintf("Expected to find bullet list marker in line %s", line))
listMarkerIndex := listMarkerLocation[0]

// If our tracker is empty, we are at list nested level 0.
if len(tracker) == 0 {
newEntry := bulletListEntry{
name: name,
index: listMarkerIndex,
}
return append(tracker, newEntry)
}
// Always compare to last entry in tracker
lastListEntry := tracker[len(tracker)-1]

// if current line's listMarkerIndex is greater than the tracker's last entry's listMarkerIndex,
// make a new tracker entry and push it on there with all the info.
if listMarkerIndex > lastListEntry.index {
name = lastListEntry.name + "." + name
newEntry := bulletListEntry{
name: name,
index: listMarkerIndex,
}
return append(tracker, newEntry)
}
// if current line's listMarkerIndex is the same as the last entry's, we're at the same level.
if listMarkerIndex == lastListEntry.index {
// Replace the last entry in our tracker
replaceEntry := bulletListEntry{
index: listMarkerIndex,
}
if len(tracker) == 1 {
replaceEntry.name = name
} else {
// use the penultimate entry name to build current name
replaceName := tracker[(len(tracker)-2)].name + "." + name
replaceEntry.name = replaceName
}
return append(tracker[:len(tracker)-1], replaceEntry)
}

// The current line's listMarkerIndex is smaller that the previous entry's.
// Pop off the latest entry, and retry to see if the next previous entry is a match.
return trackBulletListIndentation(line, name, tracker[:len(tracker)-1])
}

// parseArgFromMarkdownLine takes a line of Markdown and attempts to parse it for a Terraform argument and its
// description. It returns a struct containing the name and description of the arg, and whether an arg was found.
func parseArgFromMarkdownLine(line string) markdownLineInfo {
matches := argumentBulletRegexp.FindStringSubmatch(line)
var parsed markdownLineInfo
if len(matches) > 4 {
parsed.name = matches[1]
parsed.desc = matches[4]
parsed.isFound = true
}
return parsed
}

var genericNestedRegexp = regexp.MustCompile("supports? the following:")

var nestedObjectRegexps = []*regexp.Regexp{
// For example:
// s3_bucket.html.markdown: "The `website` object supports the following:"
Expand Down Expand Up @@ -1022,106 +934,138 @@ func getNestedBlockNames(line string) []string {
}

func parseArgReferenceSection(subsection []string, ret *entityDocs) {
// Variable to remember the last argument we found.
var lastMatch string
// Collection to hold all arguments that headline a nested description.
var nesteds []docsPath

addNewHeading := func(name, desc, line string) {
// found a property bullet, extract the name and description
if len(nesteds) > 0 {
for _, nested := range nesteds {
// We found this line within a nested field. We should record it as such.
if ret.Arguments[nested] == nil {
totalArgumentsFromDocs++
}
ret.Arguments[nested.join(name)] = &argumentDocs{desc}
}

} else {
if genericNestedRegexp.MatchString(line) {
return
// Treat our subsection as a markdown node. This will later just be a node.
docBytes := []byte(strings.Join(subsection, "\n"))

// Parse the document using Goldmark parser
gm := goldmark.New(goldmark.WithExtensions(parse.TFRegistryExtension))
astNode := gm.Parser().Parse(gmtext.NewReader(docBytes))

var paths []string
var writeList bool // tracking whether we need to write a list verbatim
err := gmast.Walk(astNode, func(node gmast.Node, enter bool) (gmast.WalkStatus, error) {
// When we find a list item, we check if it is an argument entry.
if node.Kind().String() == "ListItem" {
if enter {
// For any list item, we want to check if it opens with a code span.
// It will be list item --> Text --> Code Span, so the grandchild of the list item.
codeSpanNode := node.FirstChild().FirstChild()
if codeSpanNode.Kind().String() == "CodeSpan" {
codeSpanItem := codeSpanNode.Text(docBytes)

// The list item's first child is a text block.
// For most of our entries, this is all we need.
desc := writeLines(node.FirstChild().Lines(), docBytes)

// To see if we have a TF name, use a regex match.
// The submatch looks for patterns such as
//
// `follow_gae_application` - (Optional) A GAE application whose zone to remain"
descs := descriptionRegexp.FindStringSubmatch(desc)
if len(descs) <= 4 {
writeList = true
}

// add to docspaths if writeList is false
if !writeList {
paths = addPaths(paths, codeSpanItem)
}
// Read results into the return argument docs. When we're reading subfields for multiple fields,
// the description is still the same as discovered from the node's lines.
for _, path := range paths {
if !writeList {
ret.Arguments[docsPath(path)] = &argumentDocs{descs[4]}
} else {
// We need to write the entire list item into the description.
// We'll just append each list item as it is visited.
currentDesc := ret.Arguments[docsPath(path)].description
newDesc := currentDesc + "\n* " + desc
ret.Arguments[docsPath(path)] = &argumentDocs{newDesc}
}
}
}
} else {
if !writeList {
paths = cutPaths(paths)
}
}
ret.Arguments[docsPath(name)] = &argumentDocs{description: desc}
totalArgumentsFromDocs++
}
}
// This function adds the current line as a description to the last matched resource,
//in cases where there's no resource match found on this line.
//It represents a multi-line description for a field.
extendExistingHeading := func(line string) {
if len(nesteds) > 0 {
for _, nested := range nesteds {
line = "\n" + strings.TrimSpace(line)
ret.Arguments[nested.join(lastMatch)].description += line
}
if node.Kind().String() == "Section" {
writeList = false
// A Section's first child is its heading.
// In this part of the upstream document, a heading generally means a subresource name.
if enter {
// The text next to an arg reference's section header is assumed to be a resource field.
headerItem := node.FirstChild().Text(docBytes)
// add to docs paths
paths = addPaths(paths, headerItem)
} else {
paths = cutPaths(paths)
}
} else {
if genericNestedRegexp.MatchString(line) {
lastMatch = ""
nesteds = []docsPath{}
return
}
// Additionally, there are top-level paragraphs that can contain information about nested docs,
// such as "The `foo_bar` object supports the following:".
if node.Kind().String() == "Paragraph" && node.Parent().Kind().String() == "Document" {
writeList = false
if enter {
// All of the fields mentioned in paragraphs can be treated as top-level, i.e.
// they're of the format "(The) `foo` [field|resource] supports the following:", or they already
// include the nested path as in "(The) `foo.bar` [field|resource] supports the following:".
// This means that at any detection of a top-level Paragraph node, we re-set the docsPath slice to empty.
paths = []string{}
paragraph := writeLines(node.Lines(), docBytes)
// Check if our paragraph matches any of the nested object signifiers. See `nestedObjectRegexps`.
nestedBlockNames := getNestedBlockNames(paragraph)
if len(nestedBlockNames) > 0 {
// write to docspath
paths = nestedBlockNames
}
} else {
// Because descriptions nested under paragraphs are not children, but rather siblings,
// we do not manipulate the docspath level at this point. Continue walking.
return gmast.WalkContinue, nil
}
line = "\n" + strings.TrimSpace(line)
ret.Arguments[docsPath(lastMatch)].description += line
}
}

// hadSpace tells us if the previous line was blank.
var hadSpace bool
return gmast.WalkContinue, nil
})
contract.AssertNoErrorf(err, "Cannot fail to parse argument reference")
}

// bulletListTracker is a stack-like collection that tracks the level of nesting for a bulleted list with
// nested lists. The name of the topmost entry represents the nested docs path for the current line.
var bulletListTracker []bulletListEntry
func writeLines(lines *gmtext.Segments, docBytes []byte) string {
var desc bytes.Buffer
for i := 0; i < lines.Len(); i++ {
line := lines.At(i)
desc.Write(line.Value(docBytes))
}
return desc.String()
}

for _, line := range subsection {
parsedArg := parseArgFromMarkdownLine(line)
matchFound := parsedArg.isFound
if matchFound { // We have found a new property bullet point.
desc := parsedArg.desc
bulletListTracker = trackBulletListIndentation(line, parsedArg.name, bulletListTracker)
name := bulletListTracker[len(bulletListTracker)-1].name
lastMatch = name
addNewHeading(name, desc, line)

} else if strings.TrimSpace(line) == "---" {
// --- is a markdown section break. This probably indicates the
// section is over, but we take it to mean that the current
// heading is over.
lastMatch = ""
bulletListTracker = nil
} else if nestedBlockCurrentLine := getNestedBlockNames(line); hadSpace && len(nestedBlockCurrentLine) > 0 {
// This tells us if there's a resource that is about to have subfields (nesteds)
// in subsequent lines.
//empty nesteds
nesteds = []docsPath{}
for _, item := range nestedBlockCurrentLine {
nesteds = append(nesteds, docsPath(item))
}
lastMatch = ""
bulletListTracker = nil
} else if !isBlank(line) && lastMatch != "" {
// This appends the current line to the previous match's description.
extendExistingHeading(line)

} else if nestedBlockCurrentLine := getNestedBlockNames(line); len(nestedBlockCurrentLine) > 0 {
// This tells us if there's a resource that is about to have subfields (nesteds)
// in subsequent lines.
//empty nesteds
nesteds = []docsPath{}
for _, item := range nestedBlockCurrentLine {
nesteds = append(nesteds, docsPath(item))
}
lastMatch = ""
bulletListTracker = nil
} else if lastMatch != "" {
extendExistingHeading(line)
func cutPaths(paths []string) []string {
var newpaths []string
for _, p := range paths {
pathIndex := strings.LastIndex(
p, ".")
if pathIndex > 0 {
p = p[:pathIndex]
newpaths = append(newpaths, p)
}
hadSpace = isBlank(line)
}
return newpaths
}

for _, v := range ret.Arguments {
v.description = strings.TrimRightFunc(v.description, unicode.IsSpace)
func addPaths(paths []string, pathSection []byte) []string {
if len(paths) == 0 {
paths = append(paths, string(pathSection))
} else {
var newPaths []string
for _, p := range paths {
p = p + "." + string(pathSection)
newPaths = append(newPaths, p)
}
paths = newPaths
}
return paths
}

func parseAttributesReferenceSection(subsection []string, ret *entityDocs) {
Expand Down
Loading