Skip to content

Commit

Permalink
Support parent extraction from header blocks
Browse files Browse the repository at this point in the history
Signed-off-by: Alper Rifat Ulucinar <ulucinar@users.noreply.github.com>
  • Loading branch information
ulucinar committed Sep 26, 2022
1 parent 4bae257 commit ddc7f3d
Show file tree
Hide file tree
Showing 5 changed files with 229 additions and 160 deletions.
84 changes: 64 additions & 20 deletions pkg/registry/meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"io/ioutil"
"path/filepath"
"regexp"
"sort"
"strings"

"github.com/antchfx/htmlquery"
Expand All @@ -34,7 +35,8 @@ const (
)

var (
regexConfigurationBlock = regexp.MustCompile(`block.*support`)
regexConfigurationBlock = regexp.MustCompile(`block.*(support)?`)
regexHeaderNode = regexp.MustCompile(`h\d`)
)

// NewProviderMetadata initializes a new ProviderMetadata for
Expand Down Expand Up @@ -236,7 +238,6 @@ func (r *Resource) scrapePrelude(doc *html.Node, preludeXPath string) error {
}

func (r *Resource) scrapeFieldDocs(doc *html.Node, fieldXPath string) {
conflictedFields := make(map[string]bool)
processed := make(map[*html.Node]struct{})
codeNodes := htmlquery.Find(doc, fieldXPath)
for _, n := range codeNodes {
Expand All @@ -249,31 +250,61 @@ func (r *Resource) scrapeFieldDocs(doc *html.Node, fieldXPath string) {
r.ArgumentDocs = make(map[string]string)
}
if r.ArgumentDocs[attrName] != "" && r.ArgumentDocs[attrName] != strings.TrimSpace(docStr) {
conflictedFields[attrName] = true
continue
}
r.ArgumentDocs[attrName] = strings.TrimSpace(docStr)
}

// Remove descriptions for repeating fields in the registry.
for cf := range conflictedFields {
delete(r.ArgumentDocs, cf)
}
}

func getRootPath(n *html.Node) string { // nolint: gocyclo
var ulNode, pNode, codeNode *html.Node
func (r *Resource) getRootPath(n *html.Node) string {
var ulNode, pNode *html.Node
for ulNode = n.Parent; ulNode != nil && ulNode.Data != "ul"; ulNode = ulNode.Parent {
}
if ulNode == nil {
return ""
}
for pNode = ulNode.PrevSibling; pNode != nil && (pNode.Data != "p" || !checkBlockParagraph(pNode)); pNode = pNode.PrevSibling {
// intentionally left empty
for pNode = ulNode.PrevSibling; pNode != nil && (pNode.Data != "p" || !regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode)))); pNode = pNode.PrevSibling {
if regexHeaderNode.MatchString(pNode.Data) {
return r.extractRootFromHeader(pNode)
}
}
if pNode == nil {
return ""
}
return r.extractRootFromParagraph(pNode)
}

func (r *Resource) extractRootFromHeader(pNode *html.Node) string {
headerText := extractText(pNode)
if _, ok := r.ArgumentDocs[headerText]; ok {
return headerText
}
sortedKeys := make([]string, 0, len(r.ArgumentDocs))
for k := range r.ArgumentDocs {
sortedKeys = append(sortedKeys, k)
}
sort.Strings(sortedKeys)
for _, k := range sortedKeys {
parts := strings.Split(k, ".")
if headerText == parts[len(parts)-1] {
return k
}
}
if _, ok := r.ArgumentDocs[strings.ReplaceAll(headerText, " ", ".")]; ok {
return strings.ReplaceAll(headerText, " ", ".")
}
if regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode))) {
for _, s := range strings.Split(headerText, " ") {
if _, ok := r.ArgumentDocs[s]; ok {
return s
}
}
}
return ""
}

func (r *Resource) extractRootFromParagraph(pNode *html.Node) string {
var codeNode *html.Node
for codeNode = pNode.FirstChild; codeNode != nil && codeNode.Data != "code"; codeNode = codeNode.NextSibling {
// intentionally left empty
}
Expand All @@ -284,7 +315,7 @@ func getRootPath(n *html.Node) string { // nolint: gocyclo
if prevLiNode == nil {
return codeNode.FirstChild.Data
}
root := getRootPath(prevLiNode)
root := r.getRootPath(prevLiNode)
if len(root) == 0 {
return codeNode.FirstChild.Data
}
Expand All @@ -308,14 +339,27 @@ func getPrevLiWithCodeText(codeText string, pNode *html.Node) *html.Node {
return nil
}

func checkBlockParagraph(p *html.Node) bool {
// traverse children of the paragraph node
for c := p.FirstChild; c != nil; c = c.NextSibling {
if regexConfigurationBlock.MatchString(c.Data) {
return true
func extractText(n *html.Node) string {
switch n.Type { // nolint:exhaustive
case html.TextNode:
return n.Data
case html.ElementNode:
sb := strings.Builder{}
for c := n.FirstChild; c != nil; c = c.NextSibling {
s := ""
if c.Type != html.TextNode {
s = extractText(c)
} else {
s = c.Data
}
if len(s) != 0 {
sb.WriteString(s)
}
}
return sb.String()
default:
return ""
}
return false
}

func (r *Resource) scrapeDocString(n *html.Node, attrName *string, processed map[*html.Node]struct{}) string {
Expand All @@ -331,7 +375,7 @@ func (r *Resource) scrapeDocString(n *html.Node, attrName *string, processed map
sb := strings.Builder{}
if *attrName == "" {
*attrName = n.Data
if root := getRootPath(n); len(root) != 0 {
if root := r.getRootPath(n); len(root) != 0 {
*attrName = fmt.Sprintf("%s.%s", root, *attrName)
}
} else {
Expand Down
2 changes: 2 additions & 0 deletions pkg/registry/testdata/aws/pm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,10 @@ resources:
bucket: '- (Required, Forces new resource) The name of the bucket.'
expected_bucket_owner: '- (Optional, Forces new resource) The account ID of the expected bucket owner.'
grantee.email_address: '- (Optional) Email address of the grantee. See Regions and Endpoints for supported AWS regions where this argument can be specified.'
grantee.id: '- (Optional) The canonical user ID of the grantee.'
grantee.type: '- (Required) Type of grantee. Valid values: CanonicalUser, AmazonCustomerByEmail, Group.'
grantee.uri: '- (Optional) URI of the grantee group.'
id: '- The bucket, expected_bucket_owner (if configured), and acl (if configured) separated by commas (,).'
owner.display_name: '- (Optional) The display name of the owner.'
owner.id: '- (Required) The ID of the owner.'
importStatements: []
Loading

0 comments on commit ddc7f3d

Please sign in to comment.