pzaino · pzaino · Apr 20, 2024 · Apr 20, 2024
diff --git a/doc/ruleset-reference.md b/doc/ruleset-reference.md
@@ -0,0 +1,210 @@
+# TheCROWler Ruleset Reference
+
+*The CROWler ruleset schema defines the structure of a ruleset file, which
+contains rules for scraping, action execution, detection, and crawling.*
+
+## Items
+
+- **Items** *(object)*
+  - **`format_version`** *(string)*: Version of the ruleset format, to ensure
+   compatibility.
+  - **`author`** *(string)*: The author or owner of the ruleset.
+  - **`created_at`** *(string)*: Creation date of the ruleset.
+  - **`description`** *(string)*: A brief description of what the ruleset does.
+  - **`ruleset_name`** *(string)*: A unique name identifying the ruleset.
+  - **`rule_groups`** *(array)*
+    - **Items** *(object)*
+      - **`group_name`** *(string)*: A unique name identifying the group of
+      rules.
+      - **`valid_from`** *(string)*: The start date from which the rule group
+      becomes active.
+      - **`valid_to`** *(string)*: The end date until which the rule group
+      remains active.
+      - **`is_enabled`** *(boolean)*: Flag to enable or disable the rule group.
+      - **`scraping_rules`** *(array)*
+        - **Items** *(object)*
+          - **`rule_name`** *(string)*: A unique name identifying the scraping
+           rule.
+          - **`pre_conditions`** *(array)*: Conditions that must be met for the
+           scraping to be executed.
+            - **Items** *(object)*
+              - **`path`** *(string)*: The specific path or pattern to match
+              for scraping.
+              - **`url`** *(string)*: Optional. The specific URL to which this
+               rule applies. If omitted, the rule is considered applicable to
+                any URL matching the path.
+          - **`elements`** *(array)*: Defines multiple ways to find and
+          interact with elements, allowing for CSS, XPath, and other
+          Selenium-supported strategies.
+            - **Items** *(object)*
+              - **`key`** *(string)*
+              - **`selectors`** *(array)*
+                - **Items** *(object)*
+                  - **`selector_type`** *(string)*: Must be one of: `['css',
+                   'xpath', 'id', 'class_name', 'name', 'tag_name',
+                   'link_text', 'partial_link_text', 'regex']`.
+                  - **`selector`** *(string)*
+                  - **`attribute`** *(string)*: Optional. The attribute of the
+                   element to extract, e.g., 'innerText'. Mainly relevant for
+                   scraping actions.
+          - **`js_files`** *(boolean)*: Indicates whether JavaScript files are
+           relevant for the scraping.
+          - **`objects`** *(array)*: Identifies specific technologies, requires
+           correspondent detection rules.
+            - **Items**: A unique name identifying the detection rule.
+          - **`json_field_mappings`** *(object)*: Maps scraped elements to JSON
+           fields using PostgreSQL JSON path expressions. Can contain
+           additional properties.
+            - **Additional Properties** *(string)*
+          - **`wait_conditions`** *(array)*: Conditions to wait for before
+          performing scraping, ensuring page readiness.
+            - **Items** *(object)*
+              - **`condition_type`** *(string)*: Must be one of:
+              `['element_presence', 'element_visible', 'custom_js', 'delay']`.
+              - **`value`** *(string)*: a generic value to use with the
+              condition, e.g., a delay in seconds, applicable for delay
+              condition type.
+              - **`selector`** *(string)*: The CSS selector for the element,
+               applicable for element_presence and element_visible conditions.
+              - **`custom_js`** *(string)*: Custom JavaScript condition to
+               evaluate, applicable for custom_js condition type.
+          - **`post_processing`** *(array)*: Post-processing steps for the
+           scraped data to transform, validate, or clean it.
+            - **Items** *(object)*
+              - **`step_type`** *(string)*: Must be one of: `['replace',
+               'remove', 'transform', 'validate', 'clean']`.
+              - **`details`** *(object)*: Detailed configuration for the
+              post-processing step, structure depends on the step_type.
+      - **`action_rules`** *(array)*
+        - **Items** *(object)*
+          - **`rule_name`** *(string)*: A unique name identifying the action
+           rule.
+          - **`action_type`** *(string)*: The type of action to perform,
+           including advanced interactions. Must be one of: `['click',
+            'input_text', 'clear', 'drag_and_drop', 'mouse_hover',
+             'right_click', 'double_click', 'click_and_hold', 'release',
+              'key_down', 'key_up', 'navigate_to_url', 'forward', 'back',
+               'refresh', 'switch_to_window', 'switch_to_frame',
+                'close_window', 'accept_alert', 'dismiss_alert',
+                 'get_alert_text', 'send_keys_to_alert', 'scroll_to_element',
+                  'scroll_by_amount', 'take_screenshot',
+                   'execute_javascript']`.
+          - **`selectors`** *(array)*
+            - **Items** *(object)*
+              - **`selector_type`** *(string)*: Must be one of: `['css',
+               'xpath', 'id', 'class_name', 'name', 'tag_name', 'link_text',
+                'partial_link_text']`.
+              - **`selector`** *(string)*: The actual selector or pattern used
+               to find the element based on the selector_type.
+              - **`attribute`** *(object)*: Optional. The attribute of the
+               element to match.
+                - **`name`** *(string)*: The name of the attribute to match for
+                 the selector match to be valid.
+                - **`value`** *(string)*: The value to of the attribute to
+                 match for the selector to be valid.
+              - **`value`** *(string)*: The value within the selector that we
+               need to match for the action. (this is NOT the value to input!).
+          - **`value`** *(string)*: The value to use with the action, e.g.,
+           text to input, applicable for input_text.
+          - **`url`** *(string)*: Optional. The specific URL to which this
+           action applies or the URL to navigate to, applicable for navigate
+            action.
+          - **`wait_conditions`** *(array)*: Conditions to wait for before
+           performing the action, ensuring page readiness.
+            - **Items** *(object)*
+              - **`condition_type`** *(string)*: Must be one of:
+              `['element_presence', 'element_visible', 'custom_js', 'delay']`.
+              - **`value`** *(string)*: a generic value to use with the
+               condition, e.g., a delay in seconds, applicable for delay
+                condition type.
+              - **`selector`** *(string)*: The CSS selector for the element,
+               applicable for element_presence and element_visible conditions.
+              - **`custom_js`** *(string)*: Custom JavaScript condition to
+               evaluate, applicable for custom_js condition type.
+          - **`conditions`** *(object)*: Conditions that must be met for the
+           action to be executed. Can contain additional properties.
+          - **`error_handling`** *(object)*: Error handling strategies for the
+           action.
+            - **`retry_count`** *(integer)*: The number of times to retry the
+             action on failure.
+            - **`retry_delay`** *(integer)*: The delay between retries in
+             seconds.
+      - **`detection_rules`** *(array)*
+        - **Items** *(object)*
+          - **`rule_name`** *(string)*: A unique name identifying the
+           detection rule.
+          - **`object_name`** *(string)*: The name of the object or technology
+           to identify.
+          - **`object_version`** *(string)*: Optional. The version of the
+           object or technology to identify.
+          - **`http_header_fields`** *(array)*: Matching patterns for HTTP
+           header fields to identify technology.
+            - **Items** *(object)*
+              - **`key`** *(string)*: The name of the HTTP header field.
+              - **`value`** *(array)*: The expected value of the HTTP header
+               field.
+                - **Items** *(string)*
+              - **`confidence`** *(number)*: Optional. The confidence level for
+               the match, ranging from 0 to 10.
+          - **`page_content_patterns`** *(array)*: Patterns within the page
+           content that match specific technologies.
+            - **Items** *(object)*: Phrases or character sequences within page
+             content indicative of specific technology.
+              - **`key`** *(string)*: The name of the tag to find in the page
+               content.
+              - **`attribute`** *(string)*: Optional. The attribute of the tag
+               to match, e.g., 'src' for img tag etc. (use 'text' for text
+                content).
+              - **`value`** *(array)*: The pattern to match within the page
+               content's tag.
+                - **Items** *(string)*
+              - **`confidence`** *(number)*: Optional. The confidence level
+               for the detection, decimal number ranging from 0 to 10 (or
+                whatever set in the detection_configuration).
+          - **`url_micro_signatures`** *(array)*: URL patterns indicative of
+           specific technologies.
+            - **Items** *(object)*: Micro-signatures in URLs that indicate a
+             specific technology, like '/wp-admin' for WordPress.
+              - **`value`** *(string)*: The micro-signature to match in the
+               URL.
+              - **`confidence`** *(number)*: Optional. The confidence level for
+               the match, decimal number ranging from 0 to 10 (or whatever set
+                in the detection_configuration).
+      - **`crawling_rules`** *(array)*
+        - **Items** *(object)*
+          - **`rule_name`** *(string)*: A unique name identifying the crawling
+           rule.
+          - **`request_type`** *(string)*: The type of request to perform for
+           fuzzing. Must be one of: `['GET', 'POST']`.
+          - **`target_elements`** *(array)*: Specifies the elements to target
+           for fuzzing, including forms.
+            - **Items** *(object)*
+              - **`selector_type`** *(string)*: Must be one of: `['css',
+               'xpath', 'form']`.
+              - **`selector`** *(string)*: The actual selector or form name
+               used to find and interact with the target elements for fuzzing.
+          - **`fuzzing_parameters`** *(array)*: Defines the parameters to fuzz
+           and the strategy for generating fuzz values.
+            - **Items** *(object)*
+              - **`parameter_name`** *(string)*: Name of the parameter to fuzz.
+              - **`fuzzing_type`** *(string)*: The fuzzing strategy to use for
+               the parameter. Must be one of: `['fixed_list',
+                'pattern_based']`.
+              - **`values`** *(array)*: List of values to use for fuzzing,
+               applicable if 'fuzzing_type' is 'fixed_list'.
+                - **Items** *(string)*
+              - **`pattern`** *(string)*: A pattern to generate fuzzing values,
+               applicable if 'fuzzing_type' is 'pattern_based'.
+      - **`environment_settings`** *(object)*: Custom settings for the
+       WebDriver environment.
+        - **`headless_mode`** *(boolean)*: Specifies if the WebDriver should
+         operate in headless mode.
+        - **`custom_browser_options`** *(object)*: Custom options for browser
+         instances, such as proxies or window size.
+      - **`logging_configuration`** *(object)*: Configuration for logging and
+       monitoring rule execution.
+        - **`log_level`** *(string)*: Specifies the logging level for actions
+         and scraping activities. Must be one of: `['DEBUG', 'INFO', 'WARNING'
+         , 'ERROR', 'CRITICAL']`.
+        - **`log_file`** *(string)*: Optional. The file path to store logs if
+         file logging is desired.
diff --git a/doc/ruleset_architecture.md b/doc/ruleset_architecture.md
@@ -96,3 +96,8 @@ The CROWler supports the following rule types:
 
 Conditions are the criteria that must be met for the rule to be executed.
 Each rule type may present different types of conditions.
+
+## Ruleset Reference
+
+Check this [link](doc/ruleset_reference.md) for a detailed reference of
+the ruleset schema.
diff --git a/pkg/httpinfo/httpinfo.go b/pkg/httpinfo/httpinfo.go
@@ -287,19 +287,28 @@ func detectTechnologiesByKeyword(responseBody string, signatures *map[string][]r
 
 func detectTechBySignature(responseBody string, doc *goquery.Document, signature ruleset.PageContentSignature, sig string, detectedTech *map[string]float32) {
 	if signature.Key == "*" {
-		if strings.Contains(responseBody, signature.Signature) {
-			(*detectedTech)[sig] += signature.Confidence
-		}
+		detectTechBySignatureValue(responseBody, signature.Signature, sig, detectedTech, signature.Confidence)
 	} else {
 		doc.Find(signature.Key).Each(func(index int, htmlItem *goquery.Selection) {
-			text := htmlItem.Text()
-			if strings.Contains(text, signature.Signature) {
-				(*detectedTech)[sig] += signature.Confidence
+			var text string
+			if (signature.Attribute != "") && (signature.Attribute != "text") {
+				text = htmlItem.AttrOr(strings.ToLower(strings.TrimSpace(signature.Attribute)), "")
+			} else {
+				text = htmlItem.Text()
 			}
+			detectTechBySignatureValue(text, signature.Signature, sig, detectedTech, signature.Confidence)
 		})
 	}
 }
 
+func detectTechBySignatureValue(text string, signatures []string, sig string, detectedTech *map[string]float32, confidence float32) {
+	for _, sigValue := range signatures {
+		if strings.Contains(text, sigValue) {
+			(*detectedTech)[sig] += confidence
+		}
+	}
+}
+
 func detectTechByTag(header *http.Header, tagName string, cmsNames *map[string]map[string]ruleset.HTTPHeaderField, detectedTech *map[string]float32) {
 	hh := (*header)[tagName] // get the header value (header tag name is case sensitive)
 	tagName = strings.ToLower(tagName)

diff --git a/pkg/ruleset/common.go b/pkg/ruleset/common.go
@@ -368,3 +368,18 @@ func PreparePathForSearch(path string) (string, error) {
 	}
 	return strings.ToLower(strings.TrimSpace(path)), nil
 }
+
+// PrepareSlice prepares a slice of strings by trimming and lowercasing each element.
+func PrepareSlice(slice []string, flags int) []string {
+	var prepared []string
+	for _, s := range slice {
+		if flags&01 == 01 {
+			s = strings.TrimSpace(s)
+		}
+		if flags&02 == 02 {
+			s = strings.ToLower(s)
+		}
+		prepared = append(prepared, s)
+	}
+	return prepared
+}
diff --git a/pkg/ruleset/detectionrule.go b/pkg/ruleset/detectionrule.go
@@ -48,7 +48,7 @@ func (d *DetectionRule) GetAllPageContentPatterns() []PageContentSignature {
 	for _, pattern := range d.PageContentPatterns {
 		trimmedPatterns = append(trimmedPatterns, PageContentSignature{
 			Key:        strings.TrimSpace(pattern.Key),
-			Signature:  strings.TrimSpace(pattern.Signature),
+			Signature:  PrepareSlice(pattern.Signature, 1), // flag = 1 only trim spaces
 			Confidence: pattern.Confidence,
 		},
 		)

diff --git a/pkg/ruleset/types.go b/pkg/ruleset/types.go
@@ -139,7 +139,7 @@ type DetectionRule struct {
 // HTTPHeaderField represents a pattern for matching HTTP header fields
 type HTTPHeaderField struct {
 	Key        string   `yaml:"key"`
-	Value      []string `yaml:"value"`
+	Value      []string `yaml:"value,omitempty"`
 	Confidence float32  `yaml:"confidence"`
 }
 
@@ -151,9 +151,10 @@ type URLMicroSignature struct {
 
 // PageContent micro-signatures are patterns that can be found in the page content
 type PageContentSignature struct {
-	Key        string  `yaml:"key"`
-	Signature  string  `yaml:"value"`
-	Confidence float32 `yaml:"confidence"`
+	Key        string   `yaml:"key"`
+	Attribute  string   `yaml:"attribute,omitempty"`
+	Signature  []string `yaml:"value,omitempty"`
+	Confidence float32  `yaml:"confidence"`
 }
 
 // MetaTag represents a pattern for matching HTML meta tags

diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json
@@ -1,7 +1,7 @@
 {
     "$schema": "http://json-schema.org/draft-07/schema#",
     "type": "object",
-    "description": "Schema for a file containing multiple rulesets, each with its own configuration for web scraping and automation tasks.",
+    "description": "The CROWler ruleset schema defines the structure of a ruleset file, which contains rules for scraping, action execution, detection, and crawling.",
     "items": {
         "type": "object",
         "properties": {
@@ -407,8 +407,15 @@
                                                     "type": "string",
                                                     "description": "The name of the tag to find in the page content."
                                                 },
-                                                "value": {
+                                                "attribute": {
                                                     "type": "string",
+                                                    "description": "Optional. The attribute of the tag to match, e.g., 'src' for img tag etc. (use 'text' for text content)."
+                                                },
+                                                "value": {
+                                                    "type": "array",
+                                                    "items": {
+                                                        "type": "string"
+                                                    },
                                                     "description": "The pattern to match within the page content's tag."
                                                 },
                                                 "confidence": {

diff --git a/schemas/ruleset-schema.yaml b/schemas/ruleset-schema.yaml
@@ -3,7 +3,7 @@
 ---
 $schema: http://json-schema.org/draft-07/schema#
 type: object
-description: Schema for a file containing multiple rulesets, each with its own configuration for web scraping and automation tasks.
+description: The CROWler ruleset schema defines the structure of a ruleset file, which contains rules for scraping, action execution, detection, and crawling.
 items:
   type: object
   properties:
@@ -310,8 +310,13 @@ items:
                       key:
                         type: string
                         description: The name of the tag to find in the page content.
-                      value:
+                      attribute:
                         type: string
+                        description: Optional. The attribute of the tag to match. (use 'text' for the tag's inner text content)
+                      value:
+                        type: array
+                        items:
+                          type: string
                         description: The pattern to match within the page content's tag.
                       confidence:
                         type: number