Merge pull request #137 from pzaino/develop

More ruleset support code ported from the Rust version
pzaino · Mar 9, 2024 · b6413ec · b6413ec
2 parents 6848788 + 725da9e
commit b6413ec
Show file tree

Hide file tree

Showing 7 changed files with 704 additions and 376 deletions.
diff --git a/pkg/config/types.go b/pkg/config/types.go
@@ -15,6 +15,8 @@
 // Package config contains the configuration file parsing logic.
 package config
 
+import "time"
+
 // FileStorageAPI is a generic File Storage API configuration
 type FileStorageAPI struct {
 	Host    string `yaml:"host"`    // Hostname of the API server
@@ -186,3 +188,31 @@ type Config struct {
 	OS         string `yaml:"os"`          // Operating system name
 	DebugLevel int    `yaml:"debug_level"` // Debug level for logging
 }
+
+//// ----------- Source Config ------------ ////
+
+type SourceConfig struct {
+	FormatVersion  string              `json:"format_version"`
+	Author         string              `json:"author"`
+	CreatedAt      time.Time           `json:"created_at"`
+	Description    string              `json:"description"`
+	SourceName     string              `json:"source_name"`
+	CrawlingConfig CrawlingConfig      `json:"crawling_config"`
+	ExecutionPlan  []ExecutionPlanItem `json:"execution_plan"`
+}
+
+type CrawlingConfig struct {
+	Site string `json:"site"`
+}
+
+type ExecutionPlanItem struct {
+	Label                string                 `json:"label"`
+	Conditions           Condition              `json:"conditions"`
+	RuleGroups           []string               `json:"rule_groups,omitempty"`
+	Rules                []string               `json:"rules,omitempty"`
+	AdditionalConditions map[string]interface{} `json:"additional_conditions,omitempty"`
+}
+
+type Condition struct {
+	UrlPatterns []string `json:"url_patterns"`
+}
diff --git a/pkg/crawler/action_rules.go b/pkg/crawler/action_rules.go
@@ -0,0 +1,270 @@
+// Copyright 2023 Paolo Fabio Zaino
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package crawler implements the crawling logic of the application.
+// It's responsible for crawling a website and extracting information from it.
+package crawler
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	cmn "github.com/pzaino/thecrowler/pkg/common"
+	cfg "github.com/pzaino/thecrowler/pkg/config"
+	rules "github.com/pzaino/thecrowler/pkg/ruleset"
+	"github.com/tebeka/selenium"
+)
+
+func processActionRules(wd *selenium.WebDriver, ctx *processContext, url string) {
+	cmn.DebugMsg(cmn.DbgLvlDebug2, "Starting to search and process CROWler Action rules...")
+	// Run Action Rules if any
+	if ctx.source.Config != nil {
+		// Execute the CROWler rules
+		cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler configured Action rules...")
+		// Execute the rules
+		if strings.TrimSpace(string((*ctx.source.Config))) == "{\"config\":\"default\"}" {
+			runDefaultActionRules(wd, ctx)
+		} else {
+			configStr := string((*ctx.source.Config))
+			cmn.DebugMsg(cmn.DbgLvlDebug, "Configuration: %v", configStr)
+		}
+	} else {
+		// Check for rules based on the URL
+		cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler URL based Action rules...")
+		// If the URL matches a rule, execute it
+		processURLRules(wd, ctx, url)
+	}
+}
+
+func processURLRules(wd *selenium.WebDriver, ctx *processContext, url string) {
+	rs, err := ctx.re.GetRulesetByURL(url)
+	if err == nil {
+		if rs != nil {
+			cmn.DebugMsg(cmn.DbgLvlDebug, "Executing ruleset: %s", rs.Name)
+			// Execute all the rules in the ruleset
+			executeActionRules(rs.GetActionRules(), wd)
+		}
+	} else {
+		rg, err := ctx.re.GetRuleGroupByURL(url)
+		if err == nil {
+			if rg != nil {
+				cmn.DebugMsg(cmn.DbgLvlDebug, "Executing rule group: %s", rg.GroupName)
+				// Execute all the rules in the rule group
+				executeActionRules(rg.GetActionRules(), wd)
+			}
+		}
+	}
+}
+
+func executeActionRules(rules []rules.ActionRule, wd *selenium.WebDriver) {
+	for _, r := range rules {
+		// Execute the rule
+		err := executeActionRule(&r, wd)
+		if err != nil {
+			cmn.DebugMsg(cmn.DbgLvlError, "Error executing action rule: %v", err)
+		}
+	}
+}
+
+// executeActionRule executes a single ActionRule
+func executeActionRule(r *rules.ActionRule, wd *selenium.WebDriver) error {
+	// Execute Wait condition first
+	if len(r.WaitConditions) != 0 {
+		for _, wc := range r.WaitConditions {
+			// Execute the wait condition
+			err := executeWaitCondition(&wc, wd)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	// Execute the action based on the ActionType
+	switch strings.ToLower(strings.TrimSpace(r.ActionType)) {
+	case "click":
+		return executeActionClick(r, wd)
+	case "scroll":
+		return executeActionScroll(r, wd)
+	case "input_text":
+		return executeActionInput(r, wd)
+	case "execute_javascript":
+		return executeActionJS(r, wd)
+	}
+
+	return fmt.Errorf("action type not supported: %s", r.ActionType)
+}
+
+// executeWaitCondition is responsible for executing a "wait" condition
+func executeWaitCondition(r *rules.WaitCondition, wd *selenium.WebDriver) error {
+	// Execute the wait condition
+	switch strings.ToLower(strings.TrimSpace(r.ConditionType)) {
+	case "element":
+		return nil
+	case "delay":
+		return nil
+	case "custom_js":
+		_, err := (*wd).ExecuteScript(r.CustomJS, nil)
+		return err
+	default:
+		return fmt.Errorf("wait condition not supported: %s", r.ConditionType)
+	}
+}
+
+// executeActionClick is responsible for executing a "click" action
+func executeActionClick(r *rules.ActionRule, wd *selenium.WebDriver) error {
+	// Find the element
+	wdf, _, err := findElementBySelectorType(wd, r.Selectors)
+	if err != nil {
+		cmn.DebugMsg(cmn.DbgLvlDebug3, "No element '%v' found.", err)
+		err = nil
+	}
+
+	// If the element is found, click it
+	if wdf != nil {
+		err := wdf.Click()
+		return err
+	}
+	return err
+}
+
+// executeActionScroll is responsible for executing a "scroll" action
+func executeActionScroll(r *rules.ActionRule, wd *selenium.WebDriver) error {
+	// Get Selectors list
+	value := r.Value
+
+	// Get the attribute to scroll to
+	var attribute string
+	if value == "" {
+		attribute = "document.body.scrollHeight"
+	} else {
+		attribute = value
+	}
+
+	// Use Sprintf to dynamically create the script string with the attribute value
+	script := fmt.Sprintf("window.scrollTo(0, %s)", attribute)
+
+	// Scroll the page
+	_, err := (*wd).ExecuteScript(script, nil)
+	return err
+}
+
+// executeActionJS is responsible for executing a "execute_javascript" action
+func executeActionJS(r *rules.ActionRule, wd *selenium.WebDriver) error {
+	// Execute the JavaScript
+	_, err := (*wd).ExecuteScript(r.Value, nil)
+	return err
+}
+
+// executeActionInput is responsible for executing an "input" action
+func executeActionInput(r *rules.ActionRule, wd *selenium.WebDriver) error {
+	// Find the element
+	wdf, selector, err := findElementBySelectorType(wd, r.Selectors)
+	if err != nil {
+		cmn.DebugMsg(cmn.DbgLvlDebug3, "No element '%v' found.", err)
+		err = nil
+	}
+
+	// If the element is found, input the text
+	if wdf != nil {
+		err = wdf.SendKeys(selector.Attribute)
+	}
+	return err
+}
+
+// findElementBySelectorType is responsible for finding an element in the WebDriver
+// using the appropriate selector type. It returns the first element found and an error.
+func findElementBySelectorType(wd *selenium.WebDriver, selectors []rules.Selector) (selenium.WebElement, rules.Selector, error) {
+	var wdf selenium.WebElement = nil
+	var err error
+	var selector rules.Selector
+	for _, selector = range selectors {
+		switch selector.SelectorType {
+		case "css":
+			wdf, err = (*wd).FindElement(selenium.ByCSSSelector, selector.Selector)
+		case "xpath":
+			wdf, err = (*wd).FindElement(selenium.ByXPATH, selector.Selector)
+		case "id":
+			wdf, err = (*wd).FindElement(selenium.ByID, selector.Selector)
+		case "name":
+			wdf, err = (*wd).FindElement(selenium.ByName, selector.Selector)
+		case "linktext":
+			wdf, err = (*wd).FindElement(selenium.ByLinkText, selector.Selector)
+		case "partiallinktext":
+			wdf, err = (*wd).FindElement(selenium.ByPartialLinkText, selector.Selector)
+		case "tagname":
+			wdf, err = (*wd).FindElement(selenium.ByTagName, selector.Selector)
+		case "class":
+			wdf, err = (*wd).FindElement(selenium.ByClassName, selector.Selector)
+		}
+		if err == nil && wdf != nil {
+			break
+		}
+	}
+
+	return wdf, selector, err
+}
+
+func DefaultActionConfig(url string) cfg.SourceConfig {
+	return cfg.SourceConfig{
+		FormatVersion: "1.0",
+		Author:        "Your Name",
+		CreatedAt:     time.Now(),
+		Description:   "Default configuration",
+		SourceName:    "Example Source",
+		CrawlingConfig: cfg.CrawlingConfig{
+			Site: url,
+		},
+		ExecutionPlan: []cfg.ExecutionPlanItem{
+			{
+				Label: "Default Execution Plan",
+				Conditions: cfg.Condition{
+					UrlPatterns: []string{url},
+				},
+				Rules: []string{"ClickAcceptCookiesButton"},
+			},
+		},
+	}
+}
+
+func runDefaultActionRules(wd *selenium.WebDriver, ctx *processContext) {
+	// Execute the default scraping rules
+	cmn.DebugMsg(cmn.DbgLvlDebug, "Executing default action rules...")
+
+	// Get the default scraping rules
+	url, err := (*wd).CurrentURL()
+	if err != nil {
+		cmn.DebugMsg(cmn.DbgLvlError, "Error getting the current URL: %v", err)
+		url = ""
+	}
+	rs := DefaultActionConfig(url)
+	// Execute all the rules in the ruleset
+	for _, r := range rs.ExecutionPlan {
+		// Get the rule
+		for _, ruleName := range r.Rules {
+			if ruleName == "" {
+				continue
+			}
+			rule, err := ctx.re.GetActionRuleByName(ruleName)
+			if err != nil {
+				cmn.DebugMsg(cmn.DbgLvlError, "Error getting action rule: %v", err)
+			} else {
+				// Execute the rule
+				err := executeActionRule(rule, wd)
+				if err != nil {
+					cmn.DebugMsg(cmn.DbgLvlError, "Error executing action rule: %v", err)
+				}
+			}
+		}
+	}
+}