diff --git a/pkg/config/types.go b/pkg/config/types.go index 180ea459..16ab5411 100644 --- a/pkg/config/types.go +++ b/pkg/config/types.go @@ -15,6 +15,8 @@ // Package config contains the configuration file parsing logic. package config +import "time" + // FileStorageAPI is a generic File Storage API configuration type FileStorageAPI struct { Host string `yaml:"host"` // Hostname of the API server @@ -186,3 +188,31 @@ type Config struct { OS string `yaml:"os"` // Operating system name DebugLevel int `yaml:"debug_level"` // Debug level for logging } + +//// ----------- Source Config ------------ //// + +type SourceConfig struct { + FormatVersion string `json:"format_version"` + Author string `json:"author"` + CreatedAt time.Time `json:"created_at"` + Description string `json:"description"` + SourceName string `json:"source_name"` + CrawlingConfig CrawlingConfig `json:"crawling_config"` + ExecutionPlan []ExecutionPlanItem `json:"execution_plan"` +} + +type CrawlingConfig struct { + Site string `json:"site"` +} + +type ExecutionPlanItem struct { + Label string `json:"label"` + Conditions Condition `json:"conditions"` + RuleGroups []string `json:"rule_groups,omitempty"` + Rules []string `json:"rules,omitempty"` + AdditionalConditions map[string]interface{} `json:"additional_conditions,omitempty"` +} + +type Condition struct { + UrlPatterns []string `json:"url_patterns"` +} diff --git a/pkg/crawler/action_rules.go b/pkg/crawler/action_rules.go new file mode 100644 index 00000000..01f31d21 --- /dev/null +++ b/pkg/crawler/action_rules.go @@ -0,0 +1,270 @@ +// Copyright 2023 Paolo Fabio Zaino +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package crawler implements the crawling logic of the application. +// It's responsible for crawling a website and extracting information from it. +package crawler + +import ( + "fmt" + "strings" + "time" + + cmn "github.com/pzaino/thecrowler/pkg/common" + cfg "github.com/pzaino/thecrowler/pkg/config" + rules "github.com/pzaino/thecrowler/pkg/ruleset" + "github.com/tebeka/selenium" +) + +func processActionRules(wd *selenium.WebDriver, ctx *processContext, url string) { + cmn.DebugMsg(cmn.DbgLvlDebug2, "Starting to search and process CROWler Action rules...") + // Run Action Rules if any + if ctx.source.Config != nil { + // Execute the CROWler rules + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler configured Action rules...") + // Execute the rules + if strings.TrimSpace(string((*ctx.source.Config))) == "{\"config\":\"default\"}" { + runDefaultActionRules(wd, ctx) + } else { + configStr := string((*ctx.source.Config)) + cmn.DebugMsg(cmn.DbgLvlDebug, "Configuration: %v", configStr) + } + } else { + // Check for rules based on the URL + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler URL based Action rules...") + // If the URL matches a rule, execute it + processURLRules(wd, ctx, url) + } +} + +func processURLRules(wd *selenium.WebDriver, ctx *processContext, url string) { + rs, err := ctx.re.GetRulesetByURL(url) + if err == nil { + if rs != nil { + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing ruleset: %s", rs.Name) + // Execute all the rules in the ruleset + executeActionRules(rs.GetActionRules(), wd) + } + } else { + rg, err := ctx.re.GetRuleGroupByURL(url) + if err == nil { + if rg != nil { + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing rule group: %s", rg.GroupName) + // Execute all the rules in the rule group + executeActionRules(rg.GetActionRules(), wd) + } + } + } +} + +func executeActionRules(rules []rules.ActionRule, wd *selenium.WebDriver) { + for _, r := range rules { + // Execute the rule + err := executeActionRule(&r, wd) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error executing action rule: %v", err) + } + } +} + +// executeActionRule executes a single ActionRule +func executeActionRule(r *rules.ActionRule, wd *selenium.WebDriver) error { + // Execute Wait condition first + if len(r.WaitConditions) != 0 { + for _, wc := range r.WaitConditions { + // Execute the wait condition + err := executeWaitCondition(&wc, wd) + if err != nil { + return err + } + } + } + // Execute the action based on the ActionType + switch strings.ToLower(strings.TrimSpace(r.ActionType)) { + case "click": + return executeActionClick(r, wd) + case "scroll": + return executeActionScroll(r, wd) + case "input_text": + return executeActionInput(r, wd) + case "execute_javascript": + return executeActionJS(r, wd) + } + + return fmt.Errorf("action type not supported: %s", r.ActionType) +} + +// executeWaitCondition is responsible for executing a "wait" condition +func executeWaitCondition(r *rules.WaitCondition, wd *selenium.WebDriver) error { + // Execute the wait condition + switch strings.ToLower(strings.TrimSpace(r.ConditionType)) { + case "element": + return nil + case "delay": + return nil + case "custom_js": + _, err := (*wd).ExecuteScript(r.CustomJS, nil) + return err + default: + return fmt.Errorf("wait condition not supported: %s", r.ConditionType) + } +} + +// executeActionClick is responsible for executing a "click" action +func executeActionClick(r *rules.ActionRule, wd *selenium.WebDriver) error { + // Find the element + wdf, _, err := findElementBySelectorType(wd, r.Selectors) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlDebug3, "No element '%v' found.", err) + err = nil + } + + // If the element is found, click it + if wdf != nil { + err := wdf.Click() + return err + } + return err +} + +// executeActionScroll is responsible for executing a "scroll" action +func executeActionScroll(r *rules.ActionRule, wd *selenium.WebDriver) error { + // Get Selectors list + value := r.Value + + // Get the attribute to scroll to + var attribute string + if value == "" { + attribute = "document.body.scrollHeight" + } else { + attribute = value + } + + // Use Sprintf to dynamically create the script string with the attribute value + script := fmt.Sprintf("window.scrollTo(0, %s)", attribute) + + // Scroll the page + _, err := (*wd).ExecuteScript(script, nil) + return err +} + +// executeActionJS is responsible for executing a "execute_javascript" action +func executeActionJS(r *rules.ActionRule, wd *selenium.WebDriver) error { + // Execute the JavaScript + _, err := (*wd).ExecuteScript(r.Value, nil) + return err +} + +// executeActionInput is responsible for executing an "input" action +func executeActionInput(r *rules.ActionRule, wd *selenium.WebDriver) error { + // Find the element + wdf, selector, err := findElementBySelectorType(wd, r.Selectors) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlDebug3, "No element '%v' found.", err) + err = nil + } + + // If the element is found, input the text + if wdf != nil { + err = wdf.SendKeys(selector.Attribute) + } + return err +} + +// findElementBySelectorType is responsible for finding an element in the WebDriver +// using the appropriate selector type. It returns the first element found and an error. +func findElementBySelectorType(wd *selenium.WebDriver, selectors []rules.Selector) (selenium.WebElement, rules.Selector, error) { + var wdf selenium.WebElement = nil + var err error + var selector rules.Selector + for _, selector = range selectors { + switch selector.SelectorType { + case "css": + wdf, err = (*wd).FindElement(selenium.ByCSSSelector, selector.Selector) + case "xpath": + wdf, err = (*wd).FindElement(selenium.ByXPATH, selector.Selector) + case "id": + wdf, err = (*wd).FindElement(selenium.ByID, selector.Selector) + case "name": + wdf, err = (*wd).FindElement(selenium.ByName, selector.Selector) + case "linktext": + wdf, err = (*wd).FindElement(selenium.ByLinkText, selector.Selector) + case "partiallinktext": + wdf, err = (*wd).FindElement(selenium.ByPartialLinkText, selector.Selector) + case "tagname": + wdf, err = (*wd).FindElement(selenium.ByTagName, selector.Selector) + case "class": + wdf, err = (*wd).FindElement(selenium.ByClassName, selector.Selector) + } + if err == nil && wdf != nil { + break + } + } + + return wdf, selector, err +} + +func DefaultActionConfig(url string) cfg.SourceConfig { + return cfg.SourceConfig{ + FormatVersion: "1.0", + Author: "Your Name", + CreatedAt: time.Now(), + Description: "Default configuration", + SourceName: "Example Source", + CrawlingConfig: cfg.CrawlingConfig{ + Site: url, + }, + ExecutionPlan: []cfg.ExecutionPlanItem{ + { + Label: "Default Execution Plan", + Conditions: cfg.Condition{ + UrlPatterns: []string{url}, + }, + Rules: []string{"ClickAcceptCookiesButton"}, + }, + }, + } +} + +func runDefaultActionRules(wd *selenium.WebDriver, ctx *processContext) { + // Execute the default scraping rules + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing default action rules...") + + // Get the default scraping rules + url, err := (*wd).CurrentURL() + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error getting the current URL: %v", err) + url = "" + } + rs := DefaultActionConfig(url) + // Execute all the rules in the ruleset + for _, r := range rs.ExecutionPlan { + // Get the rule + for _, ruleName := range r.Rules { + if ruleName == "" { + continue + } + rule, err := ctx.re.GetActionRuleByName(ruleName) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error getting action rule: %v", err) + } else { + // Execute the rule + err := executeActionRule(rule, wd) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error executing action rule: %v", err) + } + } + } + } +} diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index db6df048..c82c3391 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -855,180 +855,6 @@ func getURLContent(url string, wd selenium.WebDriver, level int, ctx *processCon return wd, err0 } -func processActionRules(wd *selenium.WebDriver, ctx *processContext, url string) { - // Run Action Rules if any - if ctx.source.Config != nil { - // Execute the CROWler rules - cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler rules...") - // Execute the rules - } else { - // Check for rules based on the URL - // If the URL matches a rule, execute it - rs, err := ctx.re.GetRulesetByURL(url) - if err == nil { - if rs != nil { - // Execute all the rules in the ruleset - for _, r := range rs.GetActionRules() { - // Execute the rule - err := executeActionRule(&r, wd) - if err != nil { - cmn.DebugMsg(cmn.DbgLvlError, "Error executing action rule: %v", err) - } - } - } - } - rg, err := ctx.re.GetRuleGroupByURL(url) - if err == nil { - if rg != nil { - // Execute all the rules in the rule group - for _, r := range rg.GetActionRules() { - // Execute the rule - err := executeActionRule(&r, wd) - if err != nil { - cmn.DebugMsg(cmn.DbgLvlError, "Error executing action rule: %v", err) - } - } - } - } - } -} - -// executeActionRule executes a single ActionRule -func executeActionRule(r *rules.ActionRule, wd *selenium.WebDriver) error { - // Execute Wait condition first - if len(r.WaitConditions) != 0 { - // Execute the wait condition - err := executeWaitCondition(r, wd) - if err != nil { - return err - } - } - // Execute the action based on the ActionType - switch strings.ToLower(strings.TrimSpace(r.ActionType)) { - case "click": - return executeActionClick(r, wd) - case "scroll": - return executeActionScroll(r, wd) - case "input_text": - return executeActionInput(r, wd) - case "execute_javascript": - return executeActionJS(r, wd) - } - - return fmt.Errorf("action type not supported: %s", r.ActionType) -} - -// executeWaitCondition is responsible for executing a "wait" condition -func executeWaitCondition(r *rules.ActionRule, wd *selenium.WebDriver) error { - for _, wc := range r.WaitConditions { - // Execute the wait condition - switch strings.ToLower(strings.TrimSpace(wc.ConditionType)) { - case "element": - return nil - case "delay": - return nil - case "custom_js": - _, err := (*wd).ExecuteScript(wc.CustomJS, nil) - return err - default: - return fmt.Errorf("wait condition not supported: %s", wc.ConditionType) - } - } - return nil -} - -// executeActionClick is responsible for executing a "click" action -func executeActionClick(r *rules.ActionRule, wd *selenium.WebDriver) error { - // Find the element - wdf, _, err := findElementBySelectorType(wd, r.Selectors) - if err != nil { - return err - } - - // If the element is found, click it - if wdf != nil { - err := wdf.Click() - return err - } - return err -} - -// executeActionScroll is responsible for executing a "scroll" action -func executeActionScroll(r *rules.ActionRule, wd *selenium.WebDriver) error { - // Get Selectors list - value := r.Value - - // Get the attribute to scroll to - var attribute string - if value == "" { - attribute = "document.body.scrollHeight" - } else { - attribute = value - } - - // Use Sprintf to dynamically create the script string with the attribute value - script := fmt.Sprintf("window.scrollTo(0, %s)", attribute) - - // Scroll the page - _, err := (*wd).ExecuteScript(script, nil) - return err -} - -// executeActionJS is responsible for executing a "execute_javascript" action -func executeActionJS(r *rules.ActionRule, wd *selenium.WebDriver) error { - // Execute the JavaScript - _, err := (*wd).ExecuteScript(r.Value, nil) - return err -} - -// executeActionInput is responsible for executing an "input" action -func executeActionInput(r *rules.ActionRule, wd *selenium.WebDriver) error { - // Find the element - wdf, selector, err := findElementBySelectorType(wd, r.Selectors) - if err != nil { - return err - } - - // If the element is found, input the text - if wdf != nil { - err = wdf.SendKeys(selector.Attribute) - } - return err -} - -// findElementBySelectorType is responsible for finding an element in the WebDriver -// using the appropriate selector type. It returns the first element found and an error. -func findElementBySelectorType(wd *selenium.WebDriver, selectors []rules.Selector) (selenium.WebElement, rules.Selector, error) { - var wdf selenium.WebElement = nil - var err error - var selector rules.Selector - for _, selector = range selectors { - switch selector.SelectorType { - case "css": - wdf, err = (*wd).FindElement(selenium.ByCSSSelector, selector.Selector) - case "xpath": - wdf, err = (*wd).FindElement(selenium.ByXPATH, selector.Selector) - case "id": - wdf, err = (*wd).FindElement(selenium.ByID, selector.Selector) - case "name": - wdf, err = (*wd).FindElement(selenium.ByName, selector.Selector) - case "linktext": - wdf, err = (*wd).FindElement(selenium.ByLinkText, selector.Selector) - case "partiallinktext": - wdf, err = (*wd).FindElement(selenium.ByPartialLinkText, selector.Selector) - case "tagname": - wdf, err = (*wd).FindElement(selenium.ByTagName, selector.Selector) - case "class": - wdf, err = (*wd).FindElement(selenium.ByClassName, selector.Selector) - } - if err == nil && wdf != nil { - break - } - } - - return wdf, selector, err -} - // extractPageInfo is responsible for extracting information from a collected page. // In the future we may want to expand this function to extract more information // from the page, such as images, videos, etc. and do a better job at screen scraping. @@ -1041,15 +867,11 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo { } // Run scraping rules if any - if ctx.source.Config != nil { - // Execute the CROWler rules - cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler rules...") - /* - PageInfo.ScrapedData, err = rules.ExecuteScrapingRules(ctx.source.Config, ctx.wd) - if err != nil { - cmn.DebugMsg(cmn.DbgLvlError, "Error executing CROWler rules: %v", err) - } - */ + var scrapedData string + var url string + url, err = webPage.CurrentURL() + if err == nil { + scrapedData = processScrapingRules(&webPage, ctx, url) } title, _ := webPage.Title() @@ -1072,6 +894,7 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo { MetaTags: metaTags, DetectedLang: detectLang(webPage), DetectedType: inferDocumentType(currentURL), + ScrapedData: scrapedData, } } diff --git a/pkg/crawler/scraping_rules.go b/pkg/crawler/scraping_rules.go new file mode 100644 index 00000000..3d61a28b --- /dev/null +++ b/pkg/crawler/scraping_rules.go @@ -0,0 +1,201 @@ +// Copyright 2023 Paolo Fabio Zaino +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package crawler implements the crawling logic of the application. +// It's responsible for crawling a website and extracting information from it. +package crawler + +///////// +// This file is used as a wrapper to the scrapper package, to avoid circular dependencies. +///////// + +import ( + "encoding/json" + "fmt" + "strings" + "time" + + cmn "github.com/pzaino/thecrowler/pkg/common" + cfg "github.com/pzaino/thecrowler/pkg/config" + rules "github.com/pzaino/thecrowler/pkg/ruleset" + scraper "github.com/pzaino/thecrowler/pkg/scraper" + + "github.com/tebeka/selenium" +) + +const ( + errExecutingScraping = "error executing scraping rule: %v" +) + +// processScrapingRules processes the scraping rules +func processScrapingRules(wd *selenium.WebDriver, ctx *processContext, url string) string { + cmn.DebugMsg(cmn.DbgLvlDebug2, "Starting to search and process CROWler Scraping rules...") + + scrapedDataDoc := "" + + // Run Scraping Rules if any + if ctx.source.Config != nil { + // Execute the CROWler rules + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler configured Scraping rules...") + // Execute the rules + if strings.TrimSpace(string((*ctx.source.Config))) == "{\"config\":\"default\"}" { + runDefaultScrapingRules(wd, ctx) + } else { + configStr := string((*ctx.source.Config)) + cmn.DebugMsg(cmn.DbgLvlDebug, "Configuration: %v", configStr) + } + } else { + // Check for rules based on the URL + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler URL based Scraping rules...") + // If the URL matches a rule, execute it + scrapedDataDoc += executeScrapingRulesByURL(wd, ctx, url) + } + + return scrapedDataDoc +} + +func executeScrapingRulesByURL(wd *selenium.WebDriver, ctx *processContext, url string) string { + scrapedDataDoc := "" + + rs, err := ctx.re.GetRulesetByURL(url) + if err == nil && rs != nil { + // Execute all the rules in the ruleset + scrapedDataDoc += executeScrapingRulesInRuleset(rs, wd) + } else { + rg, err := ctx.re.GetRuleGroupByURL(url) + if err == nil && rg != nil { + // Execute all the rules in the rule group + scrapedDataDoc += executeScrapingRulesInRuleGroup(rg, wd) + } + } + + return scrapedDataDoc +} + +func executeScrapingRulesInRuleset(rs *rules.Ruleset, wd *selenium.WebDriver) string { + scrapedDataDoc := "" + for _, r := range rs.GetScrapingRules() { + // Execute the rule + scrapedData, err := executeScrapingRule(&r, wd) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, errExecutingScraping, err) + } + scrapedDataDoc += scrapedData + } + return scrapedDataDoc +} + +func executeScrapingRulesInRuleGroup(rg *rules.RuleGroup, wd *selenium.WebDriver) string { + scrapedDataDoc := "" + for _, r := range rg.GetScrapingRules() { + // Execute the rule + scrapedData, err := executeScrapingRule(&r, wd) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, errExecutingScraping, err) + } + scrapedDataDoc += scrapedData + } + return scrapedDataDoc +} + +// executeScrapingRule executes a single ScrapingRule +func executeScrapingRule(r *rules.ScrapingRule, wd *selenium.WebDriver) (string, error) { + // Execute Wait condition first + if len(r.WaitConditions) != 0 { + for _, wc := range r.WaitConditions { + err := executeWaitCondition(&wc, wd) + if err != nil { + return "", fmt.Errorf("error executing wait condition: %v", err) + } + } + } + + // Execute the scraping rule + extractedData := scraper.ApplyRule(r, wd) + + // Transform the extracted data into a JSON document + jsonData, err := json.Marshal(extractedData) + if err != nil { + return "", fmt.Errorf("error marshalling JSON: %v", err) + } + + // Convert bytes to string to get a JSON document + jsonDocument := string(jsonData) + + return jsonDocument, nil +} + +func DefaultCrawlingConfig(url string) cfg.SourceConfig { + return cfg.SourceConfig{ + FormatVersion: "1.0", + Author: "Your Name", + CreatedAt: time.Now(), + Description: "Default configuration", + SourceName: "Example Source", + CrawlingConfig: cfg.CrawlingConfig{ + Site: url, + }, + ExecutionPlan: []cfg.ExecutionPlanItem{ + { + Label: "Default Execution Plan", + Conditions: cfg.Condition{ + UrlPatterns: []string{url}, + }, + Rules: []string{""}, + }, + }, + } +} + +func runDefaultScrapingRules(wd *selenium.WebDriver, ctx *processContext) { + // Execute the default scraping rules + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing default scraping rules...") + + // Get the default scraping rules + url, err := (*wd).CurrentURL() + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error getting the current URL: %v", err) + url = "" + } + rs := DefaultCrawlingConfig(url) + // Execute all the rules in the ruleset + var scrapedDataDoc string + for _, r := range rs.ExecutionPlan { + scrapedDataDoc += executeRulesInExecutionPlan(r, wd, ctx) + } +} + +func executeRulesInExecutionPlan(epi cfg.ExecutionPlanItem, wd *selenium.WebDriver, ctx *processContext) string { + var scrapedDataDoc string + // Get the rule + for _, ruleName := range epi.Rules { + if ruleName == "" { + continue + } + rule, err := ctx.re.GetScrapingRuleByName(ruleName) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error getting scraping rule: %v", err) + } else { + // Execute the rule + scrapedData, err := executeScrapingRule(rule, wd) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, errExecutingScraping, err) + } else { + scrapedDataDoc += scrapedData + cmn.DebugMsg(cmn.DbgLvlDebug3, "Scraped data: %v", scrapedDataDoc) + } + } + } + return scrapedDataDoc +} diff --git a/pkg/ruleset/ruleset.go b/pkg/ruleset/ruleset.go index 10fa66f3..182a39b5 100644 --- a/pkg/ruleset/ruleset.go +++ b/pkg/ruleset/ruleset.go @@ -30,6 +30,17 @@ import ( "gopkg.in/yaml.v2" ) +const ( + errRulesetNotFound = "ruleset not found" + errRuleGroupNotFound = "rule group not found" + errEmptyPath = "empty path provided" + errEmptyURL = "empty URL provided" + errParsingURL = "error parsing URL: %s" + errEmptyName = "empty name provided" + errActionNotFound = "action rule not found" + errScrapingNotFound = "scraping rule not found" +) + // UnmarshalYAML parses date strings from the YAML file. func (ct *CustomTime) UnmarshalYAML(unmarshal func(interface{}) error) error { var dateStr string @@ -178,7 +189,7 @@ func (re *RuleEngine) GetRuleGroup(name string) (RuleGroup, error) { } } } - return RuleGroup{}, fmt.Errorf("rule group not found") + return RuleGroup{}, fmt.Errorf(errRuleGroupNotFound) } // GetRuleset returns a ruleset with the specified name. @@ -188,7 +199,7 @@ func (re *RuleEngine) GetRuleset(name string) (Ruleset, error) { return rs, nil } } - return Ruleset{}, fmt.Errorf("ruleset not found") + return Ruleset{}, fmt.Errorf(errRulesetNotFound) } // LoadRulesFromConfig loads the rules from the configuration file and returns a pointer to the created RuleEngine. @@ -206,7 +217,7 @@ func (re *RuleEngine) LoadRulesFromConfig(config *cfg.Config) error { // loadRulesFromConfig loads the rules from the configuration file and returns a pointer to the created RuleEngine. func loadRulesFromConfig(config cfg.Ruleset) (*[]Ruleset, error) { if config.Path == nil { - return nil, fmt.Errorf("empty path provided") + return nil, fmt.Errorf(errEmptyPath) } if config.Host == "" { // Rules are stored locally @@ -271,30 +282,30 @@ func (re *RuleEngine) CountRules() int { func (re *RuleEngine) GetRulesetByURL(urlStr string) (*Ruleset, error) { // Validate URL if urlStr == "" { - return nil, fmt.Errorf("empty URL provided") + return nil, fmt.Errorf(errEmptyURL) } _, err := url.Parse(urlStr) if err != nil { - return nil, fmt.Errorf("error parsing URL: %s", err) + return nil, fmt.Errorf(errParsingURL, err) } parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) for _, rs := range re.Rulesets { - if strings.ToLower(strings.TrimSpace(rs.Name)) == parsedURL { + if strings.ToLower(strings.TrimSpace(rs.Name)) == parsedURL || strings.ToLower(strings.TrimSpace(rs.Name)) == "*" { return &rs, nil } } - return nil, fmt.Errorf("ruleset not found") + return nil, fmt.Errorf(errRulesetNotFound) } // GetRulesGroupByURL returns the rules group for the specified URL. func (re *RuleEngine) GetRuleGroupByURL(urlStr string) (*RuleGroup, error) { // Validate URL if urlStr == "" { - return nil, fmt.Errorf("empty URL provided") + return nil, fmt.Errorf(errEmptyURL) } _, err := url.Parse(urlStr) if err != nil { - return nil, fmt.Errorf("error parsing URL: %s", err) + return nil, fmt.Errorf(errParsingURL, err) } parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) for _, rs := range re.Rulesets { @@ -306,14 +317,14 @@ func (re *RuleEngine) GetRuleGroupByURL(urlStr string) (*RuleGroup, error) { } } } - return nil, fmt.Errorf("rule group not found") + return nil, fmt.Errorf(errRuleGroupNotFound) } // GetRulesGroupByName returns the rules group for the specified name. func (re *RuleEngine) GetRuleGroupByName(name string) (*RuleGroup, error) { // Validate name if name == "" { - return nil, fmt.Errorf("empty name provided") + return nil, fmt.Errorf(errEmptyName) } parsedName := strings.ToLower(strings.TrimSpace(name)) for _, rs := range re.Rulesets { @@ -325,14 +336,14 @@ func (re *RuleEngine) GetRuleGroupByName(name string) (*RuleGroup, error) { } } } - return nil, fmt.Errorf("rule group not found") + return nil, fmt.Errorf(errRuleGroupNotFound) } // GetRulesetByName returns the ruleset for the specified name. func (re *RuleEngine) GetRulesetByName(name string) (*Ruleset, error) { // Validate name if name == "" { - return nil, fmt.Errorf("empty name provided") + return nil, fmt.Errorf(errEmptyName) } parsedName := strings.ToLower(strings.TrimSpace(name)) for _, rs := range re.Rulesets { @@ -340,7 +351,88 @@ func (re *RuleEngine) GetRulesetByName(name string) (*Ruleset, error) { return &rs, nil } } - return nil, fmt.Errorf("ruleset not found") + return nil, fmt.Errorf(errRulesetNotFound) +} + +// GetActionRuleByName returns the action rule with the specified name. +func (re *RuleEngine) GetActionRuleByName(name string) (*ActionRule, error) { + // Validate name + if name == "" { + return nil, fmt.Errorf(errEmptyName) + } + parsedName := strings.ToLower(strings.TrimSpace(name)) + for _, rs := range re.Rulesets { + for _, rg := range rs.RuleGroups { + for _, r := range rg.ActionRules { + //cmn.DebugMsg(cmn.DbgLvlDebug2, "Checking rule: '%s' == '%s'", r.RuleName, parsedName) + if strings.ToLower(strings.TrimSpace(r.RuleName)) == parsedName { + return &r, nil + } + } + } + } + return nil, fmt.Errorf(errActionNotFound) +} + +// GetActionRuleByURL returns the action rule for the specified URL. +func (re *RuleEngine) GetActionRuleByURL(urlStr string) (*ActionRule, error) { + // Validate URL + if urlStr == "" { + return nil, fmt.Errorf(errEmptyURL) + } + _, err := url.Parse(urlStr) + if err != nil { + return nil, fmt.Errorf(errParsingURL, err) + } + parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) + for _, rs := range re.Rulesets { + for _, rg := range rs.RuleGroups { + for _, r := range rg.ActionRules { + if strings.ToLower(strings.TrimSpace(r.URL)) == parsedURL { + return &r, nil + } + } + } + } + return nil, fmt.Errorf(errActionNotFound) +} + +// GetScrapingRuleByName returns the scraping rule with the specified name. +func (re *RuleEngine) GetScrapingRuleByName(name string) (*ScrapingRule, error) { + // Validate name + if name == "" { + return nil, fmt.Errorf(errEmptyName) + } + parsedName := strings.ToLower(strings.TrimSpace(name)) + for _, rs := range re.Rulesets { + for _, rg := range rs.RuleGroups { + for _, r := range rg.ScrapingRules { + if strings.ToLower(strings.TrimSpace(r.RuleName)) == parsedName { + return &r, nil + } + } + } + } + return nil, fmt.Errorf(errScrapingNotFound) +} + +// GetScrapingRuleByPath returns the scraping rule for the specified path. +func (re *RuleEngine) GetScrapingRuleByPath(path string) (*ScrapingRule, error) { + // Validate path + if path == "" { + return nil, fmt.Errorf(errEmptyPath) + } + parsedPath := strings.ToLower(strings.TrimSpace(path)) + for _, rs := range re.Rulesets { + for _, rg := range rs.RuleGroups { + for _, r := range rg.ScrapingRules { + if strings.ToLower(strings.TrimSpace(r.Path)) == parsedPath { + return &r, nil + } + } + } + } + return nil, fmt.Errorf(errScrapingNotFound) } ///// ------------------------ RULESET ---------------------------------- ///// @@ -358,7 +450,7 @@ func (rs *Ruleset) GetActionRules() []ActionRule { func (rs *Ruleset) GetActionRuleByName(name string) (ActionRule, error) { // Validate name if name == "" { - return ActionRule{}, fmt.Errorf("empty name provided") + return ActionRule{}, fmt.Errorf(errEmptyName) } // prepare name @@ -370,18 +462,18 @@ func (rs *Ruleset) GetActionRuleByName(name string) (ActionRule, error) { } } } - return ActionRule{}, fmt.Errorf("action rule not found") + return ActionRule{}, fmt.Errorf(errActionNotFound) } // GetActionRuleByURL returns the action rule for the specified URL. func (rs *Ruleset) GetActionRuleByURL(urlStr string) (ActionRule, error) { // Validate URL if urlStr == "" { - return ActionRule{}, fmt.Errorf("empty URL provided") + return ActionRule{}, fmt.Errorf(errEmptyURL) } _, err := url.Parse(urlStr) if err != nil { - return ActionRule{}, fmt.Errorf("error parsing URL: %s", err) + return ActionRule{}, fmt.Errorf(errParsingURL, err) } parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) for _, rg := range rs.RuleGroups { @@ -391,7 +483,7 @@ func (rs *Ruleset) GetActionRuleByURL(urlStr string) (ActionRule, error) { } } } - return ActionRule{}, fmt.Errorf("action rule not found") + return ActionRule{}, fmt.Errorf(errActionNotFound) } // GetRuleGroups returns all the rule groups in a Ruleset. @@ -403,7 +495,7 @@ func (rs *Ruleset) GetRuleGroups() []RuleGroup { func (rs *Ruleset) GetRuleGroupByName(name string) (RuleGroup, error) { // Validate name if name == "" { - return RuleGroup{}, fmt.Errorf("empty name provided") + return RuleGroup{}, fmt.Errorf(errEmptyName) } // prepare name @@ -416,18 +508,18 @@ func (rs *Ruleset) GetRuleGroupByName(name string) (RuleGroup, error) { return rg, nil } } - return RuleGroup{}, fmt.Errorf("rule group not found") + return RuleGroup{}, fmt.Errorf(errRuleGroupNotFound) } // GetRuleGroupByURL returns the rule group for the specified URL. func (rs *Ruleset) GetRuleGroupByURL(urlStr string) (RuleGroup, error) { // Validate URL if urlStr == "" { - return RuleGroup{}, fmt.Errorf("empty URL provided") + return RuleGroup{}, fmt.Errorf(errEmptyURL) } _, err := url.Parse(urlStr) if err != nil { - return RuleGroup{}, fmt.Errorf("error parsing URL: %s", err) + return RuleGroup{}, fmt.Errorf(errParsingURL, err) } parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) for _, rg := range rs.RuleGroups { @@ -438,7 +530,7 @@ func (rs *Ruleset) GetRuleGroupByURL(urlStr string) (RuleGroup, error) { return rg, nil } } - return RuleGroup{}, fmt.Errorf("rule group not found") + return RuleGroup{}, fmt.Errorf(errRuleGroupNotFound) } // GetScrapingRules returns all the scraping rules in a Ruleset. @@ -454,7 +546,7 @@ func (rs *Ruleset) GetScrapingRules() []ScrapingRule { func (rs *Ruleset) GetScrapingRuleByName(name string) (ScrapingRule, error) { // Validate name if name == "" { - return ScrapingRule{}, fmt.Errorf("empty name provided") + return ScrapingRule{}, fmt.Errorf(errEmptyName) } // prepare name @@ -466,14 +558,14 @@ func (rs *Ruleset) GetScrapingRuleByName(name string) (ScrapingRule, error) { } } } - return ScrapingRule{}, fmt.Errorf("scraping rule not found") + return ScrapingRule{}, fmt.Errorf(errScrapingNotFound) } // GetScrapingRuleByPath returns the scraping rule for the specified path. func (rs *Ruleset) GetScrapingRuleByPath(path string) (ScrapingRule, error) { // Validate path if path == "" { - return ScrapingRule{}, fmt.Errorf("empty path provided") + return ScrapingRule{}, fmt.Errorf(errEmptyPath) } // prepare path @@ -485,18 +577,18 @@ func (rs *Ruleset) GetScrapingRuleByPath(path string) (ScrapingRule, error) { } } } - return ScrapingRule{}, fmt.Errorf("scraping rule not found") + return ScrapingRule{}, fmt.Errorf(errScrapingNotFound) } // GetScrapingRuleByURL returns the scraping rule for the specified URL. func (rs *Ruleset) GetScrapingRuleByURL(urlStr string) (ScrapingRule, error) { // Validate URL if urlStr == "" { - return ScrapingRule{}, fmt.Errorf("empty URL provided") + return ScrapingRule{}, fmt.Errorf(errEmptyURL) } _, err := url.Parse(urlStr) if err != nil { - return ScrapingRule{}, fmt.Errorf("error parsing URL: %s", err) + return ScrapingRule{}, fmt.Errorf(errParsingURL, err) } parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) for _, rg := range rs.RuleGroups { @@ -506,7 +598,7 @@ func (rs *Ruleset) GetScrapingRuleByURL(urlStr string) (ScrapingRule, error) { } } } - return ScrapingRule{}, fmt.Errorf("scraping rule not found") + return ScrapingRule{}, fmt.Errorf(errScrapingNotFound) } ///// ---------------------- RuleGroup -------------------------------- ///// @@ -577,7 +669,7 @@ func (rg *RuleGroup) GetScrapingRules() []ScrapingRule { func (rg *RuleGroup) GetActionRuleByName(name string) (ActionRule, error) { // Validate name if name == "" { - return ActionRule{}, fmt.Errorf("empty name provided") + return ActionRule{}, fmt.Errorf(errEmptyName) } // prepare name @@ -587,18 +679,18 @@ func (rg *RuleGroup) GetActionRuleByName(name string) (ActionRule, error) { return r, nil } } - return ActionRule{}, fmt.Errorf("action rule not found") + return ActionRule{}, fmt.Errorf(errActionNotFound) } // GetActionRuleByURL returns the action rule for the specified URL. func (rg *RuleGroup) GetActionRuleByURL(urlStr string) (ActionRule, error) { // Validate URL if urlStr == "" { - return ActionRule{}, fmt.Errorf("empty URL provided") + return ActionRule{}, fmt.Errorf(errEmptyURL) } _, err := url.Parse(urlStr) if err != nil { - return ActionRule{}, fmt.Errorf("error parsing URL: %s", err) + return ActionRule{}, fmt.Errorf(errParsingURL, err) } parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) for _, r := range rg.ActionRules { @@ -606,14 +698,14 @@ func (rg *RuleGroup) GetActionRuleByURL(urlStr string) (ActionRule, error) { return r, nil } } - return ActionRule{}, fmt.Errorf("action rule not found") + return ActionRule{}, fmt.Errorf(errActionNotFound) } // GetScrapingRuleByName returns the scraping rule with the specified name. func (rg *RuleGroup) GetScrapingRuleByName(name string) (ScrapingRule, error) { // Validate name if name == "" { - return ScrapingRule{}, fmt.Errorf("empty name provided") + return ScrapingRule{}, fmt.Errorf(errEmptyName) } // prepare name @@ -623,14 +715,14 @@ func (rg *RuleGroup) GetScrapingRuleByName(name string) (ScrapingRule, error) { return r, nil } } - return ScrapingRule{}, fmt.Errorf("scraping rule not found") + return ScrapingRule{}, fmt.Errorf(errScrapingNotFound) } // GetScrapingRuleByPath returns the scraping rule for the specified path. func (rg *RuleGroup) GetScrapingRuleByPath(path string) (ScrapingRule, error) { // Validate path if path == "" { - return ScrapingRule{}, fmt.Errorf("empty path provided") + return ScrapingRule{}, fmt.Errorf(errEmptyPath) } // prepare path @@ -640,18 +732,18 @@ func (rg *RuleGroup) GetScrapingRuleByPath(path string) (ScrapingRule, error) { return r, nil } } - return ScrapingRule{}, fmt.Errorf("scraping rule not found") + return ScrapingRule{}, fmt.Errorf(errScrapingNotFound) } // GetScrapingRuleByURL returns the scraping rule for the specified URL. func (rg *RuleGroup) GetScrapingRuleByURL(urlStr string) (ScrapingRule, error) { // Validate URL if urlStr == "" { - return ScrapingRule{}, fmt.Errorf("empty URL provided") + return ScrapingRule{}, fmt.Errorf(errEmptyURL) } _, err := url.Parse(urlStr) if err != nil { - return ScrapingRule{}, fmt.Errorf("error parsing URL: %s", err) + return ScrapingRule{}, fmt.Errorf(errParsingURL, err) } parsedURL := strings.ToLower(strings.TrimSpace(urlStr)) for _, r := range rg.ScrapingRules { @@ -659,7 +751,7 @@ func (rg *RuleGroup) GetScrapingRuleByURL(urlStr string) (ScrapingRule, error) { return r, nil } } - return ScrapingRule{}, fmt.Errorf("scraping rule not found") + return ScrapingRule{}, fmt.Errorf(errScrapingNotFound) } ///// --------------------- ActionRule ------------------------------- ///// @@ -953,13 +1045,13 @@ func (re *RuleEngine) IsGroupValid(group RuleGroup) bool { // It returns a pointer to the SiteRules for the provided URL or an error if no rules are found. func (re *RuleEngine) FindRulesForSite(inputURL string) (*Ruleset, error) { if inputURL == "" { - return nil, fmt.Errorf("empty URL provided") + return nil, fmt.Errorf(errEmptyURL) } // Parse the input URL to extract the domain parsedURL, err := url.Parse(inputURL) if err != nil { - return nil, fmt.Errorf("error parsing URL: %s", err) + return nil, fmt.Errorf(errParsingURL, err) } inputDomain := strings.ToLower(strings.TrimSpace(parsedURL.Hostname())) diff --git a/pkg/scraper/scraper.go b/pkg/scraper/scraper.go index 5edea568..7477e742 100644 --- a/pkg/scraper/scraper.go +++ b/pkg/scraper/scraper.go @@ -17,87 +17,38 @@ package scraper import ( - "fmt" - "net/url" "regexp" "strings" + cmn "github.com/pzaino/thecrowler/pkg/common" rs "github.com/pzaino/thecrowler/pkg/ruleset" + "github.com/tebeka/selenium" "github.com/PuerkitoBio/goquery" "github.com/antchfx/htmlquery" "golang.org/x/net/html" ) -// ApplyRules applies the rules to the provided URL and HTML content. -// It returns a map containing the extracted data or an error if any occurred during the extraction. -func (re *ScraperRuleEngine) ApplyRules(url string, htmlContent string) (map[string]interface{}, error) { - siteRules, err := re.FindRulesForSite(url) - if err != nil { - return nil, err - } - - for _, group := range siteRules.RuleGroups { - if re.IsGroupValid(group) { - extractedData, err := re.extractData(group, url, htmlContent) - if err != nil { - return nil, err - } - return extractedData, nil - } - } - - return nil, fmt.Errorf("no valid rule groups found for URL: %s", url) -} - -// extractJSFiles extracts the JavaScript files from the provided document. -// It returns a slice of strings containing the JavaScript files. -func (re *ScraperRuleEngine) extractJSFiles(doc *goquery.Document) []string { - var jsFiles []string - doc.Find("script[src]").Each(func(_ int, s *goquery.Selection) { - if src, exists := s.Attr("src"); exists { - jsFiles = append(jsFiles, src) - } - }) - return jsFiles -} +// ApplyRule applies the provided scraping rule to the provided web page. +func ApplyRule(rule *rs.ScrapingRule, webPage *selenium.WebDriver) map[string]interface{} { + // Initialize a map to hold the extracted data + extractedData := make(map[string]interface{}) -// extractData extracts the data from the provided HTML content using the provided RuleGroup. -// It returns a map containing the extracted data or an error if any occurred during the extraction. -func (re *ScraperRuleEngine) extractData(group rs.RuleGroup, pageURL string, htmlContent string) (map[string]interface{}, error) { - // Parse the HTML content + // Prepare content for goquery: + htmlContent, _ := (*webPage).PageSource() doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) if err != nil { - return nil, fmt.Errorf("error parsing HTML: %s", err) + cmn.DebugMsg(cmn.DbgLvlError, "Error loading HTML content: %v", err) + return extractedData } + // Parse the HTML content node, err := htmlquery.Parse(strings.NewReader(htmlContent)) if err != nil { // handle error - return nil, err - } - - // Parse the page URL to extract its path - parsedURL, err := url.Parse(pageURL) - if err != nil { - return nil, fmt.Errorf("error parsing page URL: %s", err) + cmn.DebugMsg(cmn.DbgLvlError, "Error parsing HTML content: %v", err) + return extractedData } - path := parsedURL.Path - // Initialize a map to hold the extracted data - extractedData := make(map[string]interface{}) - - // Iterate over the rules in the group - for _, rule := range group.ScrapingRules { - // Apply rule only if the path matches - if strings.HasSuffix(path, rule.Path) || strings.HasSuffix(path, rule.Path+"/") { - extractedData = re.applyRule(rule, doc, node, htmlContent, extractedData) - } - } - - return extractedData, nil -} - -func (re *ScraperRuleEngine) applyRule(rule rs.ScrapingRule, doc *goquery.Document, node *html.Node, htmlContent string, extractedData map[string]interface{}) map[string]interface{} { // Iterate over the elements to be extracted for _, elementSet := range rule.Elements { key := elementSet.Key @@ -109,11 +60,21 @@ func (re *ScraperRuleEngine) applyRule(rule rs.ScrapingRule, doc *goquery.Docume var extracted string switch selectorType { case "css": - extracted = re.extractByCSS(doc, selector) + extracted = extractByCSS(doc, selector) case "xpath": - extracted = re.extractByXPath(node, selector) + extracted = extractByXPath(node, selector) + case "id": + extracted = extractByCSS(doc, "#"+selector) + case "class", "class_name": + extracted = extractByCSS(doc, "."+selector) + case "name": + extracted = extractByCSS(doc, "[name="+selector+"]") + case "tag": + extracted = extractByCSS(doc, selector) + case "link_text", "partial_link_text": + extracted = extractByCSS(doc, "a:contains('"+selector+"')") case "regex": - extracted = re.extractByRegex(htmlContent, selector) + extracted = extractByRegex(htmlContent, selector) default: extracted = "" } @@ -126,18 +87,30 @@ func (re *ScraperRuleEngine) applyRule(rule rs.ScrapingRule, doc *goquery.Docume // Optional: Extract JavaScript files if required if rule.JsFiles { - jsFiles := re.extractJSFiles(doc) + jsFiles := extractJSFiles(doc) extractedData["js_files"] = jsFiles } return extractedData } -func (re *ScraperRuleEngine) extractByCSS(doc *goquery.Document, selector string) string { +// extractJSFiles extracts the JavaScript files from the provided document. +func extractJSFiles(doc *goquery.Document) []string { + var jsFiles []string + doc.Find("script[src]").Each(func(_ int, s *goquery.Selection) { + if src, exists := s.Attr("src"); exists { + jsFiles = append(jsFiles, src) + } + }) + return jsFiles +} + +// extractByCSS extracts the content from the provided document using the provided CSS selector. +func extractByCSS(doc *goquery.Document, selector string) string { return doc.Find(selector).Text() } -func (re *ScraperRuleEngine) extractByXPath(node *html.Node, selector string) string { +func extractByXPath(node *html.Node, selector string) string { extractedNode := htmlquery.FindOne(node, selector) if extractedNode != nil { return htmlquery.InnerText(extractedNode) @@ -145,7 +118,7 @@ func (re *ScraperRuleEngine) extractByXPath(node *html.Node, selector string) st return "" } -func (re *ScraperRuleEngine) extractByRegex(htmlContent string, selector string) string { +func extractByRegex(htmlContent string, selector string) string { regex, err := regexp.Compile(selector) if err != nil { // handle regex compilation error @@ -157,3 +130,21 @@ func (re *ScraperRuleEngine) extractByRegex(htmlContent string, selector string) } return "" } + +// ApplyRulesGroup extracts the data from the provided web page using the provided a rule group. +func ApplyRulesGroup(ruleGroup *rs.RuleGroup, url string, webPage *selenium.WebDriver) (map[string]interface{}, error) { + // Initialize a map to hold the extracted data + extractedData := make(map[string]interface{}) + + // Iterate over the rules in the rule group + for _, rule := range ruleGroup.ScrapingRules { + // Apply the rule to the web page + data := ApplyRule(&rule, webPage) + // Add the extracted data to the map + for k, v := range data { + extractedData[k] = v + } + } + + return extractedData, nil +} diff --git a/pkg/scraper/scraper_test.go b/pkg/scraper/scraper_test.go index def003aa..2d5a381d 100644 --- a/pkg/scraper/scraper_test.go +++ b/pkg/scraper/scraper_test.go @@ -146,82 +146,3 @@ func TestFindRulesForSite(t *testing.T) { }) } } - -func TestExtractData(t *testing.T) { - // Example mock HTML content - mockHTML := ` - -