Merge pull request #15 from pzaino/develop

added more tests
pzaino · Jan 22, 2024 · 125feca · 125feca
2 parents 19a220a + 467b0b9
commit 125feca
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 22 deletions.
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
@@ -39,16 +39,6 @@ var (
 	config cfg.Config
 )
 
-// This struct represents the information that we want to extract from a page
-// and store in the database.
-type PageInfo struct {
-	Title           string
-	Summary         string
-	BodyText        string
-	ContainsAppInfo bool
-	MetaTags        map[string]string // Add a field for meta tags
-}
-
 var indexPageMutex sync.Mutex // Mutex to ensure that only one goroutine is indexing a page at a time
 
 // This function is responsible for crawling a website, it's the main entry point
@@ -349,8 +339,16 @@ func isExternalLink(sourceURL, linkURL string) bool {
 		fmt.Println("Source hostname:", sourceParsed.Hostname())
 		fmt.Println("Link hostname:", linkParsed.Hostname())
 	}
+
+	// Takes the substring that correspond to the 1st and 2nd level domain (e.g., google.com)
+	// regardless the number of subdomains
+	srcFqdnArr := strings.Split(sourceParsed.Hostname(), ".")
+	srcDomainName := strings.Join(srcFqdnArr[len(srcFqdnArr)-2:], ".")
+	linkFqdnArr := strings.Split(linkParsed.Hostname(), ".")
+	linkDomainName := strings.Join(linkFqdnArr[len(linkFqdnArr)-2:], ".")
+
 	// Compare hostnames
-	return sourceParsed.Hostname() != linkParsed.Hostname()
+	return srcDomainName != linkDomainName
 }
 
 // This is the worker function that is responsible for crawling a page

diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go
@@ -0,0 +1,99 @@
+// Copyright 2023 Paolo Fabio Zaino
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package crawler
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestExtractLinks(t *testing.T) {
+	type args struct {
+		htmlContent string
+	}
+	tests := []struct {
+		name string
+		args args
+		want []string
+	}{
+		{"test1", args{"<html><head><title>Test</title></head><body><a href=\"https://www.google.com\">Google</a></body></html>"}, []string{"https://www.google.com"}},
+		{"test2", args{"<html><head><title>Test</title></head><body><a href=\"https://www.google.com\">Google</a><a href=\"https://www.google.com\">Google</a></body></html>"}, []string{"https://www.google.com", "https://www.google.com"}},
+		{"test3", args{"<html><head><title>Test</title></head><body><a href=\"https://www.google.com\">Google</a><a href=\"https://www.google.com\">Google</a><a href=\"https://www.google.com\">Google</a></body></html>"}, []string{"https://www.google.com", "https://www.google.com", "https://www.google.com"}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := extractLinks(tt.args.htmlContent); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("extractLinks() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_isExternalLink(t *testing.T) {
+	type args struct {
+		sourceURL string
+		linkURL   string
+	}
+	tests := []struct {
+		name string
+		args args
+		want bool
+	}{
+		// Add test cases
+		{"test1", args{"https://www.google.com", "https://www.google.com"}, false},
+		{"test2", args{"https://www.google.com", "https://www.google.com/test"}, false},
+		{"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, false},
+		{"test4", args{"https://www.example.com", "https://www.google.com/test/test/test"}, true},
+		{"test5", args{"https://data.example.com", "https://www.example.com"}, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isExternalLink(tt.args.sourceURL, tt.args.linkURL); got != tt.want {
+				t.Errorf("isExternalLink() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestCombineURLs(t *testing.T) {
+	type args struct {
+		baseURL     string
+		relativeURL string
+	}
+	tests := []struct {
+		name    string
+		args    args
+		want    string
+		wantErr bool
+	}{
+		// Add test cases.
+		{"test1", args{"https://www.google.com", "https://www.google.com"}, "https://www.google.com", false},
+		{"test2", args{"https://www.google.com", "https://www.google.com/test"}, "https://www.google.com/test", false},
+		{"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, "https://www.google.com/test/test", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := combineURLs(tt.args.baseURL, tt.args.relativeURL)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("combineURLs() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if got != tt.want {
+				t.Errorf("combineURLs() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/pkg/crawler/keywords_test.go b/pkg/crawler/keywords_test.go
@@ -19,6 +19,11 @@ import (
 	"testing"
 )
 
+const (
+	test_data   string = "test1, test2, test3"
+	test_result string = "test1 test2 test3"
+)
+
 func TestIsKeyword(t *testing.T) {
 	type args struct {
 		keyword string
@@ -52,7 +57,7 @@ func TestExtractFromMetaTag(t *testing.T) {
 		tagName  string
 	}
 	keywords := make(map[string]string)
-	keywords["keywords"] = "test1, test2, test3"
+	keywords["keywords"] = test_data
 	tests := []struct {
 		name string
 		args args
@@ -81,9 +86,9 @@ func TestExtractContentKeywords(t *testing.T) {
 		args args
 		want []string
 	}{
-		{"test1", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
-		{"test2", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
-		{"test3", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
+		{"test1", args{test_data}, []string{"test1", "test2", "test3"}},
+		{"test2", args{test_data}, []string{"test1", "test2", "test3"}},
+		{"test3", args{test_data}, []string{"test1", "test2", "test3"}},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -121,7 +126,7 @@ func TestExtractKeywords(t *testing.T) {
 		pageInfo PageInfo
 	}
 	keywords := make(map[string]string)
-	keywords["keywords"] = "test1, test2, test3"
+	keywords["keywords"] = test_data
 	pageInfo := PageInfo{
 		MetaTags: keywords,
 	}
@@ -154,9 +159,9 @@ func TestNormalizeText(t *testing.T) {
 		want string
 	}{
 		// Add test cases:
-		{"test1", args{"test1, test2, test3"}, "test1 test2 test3"},
-		{"test2", args{"TEST1, TEST2, TEST3"}, "test1 test2 test3"},
-		{"test3", args{"TeSt1, tEsT2, test3"}, "test1 test2 test3"},
+		{"test1", args{test_data}, test_result},
+		{"test2", args{test_data}, test_result},
+		{"test3", args{test_data}, test_result},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -177,9 +182,9 @@ func TestRemovePunctuation(t *testing.T) {
 		want string
 	}{
 		// Add test cases:
-		{"test1", args{"test1, test2, test3"}, "test1 test2 test3"},
-		{"test2", args{"test1. test2. test3."}, "test1 test2 test3"},
-		{"test3", args{"test1; test2; test3;"}, "test1 test2 test3"},
+		{"test1", args{test_data}, test_result},
+		{"test2", args{"test1. test2. test3."}, test_result},
+		{"test3", args{"test1; test2; test3;"}, test_result},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {

diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go
@@ -0,0 +1,11 @@
+package crawler
+
+// This struct represents the information that we want to extract from a page
+// and store in the database.
+type PageInfo struct {
+	Title           string
+	Summary         string
+	BodyText        string
+	ContainsAppInfo bool
+	MetaTags        map[string]string // Add a field for meta tags
+}