From 467b0b9a9972ac363ce4397e6d2c84108fe260df Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Mon, 22 Jan 2024 13:03:15 +0000
Subject: [PATCH] added more tests
---
pkg/crawler/crawler.go | 20 ++++----
pkg/crawler/crawler_test.go | 99 ++++++++++++++++++++++++++++++++++++
pkg/crawler/keywords_test.go | 27 ++++++----
pkg/crawler/types.go | 11 ++++
4 files changed, 135 insertions(+), 22 deletions(-)
create mode 100644 pkg/crawler/crawler_test.go
create mode 100644 pkg/crawler/types.go
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index f73e83ee..b50eac24 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -39,16 +39,6 @@ var (
config cfg.Config
)
-// This struct represents the information that we want to extract from a page
-// and store in the database.
-type PageInfo struct {
- Title string
- Summary string
- BodyText string
- ContainsAppInfo bool
- MetaTags map[string]string // Add a field for meta tags
-}
-
var indexPageMutex sync.Mutex // Mutex to ensure that only one goroutine is indexing a page at a time
// This function is responsible for crawling a website, it's the main entry point
@@ -349,8 +339,16 @@ func isExternalLink(sourceURL, linkURL string) bool {
fmt.Println("Source hostname:", sourceParsed.Hostname())
fmt.Println("Link hostname:", linkParsed.Hostname())
}
+
+ // Takes the substring that correspond to the 1st and 2nd level domain (e.g., google.com)
+ // regardless the number of subdomains
+ srcFqdnArr := strings.Split(sourceParsed.Hostname(), ".")
+ srcDomainName := strings.Join(srcFqdnArr[len(srcFqdnArr)-2:], ".")
+ linkFqdnArr := strings.Split(linkParsed.Hostname(), ".")
+ linkDomainName := strings.Join(linkFqdnArr[len(linkFqdnArr)-2:], ".")
+
// Compare hostnames
- return sourceParsed.Hostname() != linkParsed.Hostname()
+ return srcDomainName != linkDomainName
}
// This is the worker function that is responsible for crawling a page
diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go
new file mode 100644
index 00000000..4bb17e7a
--- /dev/null
+++ b/pkg/crawler/crawler_test.go
@@ -0,0 +1,99 @@
+// Copyright 2023 Paolo Fabio Zaino
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package crawler
+
+import (
+ "reflect"
+ "testing"
+)
+
+func TestExtractLinks(t *testing.T) {
+ type args struct {
+ htmlContent string
+ }
+ tests := []struct {
+ name string
+ args args
+ want []string
+ }{
+ {"test1", args{"TestGoogle"}, []string{"https://www.google.com"}},
+ {"test2", args{"TestGoogleGoogle"}, []string{"https://www.google.com", "https://www.google.com"}},
+ {"test3", args{"TestGoogleGoogleGoogle"}, []string{"https://www.google.com", "https://www.google.com", "https://www.google.com"}},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ if got := extractLinks(tt.args.htmlContent); !reflect.DeepEqual(got, tt.want) {
+ t.Errorf("extractLinks() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
+
+func Test_isExternalLink(t *testing.T) {
+ type args struct {
+ sourceURL string
+ linkURL string
+ }
+ tests := []struct {
+ name string
+ args args
+ want bool
+ }{
+ // Add test cases
+ {"test1", args{"https://www.google.com", "https://www.google.com"}, false},
+ {"test2", args{"https://www.google.com", "https://www.google.com/test"}, false},
+ {"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, false},
+ {"test4", args{"https://www.example.com", "https://www.google.com/test/test/test"}, true},
+ {"test5", args{"https://data.example.com", "https://www.example.com"}, false},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ if got := isExternalLink(tt.args.sourceURL, tt.args.linkURL); got != tt.want {
+ t.Errorf("isExternalLink() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
+
+func TestCombineURLs(t *testing.T) {
+ type args struct {
+ baseURL string
+ relativeURL string
+ }
+ tests := []struct {
+ name string
+ args args
+ want string
+ wantErr bool
+ }{
+ // Add test cases.
+ {"test1", args{"https://www.google.com", "https://www.google.com"}, "https://www.google.com", false},
+ {"test2", args{"https://www.google.com", "https://www.google.com/test"}, "https://www.google.com/test", false},
+ {"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, "https://www.google.com/test/test", false},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got, err := combineURLs(tt.args.baseURL, tt.args.relativeURL)
+ if (err != nil) != tt.wantErr {
+ t.Errorf("combineURLs() error = %v, wantErr %v", err, tt.wantErr)
+ return
+ }
+ if got != tt.want {
+ t.Errorf("combineURLs() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
diff --git a/pkg/crawler/keywords_test.go b/pkg/crawler/keywords_test.go
index dc87ec4b..e24f8fe2 100644
--- a/pkg/crawler/keywords_test.go
+++ b/pkg/crawler/keywords_test.go
@@ -19,6 +19,11 @@ import (
"testing"
)
+const (
+ test_data string = "test1, test2, test3"
+ test_result string = "test1 test2 test3"
+)
+
func TestIsKeyword(t *testing.T) {
type args struct {
keyword string
@@ -52,7 +57,7 @@ func TestExtractFromMetaTag(t *testing.T) {
tagName string
}
keywords := make(map[string]string)
- keywords["keywords"] = "test1, test2, test3"
+ keywords["keywords"] = test_data
tests := []struct {
name string
args args
@@ -81,9 +86,9 @@ func TestExtractContentKeywords(t *testing.T) {
args args
want []string
}{
- {"test1", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
- {"test2", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
- {"test3", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
+ {"test1", args{test_data}, []string{"test1", "test2", "test3"}},
+ {"test2", args{test_data}, []string{"test1", "test2", "test3"}},
+ {"test3", args{test_data}, []string{"test1", "test2", "test3"}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
@@ -121,7 +126,7 @@ func TestExtractKeywords(t *testing.T) {
pageInfo PageInfo
}
keywords := make(map[string]string)
- keywords["keywords"] = "test1, test2, test3"
+ keywords["keywords"] = test_data
pageInfo := PageInfo{
MetaTags: keywords,
}
@@ -154,9 +159,9 @@ func TestNormalizeText(t *testing.T) {
want string
}{
// Add test cases:
- {"test1", args{"test1, test2, test3"}, "test1 test2 test3"},
- {"test2", args{"TEST1, TEST2, TEST3"}, "test1 test2 test3"},
- {"test3", args{"TeSt1, tEsT2, test3"}, "test1 test2 test3"},
+ {"test1", args{test_data}, test_result},
+ {"test2", args{test_data}, test_result},
+ {"test3", args{test_data}, test_result},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
@@ -177,9 +182,9 @@ func TestRemovePunctuation(t *testing.T) {
want string
}{
// Add test cases:
- {"test1", args{"test1, test2, test3"}, "test1 test2 test3"},
- {"test2", args{"test1. test2. test3."}, "test1 test2 test3"},
- {"test3", args{"test1; test2; test3;"}, "test1 test2 test3"},
+ {"test1", args{test_data}, test_result},
+ {"test2", args{"test1. test2. test3."}, test_result},
+ {"test3", args{"test1; test2; test3;"}, test_result},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go
new file mode 100644
index 00000000..3511fa01
--- /dev/null
+++ b/pkg/crawler/types.go
@@ -0,0 +1,11 @@
+package crawler
+
+// This struct represents the information that we want to extract from a page
+// and store in the database.
+type PageInfo struct {
+ Title string
+ Summary string
+ BodyText string
+ ContainsAppInfo bool
+ MetaTags map[string]string // Add a field for meta tags
+}