From 467b0b9a9972ac363ce4397e6d2c84108fe260df Mon Sep 17 00:00:00 2001 From: Paolo Fabio Zaino Date: Mon, 22 Jan 2024 13:03:15 +0000 Subject: [PATCH] added more tests --- pkg/crawler/crawler.go | 20 ++++---- pkg/crawler/crawler_test.go | 99 ++++++++++++++++++++++++++++++++++++ pkg/crawler/keywords_test.go | 27 ++++++---- pkg/crawler/types.go | 11 ++++ 4 files changed, 135 insertions(+), 22 deletions(-) create mode 100644 pkg/crawler/crawler_test.go create mode 100644 pkg/crawler/types.go diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index f73e83ee..b50eac24 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -39,16 +39,6 @@ var ( config cfg.Config ) -// This struct represents the information that we want to extract from a page -// and store in the database. -type PageInfo struct { - Title string - Summary string - BodyText string - ContainsAppInfo bool - MetaTags map[string]string // Add a field for meta tags -} - var indexPageMutex sync.Mutex // Mutex to ensure that only one goroutine is indexing a page at a time // This function is responsible for crawling a website, it's the main entry point @@ -349,8 +339,16 @@ func isExternalLink(sourceURL, linkURL string) bool { fmt.Println("Source hostname:", sourceParsed.Hostname()) fmt.Println("Link hostname:", linkParsed.Hostname()) } + + // Takes the substring that correspond to the 1st and 2nd level domain (e.g., google.com) + // regardless the number of subdomains + srcFqdnArr := strings.Split(sourceParsed.Hostname(), ".") + srcDomainName := strings.Join(srcFqdnArr[len(srcFqdnArr)-2:], ".") + linkFqdnArr := strings.Split(linkParsed.Hostname(), ".") + linkDomainName := strings.Join(linkFqdnArr[len(linkFqdnArr)-2:], ".") + // Compare hostnames - return sourceParsed.Hostname() != linkParsed.Hostname() + return srcDomainName != linkDomainName } // This is the worker function that is responsible for crawling a page diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go new file mode 100644 index 00000000..4bb17e7a --- /dev/null +++ b/pkg/crawler/crawler_test.go @@ -0,0 +1,99 @@ +// Copyright 2023 Paolo Fabio Zaino +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package crawler + +import ( + "reflect" + "testing" +) + +func TestExtractLinks(t *testing.T) { + type args struct { + htmlContent string + } + tests := []struct { + name string + args args + want []string + }{ + {"test1", args{"TestGoogle"}, []string{"https://www.google.com"}}, + {"test2", args{"TestGoogleGoogle"}, []string{"https://www.google.com", "https://www.google.com"}}, + {"test3", args{"TestGoogleGoogleGoogle"}, []string{"https://www.google.com", "https://www.google.com", "https://www.google.com"}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := extractLinks(tt.args.htmlContent); !reflect.DeepEqual(got, tt.want) { + t.Errorf("extractLinks() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_isExternalLink(t *testing.T) { + type args struct { + sourceURL string + linkURL string + } + tests := []struct { + name string + args args + want bool + }{ + // Add test cases + {"test1", args{"https://www.google.com", "https://www.google.com"}, false}, + {"test2", args{"https://www.google.com", "https://www.google.com/test"}, false}, + {"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, false}, + {"test4", args{"https://www.example.com", "https://www.google.com/test/test/test"}, true}, + {"test5", args{"https://data.example.com", "https://www.example.com"}, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isExternalLink(tt.args.sourceURL, tt.args.linkURL); got != tt.want { + t.Errorf("isExternalLink() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestCombineURLs(t *testing.T) { + type args struct { + baseURL string + relativeURL string + } + tests := []struct { + name string + args args + want string + wantErr bool + }{ + // Add test cases. + {"test1", args{"https://www.google.com", "https://www.google.com"}, "https://www.google.com", false}, + {"test2", args{"https://www.google.com", "https://www.google.com/test"}, "https://www.google.com/test", false}, + {"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, "https://www.google.com/test/test", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := combineURLs(tt.args.baseURL, tt.args.relativeURL) + if (err != nil) != tt.wantErr { + t.Errorf("combineURLs() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("combineURLs() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/crawler/keywords_test.go b/pkg/crawler/keywords_test.go index dc87ec4b..e24f8fe2 100644 --- a/pkg/crawler/keywords_test.go +++ b/pkg/crawler/keywords_test.go @@ -19,6 +19,11 @@ import ( "testing" ) +const ( + test_data string = "test1, test2, test3" + test_result string = "test1 test2 test3" +) + func TestIsKeyword(t *testing.T) { type args struct { keyword string @@ -52,7 +57,7 @@ func TestExtractFromMetaTag(t *testing.T) { tagName string } keywords := make(map[string]string) - keywords["keywords"] = "test1, test2, test3" + keywords["keywords"] = test_data tests := []struct { name string args args @@ -81,9 +86,9 @@ func TestExtractContentKeywords(t *testing.T) { args args want []string }{ - {"test1", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}}, - {"test2", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}}, - {"test3", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}}, + {"test1", args{test_data}, []string{"test1", "test2", "test3"}}, + {"test2", args{test_data}, []string{"test1", "test2", "test3"}}, + {"test3", args{test_data}, []string{"test1", "test2", "test3"}}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -121,7 +126,7 @@ func TestExtractKeywords(t *testing.T) { pageInfo PageInfo } keywords := make(map[string]string) - keywords["keywords"] = "test1, test2, test3" + keywords["keywords"] = test_data pageInfo := PageInfo{ MetaTags: keywords, } @@ -154,9 +159,9 @@ func TestNormalizeText(t *testing.T) { want string }{ // Add test cases: - {"test1", args{"test1, test2, test3"}, "test1 test2 test3"}, - {"test2", args{"TEST1, TEST2, TEST3"}, "test1 test2 test3"}, - {"test3", args{"TeSt1, tEsT2, test3"}, "test1 test2 test3"}, + {"test1", args{test_data}, test_result}, + {"test2", args{test_data}, test_result}, + {"test3", args{test_data}, test_result}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -177,9 +182,9 @@ func TestRemovePunctuation(t *testing.T) { want string }{ // Add test cases: - {"test1", args{"test1, test2, test3"}, "test1 test2 test3"}, - {"test2", args{"test1. test2. test3."}, "test1 test2 test3"}, - {"test3", args{"test1; test2; test3;"}, "test1 test2 test3"}, + {"test1", args{test_data}, test_result}, + {"test2", args{"test1. test2. test3."}, test_result}, + {"test3", args{"test1; test2; test3;"}, test_result}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go new file mode 100644 index 00000000..3511fa01 --- /dev/null +++ b/pkg/crawler/types.go @@ -0,0 +1,11 @@ +package crawler + +// This struct represents the information that we want to extract from a page +// and store in the database. +type PageInfo struct { + Title string + Summary string + BodyText string + ContainsAppInfo bool + MetaTags map[string]string // Add a field for meta tags +}