Skip to content

Commit

Permalink
Merge pull request #15 from pzaino/develop
Browse files Browse the repository at this point in the history
added more tests
  • Loading branch information
pzaino authored Jan 22, 2024
2 parents 19a220a + 467b0b9 commit 125feca
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 22 deletions.
20 changes: 9 additions & 11 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,6 @@ var (
config cfg.Config
)

// This struct represents the information that we want to extract from a page
// and store in the database.
type PageInfo struct {
Title string
Summary string
BodyText string
ContainsAppInfo bool
MetaTags map[string]string // Add a field for meta tags
}

var indexPageMutex sync.Mutex // Mutex to ensure that only one goroutine is indexing a page at a time

// This function is responsible for crawling a website, it's the main entry point
Expand Down Expand Up @@ -349,8 +339,16 @@ func isExternalLink(sourceURL, linkURL string) bool {
fmt.Println("Source hostname:", sourceParsed.Hostname())
fmt.Println("Link hostname:", linkParsed.Hostname())
}

// Takes the substring that correspond to the 1st and 2nd level domain (e.g., google.com)
// regardless the number of subdomains
srcFqdnArr := strings.Split(sourceParsed.Hostname(), ".")
srcDomainName := strings.Join(srcFqdnArr[len(srcFqdnArr)-2:], ".")
linkFqdnArr := strings.Split(linkParsed.Hostname(), ".")
linkDomainName := strings.Join(linkFqdnArr[len(linkFqdnArr)-2:], ".")

// Compare hostnames
return sourceParsed.Hostname() != linkParsed.Hostname()
return srcDomainName != linkDomainName
}

// This is the worker function that is responsible for crawling a page
Expand Down
99 changes: 99 additions & 0 deletions pkg/crawler/crawler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2023 Paolo Fabio Zaino
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package crawler

import (
"reflect"
"testing"
)

func TestExtractLinks(t *testing.T) {
type args struct {
htmlContent string
}
tests := []struct {
name string
args args
want []string
}{
{"test1", args{"<html><head><title>Test</title></head><body><a href=\"https://www.google.com\">Google</a></body></html>"}, []string{"https://www.google.com"}},
{"test2", args{"<html><head><title>Test</title></head><body><a href=\"https://www.google.com\">Google</a><a href=\"https://www.google.com\">Google</a></body></html>"}, []string{"https://www.google.com", "https://www.google.com"}},
{"test3", args{"<html><head><title>Test</title></head><body><a href=\"https://www.google.com\">Google</a><a href=\"https://www.google.com\">Google</a><a href=\"https://www.google.com\">Google</a></body></html>"}, []string{"https://www.google.com", "https://www.google.com", "https://www.google.com"}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := extractLinks(tt.args.htmlContent); !reflect.DeepEqual(got, tt.want) {
t.Errorf("extractLinks() = %v, want %v", got, tt.want)
}
})
}
}

func Test_isExternalLink(t *testing.T) {
type args struct {
sourceURL string
linkURL string
}
tests := []struct {
name string
args args
want bool
}{
// Add test cases
{"test1", args{"https://www.google.com", "https://www.google.com"}, false},
{"test2", args{"https://www.google.com", "https://www.google.com/test"}, false},
{"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, false},
{"test4", args{"https://www.example.com", "https://www.google.com/test/test/test"}, true},
{"test5", args{"https://data.example.com", "https://www.example.com"}, false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := isExternalLink(tt.args.sourceURL, tt.args.linkURL); got != tt.want {
t.Errorf("isExternalLink() = %v, want %v", got, tt.want)
}
})
}
}

func TestCombineURLs(t *testing.T) {
type args struct {
baseURL string
relativeURL string
}
tests := []struct {
name string
args args
want string
wantErr bool
}{
// Add test cases.
{"test1", args{"https://www.google.com", "https://www.google.com"}, "https://www.google.com", false},
{"test2", args{"https://www.google.com", "https://www.google.com/test"}, "https://www.google.com/test", false},
{"test3", args{"https://www.google.com", "https://www.google.com/test/test"}, "https://www.google.com/test/test", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := combineURLs(tt.args.baseURL, tt.args.relativeURL)
if (err != nil) != tt.wantErr {
t.Errorf("combineURLs() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got != tt.want {
t.Errorf("combineURLs() = %v, want %v", got, tt.want)
}
})
}
}
27 changes: 16 additions & 11 deletions pkg/crawler/keywords_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ import (
"testing"
)

const (
test_data string = "test1, test2, test3"
test_result string = "test1 test2 test3"
)

func TestIsKeyword(t *testing.T) {
type args struct {
keyword string
Expand Down Expand Up @@ -52,7 +57,7 @@ func TestExtractFromMetaTag(t *testing.T) {
tagName string
}
keywords := make(map[string]string)
keywords["keywords"] = "test1, test2, test3"
keywords["keywords"] = test_data
tests := []struct {
name string
args args
Expand Down Expand Up @@ -81,9 +86,9 @@ func TestExtractContentKeywords(t *testing.T) {
args args
want []string
}{
{"test1", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
{"test2", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
{"test3", args{"test1, test2, test3"}, []string{"test1", "test2", "test3"}},
{"test1", args{test_data}, []string{"test1", "test2", "test3"}},
{"test2", args{test_data}, []string{"test1", "test2", "test3"}},
{"test3", args{test_data}, []string{"test1", "test2", "test3"}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down Expand Up @@ -121,7 +126,7 @@ func TestExtractKeywords(t *testing.T) {
pageInfo PageInfo
}
keywords := make(map[string]string)
keywords["keywords"] = "test1, test2, test3"
keywords["keywords"] = test_data
pageInfo := PageInfo{
MetaTags: keywords,
}
Expand Down Expand Up @@ -154,9 +159,9 @@ func TestNormalizeText(t *testing.T) {
want string
}{
// Add test cases:
{"test1", args{"test1, test2, test3"}, "test1 test2 test3"},
{"test2", args{"TEST1, TEST2, TEST3"}, "test1 test2 test3"},
{"test3", args{"TeSt1, tEsT2, test3"}, "test1 test2 test3"},
{"test1", args{test_data}, test_result},
{"test2", args{test_data}, test_result},
{"test3", args{test_data}, test_result},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand All @@ -177,9 +182,9 @@ func TestRemovePunctuation(t *testing.T) {
want string
}{
// Add test cases:
{"test1", args{"test1, test2, test3"}, "test1 test2 test3"},
{"test2", args{"test1. test2. test3."}, "test1 test2 test3"},
{"test3", args{"test1; test2; test3;"}, "test1 test2 test3"},
{"test1", args{test_data}, test_result},
{"test2", args{"test1. test2. test3."}, test_result},
{"test3", args{"test1; test2; test3;"}, test_result},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down
11 changes: 11 additions & 0 deletions pkg/crawler/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package crawler

// This struct represents the information that we want to extract from a page
// and store in the database.
type PageInfo struct {
Title string
Summary string
BodyText string
ContainsAppInfo bool
MetaTags map[string]string // Add a field for meta tags
}

0 comments on commit 125feca

Please sign in to comment.