Skip to content

Latest commit

 

History

History
67 lines (52 loc) · 1.42 KB

README.md

File metadata and controls

67 lines (52 loc) · 1.42 KB

robotstxt

A robots.txt parser and generator written in Go, based on samclarke robotstxt package.

It currently supports:

  • User-agent:
  • Allow:
  • Disallow:
  • Sitemap:
  • Crawl-delay:
  • Host:
  • URL encoded & UTF-8 paths
  • Paths with wildcards (*) and EOL matching ($)

Installation

Go get:

go get github.com/rimiti/robotstxt

Usage

import (
    "log"
    "github.com/rimiti/robotstxt"
)

func main() {
    url := "http://www.example.com/robots.txt"
    contents := `
        User-agent: *
        Disallow: /dir/
        Disallow: /test.html
        Allow: /dir/test.html
        Allow: /test.html
        Crawl-delay: 1
        Sitemap: http://example.com/sitemap.xml
        Host: example.com
    `

    robots, err := Parse(contents, url)
    if err != nil {
        log.Fatalln(err.Error())
    }

    allowed, _ := robots.IsAllowed("Bot/1.0", "http://www.example.com/test.html")
    if !allowed {
        println("Not allowed to crawl: /test.html")
    }

    allowed, _ = robots.IsAllowed("Bot/1.0", "http://www.example.com/dir/test.html")
    if allowed {
        println("Allowed to crawl: /dir/test.html")
    }

    // 1
    println("Crawl delay: " + robots.CrawlDelay("Bot/1.0"))

    // [http://example.com/sitemap.xml]
    println("Sitemaps: " + strings.Join(robots.Sitemaps(), ","))

    // example.com
    println("Preferred host: " + robots.Host())
}