From da74f6b993cf16c3fe376efb89fff8944945046c Mon Sep 17 00:00:00 2001
From: harlan <harlan@harlanzw.com>
Date: Sun, 29 Sep 2024 16:08:46 +1000
Subject: [PATCH] fix: improved robots.txt exclude matching

---
 packages/core/src/discovery/robotsTxt.ts  | 113 +++++++++++++---------
 packages/core/src/puppeteer/worker.ts     |  10 ++
 packages/core/src/types.ts                |   5 +
 packages/core/src/util/robotsTxtParser.ts |  25 ++++-
 4 files changed, 107 insertions(+), 46 deletions(-)
diff --git a/packages/core/src/discovery/robotsTxt.ts b/packages/core/src/discovery/robotsTxt.ts
index 104a1c6e..cbed31b8 100644
--- a/packages/core/src/discovery/robotsTxt.ts
+++ b/packages/core/src/discovery/robotsTxt.ts
@@ -10,20 +10,6 @@ export interface RobotsTxtParsed {
   groups: RobotsGroupResolved[]
 }
 
-function isValidRegex(s: string | RegExp) {
-  if (typeof s === 'string') {
-    // make sure it's valid regex
-    try {
-      // eslint-disable-next-line no-new
-      new RegExp(s)
-      return true
-    }
-    catch (e) {
-      return false
-    }
-  }
-  return true
-}
 /**
  * Fetches the robots.txt file.
  * @param site
@@ -46,40 +32,77 @@ export async function fetchRobotsTxt(site: string): Promise<false | string> {
   return robotsTxt.response.data as string
 }
 
-export function mergeRobotsTxtConfig(config: ResolvedUserConfig, { groups, sitemaps }: RobotsTxtParsed): void {
-  const normalisedGroups = groups
-    .filter(group => group.userAgent.includes('*'))
-    .map((group) => {
-      for (const k of ['disallow', 'allow']) {
-        // @ts-expect-error untyped
-        group[k] = (group[k] as string[])
-          // skip any disallows that are root level
-          .filter(path => path !== '/' && path)
-          .map((path) => {
-            // convert robots.txt paths to regex paths
-            if (path.includes('*'))
-              path = path.replace(/\*/g, '.*')
-            else
-              path = `${path}.*`
-            return path
-          })
+interface RobotsTxtRule { pattern: string, allow: boolean }
+
+function matches(pattern: string, path: string): boolean {
+  const pathLength = path.length
+  const patternLength = pattern.length
+  const matchingLengths: number[] = Array.from({ length: pathLength + 1 }).fill(0)
+  let numMatchingLengths = 1
+
+  let p = 0
+  while (p < patternLength) {
+    if (pattern[p] === '$' && p + 1 === patternLength) {
+      return matchingLengths[numMatchingLengths - 1] === pathLength
+    }
+
+    if (pattern[p] === '*') {
+      numMatchingLengths = pathLength - matchingLengths[0] + 1
+      for (let i = 1; i < numMatchingLengths; i++) {
+        matchingLengths[i] = matchingLengths[i - 1] + 1
+      }
+    }
+    else {
+      let numMatches = 0
+      for (let i = 0; i < numMatchingLengths; i++) {
+        const matchLength = matchingLengths[i]
+        if (matchLength < pathLength && path[matchLength] === pattern[p]) {
+          matchingLengths[numMatches++] = matchLength + 1
+        }
+      }
+      if (numMatches === 0) {
+        return false
       }
-      return group
-    })
+      numMatchingLengths = numMatches
+    }
+    p++
+  }
+
+  return true
+}
+export function matchPathToRule(path: string, _rules: RobotsTxtRule[]): RobotsTxtRule | null {
+  let matchedRule: RobotsTxtRule | null = null
+
+  const rules = _rules.filter(Boolean) // filter out empty line such as Disallow:
+  const rulesLength = rules.length
+  let i = 0
+  while (i < rulesLength) {
+    const rule = rules[i]
+    if (!matches(rule.pattern, path)) {
+      i++
+      continue
+    }
 
-  // for diallow we add it to the exclude list
-  config.scanner.exclude = [...new Set([
-    ...(config.scanner.exclude || []),
-    ...normalisedGroups.flatMap(group => group.disallow),
-  ])].filter(isValidRegex)
-  config.scanner.include = config.scanner.include || []
-  const robotsAllows = normalisedGroups.flatMap(group => group.allow).filter(a => a.length)
-  if (!config.scanner.include.length && robotsAllows.length) {
-    config.scanner.include = [...new Set([
-      '/*',
-      ...normalisedGroups.flatMap(group => group.allow),
-    ])].filter(isValidRegex)
+    if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) {
+      matchedRule = rule
+    }
+    else if (
+      rule.pattern.length === matchedRule.pattern.length
+      && rule.allow
+      && !matchedRule.allow
+    ) {
+      matchedRule = rule
+    }
+    i++
   }
+
+  return matchedRule
+}
+
+export function mergeRobotsTxtConfig(config: ResolvedUserConfig, { groups, sitemaps }: RobotsTxtParsed): void {
+  config.scanner._robotsTxtRules = groups.filter((group) => {
+    return group.userAgent.includes('*') || group.userAgent.includes(String(config.lighthouseOptions?.emulatedUserAgent))
+  }).map(group => group._rules)
   if (config.scanner.sitemap !== false && sitemaps.length) {
     // allow overriding the robots.txt sitemaps with your own
     if (!Array.isArray(config.scanner.sitemap) || !config.scanner.sitemap.length)
diff --git a/packages/core/src/puppeteer/worker.ts b/packages/core/src/puppeteer/worker.ts
index b03d2be9..cf204714 100644
--- a/packages/core/src/puppeteer/worker.ts
+++ b/packages/core/src/puppeteer/worker.ts
@@ -12,6 +12,7 @@ import fs from 'node:fs'
 import { join } from 'node:path'
 import chalk from 'chalk'
 import { get, sortBy, uniqBy } from 'lodash-es'
+import { matchPathToRule } from '../discovery'
 import { useLogger } from '../logger'
 import { useUnlighthouse } from '../unlighthouse'
 import { createTaskReportFromRoute, formatBytes, ReportArtifacts } from '../util'
@@ -94,6 +95,15 @@ export async function createUnlighthouseWorker(tasks: Record<UnlighthouseTask, T
     if (ignoredRoutes.has(id))
       return
 
+    // do robots.txt test
+    if (resolvedConfig.scanner.robotsTxt) {
+      const rule = matchPathToRule(path, resolvedConfig.scanner._robotsTxtRules)
+      if (rule && !rule.allow) {
+        logger.info(`Skipping route based on robots.txt rule \`${rule.pattern}\``, { path })
+        return
+      }
+    }
+
     if (resolvedConfig.scanner.include || resolvedConfig.scanner.exclude) {
       const filter = createFilter(resolvedConfig.scanner)
       if (!filter(path)) {
diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts
index 471da491..003fdd57 100644
--- a/packages/core/src/types.ts
+++ b/packages/core/src/types.ts
@@ -465,6 +465,11 @@ export interface ResolvedUserConfig {
      * @default 'mobile'
      */
     device: 'mobile' | 'desktop' | false
+    /**
+     * Resolved robots.txt groups.
+     * @internal
+     */
+    _robotsTxtRules?: any
   }
   /**
    * Changes the default behaviour of lighthouse.
diff --git a/packages/core/src/util/robotsTxtParser.ts b/packages/core/src/util/robotsTxtParser.ts
index 507c6b53..a128bf0b 100644
--- a/packages/core/src/util/robotsTxtParser.ts
+++ b/packages/core/src/util/robotsTxtParser.ts
@@ -4,6 +4,9 @@ export interface RobotsGroupResolved {
   allow: string[]
   userAgent: string[]
   host?: string
+  // runtime optimization
+  _indexable: boolean
+  _rules: { pattern: string, allow: boolean }[]
 }
 
 /**
@@ -78,7 +81,27 @@ export function parseRobotsTxt(s: string) {
     ...currentGroup,
   })
   return {
-    groups,
+    groups: groups.map(normalizeGroup),
     sitemaps,
   }
 }
+
+function asArray(v: any) {
+  return typeof v === 'undefined' ? [] : (Array.isArray(v) ? v : [v])
+}
+
+function normalizeGroup(group: RobotsGroupResolved): RobotsGroupResolved {
+  const disallow = asArray(group.disallow) // we can have empty disallow
+  const allow = asArray(group.allow).filter(rule => Boolean(rule))
+  return <RobotsGroupResolved> {
+    ...group,
+    userAgent: group.userAgent ? asArray(group.userAgent) : ['*'],
+    disallow,
+    allow,
+    _indexable: !disallow.includes((rule: string) => rule === '/'),
+    _rules: [
+      ...disallow.filter(Boolean).map(r => ({ pattern: r, allow: false })),
+      ...allow.map(r => ({ pattern: r, allow: true })),
+    ],
+  }
+}