From 1f2b4bb65455e8b2a99e111b39ff795699e88be6 Mon Sep 17 00:00:00 2001 From: Stanislav Popov Date: Tue, 6 Apr 2021 22:32:29 +0500 Subject: [PATCH] fix: make working in docker --- .env | 1 + Dockerfile | 3 +- README.md | 168 ++++++++++++++++++++++++--------------------- docker-compose.yml | 2 + src/scrap-site.js | 3 +- 5 files changed, 98 insertions(+), 79 deletions(-) diff --git a/.env b/.env index ef21fea..c68e1e7 100644 --- a/.env +++ b/.env @@ -7,3 +7,4 @@ SERVER_URL=http://localhost:5301 FRONTEND_URL=http://localhost:5302 SCAN_DEFAULT_MAX_REQUESTS=0 YAKE_SERVER_URL=http://yake:5000/yake/ +PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 059ae10..3e6fdff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ -FROM buildkite/puppeteer:latest +FROM buildkite/puppeteer:8.0.0 # ARG PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=${PUPPETEER_SKIP_CHROMIUM_DOWNLOAD:-"false"} +# ARG PUPPETEER_EXECUTABLE_PATH=/node_modules/puppeteer/.local-chromium/linux-782078/chrome-linux/chrome # RUN apk update && \ # apk upgrade && \ diff --git a/README.md b/README.md index a1f7fe0..a14a906 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,94 @@ Demo: ## Using without install Open https://viasite.github.io/site-audit-seo-viewer/. +## Features: +- Crawls the entire site, collects links to pages and documents +- Does not follow links outside the scanned domain (configurable) +- Analyse each page with Lighthouse (see below) +- Analyse main page text with Mozilla Readability and Yake +- Search pages with SSL mixed content +- Scan list of urls, `--url-list` +- Set default report fields and filters +- Scan presets +- Documents with the extensions `doc`,` docx`, `xls`,` xlsx`, `ppt`,` pptx`, `pdf`,` rar`, `zip` are added to the list with a depth == 0 + +## Technical details: +- Does not load images, css, js (configurable) +- Each site is saved to a file with a domain name in `~/site-audit-seo/` +- Some URLs are ignored ([`preRequest` in `src/scrap-site.js`](src/scrap-site.js#L98)) + +### XLSX features +- The first row and the first column are fixed +- Column width and auto cell height are configured for easy viewing +- URL, title, description and some other fields are limited in width +- Title is right-aligned to reveal the common part +- Validation of some columns (status, request time, description length) +- Export xlsx to Google Drive and print URL + +### Web viewer features: +- Fixed table header and url column +- Add/remove columns +- Column presets +- Field groups by categories +- Filters presets (ex. `h1_count != 1`) +- Color validation +- Verbose page details (`+` button) +- Direct URL to same report with selected fields, filters, sort +- Stats for whole scanned pages, validation summary +- Persistent URL to report when `--upload` using +- Switch between last uploaded reports +- Rescan current report + + +### Fields list (18.08.2020): +- url +- mixed_content_url +- canonical +- is_canonical +- previousUrl +- depth +- status +- request_time +- title +- h1 +- page_date +- description +- keywords +- og_title +- og_image +- schema_types +- h1_count +- h2_count +- h3_count +- h4_count +- canonical_count +- google_amp +- images +- images_without_alt +- images_alt_empty +- images_outer +- links +- links_inner +- links_outer +- text_ratio_percent +- dom_size +- html_size +- lighthouse_scores_performance +- lighthouse_scores_pwa +- lighthouse_scores_accessibility +- lighthouse_scores_best-practices +- lighthouse_scores_seo +- lighthouse_first-contentful-paint +- lighthouse_speed-index +- lighthouse_largest-contentful-paint +- lighthouse_interactive +- lighthouse_total-blocking-time +- lighthouse_cumulative-layout-shift +- and 150 more lighthouse tests! + + +## Install + ## Install with docker-compose ``` bash git clone /~https://github.com/viasite/site-audit-seo @@ -29,6 +117,7 @@ Service will available on http://localhost:5302 ##### Default ports: - Backend: `5301` - Frontend: `5302` +- Yake: `5303` You can change it in `.env` file or in `docker-compose.yml`. @@ -52,7 +141,7 @@ sudo chown -R $USER:$USER "$(npm prefix -g)/lib/node_modules/site-audit-seo/node Error details [Invalid file descriptor to ICU data received](/~https://github.com/puppeteer/puppeteer/issues/2519). -## Usage: +## Command line usage: ``` $ site-audit-seo --help Usage: site-audit-seo -u https://example.com --upload @@ -89,82 +178,6 @@ Options: -h, --help display help for command ``` -## Features: -- Crawls the entire site, collects links to pages and documents -- Validation summary after scan -- Documents with the extensions `doc`,` docx`, `xls`,` xlsx`, `ppt`,` pptx`, `pdf`,` rar`, `zip` are added to the list with a depth == 0 -- Search pages with SSL mixed content -- Each site is saved to a file with a domain name in `~/site-audit-seo/` -- Does not follow links outside the scanned domain (configurable) -- Does not load images, css, js (configurable) -- Some URLs are ignored ([`preRequest` in `src/scrap-site.js`](src/scrap-site.js#L98)) -- Analyse each page with Lighthouse (see below) -- Scan list of urls, `--url-list` - -### XLSX features -- The first row and the first column are fixed -- Column width and auto cell height are configured for easy viewing -- URL, title, description and some other fields are limited in width -- Title is right-aligned to reveal the common part -- Validation of some columns (status, request time, description length) -- Export xlsx to Google Drive and print URL - -### Web viewer features: -- Fixed table header and url column -- Add/remove columns -- Column presets -- Field groups by categories -- Filters presets (ex. `h1_count != 1`) -- Color validation -- Persistent URL to report when `--upload` using -- Switch between last uploaded reports - - -### Fields list (18.08.2020): -- url -- mixed_content_url -- canonical -- is_canonical -- previousUrl -- depth -- status -- request_time -- title -- h1 -- page_date -- description -- keywords -- og_title -- og_image -- schema_types -- h1_count -- h2_count -- h3_count -- h4_count -- canonical_count -- google_amp -- images -- images_without_alt -- images_alt_empty -- images_outer -- links -- links_inner -- links_outer -- text_ratio_percent -- dom_size -- html_size -- lighthouse_scores_performance -- lighthouse_scores_pwa -- lighthouse_scores_accessibility -- lighthouse_scores_best-practices -- lighthouse_scores_seo -- lighthouse_first-contentful-paint -- lighthouse_speed-index -- lighthouse_largest-contentful-paint -- lighthouse_interactive -- lighthouse_total-blocking-time -- lighthouse_cumulative-layout-shift -- and 150 more lighthouse tests! ## Custom fields @@ -362,6 +375,7 @@ site-audit-seo -u https://example.com --lighthouse - [Offline w3c validation](https://www.npmjs.com/package/html-validator) - [Words count](/~https://github.com/IonicaBizau/count-words) - [Sentences count](/~https://github.com/NaturalNode/natural) +- Do not load image with non-standard URL, like [this](https://lh3.googleusercontent.com/pw/ACtC-3dd9Ng2Jdq713vsFqqTrNT6j_nyH3mFsRAzPbIAzWvDoRkiKSW2MIQOxrtpPVab4e9BElcL_Rlr8eGT68R7ZBnLCHpnHHJNRcd8JadddrxpVVClu1iOnkxPUQXOx-7OoNDmeEtH0xyg7NkEI8VF0oJRXQ=w1423-h1068-no?authuser=0) - External follow links - Broken images - Breadcrumbs - /~https://github.com/glitchdigital/structured-data-testing-tool diff --git a/docker-compose.yml b/docker-compose.yml index a09c328..bc9b461 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,7 @@ services: ports: - ${SERVER_PORT}:${SERVER_PORT} volumes: + # - .:/app - ./data:/app/data - ./data/reports:/app/data/reports - ./data/db-docker.json:/app/data/db.json @@ -44,6 +45,7 @@ services: yake: image: liaad/yake-server container_name: ${NAMESPACE}-yake + restart: always ports: - 5303:5000 diff --git a/src/scrap-site.js b/src/scrap-site.js index 1cfff3d..949000d 100644 --- a/src/scrap-site.js +++ b/src/scrap-site.js @@ -155,7 +155,7 @@ module.exports = async (baseUrl, options = {}) => { skipRequestedRedirect: true, // all redirects marks as visited depthPriority: false, // without it find not all pages retryCount: 1, - args: ['--no-sandbox', '--disable-dev-shm-usage'], // puppeteer freezes without it + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], // puppeteer freezes without it exporter, // url ignore rules @@ -501,6 +501,7 @@ module.exports = async (baseUrl, options = {}) => { }; const crawlerOptions = {...defaultOptions, ...options}; + crawlerOptions.args = defaultOptions.args; // override args for chromium // start const start = Date.now();