From f5bb7eafc9a33e7ffe334699569486ead929cb18 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sun, 26 Apr 2020 17:11:18 +0100 Subject: [PATCH] test: link-check: trim and manage exclusions (#1189) * link-check: abstract away finder * link-check: add exclude-links-check.sh * link-check: purge unused exclusions * test: add link-check-exclude * link-check: exclude *.test.js also ensure link-check-git-all follows exclusions * link-check: exclude more missing * link-check: final exclude purge --- .circleci/config.yml | 2 ++ package.json | 3 ++- scripts/exclude-links-check.sh | 19 ++++++++++++++++++ scripts/exclude-links.txt | 35 ---------------------------------- scripts/link-check-git-all.sh | 1 + scripts/link-check-git-diff.sh | 2 +- scripts/link-check.sh | 18 ++++------------- scripts/utils.sh | 14 ++++++++++++++ 8 files changed, 43 insertions(+), 51 deletions(-) create mode 100755 scripts/exclude-links-check.sh create mode 100644 scripts/utils.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 8d3a50f56f..866f3af235 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -59,6 +59,7 @@ jobs: - run: yarn lint-ts - run: yarn format-check - run: yarn link-check-diff + - run: yarn link-check-exclude test_full: <<: *defaults @@ -69,6 +70,7 @@ jobs: - run: yarn lint-ts - run: yarn format-check - run: yarn link-check + - run: yarn link-check-exclude workflows: version: 2 diff --git a/package.json b/package.json index ae274ebb85..4574f1bdf5 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,8 @@ "lint-ts": "tsc --noEmit --skipLibCheck && eslint --ext .json,.js,.ts,.tsx .", "lint-css": "stylelint \"src/**/*.css\"", "link-check": "./scripts/link-check-git-all.sh", - "link-check-diff": "./scripts/link-check-git-diff.sh" + "link-check-diff": "./scripts/link-check-git-diff.sh", + "link-check-exclude": "./scripts/exclude-links-check.sh" }, "repository": { "type": "git", diff --git a/scripts/exclude-links-check.sh b/scripts/exclude-links-check.sh new file mode 100755 index 0000000000..28b8b90823 --- /dev/null +++ b/scripts/exclude-links-check.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Checks that `exclude-links.txt` contains only used links. +set -euo pipefail + +source "$(dirname "$0")"/utils.sh +exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname "$0")/exclude-links.txt}" +[ -f "$exclude" ] && exclude="$(cat "$exclude")" + +missing="$( + urlfinder $(git ls-files '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json' ':!*.test.js') \ + | sed 's/#.*//g' | sort -u \ + | comm -13 - <(echo "$exclude" | sort -u) +)" + +if [[ -n "$missing" ]]; then + echo "ERROR:Exclusions not found in codebase:" >&2 + echo "$missing" | sed 's/^/ /' >&2 + exit 1 +fi diff --git a/scripts/exclude-links.txt b/scripts/exclude-links.txt index 4943c165bf..ed8797ab0f 100644 --- a/scripts/exclude-links.txt +++ b/scripts/exclude-links.txt @@ -1,38 +1,16 @@ http://127.0.0.1:10000/devstoreaccount1; -http://localhost:3000/ http://localhost:8000/ http://millionsongdataset.com/pages/getting-dataset/ -http://ogp.me/ns https://$ http://s3-external-1.amazonaws.com/bucket/path https://accounts.google.com/o/oauth2/auth https://api.cloudflare.com/client/v4/zones/$ https://api.github.com/repos/$ -https://blog.$ -https://blog.dataversioncontrol.com/some-random https://circleci.com/gh/iterative/dvc.org -https://code.dvc.org/foo/bar -https://data.dvc.org/foo/bar -https://dataversioncontrol.com/some-random https://discuss.$ -https://discuss.dataversioncontrol.com/some-random -https://discuss.dvc.org/some-random https://drive.google.com/drive/folders/0AIac4JZqHhKmUk9PDA -https://dvc.org$ -https://dvc.org/blog/$1 -https://dvc.org/blog$1 -https://dvc.org/blog/some-random -https://dvc.org/doc/command-reference$1 -https://dvc.org/doc/command-reference/foo -https://dvc.org/foo -https://dvc.org/foo/bar?baz -https://dvc.org/img/.gif -https://dvc.org/some-random -https://dvc.org/uploads/images/2020-02-10/image.png https://example.com/data.txt https://example.com/file.csv -https://example.com/foo -https://example.com/path/to/data https://example.com/path/to/data.csv https://example.com/path/to/dir https://example.com/path/to/file @@ -42,27 +20,14 @@ https://github.com/example/registry https://github.com/iterative/dvc.org/blob/master/content$ https://github.com/iterative/dvc/releases/download/$ https://github.com/user/proj -https://man.dvc.org/foo https://marketplace.visualstudio.com/items?itemName=stkb.rewrap https://myendpoint.com https://object-storage.example.com https://remote.dvc.org/dataset-registry -https://remote.dvc.org/dataset-registry/a3/04af... https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355 -https://remote.dvc.org/foo/bar https://remote.dvc.org/get-started https://s3.eu.cloud-object-storage.appdomain.cloud -https://s3-us-east-2.amazonaws.com/dvc-public/$1/$2 -https://s3-us-east-2.amazonaws.com/dvc-public/code/foo/bar -https://s3-us-east-2.amazonaws.com/dvc-public/data/foo/bar -https://s3-us-east-2.amazonaws.com/dvc-public/remote/foo/bar -https://s3-us-east-2.amazonaws.com/dvc-s3-repo/ -https://s3-us-east-2.amazonaws.com/dvc-s3-repo/$1 -https://s3-us-east-2.amazonaws.com/dvc-s3-repo/deb/foo -https://s3-us-east-2.amazonaws.com/dvc-s3-repo/rpm/foo https://sweedom.us10.list-manage.com/subscribe/post?u=a08bf93caae4063c4e6a351f6&id=24c0ecc49a -https://www.dataversioncontrol.com/some-random -https://www.dvc.org/foo https://www.kaggle.com/rtatman/kerneld4769833fe https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/ https://www.reddit.com/r/MachineLearning/comments/bx0apm/d_how_do_you_manage_your_machine_learning/ diff --git a/scripts/link-check-git-all.sh b/scripts/link-check-git-all.sh index 3505ac51d7..d5b3abc0f7 100755 --- a/scripts/link-check-git-all.sh +++ b/scripts/link-check-git-all.sh @@ -8,6 +8,7 @@ pushd "$repo" (find .github/ content/docs/ src/ \ -name '*.css' -o -name '*.js' -o -name '*.jsx' -o -name '*.md' -o -name '*.tsx' -o \ -name '*.ts' -o -name '*.json' && ls *.js *.jsx *.md *.tsx *.ts *.json) \ + | grep -Ev '(redirects-list\.json|\.test\.js)$' \ | xargs -n1 -P8 "$(dirname "$0")"/link-check.sh popd diff --git a/scripts/link-check-git-diff.sh b/scripts/link-check-git-diff.sh index bbbef2960f..94680f56c8 100755 --- a/scripts/link-check-git-diff.sh +++ b/scripts/link-check-git-diff.sh @@ -5,7 +5,7 @@ repo="$(dirname "$(realpath "$(dirname "$0")")")" pushd "$repo" differ="git diff $(git merge-base HEAD origin/master)" -changed="$($differ --name-only -- '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json')" +changed="$($differ --name-only -- '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json' ':!*.test.js')" [ -z "$changed" ] && exit 0 diff --git a/scripts/link-check.sh b/scripts/link-check.sh index f10c20793f..3596a15ea8 100755 --- a/scripts/link-check.sh +++ b/scripts/link-check.sh @@ -6,24 +6,12 @@ # link-check.sh [] set -euo pipefail +source "$(dirname "$0")"/utils.sh base_url="${CHECK_LINKS_RELATIVE_URL:-https://dvc.org}" exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname "$0")/exclude-links.txt}" [ -f "$exclude" ] && exclude="$(cat "$exclude")" user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0" - -finder(){ # expects list of files - content="$(cat "$@")" # read once (could be file descriptors) - # explicit links not in markdown - echo "$content" | pcregrep -o '(?{}"'"'"'`]+' - # explicit links in markdown - echo "$content" | pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))' - # relative links in markdown - echo "$content" | sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' | xargs -n1 -II echo ${base_url}I - # relative links in html - echo "$content" | sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+)["'"'"'].*/\1/p' | xargs -n1 -II echo ${base_url}I -} - checker(){ # expects list of urls errors=0 for url in "$@"; do @@ -56,7 +44,9 @@ fails=0 for file in "$@"; do echo -n "$file:" prev=$fails - checker $(finder "$file" | sed 's/#.*//g' | sort -u | comm -23 - <(echo "$exclude" | sort -u)) || fails=$(($fails + 1)) + checker $(urlfinder "$base_url" "$file" | sed 's/#.*//g' | sort -u \ + | comm -23 - <(echo "$exclude" | sort -u)) \ + || fails=$(($fails + 1)) [ $prev -eq $fails ] && echo OK done diff --git a/scripts/utils.sh b/scripts/utils.sh new file mode 100644 index 0000000000..e0891c84f6 --- /dev/null +++ b/scripts/utils.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +urlfinder(){ # expects ... + base_url="$1" + content="$(cat "${@:2}")" # read once (could be file descriptors) + # explicit links not in markdown + echo "$content" | pcregrep -o '(?{}"'"'"'`]+' + # explicit links in markdown + echo "$content" | pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))' + # relative links in markdown + echo "$content" | sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' | xargs -n1 -II echo ${base_url}I + # relative links in html + echo "$content" | sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+)["'"'"'].*/\1/p' | xargs -n1 -II echo ${base_url}I +}