Skip to content

Commit

Permalink
test: link-check: trim and manage exclusions (#1189)
Browse files Browse the repository at this point in the history
* link-check: abstract away finder

* link-check: add exclude-links-check.sh

* link-check: purge unused exclusions

* test: add link-check-exclude

* link-check: exclude *.test.js

also ensure link-check-git-all follows exclusions

* link-check: exclude more missing

* link-check: final exclude purge
  • Loading branch information
casperdcl authored Apr 26, 2020
1 parent 4b2868c commit f5bb7ea
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 51 deletions.
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ jobs:
- run: yarn lint-ts
- run: yarn format-check
- run: yarn link-check-diff
- run: yarn link-check-exclude

test_full:
<<: *defaults
Expand All @@ -69,6 +70,7 @@ jobs:
- run: yarn lint-ts
- run: yarn format-check
- run: yarn link-check
- run: yarn link-check-exclude

workflows:
version: 2
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
"lint-ts": "tsc --noEmit --skipLibCheck && eslint --ext .json,.js,.ts,.tsx .",
"lint-css": "stylelint \"src/**/*.css\"",
"link-check": "./scripts/link-check-git-all.sh",
"link-check-diff": "./scripts/link-check-git-diff.sh"
"link-check-diff": "./scripts/link-check-git-diff.sh",
"link-check-exclude": "./scripts/exclude-links-check.sh"
},
"repository": {
"type": "git",
Expand Down
19 changes: 19 additions & 0 deletions scripts/exclude-links-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Checks that `exclude-links.txt` contains only used links.
set -euo pipefail

source "$(dirname "$0")"/utils.sh
exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname "$0")/exclude-links.txt}"
[ -f "$exclude" ] && exclude="$(cat "$exclude")"

missing="$(
urlfinder $(git ls-files '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json' ':!*.test.js') \
| sed 's/#.*//g' | sort -u \
| comm -13 - <(echo "$exclude" | sort -u)
)"

if [[ -n "$missing" ]]; then
echo "ERROR:Exclusions not found in codebase:" >&2
echo "$missing" | sed 's/^/ /' >&2
exit 1
fi
35 changes: 0 additions & 35 deletions scripts/exclude-links.txt
Original file line number Diff line number Diff line change
@@ -1,38 +1,16 @@
http://127.0.0.1:10000/devstoreaccount1;
http://localhost:3000/
http://localhost:8000/
http://millionsongdataset.com/pages/getting-dataset/
http://ogp.me/ns
https://$
http://s3-external-1.amazonaws.com/bucket/path
https://accounts.google.com/o/oauth2/auth
https://api.cloudflare.com/client/v4/zones/$
https://api.github.com/repos/$
https://blog.$
https://blog.dataversioncontrol.com/some-random
https://circleci.com/gh/iterative/dvc.org
https://code.dvc.org/foo/bar
https://data.dvc.org/foo/bar
https://dataversioncontrol.com/some-random
https://discuss.$
https://discuss.dataversioncontrol.com/some-random
https://discuss.dvc.org/some-random
https://drive.google.com/drive/folders/0AIac4JZqHhKmUk9PDA
https://dvc.org$
https://dvc.org/blog/$1
https://dvc.org/blog$1
https://dvc.org/blog/some-random
https://dvc.org/doc/command-reference$1
https://dvc.org/doc/command-reference/foo
https://dvc.org/foo
https://dvc.org/foo/bar?baz
https://dvc.org/img/<filename>.gif
https://dvc.org/some-random
https://dvc.org/uploads/images/2020-02-10/image.png
https://example.com/data.txt
https://example.com/file.csv
https://example.com/foo
https://example.com/path/to/data
https://example.com/path/to/data.csv
https://example.com/path/to/dir
https://example.com/path/to/file
Expand All @@ -42,27 +20,14 @@ https://github.com/example/registry
https://github.com/iterative/dvc.org/blob/master/content$
https://github.com/iterative/dvc/releases/download/$
https://github.com/user/proj
https://man.dvc.org/foo
https://marketplace.visualstudio.com/items?itemName=stkb.rewrap
https://myendpoint.com
https://object-storage.example.com
https://remote.dvc.org/dataset-registry
https://remote.dvc.org/dataset-registry/a3/04af...
https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355
https://remote.dvc.org/foo/bar
https://remote.dvc.org/get-started
https://s3.eu.cloud-object-storage.appdomain.cloud
https://s3-us-east-2.amazonaws.com/dvc-public/$1/$2
https://s3-us-east-2.amazonaws.com/dvc-public/code/foo/bar
https://s3-us-east-2.amazonaws.com/dvc-public/data/foo/bar
https://s3-us-east-2.amazonaws.com/dvc-public/remote/foo/bar
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/$1
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/deb/foo
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/rpm/foo
https://sweedom.us10.list-manage.com/subscribe/post?u=a08bf93caae4063c4e6a351f6&amp;id=24c0ecc49a
https://www.dataversioncontrol.com/some-random
https://www.dvc.org/foo
https://www.kaggle.com/rtatman/kerneld4769833fe
https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/
https://www.reddit.com/r/MachineLearning/comments/bx0apm/d_how_do_you_manage_your_machine_learning/
Expand Down
1 change: 1 addition & 0 deletions scripts/link-check-git-all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pushd "$repo"
(find .github/ content/docs/ src/ \
-name '*.css' -o -name '*.js' -o -name '*.jsx' -o -name '*.md' -o -name '*.tsx' -o \
-name '*.ts' -o -name '*.json' && ls *.js *.jsx *.md *.tsx *.ts *.json) \
| grep -Ev '(redirects-list\.json|\.test\.js)$' \
| xargs -n1 -P8 "$(dirname "$0")"/link-check.sh

popd
2 changes: 1 addition & 1 deletion scripts/link-check-git-diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ repo="$(dirname "$(realpath "$(dirname "$0")")")"
pushd "$repo"

differ="git diff $(git merge-base HEAD origin/master)"
changed="$($differ --name-only -- '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json')"
changed="$($differ --name-only -- '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json' ':!*.test.js')"

[ -z "$changed" ] && exit 0

Expand Down
18 changes: 4 additions & 14 deletions scripts/link-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,12 @@
# link-check.sh [<files>]
set -euo pipefail

source "$(dirname "$0")"/utils.sh
base_url="${CHECK_LINKS_RELATIVE_URL:-https://dvc.org}"
exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname "$0")/exclude-links.txt}"
[ -f "$exclude" ] && exclude="$(cat "$exclude")"
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0"


finder(){ # expects list of files
content="$(cat "$@")" # read once (could be file descriptors)
# explicit links not in markdown
echo "$content" | pcregrep -o '(?<!\]\()https?://[^\s<>{}"'"'"'`]+'
# explicit links in markdown
echo "$content" | pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))'
# relative links in markdown
echo "$content" | sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' | xargs -n1 -II echo ${base_url}I
# relative links in html
echo "$content" | sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+)["'"'"'].*/\1/p' | xargs -n1 -II echo ${base_url}I
}

checker(){ # expects list of urls
errors=0
for url in "$@"; do
Expand Down Expand Up @@ -56,7 +44,9 @@ fails=0
for file in "$@"; do
echo -n "$file:"
prev=$fails
checker $(finder "$file" | sed 's/#.*//g' | sort -u | comm -23 - <(echo "$exclude" | sort -u)) || fails=$(($fails + 1))
checker $(urlfinder "$base_url" "$file" | sed 's/#.*//g' | sort -u \
| comm -23 - <(echo "$exclude" | sort -u)) \
|| fails=$(($fails + 1))
[ $prev -eq $fails ] && echo OK
done

Expand Down
14 changes: 14 additions & 0 deletions scripts/utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

urlfinder(){ # expects <base_url> <files>...
base_url="$1"
content="$(cat "${@:2}")" # read once (could be file descriptors)
# explicit links not in markdown
echo "$content" | pcregrep -o '(?<!\]\()https?://[^\s<>{}"'"'"'`]+'
# explicit links in markdown
echo "$content" | pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))'
# relative links in markdown
echo "$content" | sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' | xargs -n1 -II echo ${base_url}I
# relative links in html
echo "$content" | sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+)["'"'"'].*/\1/p' | xargs -n1 -II echo ${base_url}I
}

0 comments on commit f5bb7ea

Please sign in to comment.