Skip to content

Commit

Permalink
Make user-agent for linkchecks customisable (#2562)
Browse files Browse the repository at this point in the history
Co-authored-by: Morten Piibeleht <[email protected]>
  • Loading branch information
giordano and mortenpi authored Sep 1, 2024
1 parent 1201725 commit 30f6ba2
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 7 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## UNRELEASED

### Added

* The `User-Agent` header set in the linkcheck HTTP(S) requests can now be customized with the `linkcheck_useragent` option to `makedocs`. ([#2557], [#2562])

### Fixed

* The paths for `size_threshold_ignore` option of `Documenter.HTML` are now correctly normalized and no longer sensitive to platform-dependent differences in path separators. ([#2560], [#2561])
Expand Down Expand Up @@ -1880,8 +1884,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[#2514]: https://github.com/JuliaDocs/Documenter.jl/issues/2514
[#2549]: https://github.com/JuliaDocs/Documenter.jl/issues/2549
[#2551]: https://github.com/JuliaDocs/Documenter.jl/issues/2551
[#2557]: https://github.com/JuliaDocs/Documenter.jl/issues/2557
[#2560]: https://github.com/JuliaDocs/Documenter.jl/issues/2560
[#2561]: https://github.com/JuliaDocs/Documenter.jl/issues/2561
[#2562]: https://github.com/JuliaDocs/Documenter.jl/issues/2562
[JuliaLang/julia#36953]: https://github.com/JuliaLang/julia/issues/36953
[JuliaLang/julia#38054]: https://github.com/JuliaLang/julia/issues/38054
[JuliaLang/julia#39841]: https://github.com/JuliaLang/julia/issues/39841
Expand Down
20 changes: 14 additions & 6 deletions src/docchecks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ function linkcheck(node::MarkdownAST.Node, element::MarkdownAST.AbstractElement,
return nothing
end

const _LINKCHECK_DEFAULT_USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

function linkcheck(node::MarkdownAST.Node, link::MarkdownAST.Link, doc::Document; method::Symbol=:HEAD)

# first, make sure we're not supposed to ignore this link
Expand All @@ -203,6 +205,7 @@ function linkcheck(node::MarkdownAST.Node, link::MarkdownAST.Link, doc::Document

if !haskey(doc.internal.locallinks, link)
timeout = doc.user.linkcheck_timeout
useragent = doc.user.linkcheck_useragent
null_file = @static Sys.iswindows() ? "nul" : "/dev/null"
# In some cases, web servers (e.g. docs.github.com as of 2022) will reject requests
# that declare a non-browser user agent (curl specifically passes 'curl/X.Y'). In
Expand All @@ -212,12 +215,17 @@ function linkcheck(node::MarkdownAST.Node, link::MarkdownAST.Link, doc::Document
# Mozilla developer docs, but only is it's a HTTP(S) request.
#
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent#chrome_ua_string
fakebrowser = startswith(uppercase(link.destination), "HTTP") ? [
"--user-agent",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"-H",
"accept-encoding: gzip, deflate, br",
] : ""
fakebrowser = if startswith(uppercase(link.destination), "HTTP")
headers = [
"-H",
"accept-encoding: gzip, deflate, br",
]
if !isempty(useragent)
push!(headers, "--user-agent", useragent)
end
else
""
end
cmd = `curl $(method === :HEAD ? "-sI" : "-s") --proto =http,https,ftp,ftps $(fakebrowser) $(link.destination) --max-time $timeout -o $null_file --write-out "%{http_code} %{url_effective} %{redirect_url}"`

local result
Expand Down
3 changes: 3 additions & 0 deletions src/documents.jl
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ struct User
linkcheck::Bool # Check external links..
linkcheck_ignore::Vector{Union{String,Regex}} # ..and then ignore (some of) them.
linkcheck_timeout::Real # ..but only wait this many seconds for each one.
linkcheck_useragent::String # User agent to use for linkchecks.
checkdocs::Symbol # Check objects missing from `@docs` blocks. `:none`, `:exports`, or `:all`.
doctestfilters::Vector{Regex} # Filtering for doctests
warnonly::Vector{Symbol} # List of docerror groups that should only warn, rather than cause a build failure
Expand Down Expand Up @@ -385,6 +386,7 @@ function Document(;
linkcheck:: Bool = false,
linkcheck_ignore :: Vector = [],
linkcheck_timeout :: Real = 10,
linkcheck_useragent :: String= _LINKCHECK_DEFAULT_USERAGENT,
checkdocs::Symbol = :all,
doctestfilters::Vector{Regex}= Regex[],
warnonly :: Union{Bool,Symbol,Vector{Symbol}} = Symbol[],
Expand Down Expand Up @@ -450,6 +452,7 @@ function Document(;
linkcheck,
linkcheck_ignore,
linkcheck_timeout,
linkcheck_useragent,
checkdocs,
doctestfilters,
warnonly,
Expand Down
12 changes: 12 additions & 0 deletions src/makedocs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,18 @@ ignored.
**`linkcheck_timeout`** configures how long `curl` waits (in seconds) for a link request to
return a response before giving up. The default is 10 seconds.
**`linkcheck_useragent`** can be used to override the user agent string used by the HTTP and
HTTPS requests made when checking for broken links. Currently, the default user agent is
```
$(_LINKCHECK_DEFAULT_USERAGENT)
```
which is set to mimic a realistic web browser. However, the exact user agent string is subject
to change. As such, it is possible that breakages can occur when Documenter's version changes,
but the goal is to set the user agent such that it would be accepted by as many web servers as
possible.
**`warnonly`** can be used to control whether the `makedocs` build fails with an error, or
simply prints a warning if it detects any issues with the document. Additionally, a `Symbol`
or a `Vector` of `Symbol`s can be passed to make Documenter warn for only those specified
Expand Down
3 changes: 2 additions & 1 deletion test/examples/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ examples_html_local_doc = if "html-local" in EXAMPLE_BUILDS
repo = "https://dev.azure.com/org/project/_git/repo?path={path}&version={commit}{line}&lineStartColumn=1&lineEndColumn=1",
linkcheck = true,
linkcheck_ignore = [r"(x|y).md", "z.md", r":func:.*"],
linkcheck_useragent = "Documenter/1",
format = Documenter.HTML(
assets = [
"assets/custom.css",
Expand All @@ -434,7 +435,7 @@ else
end

# HTML: draft mode
examples_html_local_doc = if "html-draft" in EXAMPLE_BUILDS
examples_html_draft_doc = if "html-draft" in EXAMPLE_BUILDS
@info("Building mock package docs: HTMLWriter / draft build")
@quietly makedocs(
debug = true,
Expand Down
7 changes: 7 additions & 0 deletions test/examples/tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,9 @@ end
# .. but, crucially, Main.SVG_HTML did _not_ get written out.
@test !isfile(joinpath(build_dir, "example-output", "$(SVG_BIG.hash_slug)-002.svg"))
end

# Testing linkcheck_useragent default
@test doc.user.linkcheck_useragent == Documenter._LINKCHECK_DEFAULT_USERAGENT
end

@testset "HTML: local" begin
Expand Down Expand Up @@ -421,6 +424,10 @@ end
# .. but, crucially, Main.SVG_HTML did _not_ get written out.
@test !isfile(joinpath(build_dir, "example-output-$(SVG_BIG.hash_slug)-002.svg"))
end

# It doesn't actually test that the user agent was used correctly, but at least it tests that
# the option go set.
@test doc.user.linkcheck_useragent == "Documenter/1"
end

@testset "HTML: pagesonly" begin
Expand Down

0 comments on commit 30f6ba2

Please sign in to comment.