Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: option to fix canonical URLs of a multidocumenter build #60

Merged
merged 5 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 45 additions & 11 deletions src/MultiDocumenter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ import Gumbo, AbstractTrees
using HypertextLiteral
import Git: git

module DocumenterTools
import Gumbo, AbstractTrees
include("documentertools/walkdocs.jl")
include("documentertools/canonical_urls.jl")
end

"""
SearchConfig(index_versions = ["stable"], engine = MultiDocumenter.FlexSearch, lowfi = false)

Expand All @@ -25,13 +31,22 @@ struct MultiDocRef
path::String
name::String

fix_canonical_url::Bool

# these are not actually used internally
giturl::String
branch::String
end

function MultiDocRef(; upstream, name, path, giturl = "", branch = "gh-pages")
MultiDocRef(upstream, path, name, giturl, branch)
function MultiDocRef(;
upstream,
name,
path,
giturl = "",
branch = "gh-pages",
fix_canonical_url = true,
)
MultiDocRef(upstream, path, name, fix_canonical_url, giturl, branch)
end

struct DropdownNav
Expand Down Expand Up @@ -60,13 +75,11 @@ function walk_outputs(f, root, docs::Vector{MultiDocRef}, dirs::Vector{String})
for dir in dirs
dirpath = joinpath(p, dir)
isdir(dirpath) || continue
for (r, _, files) in walkdir(dirpath)
for file in files
file == "index.html" || continue
pfitzseb marked this conversation as resolved.
Show resolved Hide resolved

# +1 for path separator
f(chop(r, head = length(root) + 1, tail = 0), joinpath(r, file))
end
DocumenterTools.walkdocs(
dirpath,
fileinfo -> fileinfo.filename == "index.html",
) do fileinfo
f(relpath(dirname(fileinfo.fullpath), root), fileinfo.fullpath)
end
break
end
Expand All @@ -76,6 +89,7 @@ end
include("renderers.jl")
include("search/flexsearch.jl")
include("search/stork.jl")
include("canonical.jl")

const DEFAULT_ENGINE = SearchConfig(index_versions = ["stable", "dev"], engine = FlexSearch)

Expand All @@ -91,6 +105,7 @@ const DEFAULT_ENGINE = SearchConfig(index_versions = ["stable", "dev"], engine =
prettyurls = true,
rootpath = "/",
hide_previews = true,
canonical = nothing,
)

Aggregates multiple Documenter.jl-based documentation pages `docs` into `outdir`.
Expand All @@ -105,6 +120,9 @@ Aggregates multiple Documenter.jl-based documentation pages `docs` into `outdir`
- `prettyurls` removes all `index.html` suffixes from links in the global navigation.
- `rootpath` is the path your site ends up being deployed at, e.g. `/foo/` if it's hosted at `https://bar.com/foo`
- `hide_previews` removes preview builds from the aggregated documentation.
- `canonical`: if set to the root URL of the MultiDocumenter site, will check and, if necessary, update the
canonical URL tags for each package site to point to the directory. Similar to the `canonical` argument of
`Documenter.HTML` constructor.
"""
function make(
outdir,
Expand All @@ -117,10 +135,19 @@ function make(
prettyurls = true,
rootpath = "/",
hide_previews = true,
canonical::Union{AbstractString,Nothing} = nothing,
)
maybe_clone(flatten_multidocrefs(docs))

dir = make_output_structure(flatten_multidocrefs(docs), prettyurls, hide_previews)
if !isnothing(canonical)
canonical = rstrip(canonical, '/')
end
dir = make_output_structure(
flatten_multidocrefs(docs),
prettyurls,
hide_previews;
canonical,
)
out_assets = joinpath(dir, "assets")
if assets_dir !== nothing && isdir(assets_dir)
cp(assets_dir, out_assets)
Expand Down Expand Up @@ -192,7 +219,12 @@ function maybe_clone(docs::Vector{MultiDocRef})
end
end

function make_output_structure(docs::Vector{MultiDocRef}, prettyurls, hide_previews)
function make_output_structure(
docs::Vector{MultiDocRef},
prettyurls,
hide_previews;
canonical::Union{AbstractString,Nothing},
)
dir = mktempdir()

for doc in docs
Expand All @@ -210,6 +242,8 @@ function make_output_structure(docs::Vector{MultiDocRef}, prettyurls, hide_previ
if hide_previews && isdir(previewpath)
rm(previewpath, recursive = true)
end

fix_canonical_url!(doc; canonical, root_dir = dir)
end

open(joinpath(dir, "index.html"), "w") do io
Expand Down
23 changes: 23 additions & 0 deletions src/canonical.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# This files contains the functions used to implement the canonical URL
# update functionality.
function fix_canonical_url!(
doc::MultiDocRef;
canonical::Union{AbstractString,Nothing},
root_dir::AbstractString,
)
# If the user didn't set `canonical`, then we don't need to do anything
isnothing(canonical) && return nothing
# The user can also disable the canonical URL fixing on a per-package basis
doc.fix_canonical_url || return nothing
# Determine the canonical URL and fix them in the HTML files
documenter_directory_root = joinpath(root_dir, doc.path)
try
DocumenterTools.update_canonical_links(
documenter_directory_root;
canonical = join((canonical, doc.path), '/'),
)
catch e
@error "Unable to update canonical URLs for this package" doc exception =
(e, catch_backtrace())
end
end
247 changes: 247 additions & 0 deletions src/documentertools/canonical_urls.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
# This is vendored version of code that should eventually moved into DocumenterTools.jl
# once the generic interface has crystallized, and then DocumenterTools should be added
# as a dependency here.
#
# WIP upstream PR: https://github.com/JuliaDocs/DocumenterTools.jl/pull/78
#
# Note: these functions are not part of MultiDocumenter public API.

"""
DocumenterTools.update_canonical_links_for_build(
docs_directory::AbstractString;
canonical::AbstractString,
)

- **`canonical`**: corresponds to the `canonical` attribute of `Documenter.HTML`,
specifying the root of the canonical URL.
"""
function update_canonical_links_for_version(
docs_directory::AbstractString;
canonical::AbstractString,
)
canonical = rstrip(canonical, '/')

walkdocs(docs_directory, isdochtml) do fileinfo
@debug "update_canonical_links: checking $(fileinfo.relpath)"
# Determine the
filepath = splitpath(fileinfo.relpath)
new_canonical_href = if filepath[end] == "index.html"
joinurl(canonical, filepath[1:end-1]...) * '/'
else
joinurl(canonical, filepath[1:end]...)
end

html = Gumbo.parsehtml(read(fileinfo.fullpath, String))
n_canonical_tags::Int = 0
dom_updated::Bool = false
for e in AbstractTrees.PreOrderDFS(html.root)
is_canonical_element(e) || continue
n_canonical_tags += 1
canonical_href = Gumbo.getattr(e, "href", nothing)
if canonical_href != new_canonical_href
Gumbo.setattr!(e, "href", new_canonical_href)
@debug "update_canonical_links_for_version: canonical_href updated" canonical_href new_canonical_href fileinfo.relpath
dom_updated = true
end
end
if n_canonical_tags == 0
for e in AbstractTrees.PreOrderDFS(html.root)
e isa Gumbo.HTMLElement || continue
Gumbo.tag(e) == :head || continue
canonical_href_element = Gumbo.HTMLElement{:link}(
[],
e,
Dict("rel" => "canonical", "href" => new_canonical_href),
)
push!(e.children, canonical_href_element)
@debug "update_canonical_links_for_version: added new canonical_href" new_canonical_href fileinfo.relpath
dom_updated = true
break
end
end
if dom_updated
open(io -> print(io, html), fileinfo.fullpath, "w")
end
if n_canonical_tags > 1
@error "Multiple canonical tags!" file = fileinfo.relpath
end
end
end

is_canonical_element(e) =
(e isa Gumbo.HTMLElement) &&
(Gumbo.tag(e) == :link) &&
(Gumbo.getattr(e, "rel", nothing) == "canonical")
joinurl(ps::AbstractString...) = join(ps, '/')

"""
Takes the multi-versioned Documenter site in `docs_directory` and updates the HTML canonical URLs
to point to `canonical`.
"""
function update_canonical_links(docs_directory::AbstractString; canonical::AbstractString)
canonical = rstrip(canonical, '/')
docs_directory = abspath(docs_directory)
isdir(docs_directory) || throw(ArgumentError("No such directory: $(docs_directory)"))

redirect_index_html_path = joinpath(docs_directory, "index.html")
canonical_path = if isfile(redirect_index_html_path)
redirect_url = get_meta_redirect_url(redirect_index_html_path)
splitpath(normpath(redirect_url))
else
canonical_version_from_versions_js(docs_directory)
end
canonical_full_root = joinurl(canonical, canonical_path...)
# If we have determined which version should be the canonical version, we can actually
# go and run update_canonical_links_for_version on each directory. First, we'll gather
# up the list of Documenter (or other) directories we actually want to run over.
docs_subdirectory_queue, docs_subdirectories = readdir(docs_directory), []
while !isempty(docs_subdirectory_queue)
docs_subdirectory = popfirst!(docs_subdirectory_queue)
path = joinpath(docs_directory, docs_subdirectory)
# We'll skip all files. This includes files such as index.html, which in this
# directory will likely be the redirect. Also, links should be pointing to other
# versions, so we'll skip them too.
if !isdir(path) || islink(path)
continue
end
# Preview directory is should contain other Documenter directories, so we just add
# the subdirectories into the queue and ignore the parent directory itself
if docs_subdirectory == "previews"
append!(docs_subdirectory_queue, joinpath.(docs_subdirectory, readdir(path)))
continue
end
# For other directories, we check for the presence of siteinfo.js, and warn if that
# is missing (but we still try to go and update the canonical URLs).
if !isfile(joinpath(path, "siteinfo.js"))
@warn "update_canonical_links: missing siteinfo.js file" path
end
push!(docs_subdirectories, path)
end
# Finally, we can run update_canonical_links_for_version on the directory.
for path in docs_subdirectories
@debug "Updating canonical URLs for a version" path canonical_full_root
update_canonical_links_for_version(path; canonical = canonical_full_root)
end
end

function canonical_directory_from_redirect_index_html(docs_directory::AbstractString)
redirect_index_html_path = joinpath(docs_directory, "index.html")
isfile(redirect_index_html_path) || return nothing
redirect_url = get_meta_redirect_url(redirect_index_html_path)
splitpath(normpath(redirect_url))
end

"""
Parses the HTML file at `indexhtml_path` and tries to extract the `url=...` value
of the redirect `<meta http-equiv="refresh" ...>` tag.
"""
function get_meta_redirect_url(indexhtml_path::AbstractString)
html = Gumbo.parsehtml(read(indexhtml_path, String))
for e in AbstractTrees.PreOrderDFS(html.root)
e isa Gumbo.HTMLElement || continue
Gumbo.tag(e) == :meta || continue
Gumbo.getattr(e, "http-equiv", nothing) == "refresh" || continue
content = Gumbo.getattr(e, "content", nothing)
if isnothing(content)
@warn "<meta http-equiv=\"refresh\" ...> with no content attribute" path =
indexhtml_path
continue
end
m = match(r"[0-9]+;\s*url=(.*)", content)
if isnothing(m)
@warn "Unable to parse content value of <meta http-equiv=\"refresh\" ...>" content path =
indexhtml_path
continue
end
return m.captures[1]
end
return nothing
end

function canonical_version_from_versions_js(docs_directory)
isdir(docs_directory) || throw(ArgumentError("Not a directory: $(docs_directory)"))
# Try to extract the list of versions from versions.js
versions_js = joinpath(docs_directory, "versions.js")
isfile(versions_js) ||
throw(ArgumentError("versions.js is missing in $(docs_directory)"))
versions = map(extract_versions_list(versions_js)) do version_str
isversion, version_number = if occursin(Base.VERSION_REGEX, version_str)
true, VersionNumber(version_str)
else
false, nothing
end
fullpath = joinpath(docs_directory, version_str)
return (;
path = version_str,
path_exists = isdir(fullpath) || islink(fullpath),
symlink = islink(fullpath),
isversion,
version_number,
fullpath,
)
end
# We'll filter out a couple of potential bad cases and issue warnings
filter(versions) do vi
if !vi.path_exists
@warn "update_canonical_links: path does not exists or is not a directory" docs_directory vi
return false
end
return true
end
# We need to determine the canonical path. This would usually be something like the stable/
# directory, but it can have a different name, including being a version number. So first we
# try to find a non-version directory _that is a symlink_ (so that it wouldn't get confused)
# previews/ or dev builds. If that fails, we try to find the directory matching `v[0-9]+`,
# with the highest version number. This does not cover all possible cases, but should be good
# enough for now.
if isempty(versions)
error("Unable to determine the canonical path. Found no version directories")
end

non_version_symlinks = filter(vi -> !vi.isversion && vi.symlink, versions)
canonical_version = if isempty(non_version_symlinks)
# We didn't find any non-version symlinks, so we'll try to find the vN directory now
# as a fallback.
version_symlinks = map(versions) do vi
m = match(r"^v([0-9]+)$", vi.path)
isnothing(m) && return nothing
parse(Int, m[1]) => vi
end
filter!(!isnothing, version_symlinks)
if isempty(version_symlinks)
error("Unable to determine the canonical path. Found no version directories")
end
# Note: findmax(first, version_symlinks) would be nicer, but is not supported
# on Julia 1.6
_, idx = findmax(first.(version_symlinks))
version_symlinks[idx][2]
elseif length(non_version_symlinks) > 1
error(
"Unable to determine the canonical path. Found multiple non-version symlinks.\n$(non_version_symlinks)",
)
else
only(non_version_symlinks)
end

return canonical_version.path
end

function extract_versions_list(versions_js::AbstractString)
versions_js = abspath(versions_js)
isfile(versions_js) || throw(ArgumentError("No such file: $(versions_js)"))
versions_js_content = read(versions_js, String)
m = match(r"var\s+DOC_VERSIONS\s*=\s*\[([0-9A-Za-z\"\s.,+-]+)\]", versions_js_content)
if isnothing(m)
throw(ArgumentError("""
Could not find DOC_VERSIONS in $(versions_js):
$(versions_js_content)"""))
end
versions = strip.(c -> isspace(c) || (c == '"'), split(m[1], ","))
filter!(!isempty, versions)
if isempty(versions)
throw(ArgumentError("""
DOC_VERSIONS empty in $(versions_js):
$(versions_js_content)"""))
end
return versions
end
Loading