Skip to content

Commit

Permalink
Add caching support to verify-links (#1562)
Browse files Browse the repository at this point in the history
- Update link checking pipeline to enable caching
- Add loading cache file from a http endpoint
- Add retry logic when hitting 429

Fixes most of Azure/azure-sdk#2403
Fixes #1439
  • Loading branch information
weshaggard authored Apr 22, 2021
1 parent d43f7dc commit ad902df
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 54 deletions.
5 changes: 3 additions & 2 deletions eng/common/pipelines/templates/steps/verify-links.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ steps:
arguments: >
-urls ${{ parameters.Urls }}
-rootUrl "file://${{ parameters.WorkingDirectory }}/${{ parameters.Directory }}"
-recursive: ${{ parameters.Recursive }}
-recursive: ${{ parameters.Recursive }}
-ignoreLinksFile ${{ parameters.IgnoreLinksFile }}
-branchReplaceRegex "${{ parameters.BranchReplaceRegex }}"
-branchReplacementName ${{ parameters.BranchReplacementName }}
-devOpsLogging: $true
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
-inputCacheFile "https://azuresdkartifacts.blob.core.windows.net/verify-links-cache/verify-links-cache.txt"
182 changes: 131 additions & 51 deletions eng/common/scripts/Verify-Links.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -12,47 +12,48 @@
Specifies the file that contains a set of links to ignore when verifying.
.PARAMETER devOpsLogging
Switch that will enable devops specific logging for warnings
Switch that will enable devops specific logging for warnings
.PARAMETER recursive
Check the links recurisvely based on recursivePattern.
Check the links recurisvely based on recursivePattern.
.PARAMETER baseUrl
Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in.
.PARAMETER rootUrl
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
.PARAMETER errorStatusCodes
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
.PARAMETER branchReplaceRegex
Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)master(/.*)$
.PARAMETER branchReplacementName
The substitute branch name or SHA commit.
The substitute branch name or SHA commit.
.PARAMETER checkLinkGuidance
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
.PARAMETER userAgent
UserAgent to be configured for web requests. Defaults to current Chrome version.
.INPUTS
None. No required inputs.
.PARAMETER inputCacheFile
Path to a file that contains a list of links that are known valid so we can skip checking them.
.OUTPUTS
None. Verify-Links.ps1 does not generate any output.
.PARAMETER outputCacheFile
Path to a file that the script will output all the validated links after running all checks.
.EXAMPLE
PS> .\Verify-Links.ps1
PS> .\Verify-Links.ps1 C:\README.md
.EXAMPLE
PS> .\Verify-Links.ps1 -urls C:\README.md
PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html
.EXAMPLE
PS> .\Verify-Links -urls C:\README.md -checkLinkGuidance $true
PS> .\Verify-Links C:\README.md -checkLinkGuidance $true
#>
[CmdletBinding()]
param (
[string[]] $urls,
[string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt",
Expand All @@ -64,7 +65,9 @@ param (
[string] $branchReplaceRegex = "",
[string] $branchReplacementName = "",
[bool] $checkLinkGuidance = $false,
[string] $userAgent
[string] $userAgent,
[string] $inputCacheFile,
[string] $outputCacheFile
)

$ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog
Expand All @@ -88,7 +91,7 @@ function NormalizeUrl([string]$url){
}

if ($script:rootUrl -eq "") {
if ($uri.IsFile) {
if ($uri.IsFile) {
# for files default to the containing directory
$script:rootUrl = $script:baseUrl;
}
Expand Down Expand Up @@ -129,7 +132,7 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
# If the link is mailto, skip it.
if ($link.StartsWith("mailto:")) {
Write-Verbose "Skipping $link because it is a mailto link."
return $null
return
}

$linkUri = [System.Uri]$link;
Expand All @@ -156,12 +159,12 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
# If the link is not a web request, like mailto, skip it.
if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) {
Write-Verbose "Skipping $linkUri because it is not http or file based."
return $null
return
}

if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains($link) -or $ignoreLinks.Contains($linkUri.ToString()))) {
Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file."
return $null
return
}

return $linkUri;
Expand All @@ -177,28 +180,34 @@ function ParseLinks([string]$baseUri, [string]$htmlContent)
#$hrefs | Foreach-Object { Write-Host $_ }

Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri";
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value } | Sort-Object -Unique
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value }

#$links | Foreach-Object { Write-Host $_ }

return $links
}

function CheckLink ([System.Uri]$linkUri)
function CheckLink ([System.Uri]$linkUri, $allowRetry=$true)
{
if(!$linkUri.ToString().Trim()) {
LogWarning "Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
return $false
}
if ($checkedLinks.ContainsKey($linkUri)) {
if (!$checkedLinks[$linkUri]) {
LogWarning "broken link $linkUri"

$originalLinkUri = $linkUri
$linkUri = ReplaceGithubLink $linkUri

$link = $linkUri.ToString()

if ($checkedLinks.ContainsKey($link)) {
if (!$checkedLinks[$link]) {
LogWarning "broken link $link"
}
return $checkedLinks[$linkUri]
return $checkedLinks[$link]
}

$linkValid = $true
Write-Verbose "Checking link $linkUri..."
Write-Verbose "Checking link $linkUri..."

if ($linkUri.IsFile) {
if (!(Test-Path $linkUri.LocalPath)) {
Expand Down Expand Up @@ -234,27 +243,58 @@ function CheckLink ([System.Uri]$linkUri)
}

if ($statusCode -in $errorStatusCodes) {
LogWarning "[$statusCode] broken link $linkUri"
if ($originalLinkUri -ne $linkUri) {
LogWarning "[$statusCode] broken link $originalLinkUri (resolved to $linkUri)"
}
else {
LogWarning "[$statusCode] broken link $linkUri"
}

$linkValid = $false
}
else {
if ($null -ne $statusCode) {
Write-Host "[$statusCode] while requesting $linkUri"
# For 429 rate-limiting try to pause if possible
if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429) {
$retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds

# Default retry after 60 (arbitrary) seconds if no header given
if (!$retryAfter -or $retryAfter -gt 60) { $retryAfter = 60 }
Write-Host "Rate-Limited for $retryAfter seconds while requesting $linkUri"

Start-Sleep -Seconds $retryAfter
$linkValid = CheckLink $originalLinkUri -allowRetry $false
}
else {
Write-Host "[$statusCode] handled while requesting $linkUri"
# Override and set status code in the cache so it is truthy
# so we don't keep checking but we don't think it is valid either
$linkValid = $statusCode
}
}
else {
Write-Host "Exception while requesting $linkUri"
Write-Host $_.Exception.ToString()
# Override and set exception in the cache so it is truthy
# so we don't keep checking but we don't think it is valid either
$linkValid = "Exception"
}
}
}
}

elseif ($link.StartsWith("#")) {
# Ignore anchor links as we don't have a great way to check them.
}
else {
LogWarning "Link has invalid format $linkUri"
$linkValid = $false
}

if ($checkLinkGuidance) {
if ($linkUri.Scheme -eq 'http') {
LogWarning "DO NOT use 'http' in $linkUri. Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
$linkValid = $false
}
$link = $linkUri.ToString()
# Check if the url is relative links, suppress the archor link validation.
if (!$linkUri.IsAbsoluteUri -and !$link.StartsWith("#")) {
LogWarning "DO NOT use relative link $linkUri. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
Expand All @@ -272,16 +312,16 @@ function CheckLink ([System.Uri]$linkUri)
}
}

$checkedLinks[$linkUri] = $linkValid
$checkedLinks[$link] = $linkValid
return $linkValid
}

function ReplaceGithubLink([string]$originLink) {
if (!$branchReplacementName) {
if (!$branchReplacementName -or !$branchReplaceRegex) {
return $originLink
}
$ReplacementPattern = "`${1}$branchReplacementName`$2"
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
}

function GetLinks([System.Uri]$pageUri)
Expand Down Expand Up @@ -327,25 +367,55 @@ if ($urls) {
if ($urls.Count -eq 0) {
Write-Host "Usage $($MyInvocation.MyCommand.Name) <urls>";
exit 1;
}
}
}

if ($PSVersionTable.PSVersion.Major -lt 6)
{
LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)."
}
$ignoreLinks = @();
if (Test-Path $ignoreLinksFile)
if (Test-Path $ignoreLinksFile) {
$ignoreLinks = (Get-Content $ignoreLinksFile).Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
}

# Use default hashtable constructor instead of @{} because we need them to be case sensitive
$checkedPages = New-Object Hashtable
$checkedLinks = New-Object Hashtable

if ($inputCacheFile)
{
$ignoreLinks = [Array](Get-Content $ignoreLinksFile | ForEach-Object { ($_ -replace "#.*", "").Trim() } | Where-Object { $_ -ne "" })
$cacheContent = ""
if ($inputCacheFile.StartsWith("http")) {
try {
$response = Invoke-WebRequest -Uri $inputCacheFile
$cacheContent = $response.Content
}
catch {
$statusCode = $_.Exception.Response.StatusCode.value__
Write-Error "Failed to read cache file from page [$statusCode] $inputCacheFile"
}
}
elseif (Test-Path $inputCacheFile) {
$cacheContent = Get-Content $inputCacheFile -Raw
}
$goodLinks = $cacheContent.Split("`n").Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })

foreach ($goodLink in $goodLinks) {
$checkedLinks[$goodLink] = $true
}
}

$checkedPages = @{};
$checkedLinks = @{};
$badLinks = @{};
$cachedLinksCount = $checkedLinks.Count

if ($cachedLinksCount) {
Write-Host "Skipping checks on $cachedLinksCount links found in the given cache of known good links."
}

$badLinks = New-Object Hashtable
$pageUrisToCheck = new-object System.Collections.Queue
foreach ($url in $urls) {
$uri = NormalizeUrl $url
$uri = NormalizeUrl $url
$pageUrisToCheck.Enqueue($uri);
}

Expand All @@ -359,8 +429,7 @@ while ($pageUrisToCheck.Count -ne 0)
Write-Host "Found $($linkUris.Count) links on page $pageUri";
$badLinksPerPage = @();
foreach ($linkUri in $linkUris) {
$replacedLink = ReplaceGithubLink $linkUri
$isLinkValid = CheckLink $replacedLink
$isLinkValid = CheckLink $linkUri
if (!$isLinkValid -and !$badLinksPerPage.Contains($linkUri)) {
if (!$linkUri.ToString().Trim()) {
$linkUri = $emptyLinkMessage
Expand Down Expand Up @@ -388,10 +457,21 @@ foreach ($pageLink in $badLinks.Keys) {
}
}

$linksChecked = $checkedLinks.Count - $cachedLinksCount

if ($badLinks.Count -gt 0) {
LogError "Found $($checkedLinks.Count) links with $($badLinks.Count) page(s) broken."
}
LogError "Checked $linksChecked links with $($badLinks.Count) page(s) broken."
}
else {
Write-Host "Found $($checkedLinks.Count) links. No broken links found."
Write-Host "Checked $linksChecked links. No broken links found."
}

if ($outputCacheFile)
{
$goodLinks = $checkedLinks.Keys.Where({ "True" -eq $checkedLinks[$_].ToString() }) | Sort-Object

Write-Host "Writing the list of validated links to $outputCacheFile"
$goodLinks | Set-Content $outputCacheFile
}

exit $badLinks.Count
Loading

0 comments on commit ad902df

Please sign in to comment.