From ad902df423c0a4704d22a2fc248450f850d755f2 Mon Sep 17 00:00:00 2001 From: Wes Haggard Date: Thu, 22 Apr 2021 14:17:21 -0700 Subject: [PATCH] Add caching support to verify-links (#1562) - Update link checking pipeline to enable caching - Add loading cache file from a http endpoint - Add retry logic when hitting 429 Fixes most of https://github.com/Azure/azure-sdk/issues/2403 Fixes https://github.com/Azure/azure-sdk-tools/issues/1439 --- .../templates/steps/verify-links.yml | 5 +- eng/common/scripts/Verify-Links.ps1 | 182 +++++++++++++----- eng/pipelines/githubio-linkcheck.yml | 37 +++- 3 files changed, 170 insertions(+), 54 deletions(-) diff --git a/eng/common/pipelines/templates/steps/verify-links.yml b/eng/common/pipelines/templates/steps/verify-links.yml index 45c68c45091..5c4e7402bbd 100644 --- a/eng/common/pipelines/templates/steps/verify-links.yml +++ b/eng/common/pipelines/templates/steps/verify-links.yml @@ -22,9 +22,10 @@ steps: arguments: > -urls ${{ parameters.Urls }} -rootUrl "file://${{ parameters.WorkingDirectory }}/${{ parameters.Directory }}" - -recursive: ${{ parameters.Recursive }} + -recursive: ${{ parameters.Recursive }} -ignoreLinksFile ${{ parameters.IgnoreLinksFile }} -branchReplaceRegex "${{ parameters.BranchReplaceRegex }}" -branchReplacementName ${{ parameters.BranchReplacementName }} -devOpsLogging: $true - -checkLinkGuidance: ${{ parameters.CheckLinkGuidance }} \ No newline at end of file + -checkLinkGuidance: ${{ parameters.CheckLinkGuidance }} + -inputCacheFile "https://azuresdkartifacts.blob.core.windows.net/verify-links-cache/verify-links-cache.txt" \ No newline at end of file diff --git a/eng/common/scripts/Verify-Links.ps1 b/eng/common/scripts/Verify-Links.ps1 index afcd8793b66..ca40793dfa6 100644 --- a/eng/common/scripts/Verify-Links.ps1 +++ b/eng/common/scripts/Verify-Links.ps1 @@ -12,47 +12,48 @@ Specifies the file that contains a set of links to ignore when verifying. .PARAMETER devOpsLogging - Switch that will enable devops specific logging for warnings + Switch that will enable devops specific logging for warnings .PARAMETER recursive - Check the links recurisvely based on recursivePattern. - + Check the links recurisvely based on recursivePattern. + .PARAMETER baseUrl Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in. - + .PARAMETER rootUrl - Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files. - + Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files. + .PARAMETER errorStatusCodes - List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004. - + List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004. + .PARAMETER branchReplaceRegex Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)master(/.*)$ - + .PARAMETER branchReplacementName - The substitute branch name or SHA commit. - + The substitute branch name or SHA commit. + .PARAMETER checkLinkGuidance - Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links. - + Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links. + .PARAMETER userAgent UserAgent to be configured for web requests. Defaults to current Chrome version. - .INPUTS - None. No required inputs. + .PARAMETER inputCacheFile + Path to a file that contains a list of links that are known valid so we can skip checking them. - .OUTPUTS - None. Verify-Links.ps1 does not generate any output. + .PARAMETER outputCacheFile + Path to a file that the script will output all the validated links after running all checks. .EXAMPLE - PS> .\Verify-Links.ps1 + PS> .\Verify-Links.ps1 C:\README.md .EXAMPLE - PS> .\Verify-Links.ps1 -urls C:\README.md + PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html .EXAMPLE - PS> .\Verify-Links -urls C:\README.md -checkLinkGuidance $true + PS> .\Verify-Links C:\README.md -checkLinkGuidance $true #> +[CmdletBinding()] param ( [string[]] $urls, [string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt", @@ -64,7 +65,9 @@ param ( [string] $branchReplaceRegex = "", [string] $branchReplacementName = "", [bool] $checkLinkGuidance = $false, - [string] $userAgent + [string] $userAgent, + [string] $inputCacheFile, + [string] $outputCacheFile ) $ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog @@ -88,7 +91,7 @@ function NormalizeUrl([string]$url){ } if ($script:rootUrl -eq "") { - if ($uri.IsFile) { + if ($uri.IsFile) { # for files default to the containing directory $script:rootUrl = $script:baseUrl; } @@ -129,7 +132,7 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link) # If the link is mailto, skip it. if ($link.StartsWith("mailto:")) { Write-Verbose "Skipping $link because it is a mailto link." - return $null + return } $linkUri = [System.Uri]$link; @@ -156,12 +159,12 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link) # If the link is not a web request, like mailto, skip it. if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) { Write-Verbose "Skipping $linkUri because it is not http or file based." - return $null + return } if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains($link) -or $ignoreLinks.Contains($linkUri.ToString()))) { Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file." - return $null + return } return $linkUri; @@ -177,28 +180,34 @@ function ParseLinks([string]$baseUri, [string]$htmlContent) #$hrefs | Foreach-Object { Write-Host $_ } Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri"; - $links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value } | Sort-Object -Unique + $links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value } #$links | Foreach-Object { Write-Host $_ } return $links } -function CheckLink ([System.Uri]$linkUri) +function CheckLink ([System.Uri]$linkUri, $allowRetry=$true) { if(!$linkUri.ToString().Trim()) { LogWarning "Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links" return $false } - if ($checkedLinks.ContainsKey($linkUri)) { - if (!$checkedLinks[$linkUri]) { - LogWarning "broken link $linkUri" + + $originalLinkUri = $linkUri + $linkUri = ReplaceGithubLink $linkUri + + $link = $linkUri.ToString() + + if ($checkedLinks.ContainsKey($link)) { + if (!$checkedLinks[$link]) { + LogWarning "broken link $link" } - return $checkedLinks[$linkUri] + return $checkedLinks[$link] } $linkValid = $true - Write-Verbose "Checking link $linkUri..." + Write-Verbose "Checking link $linkUri..." if ($linkUri.IsFile) { if (!(Test-Path $linkUri.LocalPath)) { @@ -234,27 +243,58 @@ function CheckLink ([System.Uri]$linkUri) } if ($statusCode -in $errorStatusCodes) { - LogWarning "[$statusCode] broken link $linkUri" + if ($originalLinkUri -ne $linkUri) { + LogWarning "[$statusCode] broken link $originalLinkUri (resolved to $linkUri)" + } + else { + LogWarning "[$statusCode] broken link $linkUri" + } + $linkValid = $false } else { if ($null -ne $statusCode) { - Write-Host "[$statusCode] while requesting $linkUri" + # For 429 rate-limiting try to pause if possible + if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429) { + $retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds + + # Default retry after 60 (arbitrary) seconds if no header given + if (!$retryAfter -or $retryAfter -gt 60) { $retryAfter = 60 } + Write-Host "Rate-Limited for $retryAfter seconds while requesting $linkUri" + + Start-Sleep -Seconds $retryAfter + $linkValid = CheckLink $originalLinkUri -allowRetry $false + } + else { + Write-Host "[$statusCode] handled while requesting $linkUri" + # Override and set status code in the cache so it is truthy + # so we don't keep checking but we don't think it is valid either + $linkValid = $statusCode + } } else { Write-Host "Exception while requesting $linkUri" Write-Host $_.Exception.ToString() + # Override and set exception in the cache so it is truthy + # so we don't keep checking but we don't think it is valid either + $linkValid = "Exception" } } } } - + elseif ($link.StartsWith("#")) { + # Ignore anchor links as we don't have a great way to check them. + } + else { + LogWarning "Link has invalid format $linkUri" + $linkValid = $false + } + if ($checkLinkGuidance) { if ($linkUri.Scheme -eq 'http') { LogWarning "DO NOT use 'http' in $linkUri. Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links" $linkValid = $false } - $link = $linkUri.ToString() # Check if the url is relative links, suppress the archor link validation. if (!$linkUri.IsAbsoluteUri -and !$link.StartsWith("#")) { LogWarning "DO NOT use relative link $linkUri. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links" @@ -272,16 +312,16 @@ function CheckLink ([System.Uri]$linkUri) } } - $checkedLinks[$linkUri] = $linkValid + $checkedLinks[$link] = $linkValid return $linkValid } function ReplaceGithubLink([string]$originLink) { - if (!$branchReplacementName) { + if (!$branchReplacementName -or !$branchReplaceRegex) { return $originLink } $ReplacementPattern = "`${1}$branchReplacementName`$2" - return $originLink -replace $branchReplaceRegex, $ReplacementPattern + return $originLink -replace $branchReplaceRegex, $ReplacementPattern } function GetLinks([System.Uri]$pageUri) @@ -327,7 +367,7 @@ if ($urls) { if ($urls.Count -eq 0) { Write-Host "Usage $($MyInvocation.MyCommand.Name) "; exit 1; - } + } } if ($PSVersionTable.PSVersion.Major -lt 6) @@ -335,17 +375,47 @@ if ($PSVersionTable.PSVersion.Major -lt 6) LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)." } $ignoreLinks = @(); -if (Test-Path $ignoreLinksFile) +if (Test-Path $ignoreLinksFile) { + $ignoreLinks = (Get-Content $ignoreLinksFile).Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") }) +} + +# Use default hashtable constructor instead of @{} because we need them to be case sensitive +$checkedPages = New-Object Hashtable +$checkedLinks = New-Object Hashtable + +if ($inputCacheFile) { - $ignoreLinks = [Array](Get-Content $ignoreLinksFile | ForEach-Object { ($_ -replace "#.*", "").Trim() } | Where-Object { $_ -ne "" }) + $cacheContent = "" + if ($inputCacheFile.StartsWith("http")) { + try { + $response = Invoke-WebRequest -Uri $inputCacheFile + $cacheContent = $response.Content + } + catch { + $statusCode = $_.Exception.Response.StatusCode.value__ + Write-Error "Failed to read cache file from page [$statusCode] $inputCacheFile" + } + } + elseif (Test-Path $inputCacheFile) { + $cacheContent = Get-Content $inputCacheFile -Raw + } + $goodLinks = $cacheContent.Split("`n").Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") }) + + foreach ($goodLink in $goodLinks) { + $checkedLinks[$goodLink] = $true + } } -$checkedPages = @{}; -$checkedLinks = @{}; -$badLinks = @{}; +$cachedLinksCount = $checkedLinks.Count + +if ($cachedLinksCount) { + Write-Host "Skipping checks on $cachedLinksCount links found in the given cache of known good links." +} + +$badLinks = New-Object Hashtable $pageUrisToCheck = new-object System.Collections.Queue foreach ($url in $urls) { - $uri = NormalizeUrl $url + $uri = NormalizeUrl $url $pageUrisToCheck.Enqueue($uri); } @@ -359,8 +429,7 @@ while ($pageUrisToCheck.Count -ne 0) Write-Host "Found $($linkUris.Count) links on page $pageUri"; $badLinksPerPage = @(); foreach ($linkUri in $linkUris) { - $replacedLink = ReplaceGithubLink $linkUri - $isLinkValid = CheckLink $replacedLink + $isLinkValid = CheckLink $linkUri if (!$isLinkValid -and !$badLinksPerPage.Contains($linkUri)) { if (!$linkUri.ToString().Trim()) { $linkUri = $emptyLinkMessage @@ -388,10 +457,21 @@ foreach ($pageLink in $badLinks.Keys) { } } +$linksChecked = $checkedLinks.Count - $cachedLinksCount + if ($badLinks.Count -gt 0) { - LogError "Found $($checkedLinks.Count) links with $($badLinks.Count) page(s) broken." -} + LogError "Checked $linksChecked links with $($badLinks.Count) page(s) broken." +} else { - Write-Host "Found $($checkedLinks.Count) links. No broken links found." + Write-Host "Checked $linksChecked links. No broken links found." } + +if ($outputCacheFile) +{ + $goodLinks = $checkedLinks.Keys.Where({ "True" -eq $checkedLinks[$_].ToString() }) | Sort-Object + + Write-Host "Writing the list of validated links to $outputCacheFile" + $goodLinks | Set-Content $outputCacheFile +} + exit $badLinks.Count diff --git a/eng/pipelines/githubio-linkcheck.yml b/eng/pipelines/githubio-linkcheck.yml index 0fc21d210db..c5f63a92954 100644 --- a/eng/pipelines/githubio-linkcheck.yml +++ b/eng/pipelines/githubio-linkcheck.yml @@ -4,6 +4,9 @@ pr: none pool: vmImage: 'ubuntu-18.04' +variables: + cachefile: verify-links-cache.txt + steps: - task: PowerShell@2 displayName: 'azure-sdk link check' @@ -13,6 +16,20 @@ steps: arguments: > -urls "https://azure.github.io/azure-sdk/index.html" -ignoreLinksFile "./eng/pipelines/githubio-linkcheck-ignore-links.txt" + -inputCacheFile "$(cachefile)" + -outputCacheFile "$(cachefile)" + -devOpsLogging: $true + +- task: PowerShell@2 + displayName: 'azure-sdk link check with caching' + inputs: + pwsh: true + filePath: eng/common/scripts/Verify-Links.ps1 + arguments: > + -urls "https://azure.github.io/azure-sdk/index.html" + -ignoreLinksFile "./eng/pipelines/githubio-linkcheck-ignore-links.txt" + -inputCacheFile "$(cachefile)" + -outputCacheFile "$(cachefile)" -devOpsLogging: $true - task: PowerShell@2 @@ -23,6 +40,8 @@ steps: arguments: > -urls "https://azure.github.io/azure-sdk-for-java/index.html" -ignoreLinksFile "./eng/pipelines/githubio-linkcheck-ignore-links.txt" + -inputCacheFile "$(cachefile)" + -outputCacheFile "$(cachefile)" -devOpsLogging: $true - task: PowerShell@2 @@ -33,6 +52,8 @@ steps: arguments: > -urls "https://azure.github.io/azure-sdk-for-js/index.html" -ignoreLinksFile "./eng/pipelines/githubio-linkcheck-ignore-links.txt" + -inputCacheFile "$(cachefile)" + -outputCacheFile "$(cachefile)" -devOpsLogging: $true - task: PowerShell@2 @@ -43,6 +64,8 @@ steps: arguments: > -urls "https://azure.github.io/azure-sdk-for-net/index.html" -ignoreLinksFile "./eng/pipelines/githubio-linkcheck-ignore-links.txt" + -inputCacheFile "$(cachefile)" + -outputCacheFile "$(cachefile)" -devOpsLogging: $true - task: PowerShell@2 @@ -53,4 +76,16 @@ steps: arguments: > -urls "https://azure.github.io/azure-sdk-for-python/index.html" -ignoreLinksFile "./eng/pipelines/githubio-linkcheck-ignore-links.txt" - -devOpsLogging: $true \ No newline at end of file + -inputCacheFile "$(cachefile)" + -outputCacheFile "$(cachefile)" + -devOpsLogging: $true + +- publish: $(cachefile) + artifact: verify-links-cache.txt + condition: succeededOrFailed() + displayName: Upload verified links + +- pwsh: | + azcopy copy '$(cachefile)' 'https://azuresdkartifacts.blob.core.windows.net/verify-links-cache$(azuresdkartifacts-verify-links-cache-sas)' + condition: succeededOrFailed() + displayName: Upload cache file to blob container