Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync eng/common directory with azure-sdk-tools for PR 1562 #2150

Merged
merged 2 commits into from
Apr 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions eng/common/pipelines/templates/steps/verify-links.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ steps:
arguments: >
-urls ${{ parameters.Urls }}
-rootUrl "file://${{ parameters.WorkingDirectory }}/${{ parameters.Directory }}"
-recursive: ${{ parameters.Recursive }}
-recursive: ${{ parameters.Recursive }}
-ignoreLinksFile ${{ parameters.IgnoreLinksFile }}
-branchReplaceRegex "${{ parameters.BranchReplaceRegex }}"
-branchReplacementName ${{ parameters.BranchReplacementName }}
-devOpsLogging: $true
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
-inputCacheFile "https://azuresdkartifacts.blob.core.windows.net/verify-links-cache/verify-links-cache.txt"
182 changes: 131 additions & 51 deletions eng/common/scripts/Verify-Links.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -12,47 +12,48 @@
Specifies the file that contains a set of links to ignore when verifying.

.PARAMETER devOpsLogging
Switch that will enable devops specific logging for warnings
Switch that will enable devops specific logging for warnings

.PARAMETER recursive
Check the links recurisvely based on recursivePattern.
Check the links recurisvely based on recursivePattern.

.PARAMETER baseUrl
Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in.

.PARAMETER rootUrl
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.

.PARAMETER errorStatusCodes
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.

.PARAMETER branchReplaceRegex
Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)master(/.*)$

.PARAMETER branchReplacementName
The substitute branch name or SHA commit.
The substitute branch name or SHA commit.

.PARAMETER checkLinkGuidance
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.

.PARAMETER userAgent
UserAgent to be configured for web requests. Defaults to current Chrome version.

.INPUTS
None. No required inputs.
.PARAMETER inputCacheFile
Path to a file that contains a list of links that are known valid so we can skip checking them.

.OUTPUTS
None. Verify-Links.ps1 does not generate any output.
.PARAMETER outputCacheFile
Path to a file that the script will output all the validated links after running all checks.

.EXAMPLE
PS> .\Verify-Links.ps1
PS> .\Verify-Links.ps1 C:\README.md

.EXAMPLE
PS> .\Verify-Links.ps1 -urls C:\README.md
PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html

.EXAMPLE
PS> .\Verify-Links -urls C:\README.md -checkLinkGuidance $true
PS> .\Verify-Links C:\README.md -checkLinkGuidance $true
#>
[CmdletBinding()]
param (
[string[]] $urls,
[string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt",
Expand All @@ -64,7 +65,9 @@ param (
[string] $branchReplaceRegex = "",
[string] $branchReplacementName = "",
[bool] $checkLinkGuidance = $false,
[string] $userAgent
[string] $userAgent,
[string] $inputCacheFile,
[string] $outputCacheFile
)

$ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog
Expand All @@ -88,7 +91,7 @@ function NormalizeUrl([string]$url){
}

if ($script:rootUrl -eq "") {
if ($uri.IsFile) {
if ($uri.IsFile) {
# for files default to the containing directory
$script:rootUrl = $script:baseUrl;
}
Expand Down Expand Up @@ -129,7 +132,7 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
# If the link is mailto, skip it.
if ($link.StartsWith("mailto:")) {
Write-Verbose "Skipping $link because it is a mailto link."
return $null
return
}

$linkUri = [System.Uri]$link;
Expand All @@ -156,12 +159,12 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
# If the link is not a web request, like mailto, skip it.
if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) {
Write-Verbose "Skipping $linkUri because it is not http or file based."
return $null
return
}

if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains($link) -or $ignoreLinks.Contains($linkUri.ToString()))) {
Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file."
return $null
return
}

return $linkUri;
Expand All @@ -177,28 +180,34 @@ function ParseLinks([string]$baseUri, [string]$htmlContent)
#$hrefs | Foreach-Object { Write-Host $_ }

Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri";
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value } | Sort-Object -Unique
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value }

#$links | Foreach-Object { Write-Host $_ }

return $links
}

function CheckLink ([System.Uri]$linkUri)
function CheckLink ([System.Uri]$linkUri, $allowRetry=$true)
{
if(!$linkUri.ToString().Trim()) {
LogWarning "Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
return $false
}
if ($checkedLinks.ContainsKey($linkUri)) {
if (!$checkedLinks[$linkUri]) {
LogWarning "broken link $linkUri"

$originalLinkUri = $linkUri
$linkUri = ReplaceGithubLink $linkUri

$link = $linkUri.ToString()

if ($checkedLinks.ContainsKey($link)) {
if (!$checkedLinks[$link]) {
LogWarning "broken link $link"
}
return $checkedLinks[$linkUri]
return $checkedLinks[$link]
}

$linkValid = $true
Write-Verbose "Checking link $linkUri..."
Write-Verbose "Checking link $linkUri..."

if ($linkUri.IsFile) {
if (!(Test-Path $linkUri.LocalPath)) {
Expand Down Expand Up @@ -234,27 +243,58 @@ function CheckLink ([System.Uri]$linkUri)
}

if ($statusCode -in $errorStatusCodes) {
LogWarning "[$statusCode] broken link $linkUri"
if ($originalLinkUri -ne $linkUri) {
LogWarning "[$statusCode] broken link $originalLinkUri (resolved to $linkUri)"
}
else {
LogWarning "[$statusCode] broken link $linkUri"
}

$linkValid = $false
}
else {
if ($null -ne $statusCode) {
Write-Host "[$statusCode] while requesting $linkUri"
# For 429 rate-limiting try to pause if possible
if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429) {
$retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds

# Default retry after 60 (arbitrary) seconds if no header given
if (!$retryAfter -or $retryAfter -gt 60) { $retryAfter = 60 }
Write-Host "Rate-Limited for $retryAfter seconds while requesting $linkUri"

Start-Sleep -Seconds $retryAfter
$linkValid = CheckLink $originalLinkUri -allowRetry $false
}
else {
Write-Host "[$statusCode] handled while requesting $linkUri"
# Override and set status code in the cache so it is truthy
# so we don't keep checking but we don't think it is valid either
$linkValid = $statusCode
}
}
else {
Write-Host "Exception while requesting $linkUri"
Write-Host $_.Exception.ToString()
# Override and set exception in the cache so it is truthy
# so we don't keep checking but we don't think it is valid either
$linkValid = "Exception"
}
}
}
}

elseif ($link.StartsWith("#")) {
# Ignore anchor links as we don't have a great way to check them.
}
else {
LogWarning "Link has invalid format $linkUri"
$linkValid = $false
}

if ($checkLinkGuidance) {
if ($linkUri.Scheme -eq 'http') {
LogWarning "DO NOT use 'http' in $linkUri. Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
$linkValid = $false
}
$link = $linkUri.ToString()
# Check if the url is relative links, suppress the archor link validation.
if (!$linkUri.IsAbsoluteUri -and !$link.StartsWith("#")) {
LogWarning "DO NOT use relative link $linkUri. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
Expand All @@ -272,16 +312,16 @@ function CheckLink ([System.Uri]$linkUri)
}
}

$checkedLinks[$linkUri] = $linkValid
$checkedLinks[$link] = $linkValid
return $linkValid
}

function ReplaceGithubLink([string]$originLink) {
if (!$branchReplacementName) {
if (!$branchReplacementName -or !$branchReplaceRegex) {
return $originLink
}
$ReplacementPattern = "`${1}$branchReplacementName`$2"
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
}

function GetLinks([System.Uri]$pageUri)
Expand Down Expand Up @@ -327,25 +367,55 @@ if ($urls) {
if ($urls.Count -eq 0) {
Write-Host "Usage $($MyInvocation.MyCommand.Name) <urls>";
exit 1;
}
}
}

if ($PSVersionTable.PSVersion.Major -lt 6)
{
LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)."
}
$ignoreLinks = @();
if (Test-Path $ignoreLinksFile)
if (Test-Path $ignoreLinksFile) {
$ignoreLinks = (Get-Content $ignoreLinksFile).Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
}

# Use default hashtable constructor instead of @{} because we need them to be case sensitive
$checkedPages = New-Object Hashtable
$checkedLinks = New-Object Hashtable

if ($inputCacheFile)
{
$ignoreLinks = [Array](Get-Content $ignoreLinksFile | ForEach-Object { ($_ -replace "#.*", "").Trim() } | Where-Object { $_ -ne "" })
$cacheContent = ""
if ($inputCacheFile.StartsWith("http")) {
try {
$response = Invoke-WebRequest -Uri $inputCacheFile
$cacheContent = $response.Content
}
catch {
$statusCode = $_.Exception.Response.StatusCode.value__
Write-Error "Failed to read cache file from page [$statusCode] $inputCacheFile"
}
}
elseif (Test-Path $inputCacheFile) {
$cacheContent = Get-Content $inputCacheFile -Raw
}
$goodLinks = $cacheContent.Split("`n").Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })

foreach ($goodLink in $goodLinks) {
$checkedLinks[$goodLink] = $true
}
}

$checkedPages = @{};
$checkedLinks = @{};
$badLinks = @{};
$cachedLinksCount = $checkedLinks.Count

if ($cachedLinksCount) {
Write-Host "Skipping checks on $cachedLinksCount links found in the given cache of known good links."
}

$badLinks = New-Object Hashtable
$pageUrisToCheck = new-object System.Collections.Queue
foreach ($url in $urls) {
$uri = NormalizeUrl $url
$uri = NormalizeUrl $url
$pageUrisToCheck.Enqueue($uri);
}

Expand All @@ -359,8 +429,7 @@ while ($pageUrisToCheck.Count -ne 0)
Write-Host "Found $($linkUris.Count) links on page $pageUri";
$badLinksPerPage = @();
foreach ($linkUri in $linkUris) {
$replacedLink = ReplaceGithubLink $linkUri
$isLinkValid = CheckLink $replacedLink
$isLinkValid = CheckLink $linkUri
if (!$isLinkValid -and !$badLinksPerPage.Contains($linkUri)) {
if (!$linkUri.ToString().Trim()) {
$linkUri = $emptyLinkMessage
Expand Down Expand Up @@ -388,10 +457,21 @@ foreach ($pageLink in $badLinks.Keys) {
}
}

$linksChecked = $checkedLinks.Count - $cachedLinksCount

if ($badLinks.Count -gt 0) {
LogError "Found $($checkedLinks.Count) links with $($badLinks.Count) page(s) broken."
}
LogError "Checked $linksChecked links with $($badLinks.Count) page(s) broken."
}
else {
Write-Host "Found $($checkedLinks.Count) links. No broken links found."
Write-Host "Checked $linksChecked links. No broken links found."
}

if ($outputCacheFile)
{
$goodLinks = $checkedLinks.Keys.Where({ "True" -eq $checkedLinks[$_].ToString() }) | Sort-Object

Write-Host "Writing the list of validated links to $outputCacheFile"
$goodLinks | Set-Content $outputCacheFile
}

exit $badLinks.Count