Skip to content

Commit

Permalink
Transcript import (#170)
Browse files Browse the repository at this point in the history
  • Loading branch information
MrHinsh authored Nov 26, 2024
2 parents fabd65c + 4cb8aba commit e781847
Show file tree
Hide file tree
Showing 169 changed files with 7,684 additions and 1,624 deletions.
146 changes: 146 additions & 0 deletions .powershell/_includes/YoutubeAPI.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@

# Function to fetch video list from YouTube API and save a single youtube.json file
function Get-YoutubePublicChannelVideos {
param (
[string]$channelId,
[string]$apiKey
)

Write-Host "Getting Video List for $channelId"
$nextPageToken = $null
$page = 1;
$allVideosData = @()

do {
# YouTube API endpoint to get videos from a channel, including nextPageToken
$searchApiUrl = "https://www.googleapis.com/youtube/v3/search?key=$apiKey&part=snippet&channelId=$channelId&type=video&maxResults=$maxResults&pageToken=$nextPageToken"

# Fetch video list
$searchResponse = Invoke-RestMethod -Uri $searchApiUrl -Method Get
Write-Host " Parsing Page $page with $($searchResponse.items.Count) videos and etag: $($searchResponse.etag)"
$allVideosData += $searchResponse.items

# Get the nextPageToken to continue fetching more videos
$nextPageToken = $searchResponse.nextPageToken
$page++
} while ($nextPageToken)
Write-Host " Found $($allVideosData.Count) videos"
return $allVideosData;
}

# Function to test if a file is older than a specified number of hours
function Test-FileAge {
param (
[Parameter(Mandatory = $true)]
[string]$filePath,
[Parameter(Mandatory = $true)]
[int]$hours
)

if (-not (Test-Path -Path $filePath)) {
# File doesn't exist, consider it old
return $true
}

$fileInfo = Get-Item -Path $filePath
$lastWriteTime = $fileInfo.LastWriteTime
$timeDifference = (Get-Date) - $lastWriteTime

return $timeDifference.TotalHours -ge $hours
}

# Function to update data.json for a single video
function Get-YoutubeVideoData {
param (
[Parameter(Mandatory = $true)]
[string]$videoId
)

# Ensure API key is defined
if (-not $apiKey) {
Write-Host "API Key is missing. Please set the API Key." -ForegroundColor Red
return $null
}

# Ensure videoId is valid
if (-not $videoId) {
Write-Host "Invalid videoId provided." -ForegroundColor Red
return $null
}

Write-Host "Working on Data for: $videoId" -ForegroundColor Green
$videoDetailsUrl = "https://www.googleapis.com/youtube/v3/videos?key=$apiKey&id=$videoId&part=snippet,contentDetails"

try {
$videoDetails = Invoke-RestMethod -Uri $videoDetailsUrl -Method Get -ErrorAction Stop
if ($null -eq $videoDetails -or $null -eq $videoDetails.items -or $videoDetails.items.Count -eq 0) {
Write-Host "No data found for video: $videoId" -ForegroundColor Yellow
return $null
}

$videoData = $videoDetails.items[0]

Write-Host "Data found for video: $videoId" -ForegroundColor Green
return $videoData
}
catch {
Write-Host "Error fetching data for video: $videoId" -ForegroundColor Red
Write-Host $_.Exception.Message -ForegroundColor Red
return $null
}
}

# Function to get captions data for a single video
function Get-YouTubeCaptionsData {
param (
[Parameter(Mandatory = $true)]
[string]$videoId
)

# Ensure API key is defined
if (-not $apiKey) {
Write-Host "API Key is missing. Please set the API Key." -ForegroundColor Red
return $null
}

# Ensure videoId is valid
if (-not $videoId) {
Write-Host "Invalid videoId provided." -ForegroundColor Red
return $null
}

Write-Host "Getting caption data for: $videoId" -ForegroundColor Green
$captionsUrl = "https://www.googleapis.com/youtube/v3/captions?key=$apiKey&videoId=$videoId&part=snippet"

try {
# Get captions for the video
$captionsResponse = Invoke-RestMethod -Uri $captionsUrl -Method Get -ErrorAction Stop
$captionsData = @()

if ($null -ne $captionsResponse -and $null -ne $captionsResponse.items -and $captionsResponse.items.Count -gt 0) {
foreach ($caption in $captionsResponse.items) {
$captionsData += @{
"captionId" = $caption.id
"language" = $caption.snippet.language
"trackKind" = $caption.snippet.trackKind
"isDraft" = $caption.snippet.isDraft
"status" = $caption.snippet.status
"lastUpdated" = $caption.snippet.lastUpdated
}
}
}
else {
Write-Host "No captions found for video: $videoId" -ForegroundColor Yellow
}

return $captionsData
}
catch {
Write-Host "Error fetching captions for video: $videoId" -ForegroundColor Red
Write-Host $_.Exception.Message -ForegroundColor Red
return $null
}
}


Write-Host "YoutubeAPI.ps1 loaded" -ForegroundColor Green
185 changes: 116 additions & 69 deletions .powershell/build/Update-YoutubeChannelData.ps1
Original file line number Diff line number Diff line change
@@ -1,109 +1,156 @@
Write-Host "Running v2"
# Helpers
. ./.powershell/_includes/YoutubeAPI.ps1

# Define variables
Write-Host "Running v3"

$apiKey = $env:YOUTUBE_API_KEY
$channelId = "UCkYqhFNmhCzkefHsHS652hw"
$outputDir = "site\content\resources\videos\youtube"
$dataDirectory = ".\site\data"
$refreshData = $false

$maxResults = 800

# Create the output directory if it doesn't exist
if (-not (Test-Path $outputDir)) {
New-Item -Path $outputDir -ItemType Directory
}

# Function to fetch video list from YouTube API and save a single youtube.json file
function Fetch-YoutubeVideoList {
param ()

$nextPageToken = $null
$page = 1;
$allVideosData = @()
# Function to get captions for a video
function Get-YouTubeCaptions {
param (
[Parameter(Mandatory = $true)]
[string]$videoId,
[string]$accessToken
)

$captionsApiUrl = "https://www.googleapis.com/youtube/v3/captions?part=id,snippet&videoId=$videoId"
$headers = @{"Authorization" = "Bearer $accessToken" }

$response = Invoke-RestMethod -Uri $captionsApiUrl -Headers $headers -Method Get
return $response.items
}

do {
# YouTube API endpoint to get videos from a channel, including nextPageToken
$searchApiUrl = "https://www.googleapis.com/youtube/v3/search?key=$apiKey&part=snippet&channelId=$channelId&type=video&maxResults=$maxResults&pageToken=$nextPageToken"
# Function to download a caption file with a check if $captionContent is empty
function Get-YouTubeCaption {
param (
[Parameter(Mandatory = $true)]
[string]$captionId,
[Parameter(Mandatory = $true)]
[string]$accessToken
)

# Fetch video list
$searchResponse = Invoke-RestMethod -Uri $searchApiUrl -Method Get
# Specify the format as SRT by adding 'tfmt=srt' to the URL
$downloadUrl = "https://www.googleapis.com/youtube/v3/captions/$captionId/?tfmt=srt"
$headers = @{"Authorization" = "Bearer $accessToken" }

$allVideosData += $searchResponse.items
# Use Invoke-WebRequest for binary or non-JSON/XML responses
$response = Invoke-WebRequest -Uri $downloadUrl -Headers $headers -Method Get

# Get the nextPageToken to continue fetching more videos
$nextPageToken = $searchResponse.nextPageToken
$page++
} while ($nextPageToken)
return $response.Content
}

# Define variables
$channelId = "UCkYqhFNmhCzkefHsHS652hw"
$outputDir = "site\content\resources\videos\youtube"
$dataDirectory = ".\site\data"
$refreshData = $false
$captionsDownloadLimit = 0
$videoUpdateLimit = 10
$captionsManafestUpdateLimit = 10

# 0. Get Youtube Video List
$dataFilePath = Join-Path $dataDirectory "youtube.json"
if (Test-FileAge -filePath $dataFilePath -hours 3) {
$allVideosData = Get-YoutubePublicChannelVideos -channelId $channelId -apiKey $env:YOUTUBE_API_KEY # Call this to fetch video list and save to youtube.json
# Save all video data to a single youtube.json file
$dataFilePath = Join-Path $dataDirectory "youtube.json"

$allVideosData | ConvertTo-Json -Depth 10 | Set-Content -Path $dataFilePath

Write-Host "All video data saved to youtube.json."
Write-Host "$dataFilePath saved with $($allVideosData.Count) videos." -ForegroundColor Green
}
else {
Write-Host "$dataFilePath is up to date." -ForegroundColor Yellow
}

# Function to update data.json for a single video
function Update-YoutubeDataFile {
param (
[string]$videoId
)

$videoUpdateCount = 0
$captionsManafestUpdateCount = 0
$captionsDownloadCount = 0
foreach ($video in $allVideosData) {

Write-Host "Processing $($video.id.videoId)" -ForegroundColor Green

$videoId = $video.id.videoId
# Create the directory named after the video ID
$videoDir = Join-Path $outputDir $videoId
if (-not (Test-Path $videoDir)) {
New-Item -Path $videoDir -ItemType Directory
}

# File path for data.json
$jsonFilePath = Join-Path $videoDir "data.json"
if ($videoId -eq "xo4jMxupIM0") {
Write-Host "Updating data.json for video: $videoId"
# 1. Get Youtube Video Data
$jsonFilePathVideos = Join-Path $videoDir "data.json"
if ($refreshData -or -not (Test-Path $jsonFilePathVideos)) {
if ($videoUpdateCount -lt $videoUpdateLimit) {
# Call the function to update the data for a single video
$videoData = Get-YoutubeVideoData -videoId $videoId
# Save updated video data to data.json
if ($videoData) {
$videoData | ConvertTo-Json -Depth 10 | Set-Content -Path $jsonFilePathVideos
Write-Host " Updated data.json for video: $videoId"
$videoUpdateCount++;
}
}
else {
Write-Host " Reached video update limit of $videoUpdateLimit. skipping."
}
}
# Only update if $refreshData is true or data.json doesn't exist
if ($refreshData -or -not (Test-Path $jsonFilePath)) {
# Fetch full video details from YouTube API
$videoDetailsUrl = "https://www.googleapis.com/youtube/v3/videos?key=$apiKey&id=$videoId&part=snippet,contentDetails"
$videoDetails = Invoke-RestMethod -Uri $videoDetailsUrl -Method Get
$videoData = $videoDetails.items[0]

if ($videoData) {

# 2. Get Youtube Captions List
$jsonFilePathCaptions = Join-Path $videoDir "data.captions.json"
if ($refreshData -or -not (Test-Path $jsonFilePathCaptions)) {
if ($captionsManafestUpdateCount -lt $captionsManafestUpdateLimit) {
# Call the function to update the data for a single video
$captionListData = Get-YouTubeCaptionsData -videoId $videoId
# Save updated video data to data.json
$videoData | ConvertTo-Json -Depth 10 | Set-Content -Path $jsonFilePath
Write-Host "Updated data.json for video: $videoId"
if ($captionListData) {
$captionListData | ConvertTo-Json -Depth 10 | Set-Content -Path $jsonFilePathCaptions
Write-Host " Updated data.captions.json for video: $videoId"
$captionsManafestUpdateCount++;
}
}
else {
Write-Host "No data found for video: $videoId"
Write-Host " Reached capations manafest update limit of $captionsManafestUpdateLimit. skipping."
}

}

# 3. Download Captions
if (Test-Path $jsonFilePathCaptions) {
$captionsData = Get-Content -Path $jsonFilePathCaptions | ConvertFrom-Json
foreach ($caption in $captionsData) {
$captionId = $caption.captionId
$language = $caption.language
$captionsFileName = "data.captions.$language.srt"
$captionFilePath = Join-Path $videoDir $captionsFileName
if (-not (Test-Path $captionFilePath)) {
if ($captionsDownloadCount -lt $captionsDownloadLimit) {
$captionData = Get-YouTubeCaption -captionId $captionId -accessToken $env:GOOGLE_ACCESS_TOKEN
$captionData | Set-Content -Path $captionFilePath
Write-Host " Updated $captionsFileName for video: $videoId"
$captionsDownloadCount++
}
else {
Write-Host " Reached capations download limit of $captionsDownloadLimit. skipping."
}

}
}

}
else {
Write-Host "Data for video $videoId is already up to date."
Write-Host " No caption list data manafest. skipping."
}
}

# Function to iterate through youtube.json and update data.json for each video
function Update-YoutubeDataFilesFromJson {
param ()

$dataFilePath = Join-Path $dataDirectory "youtube.json"
if (-not (Test-Path $dataFilePath)) {
Write-Host "youtube.json file not found. Please run Fetch-YoutubeVideoList first."
return
}

# Load video list from youtube.json
$allVideosData = Get-Content -Path $dataFilePath | ConvertFrom-Json

foreach ($video in $allVideosData) {
$videoId = $video.id.videoId

# Call the function to update the data for a single video
Update-YoutubeDataFile -videoId $videoId
}
# Update-YoutubeDataFilesFromJson # Call this to update data.json files from youtube.json

Write-Host "All video data files updated from youtube.json."
}
# # Set a limit for the number of transcripts to download

#Fetch-YoutubeVideoList # Call this to fetch video list and save to youtube.json
Update-YoutubeDataFilesFromJson # Call this to update data.json files from youtube.json
# Download-AllYouTubeCaptions -accessToken $env:GOOGLE_ACCESS_TOKEN
File renamed without changes.
Loading

0 comments on commit e781847

Please sign in to comment.