-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
175 lines (151 loc) · 5.64 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
const puppeteer = require('puppeteer');
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
const fs = require('fs');
const csv = require('csv-parser');
const { exec } = require('child_process');
const path = require('path');
// Function to convert view count text to a number
const parseViewCount = (viewsText) => {
if (viewsText.includes('M')) {
return parseFloat(viewsText.replace('M', '')) * 1_000_000;
} else if (viewsText.includes('K')) {
return parseFloat(viewsText.replace('K', '')) * 1_000;
} else {
return parseInt(viewsText.replace(/,/g, ''), 10);
}
};
const scrapeChannelShorts = async (browser, channelLink) => {
const page = await browser.newPage();
const shortsLink = channelLink.includes('/shorts') ? channelLink : `${channelLink}/shorts`;
try {
await page.goto(shortsLink, { waitUntil: 'networkidle2' });
// Scroll the page to load all shorts
await page.evaluate(async () => {
await new Promise(resolve => {
const scroll = setInterval(() => {
window.scrollBy(0, 1000);
if (window.innerHeight + window.scrollY >= document.body.scrollHeight) {
clearInterval(scroll);
resolve();
}
}, 1000);
});
});
// Extract video data
const videoData = await page.evaluate(() => {
const videos = Array.from(document.querySelectorAll('ytd-rich-item-renderer'));
return videos.map(video => {
const titleElement = video.querySelector('#video-title');
const linkElement = video.querySelector('a#thumbnail');
const viewsElement = video.querySelector('.inline-metadata-item.style-scope.ytd-video-meta-block');
const viewsText = viewsElement ? viewsElement.textContent.trim() : '0 views';
return {
title: titleElement ? titleElement.textContent.trim() : 'No title',
link: linkElement ? linkElement.href : 'No link',
views: viewsText
};
});
});
return videoData.map(video => ({
...video,
views: parseViewCount(video.views)
}));
} catch (error) {
console.error(`Failed to scrape ${channelLink}:`, error);
return [];
} finally {
await page.close();
}
};
// Function to download a video and audio using yt-dlp and convert audio to mp3 or wav using ffmpeg
const downloadVideo = (url, filePath) => {
return new Promise((resolve, reject) => {
// Command to download both video and audio and merge them into an MP4 file
const command = `yt-dlp -f "bestvideo+bestaudio[ext=m4a]/best" --merge-output-format mp4 ${url} -o ${filePath}`;
exec(command, (error, stdout, stderr) => {
if (error) {
reject(`Error: ${stderr}`);
} else {
console.log(`Downloaded video to ${filePath}`);
resolve(filePath);
}
});
});
};
// Function to extract audio and convert it to MP3 or WAV format using ffmpeg
const convertAudioFormat = (videoPath, audioFormat) => {
return new Promise((resolve, reject) => {
const audioPath = videoPath.replace('.mp4', `.${audioFormat}`);
const command = `ffmpeg -i "${videoPath}" -q:a 0 -map a "${audioPath}"`;
exec(command, (error, stdout, stderr) => {
if (error) {
reject(`Error: ${stderr}`);
} else {
console.log(`Converted audio to ${audioFormat} format: ${audioPath}`);
resolve(audioPath);
}
});
});
};
// Function to print latest 3 links and download the videos
const printAndDownloadLatest3Links = async (audioFormat) => {
const results = [];
fs.createReadStream('video_data.csv')
.pipe(csv())
.on('data', (data) => results.push(data))
.on('end', async () => {
// Get the latest 3 items
const latest3 = results.slice(0, 1);
const latest3Links = latest3.map(item => item.Link);
// Print the latest 3 links
console.log('Latest 3 video links:');
latest3Links.forEach(link => console.log(link));
// Ensure the /videos directory exists
const videoDir = path.join(__dirname, 'videos');
if (!fs.existsSync(videoDir)) {
fs.mkdirSync(videoDir);
}
// Download the latest 3 videos and convert audio format
for (const [index, link] of latest3Links.entries()) {
const videoPath = path.join(videoDir, `latest_video_${index + 1}.mp4`);
try {
await downloadVideo(link, videoPath);
await convertAudioFormat(videoPath, audioFormat);
} catch (error) {
console.error(`Failed to process video from ${link}: ${error}`);
}
}
});
};
const main = async () => {
const browser = await puppeteer.launch({ headless: true });
const allVideoData = [];
// Read channel links from channels.txt
const channelLinks = fs.readFileSync('channels.txt', 'utf-8').split('\n').filter(Boolean);
for (const channelLink of channelLinks) {
console.log(`Scraping channel: ${channelLink}`);
const videoData = await scrapeChannelShorts(browser, channelLink);
allVideoData.push(...videoData);
console.log('Extracted Data:', videoData);
}
await browser.close();
if (allVideoData.length === 0) {
console.error('No video data found.');
return;
}
// Define CSV writer
const csvWriter = createCsvWriter({
path: 'video_data.csv',
header: [
{ id: 'title', title: 'Title' },
{ id: 'link', title: 'Link' },
{ id: 'views', title: 'Views' }
]
});
// Write data to CSV
await csvWriter.writeRecords(allVideoData);
console.log('CSV file was written successfully');
// Function to print latest 3 links and download the latest 3 videos
await printAndDownloadLatest3Links('wav'); // Change to 'wav' for WAV format
};
main().catch(console.error);