-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
executable file
·261 lines (220 loc) · 9.37 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env node
const puppeteer = require("puppeteer");
const yargs = require("yargs/yargs");
const { hideBin } = require("yargs/helpers");
const chalk = require("chalk");
const jsonToCsv = require("json-2-csv");
const fs = require("fs");
const config = require("./config.json");
async function run() {
yargs(hideBin(process.argv))
.command("list-categories", "list all categories", () => { }, async (argv) => {
console.log(chalk.green("Fetching categories..."));
const page = await setupBrowser();
const links = await listCategories(page);
links.forEach(link => console.log(` ${link.name}`));
process.exit(0);
})
.command("scrape-all <output>", "scrape all products", (yargs) => {
return yargs
.positional("output", {
type: "string",
description: "Specify output file"
})
.option("format", {
alias: "f",
type: "string",
choices: ["json", "csv"],
default: "json",
description: "Specify output file"
})
.option("with-extra-data", {
alias: "e",
type: "boolean",
description: "Include extra data such as product description"
});
}, async (argv) => {
console.log(chalk.green(`Fetching products${argv["with-extra-data"] ? " with extra data" : ""} (this may take a while)...`));
if(config.debug) console.log(chalk.gray(`> Selected format: ${argv.format}`));
const page = await setupBrowser();
const products = await scrapeAllProducts(page);
console.log(chalk.green(`Fetched ${products.length} products`));
if (argv["with-extra-data"]) {
console.log(chalk.green("Fetching extra data..."));
for (const product of products) {
if (config.debug) console.log(chalk.gray("> Retrieving extra data for " + product.name));
try {
const extraData = await scrapeProductExtraInfo(page, product.link);
Object.keys(extraData).forEach(field => product[field] = extraData[field]);
} catch (e) {
console.error(e.message)
continue;
}
}
}
// Write file
const output = argv.format === "json" ? JSON.stringify(products, null, 2) : await jsonToCsv.json2csv(products);
fs.writeFileSync(argv.output, output);
console.log(chalk.green("Done"));
process.exit(0);
})
.demandCommand(1)
.parse();
}
/**
* Sets up pupeteer, launches the browser and sets relevant cookies.
*
* @return {puppeteer.Page} The page instance
*/
async function setupBrowser() {
const browser = await puppeteer.launch({
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
// Set relevant cookies
await page.setCookie({
url: config.siteUrl,
name: "PHPSESSID",
value: config.token
});
await page.setCookie({
url: config.siteUrl,
name: "USE_LISTVIEW",
value: "true"
});
// Relay console messages in the browser to the Node log
page.on("console", c => console.log(`[page] ${c.text()}`));
return page;
}
/**
* Returns a list of all categories on the website.
*
* @param {puppeteer.Page} page The page instance from {@link setupBrowser}.
* @returns A list of categories
*/
async function listCategories(page) {
// Go to the home page
await page.goto(`${config.siteUrl}/ordering/pages/default.php`);
// Retrieve all category links
const categoriesElement = await page.$(".cat-menu");
const categoryLinks = await page.evaluate(el => {
return Array.from(el.children).map(child => ({
name: child.children[0].text,
link: child.children[0].href
}));
}, categoriesElement);
return categoryLinks;
}
/**
* Returns information about all products on the website.
*
* This does not include information such as product description,
* allgens. For that, use {@link scrapeProductExtraInfo}.
*
* @param {puppeteer.Page} page The page instance from {@link setupBrowser}.
* @returns A list of products
*/
async function scrapeAllProducts(page) {
let completeProductData = [];
// Go to the home page
await page.goto(`${config.siteUrl}/ordering/pages/default.php`);
// Retrieve all category links
const categoriesElement = await page.$(".cat-menu");
const categoryLinks = await page.evaluate(el => {
return Array.from(el.children).map(child => child.children[0].href)
}, categoriesElement);
// Loop through the category links to get the sub category links
for (const link of categoryLinks) {
await page.waitForNetworkIdle();
await page.goto(link);
const subCategories = await page.$("#default_page_subtitle_table");
// Find the sub category links on the page
const subCategoryLinks = await page.evaluate(async el => {
let cats = document.getElementsByClassName("category-card-item");
let subCategoryLinks = [];
for (let i = 0; i < cats.length; i++) {
subCategoryLinks.push(cats[i].children[0].href);
}
return subCategoryLinks;
}, subCategories);
// Loop through to get all product links in each sub category
for (const subLink of subCategoryLinks) {
await getProductData(subLink);
async function getProductData(dataLink) {
await page.goto(dataLink);
await page.addScriptTag({ url: "https://code.jquery.com/jquery-3.2.1.min.js" });
const productElements = await page.$("tr");
if (config.debug) console.log(chalk.gray("> Retrieving products on page " + dataLink));
// Retrieve the data we need from each product listing
const productData = await page.evaluate((el, dataLink, siteUrl) => {
const tr = $("#product_listing_table_in_form tr");
const next = $(".prods[title=\" Next Page \"]");
let prodData = [];
let runs = 0;
for (let prods of tr.toArray()) {
if (runs < 2) {
runs++;
continue;
}
const sku = $(prods).find(".pl_code").first().text().trim();
prodData.push({
name: $(prods).find(".pl_name").first().text().trim(),
sku,
price: $(prods).find(".pl_incvat").first().text().trim(),
inStock: !$(prods).find(".pl_instock").text().includes("-"),
unit: $(prods).find(".pl_units").text().trim(), // TODO: Check if I need to convert this
stockCount: $(prods).find(".pl_instock").text().trim(),
link: `${siteUrl}/ordering/pages/product_info.php?products_id=${sku}`,
brand: $(prods).find(".pl_brand").first().text().trim(),
});
}
if (next.length !== 0) {
return {
data: prodData,
next: next.attr("href")
}
}
return { data: prodData };
}, productElements, dataLink, config.siteUrl);
completeProductData = [...completeProductData, ...productData.data];
// If there is another page of results, do it all over again
if (productData.next !== undefined) {
await getProductData(productData.next);
}
}
}
}
return completeProductData;
}
/**
* Returns information about a specific products.
*
* This only includes information that {@link scrapeAllProducts}
* does not contain. Currently this is only product description.
*
* @param {puppeteer.Page} page The page instance from {@link setupBrowser}.
* @param {String} link The product page link to scrape
* @returns Product information
*/
async function scrapeProductExtraInfo(page, link) {
await page.goto(link);
await page.addScriptTag({ url: "https://code.jquery.com/jquery-3.2.1.min.js" });
const productData = await page.evaluate(() => {
const allergenTr = $(".middle_column_div > table > tbody > tr:eq(2)");
const allergenInner = $(allergenTr).first().find("div");
for (const allergen of allergenInner.children()) {
if ($(allergen).hasClass("allergens")) {
const imgUrl = $(allergen).first().find("img").attr("src");
const text = $(allergen).first().text().trim();
if (imgUrl.includes("red")) {
console.log("Allergen: " + text);
}
}
}
return {
description: $(".product_info_description").first().text().trim()
}
});
return productData;
}
run();