Crawler completes then cancels the output of "crawledUrls"? #60

sbr2567 · 2020-05-04T17:27:04Z

I found when crawling a site with the depth set to 2, it will finish, and console.log(crawledUrls) correctly. But when using a higher depth like 4 or 6 (which of course takes longer), the crawler will finish then fail, not returning the console.log(crawledUrls).

Here is the script below.

var Crawler = require("js-crawler");
 
mm_url = "{the url}"
page_nbr = 0

function ismmUrl(url) {
    return !!url.match({the regex url});
  }

var crawler = new Crawler().configure({
    maxRequestsPerSecond: 100,
    maxConcurrentRequests: 40,
    ignoreRelative: false,
    depth: 4,

    shouldCrawl: function(url) {
        return ismmUrl(url) || url == mm_url;
      }
});

crawler.crawl({

    url: mm_url,
    success: function(page) {
        console.log(`Loaded page ${page_nbr++}. URL = ${page.url} content length = ${page.content.length} status = ${page.status}`);
        // console.log(page.content)
    },
    failure: function(page) {
        console.log(`Could not load page. URL = ${page.url} status = ${page.status}`);
    },
    finished: function(crawledUrls) {
        console.log('Forgetting all crawled...');
        crawler.forgetCrawled();
        console.log('Complete');
        console.log(crawledUrls);
    }
});

This is what the crawler outputs inside the console:

Loaded page 9078. URL = {theurl}/{morestuff} content length = 8850 status = 200
Loaded page 9079. URL = {theurl}/{morestuff} content length = 19070 status = 200
Loaded page 9080. URL = {theurl}/{morestuff} content length = 15481 status = 200
Loaded page 9081. URL = {theurl}/{morestuff} content length = 15776 status = 200
Forgetting all crawled...
Complete
Canceled

This is what the crawler outputs when the depth is 2. As you can see, the array is logged.

Loaded page 7. URL = {theurl}/{morestuff} content length = 8850 status = 200
Loaded page 8. URL = {theurl}/{morestuff} content length = 19070 status = 200
Loaded page 9. URL = {theurl}/{morestuff} content length = 15481 status = 200
Loaded page 10. URL = {theurl}/{morestuff} content length = 15776 status = 200
Forgetting all crawled...
Complete
Array(11) ["{theurl}/{morestuff}", "{theurl}/{morestuff}", "h{theurl}/{morestuff}", "{theurl}/{morestuff}", "{theurl}/{morestuff}", "{theurl}/{morestuff}", "{theurl}/{morestuff}", "{theurl}/{morestuff}", …]

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Crawler completes then cancels the output of "crawledUrls"? #60

Crawler completes then cancels the output of "crawledUrls"? #60

sbr2567 commented May 4, 2020 •

edited

Loading

Crawler completes then cancels the output of "crawledUrls"? #60

Crawler completes then cancels the output of "crawledUrls"? #60

Comments

sbr2567 commented May 4, 2020 • edited Loading

sbr2567 commented May 4, 2020 •

edited

Loading