diff --git a/Readme.md b/Readme.md index e2f162a..24adc32 100644 --- a/Readme.md +++ b/Readme.md @@ -330,6 +330,84 @@ To run the job simply use the job.run() method, please keep in mind jobs are not job.run(); // Job will start running ``` +Hooks +===== +Hooks are run in specific moments of an instanced `task`'s life (before emitting events to the outside), and they can modify the scrapers default behavior. +To specify a `task`'s hooks use its `setup` method. + +```javascript +Yakuza.task('scraper', 'agent', 'someTask').setup(function (config) { + config.hooks = { + 'onFail': function (task) { + // ... do stuff + }, + 'onSuccess': function (task) { + // ... do stuff + } + }; +}); +``` + +onFail +------ +Runs when a task fails, `onFail` can be used to do some fancy stuff like retrying failed tasks right away. + +The `task` object passed to the `onFail` hook has the following properties: +- runs: Amount of times the task has run (starts from 1) +- params: Parameters with which the task was instanced for the first time (doesn't change) +- rerun([params]): Re-runs the task with original parameters (passed by the builder), if an object is provided, it will replace the task's parameters with the object passed. +- error: Error thrown by the task's `fail` event, (if passed) + +onSuccess +--------- +Runs when a task succeeds, `onSuccess` can be used to stop the job's execution even though the task was successful. This can be useful when we need to stop our execution depending on the data we receive. + +The `task` object passed to the `onSuccess` hook has the following properties: +- data: Data returned by the `task`'s success() method +- stopJob(): Method which, if called, stops the job execution in once the current `executionBlock` is done + +Here's an example on when this could be useful: + +```javascript + Yakuza.task('scraper', 'agent', 'login').setup(function (config) { + config.hooks = { + 'onSuccess': function (task) { + // We stop the job if the loginStatus returns `wrongPassword` + // remember: in many cases wrongPassword might NOT be an error, identifying what's the login status + // can be part of a successful scraping process as well. + + if (task.data.loginStatus === 'wrongPassword') { + task.stopJob(); + } + } + }; + }).main(function (task, http, params) { + var opts; + + opts = { + url: 'http://someurl.com', + data: { + username: 'foo', + password: 'bar' + } + }; + + http.post(opts) + .then(function (res, body) { + if (body === 'wrong password') { + task.success({loginStatus: 'wrongPassword}); + } else { + task.success({loginStatus: 'authorized}); + } + }) + .fail(function (error) { + task.fail(error); + }) + .done(); + }); +``` + +When calling `task.stopJob()` the `task::success` event is, of course, still fired. Advanced ======== @@ -484,6 +562,27 @@ Yakuza.task('scraper', 'agent', 'login').main(function (task, http, params) { Any new task will now have its `http` object initialized with the cookies that were present at the time `saveCookies` was called. Notice that only tasks from the next **execution block** will be afected. +Retrying tasks +-------------- +In many cases the websites we scrape are sloppy, implemented in very wrong ways or simply unstable. This will cause our tasks to sometimes fail without warning. For this reason `Yakuza` provides a way of re-running tasks when this happens via it's `onFail` hook. + +When a task is rerun, it restarts to the point in which it was instanced. Except (for some properties like `startTime` which marks the moment when the task was first run) + +```javascript +Yakuza.task('scraper', 'agent', 'login').setup(function (config) { + config.hooks = { + onFail: function (task) { + if (task.runs <== 5) { + // Will retry the task a maximum amount of 5 times + task.rerun(); + } + } + }; +}); +``` + +You can find the `task` object's properties on the **Hooks section** + Glossary ======== diff --git a/agent.js b/agent.js index ea007cf..95f9533 100644 --- a/agent.js +++ b/agent.js @@ -23,12 +23,6 @@ function Agent (id) { */ this.__applied = false; - /** - * List of functions which modify the Agent's configuration (provided by setup()) - * @private - */ - this.__configCallbacks = []; - /** * Agent's configuration object (set by running all configCallback functions) * @private @@ -58,17 +52,6 @@ function Agent (id) { this.id = id; } -/** -* Run functions passed via config(), thus applying their config logic -* @private -*/ -Agent.prototype.__applyConfigCallbacks = function () { - var _this = this; - _.each(_this.__configCallbacks, function (configCallback) { - configCallback(_this.__config); - }); -}; - /** * Turns every element in the execution plan into an array for type consistency * @private @@ -108,16 +91,6 @@ Agent.prototype.__formatPlan = function () { this._plan = formattedPlan; }; -/** -* Applies all task definitions -* @private -*/ -Agent.prototype.__applyTaskDefinitions = function () { - _.each(this._taskDefinitions, function (taskDefinition) { - taskDefinition._applySetup(); - }); -}; - /** * Applies all necessary processes regarding the setup stage of the agent */ @@ -125,24 +98,22 @@ Agent.prototype._applySetup = function () { if (this.__applied) { return; } - this.__applyConfigCallbacks(); - this.__applyTaskDefinitions(); + this.__formatPlan(); this.__applied = true; }; /** -* Saves a configuration function into the config callbacks array -* @param {function} cbConfig method which modifies the agent's config object (passed as argument) +* Sets the task's execution plan +* @param {Array} executionPlan array representing the execution plan for this agent */ -Agent.prototype.setup = function (cbConfig) { - if (!_.isFunction(cbConfig)) { - throw new Error('Setup argument must be a function'); +Agent.prototype.plan = function (executionPlan) { + // TODO: Validate execution plan format right away + if (!_.isArray(executionPlan)) { + throw new Error('Agent plan must be an array of task ids'); } - this.__configCallbacks.push(cbConfig); - - return this; + this.__config.plan = executionPlan; }; /** diff --git a/job.js b/job.js index 872f84e..438c710 100644 --- a/job.js +++ b/job.js @@ -631,7 +631,6 @@ Job.prototype.__applyComponents = function () { return; } - this._scraper._applySetup(); this.__agent._applySetup(); this.__componentsApplied = true; diff --git a/package.json b/package.json index beaf89e..5680523 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "yakuza", - "version": "0.2.1", + "version": "1.0.0", "description": "", "main": "yakuza.js", "repository": { diff --git a/scraper.js b/scraper.js index d59788a..6a2943f 100644 --- a/scraper.js +++ b/scraper.js @@ -16,24 +16,6 @@ Agent = require('./agent'); * @class */ function Scraper () { - /** - * Determines if the setup processes have been applied - * @private - */ - this.__applied = false; - - /** - * Array of callbacks provided via config() which set the Scraper's configuration variables - * @private - */ - this.__configCallbacks = []; - - /** - * Config object, contains configuration data and is exposed via the setup() method - * @private - */ - this.__config = {}; - /** * Object which contains scraper-wide routine definitions, routines are set via the routine() * method @@ -70,43 +52,6 @@ Scraper.prototype.__createAgent = function (agentId) { return this._agents[agentId]; }; -/** -* Run functions passed via config(), thus applying their config logic -* @private -*/ -Scraper.prototype.__applyConfigCallbacks = function () { - var _this = this; - _.each(_this.__configCallbacks, function (configCallback) { - configCallback(_this.__config); - }); -}; - -/** -* Applies all necessary processes regarding the setup stage of the scraper -*/ -Scraper.prototype._applySetup = function () { - if (this.__applied) { - return; - } - this.__applyConfigCallbacks(); - this.__applied = true; -}; - -/** -* Used to configure the scraper, it enqueues each configuration function meaning it -* allows a scraper to be configured in multiple different places -* @param {function} cbConfig function which will modify config parameters -*/ -Scraper.prototype.setup = function (cbConfig) { - if (!_.isFunction(cbConfig)) { - throw new Error('Config argument must be a function'); - } - - this.__configCallbacks.push(cbConfig); - - return Scraper; -}; - /** * Creates or gets an agent based on the id passed * @param {string} agentId Id of the agent to retrieve/create diff --git a/spec/agent.spec.js b/spec/agent.spec.js index 2669477..c6b1e29 100644 --- a/spec/agent.spec.js +++ b/spec/agent.spec.js @@ -20,37 +20,36 @@ beforeEach(function () { }); describe('Agent', function () { - describe('#setup', function () { - var error; + describe('#plan', function () { + it('should set the execution plan', function () { + var agent; - error = 'Setup argument must be a function'; + agent = yakuza.agent('Scraper', 'Agent'); + + agent.plan([ + 'Task1' + ]); + + agent.__config.plan.should.eql(['Task1']); + }); + + it('should throw if argument is not an array', function () { + var error; + + error = 'Agent plan must be an array of task ids'; - it('should throw if argument is not a function', function () { (function () { - yakuza.agent('Scraper', 'Agent').setup('foo'); + yakuza.agent('Scraper', 'Agent').plan(123); }).should.throw(error); + (function () { - yakuza.agent('Scraper', 'Agent').setup(['foo']); + yakuza.agent('Scraper', 'Agent').plan({foo: 'bar'}); }).should.throw(error); + (function () { - yakuza.agent('Scraper', 'Agent').setup(123); + yakuza.agent('Scraper', 'Agent').plan('foo'); }).should.throw(error); }); - - it('it should add a config callback', function (done) { - yakuza.agent('Scraper', 'Agent').setup(function (config) { - config.plan = [ - 'Task1' - ]; - done(); - }); - - yakuza.task('Scraper', 'Agent', 'Task1').main(function (task) { - task.success(); - }); - - yakuza.ready(); - }); }); describe('#task', function () { @@ -65,13 +64,9 @@ describe('Agent', function () { beforeEach(function () { agent = yakuza.agent('Scraper', 'Agent'); }); + it('should create an agent-level routine', function () { - agent.setup(function (config) { - config.plan = [ - 'Task1', - 'Task2' - ]; - }); + agent.plan(['Task1', 'Task2']); agent.routine('OnlyOne', ['Task1']); yakuza.job('Scraper', 'Agent').routine('OnlyOne'); }); diff --git a/spec/job.spec.js b/spec/job.spec.js index 7f34d83..7c4de45 100644 --- a/spec/job.spec.js +++ b/spec/job.spec.js @@ -14,13 +14,11 @@ beforeEach(function () { yakuza = new YakuzaBase(); yakuza.scraper('Scraper'); - yakuza.agent('Scraper', 'Parallel').setup(function (config) { - config.plan = [ - 'Task1', - ['Task2', 'Task3'], - 'Task4' - ]; - }); + yakuza.agent('Scraper', 'Parallel').plan([ + 'Task1', + ['Task2', 'Task3'], + 'Task4' + ]); yakuza.task('Scraper', 'Parallel', 'Task1').main(function (task) { task.success(1); @@ -182,12 +180,10 @@ describe('Job', function () { beforeEach(function () { newYakuza = new YakuzaBase(); - newYakuza.agent('FooScraper', 'FooAgent').setup(function (config) { - config.plan = [ - 'FailTask', - 'SuccessTask' - ]; - }); + newYakuza.agent('FooScraper', 'FooAgent').plan([ + 'FailTask', + 'SuccessTask' + ]); newYakuza.task('FooScraper', 'FooAgent', 'FailTask').main(function (task) { task.fail(new Error('Error!')); }); @@ -360,11 +356,7 @@ describe('Job', function () { it('should throw if enqueued tasks are not defined', function () { var invalidJob; - yakuza.agent('Scraper', 'InvalidAgent').setup(function (config) { - config.plan = [ - 'FakeTask' - ]; - }); + yakuza.agent('Scraper', 'InvalidAgent').plan(['FakeTask']); invalidJob = yakuza.job('Scraper', 'InvalidAgent'); invalidJob.enqueue('FakeTask'); @@ -377,12 +369,10 @@ describe('Job', function () { describe('execution queue', function () { beforeEach(function () { - yakuza.scraper('QueueTest').agent('SyncTest').setup(function (config) { - config.plan = [ - {taskId: 'SyncTask', selfSync: true}, - 'AsyncTask' - ]; - }); + yakuza.scraper('QueueTest').agent('SyncTest').plan([ + {taskId: 'SyncTask', selfSync: true}, + 'AsyncTask' + ]); yakuza.task('QueueTest', 'SyncTest', 'SyncTask').builder(function () { return [1, 2]; }) diff --git a/spec/scraper.spec.js b/spec/scraper.spec.js index 1dfa51c..73b7966 100644 --- a/spec/scraper.spec.js +++ b/spec/scraper.spec.js @@ -13,51 +13,15 @@ chai.use(sinonChai); beforeEach(function () { yakuza = new YakuzaBase(); yakuza.scraper('Scraper'); - yakuza.agent('Scraper', 'Agent').setup(function (config) { - config.plan = [ - 'Task1' - ]; - }); + yakuza.agent('Scraper', 'Agent').plan([ + 'Task1' + ]); yakuza.task('Scraper', 'Agent', 'Task1').main(function (task) { task.success(); }); }); describe('Scraper', function () { - describe('#setup', function () { - it('should add a config callback', function (done) { - var job; - - yakuza.scraper('Scraper').setup(function () { - done(); - }); - - job = yakuza.job('Scraper', 'Agent'); - job.enqueue('Task1'); - - job.run(); - }); - - it('should throw if argument is not a function', function () { - var error; - - error = 'Config argument must be a function'; - - (function () { - yakuza.scraper('Scraper').setup('foo'); - }).should.throw(error); - (function () { - yakuza.scraper('Scraper').setup('foo'); - }).should.throw(error); - (function () { - yakuza.scraper('Scraper').setup([123, 456]); - }).should.throw(error); - (function () { - yakuza.scraper('Scraper').setup({foo: 'bar'}); - }).should.throw(error); - }); - }); - describe('#agent', function () { it('should throw if argument is not a non-empty string', function () { var error; @@ -147,12 +111,10 @@ describe('Scraper', function () { return newValue; }); - yakuza.agent('Scraper', 'OtherAgent').setup(function (config) { - config.plan = [ - 'ConcatTask', - 'FinalTask' - ]; - }); + yakuza.agent('Scraper', 'OtherAgent').plan([ + 'ConcatTask', + 'FinalTask' + ]); yakuza.task('Scraper', 'OtherAgent', 'ConcatTask').builder(function () { return ['this', ' is', ' con', 'catenated']; diff --git a/spec/task-definition.spec.js b/spec/task-definition.spec.js index a795378..ede6b87 100644 --- a/spec/task-definition.spec.js +++ b/spec/task-definition.spec.js @@ -19,31 +19,11 @@ describe('TaskDefinition', function () { yakuza.agent('Scraper', 'Agent'); }); - describe('#_applySetup', function () { - it('should only apply once', function () { - var task; - - task = yakuza.task('Scraper', 'Agent', 'Task'); - - task.setup(function (config) { - return; - }); - - sinon.stub(task, '__applyConfigCallbacks'); - - task._applySetup(); - task._applySetup(); - task.__applyConfigCallbacks.callCount.should.eql(1); - }); - }); - describe('#_build', function () { it('should throw if no main method is set', function () { var job; - yakuza.agent('Scraper', 'Agent').setup(function (config) { - config.plan = ['SomeTask']; - }); + yakuza.agent('Scraper', 'Agent').plan(['SomeTask']); yakuza.task('Scraper', 'Agent', 'SomeTask'); job = yakuza.job('Scraper', 'Agent'); job.enqueue('SomeTask'); @@ -54,43 +34,42 @@ describe('TaskDefinition', function () { }); }); - describe('#setup', function () { - it('should push config callbacks', function () { - var calls; + describe('#hooks', function () { + beforeEach(function () { + yakuza.agent('Scraper', 'Agent').plan([ + 'Task', + 'OtherTask' + ]); + }); - calls = 0; - yakuza.task('Scraper', 'Agent', 'Task').setup(function () { - calls += 1; - }).setup(function () { - calls += 1; - }); + it('should provide method chaining', function () { + var task; - yakuza.task('Scraper', 'Agent', 'Task')._applySetup(); + task = yakuza.task('Scraper', 'Agent', 'Foo'); - calls.should.eql(2); + task.hooks({ + onFail: function () { + // NOOP + } + }).should.eql(task); }); - it('should throw if argument is not a function', function () { - (function () { - yakuza.task('Scraper', 'Agent', 'SomeTask').setup('foo'); - }).should.throw(); + it('should throw if argument is not an object', function () { + var error; + + error = 'Hooks argument must be an object'; + (function () { - yakuza.task('Scraper', 'Agent', 'SomeTask').setup(['foo']); - }).should.throw(); + yakuza.task('Scraper', 'Agent', 'Foo').hooks(); + }).should.throw(error); + (function () { - yakuza.task('Scraper', 'Agent', 'SomeTask').setup(123); - }).should.throw(); - }); - }); + yakuza.task('Scraper', 'Agent', 'Foo').hooks(123); + }).should.throw(error); - describe('hooks', function () { - beforeEach(function () { - yakuza.agent('Scraper', 'Agent').setup(function (config) { - config.plan = [ - 'Task', - 'OtherTask' - ]; - }); + (function () { + yakuza.task('Scraper', 'Agent', 'Foo').hooks(['foo', 'bar']); + }).should.throw(error); }); describe('onFail', function () { @@ -101,16 +80,14 @@ describe('TaskDefinition', function () { hookCalled = false; params = {foo: 'bar'}; - yakuza.task('Scraper', 'Agent', 'Task').setup(function (config) { - config.hooks = { - onFail: function (task) { - task.error.should.equal(error); - task.runs.should.eql(1); - task.params.should.equal(params); + yakuza.task('Scraper', 'Agent', 'Task').hooks({ + onFail: function (task) { + task.error.should.equal(error); + task.runs.should.eql(1); + task.params.should.equal(params); - hookCalled = true; - } - }; + hookCalled = true; + } }) .builder(function () { return params; @@ -132,13 +109,11 @@ describe('TaskDefinition', function () { it('should be able to rerun a task', function (done) { var job; - yakuza.task('Scraper', 'Agent', 'Task').setup(function (config) { - config.hooks = { - onFail: function (task) { - // Rerun task with its param increased by 1 - task.rerun(task.params + 1); - } - }; + yakuza.task('Scraper', 'Agent', 'Task').hooks({ + onFail: function (task) { + // Rerun task with its param increased by 1 + task.rerun(task.params + 1); + } }) .builder(function () { return 0; @@ -164,16 +139,14 @@ describe('TaskDefinition', function () { runs = 0; - yakuza.task('Scraper', 'Agent', 'Task').setup(function (config) { - config.hooks = { - onFail: function (task) { - if (task.runs === 2) { - return; - } - - task.rerun(); + yakuza.task('Scraper', 'Agent', 'Task').hooks({ + onFail: function (task) { + if (task.runs === 2) { + return; } - }; + + task.rerun(); + } }) .builder(function () { return 10; @@ -203,12 +176,10 @@ describe('TaskDefinition', function () { hookCalled = false; - yakuza.task('Scraper', 'Agent', 'Task').setup(function (config) { - config.hooks = { - onSuccess: function () { - hookCalled = true; - } - }; + yakuza.task('Scraper', 'Agent', 'Task').hooks({ + onSuccess: function () { + hookCalled = true; + } }).main(function (task) { task.success(); }); @@ -226,12 +197,10 @@ describe('TaskDefinition', function () { it('should finish the job if stopJob() is called', function (done) { var job; - yakuza.task('Scraper', 'Agent', 'Task').setup(function (config) { - config.hooks = { - onSuccess: function (event) { - event.stopJob(); - } - }; + yakuza.task('Scraper', 'Agent', 'Task').hooks({ + onSuccess: function (event) { + event.stopJob(); + } }).main(function (task) { task.success(); }); diff --git a/spec/task.spec.js b/spec/task.spec.js index ae92b0b..a579806 100644 --- a/spec/task.spec.js +++ b/spec/task.spec.js @@ -19,12 +19,10 @@ beforeEach(function () { yakuza = new YakuzaBase(); yakuza.scraper('Scraper'); - yakuza.agent('Scraper', 'Agent').setup(function (config) { - config.plan = [ - 'Task1', - {taskId: 'Task2'} - ]; - }); + yakuza.agent('Scraper', 'Agent').plan([ + 'Task1', + {taskId: 'Task2'} + ]); }); describe('Task', function () { diff --git a/spec/yakuza.spec.js b/spec/yakuza.spec.js index d81c77c..29c6f63 100644 --- a/spec/yakuza.spec.js +++ b/spec/yakuza.spec.js @@ -154,17 +154,15 @@ describe('Yakuza', function () { }); describe('#ready', function () { - it('should call scraper\'s and agent\'s _applySetup method', function () { + it('should apply ALL agents', function () { var scraper, agent; scraper = yakuza.scraper('scraper'); agent = scraper.agent('agent'); - sinon.stub(scraper, '_applySetup'); sinon.stub(agent, '_applySetup'); yakuza.ready(); - scraper._applySetup.callCount.should.eql(1); agent._applySetup.callCount.should.eql(1); }); }); diff --git a/task-definition.js b/task-definition.js index 4dc0713..4019eef 100644 --- a/task-definition.js +++ b/task-definition.js @@ -22,12 +22,6 @@ function TaskDefinition (id) { */ this.__id = id; - /** - * List of function which modify the Task definition's configuration (provided by config()) - * @private - */ - this.__configCallbacks = []; - /** * Task definition's configuration object (set by running all configCallback functions) * Property names are pre-defined just for maintainability @@ -55,18 +49,6 @@ function TaskDefinition (id) { }; } -/** -* Run functions passed via config(), thus applying their config logic -* @private -*/ -TaskDefinition.prototype.__applyConfigCallbacks = function () { - var _this = this; - - _.each(_this.__configCallbacks, function (configCallback) { - configCallback(_this.__config); - }); -}; - /** * Executes the builder function and builds the Task instances * Note: This is called by the job @@ -93,15 +75,16 @@ TaskDefinition.prototype._build = function (builderParams, cookieJar, job) { }; /** -* Applies the current task setup +* Sets task hooks */ -TaskDefinition.prototype._applySetup = function () { - if (this._applied) { - return; +TaskDefinition.prototype.hooks = function (hooks) { + if (!_.isObject(hooks) || _.isArray(hooks)) { + throw new Error('Hooks argument must be an object'); } - this.__applyConfigCallbacks(); - this._applied = true; + this.__config.hooks = hooks; + + return this; }; /** @@ -135,19 +118,5 @@ TaskDefinition.prototype.builder = function (builderMethod) { return this; }; -/** -* Saves a configuration function into the config callbacks array -* @param {function} cbConfig method which modifies the Task definition's config object (passed as -* argument) -*/ -TaskDefinition.prototype.setup = function (cbConfig) { - if (!_.isFunction(cbConfig)) { - throw new Error('Setup argument must be a function'); - } - - this.__configCallbacks.push(cbConfig); - - return this; -}; module.exports = TaskDefinition; diff --git a/yakuza-base.js b/yakuza-base.js index 6e91ca8..c060193 100644 --- a/yakuza-base.js +++ b/yakuza-base.js @@ -131,7 +131,6 @@ YakuzaBase.prototype.job = function (scraperId, agentId, params) { */ YakuzaBase.prototype.ready = function () { _.each(this.__scrapers, function (scraper) { - scraper._applySetup(); _.each(scraper._agents, function (agent) { agent._applySetup(); });