diff --git a/docs/ops-reference.md b/docs/ops-reference.md index d2b77a1b7fd..1b218297472 100644 --- a/docs/ops-reference.md +++ b/docs/ops-reference.md @@ -253,6 +253,7 @@ size | The limit to the number of docs pulled in a chunk, if the number of docs full_response | If set to true, it will return the native response from elasticsearch with all meta-data included. If set to false it will return an array of the actual documents, no meta data included | Boolean | optional, defaults to false key_type | Used to specify the key type of the \_ids of the documents being queryed | String | optional, defaults to elasticsearch id generator (base64url) key_range | if provided, slicer will only recurse on these given keys | Array | optional +starting_key_depth | if provided, slicer will only produce keys with minimum length determined by this setting | Number | optional fields | Used to restrict what is returned from elasticsearch. If used, only these fields on the documents are returned | Array | optional | query | specify any valid lucene query for elasticsearch to use in filtering| String | optional | diff --git a/integration-tests/spec/cases/data/id-reader.js b/integration-tests/spec/cases/data/id-reader.js index e0bede8b5ff..007b4a11d48 100644 --- a/integration-tests/spec/cases/data/id-reader.js +++ b/integration-tests/spec/cases/data/id-reader.js @@ -29,11 +29,11 @@ module.exports = function() { }); it('should support reindexing by hex id', function(done) { - var job_spec = misc.newJob('id') - job_spec.name = 'reindex by hex id' - job_spec.operations[0].key_type = 'hexadecimal' - job_spec.operations[0].index = 'example-logs-10000-hex' - job_spec.operations[1].index = "test-hexadecimal-logs" + var job_spec = misc.newJob('id'); + job_spec.name = 'reindex by hex id'; + job_spec.operations[0].key_type = 'hexadecimal'; + job_spec.operations[0].index = 'example-logs-10000-hex'; + job_spec.operations[1].index = "test-hexadecimal-logs"; teraslice.jobs.submit(job_spec) .then(function(job) { diff --git a/lib/readers/id_reader.js b/lib/readers/id_reader.js index 9b0a25375c6..7187d01489b 100644 --- a/lib/readers/id_reader.js +++ b/lib/readers/id_reader.js @@ -71,6 +71,22 @@ function schema() { } } }, + starting_key_depth: { + doc: 'if provided, slicer will only produce keys with minimum length determined by this setting', + default: null, + format: function(val) { + if (val) { + if (isNaN(val)) { + throw new Error('starting_key_depth parameter for id_reader must be a number') + } + else { + if (val <= 0) { + throw new Error('starting_key_depth parameter for id_reader must be greater than zero') + } + } + } + } + }, query: { doc: 'You may place a lucene query here, and the slicer will use it when making slices', default: "", diff --git a/lib/readers/id_slicer.js b/lib/readers/id_slicer.js index 4572c947c71..a5494a4ce85 100644 --- a/lib/readers/id_slicer.js +++ b/lib/readers/id_slicer.js @@ -18,6 +18,7 @@ module.exports = function(client, job, opConfig, logger, retryData, range) { var keyRange = opConfig.key_range; var baseKeyArray = getKeyArray(opConfig); var keyArray = opConfig.key_range ? opConfig.key_range : baseKeyArray.slice(); + var starting_key_depth = opConfig.starting_key_depth; var elasticsearch = require('elasticsearch_api')(client, logger, opConfig); var retryError = retryModule(logger, job.max_retries); @@ -106,6 +107,25 @@ module.exports = function(client, job, opConfig, logger, retryData, range) { return } + function* recurseDepth(baseArray, str) { + for (let key of baseArray) { + let newStr = str + key; + + if (newStr.length >= starting_key_depth) { + let resp = yield newStr; + + if (!resp) { + yield* recurse(baseArray, newStr) + } + } + else { + yield* recurse(baseArray, newStr) + } + + } + return + } + function* generateKeys(baseArray, keyArray) { for (let startKey of keyArray) { let processKey = yield startKey; @@ -118,6 +138,14 @@ module.exports = function(client, job, opConfig, logger, retryData, range) { return null } + function* generateKeyDepth(baseArray, keyArray) { + for (let startKey of keyArray) { + yield *recurseDepth(baseArray, startKey) + } + + return null + } + function divideKeyArray(keyArray, num) { let results = []; let len = num; @@ -136,7 +164,8 @@ module.exports = function(client, job, opConfig, logger, retryData, range) { } function keyGenerator(baseArray, keyArray, retryKey, range) { - let gen = generateKeys(baseArray, keyArray); + //if there is a starting depth, use the key depth generator, if not use default generator + let gen = starting_key_depth ? generateKeyDepth(baseArray, keyArray) : generateKeys(baseArray, keyArray); let closePath = false; if (retryKey) { diff --git a/spec/readers/id_slicer-spec.js b/spec/readers/id_slicer-spec.js index 40347e3643a..bea9a88e3cb 100644 --- a/spec/readers/id_slicer-spec.js +++ b/spec/readers/id_slicer-spec.js @@ -174,7 +174,49 @@ describe('id_reader', function() { done(); }); }) + .catch(function(err){ + fail(err) + }) + + }); + + it('it produces values starting at a specific depth', function(done) { + var retryData = []; + var job1 = { + jobConfig: { + slicers: 1, + operations: [{ + _op: 'id_reader', + type: 'events-', + key_type: 'hexadecimal', + key_range: ['a', 'b', 'c', 'd'], + starting_key_depth: 3, + size: 200 + }] + } + }; + var slicer = id_reader.newSlicer(context, job1, retryData, slicerAnalytics, logger); + + Promise.resolve(slicer) + .then(function(slicers) { + return Promise.resolve(slicers[0]()) + .then(function(results) { + expect(results).toEqual({count: 100, key: 'events-#a00*'}); + return Promise.resolve(slicers[0]()) + }) + .then(function(results) { + expect(results).toEqual({count: 100, key: 'events-#a01*'}); + return Promise.resolve(slicers[0]()) + }) + .then(function(results) { + expect(results).toEqual({count: 100, key: 'events-#a02*'}); + done(); + }) + }) + .catch(function(err){ + fail(err) + }) }); it('it produces values even with an initial search error', function(done) {