From 1f40d7cc5f019cb98efc3fae897ee2297291197b Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 6 Jun 2023 10:35:01 -0500 Subject: [PATCH] msm: semaphore to limit CPUs + better split strategy (up to 25% perf boost on 96cores) (#403) * feat: add semaphore in msm and better split strategy * fix: close the chan semaphore * perf: reduced default value of nbTasks --- ecc/bls12-377/multiexp.go | 185 +++++++++++++----- ecc/bls12-377/multiexp_affine.go | 28 ++- ecc/bls12-377/multiexp_jacobian.go | 28 ++- ecc/bls12-377/multiexp_test.go | 4 +- ecc/bls12-378/multiexp.go | 185 +++++++++++++----- ecc/bls12-378/multiexp_affine.go | 28 ++- ecc/bls12-378/multiexp_jacobian.go | 28 ++- ecc/bls12-378/multiexp_test.go | 4 +- ecc/bls12-381/multiexp.go | 185 +++++++++++++----- ecc/bls12-381/multiexp_affine.go | 28 ++- ecc/bls12-381/multiexp_jacobian.go | 28 ++- ecc/bls12-381/multiexp_test.go | 4 +- ecc/bls24-315/multiexp.go | 185 +++++++++++++----- ecc/bls24-315/multiexp_affine.go | 28 ++- ecc/bls24-315/multiexp_jacobian.go | 28 ++- ecc/bls24-315/multiexp_test.go | 4 +- ecc/bls24-317/multiexp.go | 185 +++++++++++++----- ecc/bls24-317/multiexp_affine.go | 28 ++- ecc/bls24-317/multiexp_jacobian.go | 28 ++- ecc/bls24-317/multiexp_test.go | 4 +- ecc/bn254/multiexp.go | 185 +++++++++++++----- ecc/bn254/multiexp_affine.go | 28 ++- ecc/bn254/multiexp_jacobian.go | 28 ++- ecc/bn254/multiexp_test.go | 4 +- ecc/bw6-633/multiexp.go | 185 +++++++++++++----- ecc/bw6-633/multiexp_affine.go | 28 ++- ecc/bw6-633/multiexp_jacobian.go | 28 ++- ecc/bw6-633/multiexp_test.go | 4 +- ecc/bw6-756/multiexp.go | 185 +++++++++++++----- ecc/bw6-756/multiexp_affine.go | 28 ++- ecc/bw6-756/multiexp_jacobian.go | 28 ++- ecc/bw6-756/multiexp_test.go | 4 +- ecc/bw6-761/multiexp.go | 185 +++++++++++++----- ecc/bw6-761/multiexp_affine.go | 28 ++- ecc/bw6-761/multiexp_jacobian.go | 28 ++- ecc/bw6-761/multiexp_test.go | 4 +- ecc/secp256k1/multiexp.go | 95 ++++++--- ecc/secp256k1/multiexp_affine.go | 14 +- ecc/secp256k1/multiexp_jacobian.go | 14 +- ecc/secp256k1/multiexp_test.go | 2 +- .../generator/ecc/template/multiexp.go.tmpl | 95 ++++++--- .../ecc/template/multiexp_affine.go.tmpl | 15 +- .../ecc/template/multiexp_jacobian.go.tmpl | 14 +- .../ecc/template/tests/multiexp.go.tmpl | 2 +- 44 files changed, 1895 insertions(+), 561 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 10699d84a7..52fdc682c1 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -582,6 +662,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index c6c022df46..d8ed9f8c4d 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index 8226a8d912..f766d8adb1 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 1166d52a2e..712b8f7d33 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 926c013cc4..494177fb58 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -300,6 +340,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -334,7 +375,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -364,29 +405,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -408,6 +471,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -420,8 +496,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -431,7 +511,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -439,7 +519,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -586,6 +666,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 22a42a7ef2..6fe1672180 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index fe3cd412a3..86b7dbc9ae 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -99,7 +111,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -135,6 +153,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 25a035e5f2..ad7d9474d1 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index cdc96a2b6a..df673c4c02 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 3: @@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 3: @@ -582,6 +662,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 285da72848..14a1dc29d6 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index 9abe78b336..b5af1128a3 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 8cd076b090..92dd4c7eb4 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 7c708411fd..4619bee3f8 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -582,6 +662,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index c804991099..7d27c125c6 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index eeeb196809..c6f10cc799 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 0492475b39..7961763cf1 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index e4909fa9e1..b3c1f10e3f 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 3: @@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 3: @@ -582,6 +662,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index ba8135b40f..cbbb297004 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index a07c9e874f..a1eb686cb7 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index e48db59fd5..efd3fb3709 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 3650ae1e8b..9204858aef 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -300,6 +340,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -334,7 +375,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -364,29 +405,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -408,6 +471,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -420,8 +496,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -431,7 +511,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -439,7 +519,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -586,6 +666,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 81078a1438..e4b3bfced9 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index 53490b9cc0..f9b8a901bd 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -99,7 +111,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -135,6 +153,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 17989d3689..e1f848e9f2 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index cbab47917f..3e0ad3498e 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 4: @@ -247,6 +287,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -281,7 +322,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -311,29 +352,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -355,6 +418,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -367,8 +443,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -378,7 +458,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -386,7 +466,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 4: @@ -480,6 +560,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 2c9eaa8ac1..afa4a2bf31 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -292,7 +304,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -481,6 +499,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index a79e632607..e8d707ec00 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -81,7 +93,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -117,6 +135,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 74f2ef86eb..1e45861b77 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 3e691db6c9..9129701454 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 3: @@ -247,6 +287,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -281,7 +322,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -311,29 +352,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -355,6 +418,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -367,8 +443,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -378,7 +458,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -386,7 +466,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 3: @@ -480,6 +560,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 4a5df9652f..edcf8f1b8d 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -292,7 +304,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -481,6 +499,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 9251324d15..503e51b341 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -81,7 +93,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -117,6 +135,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index d454ef78df..64b515445d 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 47aa334cc0..161132e0b5 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -249,6 +289,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -283,7 +324,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -313,29 +354,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G2Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsmG2(p, C, points, scalars, config) return p, nil @@ -357,6 +420,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co chChunks[i] = make(chan g2JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -369,8 +445,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -380,7 +460,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -388,7 +468,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -484,6 +564,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 1e67370777..38f11c5b03 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -292,7 +304,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -481,6 +499,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index 16d7e6c8d3..2401259a33 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } @@ -83,7 +95,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -119,6 +137,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 66e6e17241..d98bd688b6 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(16), chChunks[:]) @@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] - go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG2Affine(p, int(16), chChunks[:]) diff --git a/ecc/secp256k1/multiexp.go b/ecc/secp256k1/multiexp.go index 02febdfef6..9977c2b345 100644 --- a/ecc/secp256k1/multiexp.go +++ b/ecc/secp256k1/multiexp.go @@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc. // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p G1Jac - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost } + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + + // if we don't split, we use the best C we found _innerMsmG1(p, C, points, scalars, config) return p, nil @@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co chChunks[i] = make(chan g1JacExtended, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two + for i := 0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { switch c { case 2: @@ -318,6 +358,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) diff --git a/ecc/secp256k1/multiexp_affine.go b/ecc/secp256k1/multiexp_affine.go index a7a6c0c16a..9cb8e2e167 100644 --- a/ecc/secp256k1/multiexp_affine.go +++ b/ecc/secp256k1/multiexp_affine.go @@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/secp256k1/multiexp_jacobian.go b/ecc/secp256k1/multiexp_jacobian.go index 788e646617..40a34f8fae 100644 --- a/ecc/secp256k1/multiexp_jacobian.go +++ b/ecc/secp256k1/multiexp_jacobian.go @@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0; i < len(buckets); i++ { @@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/ecc/secp256k1/multiexp_test.go b/ecc/secp256k1/multiexp_test.go index e0ad2cf8d1..87cbd1575c 100644 --- a/ecc/secp256k1/multiexp_test.go +++ b/ecc/secp256k1/multiexp_test.go @@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunkG1Jacobian[bucketg1JacExtendedC15] - go processChunk(uint64(j), chChunks[j], 15, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], 15, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunkG1Affine(p, int(15), chChunks[:]) diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index b79994d7f5..1096439278 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -69,6 +69,11 @@ type chunkStat struct { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { + // no benefit here to have more tasks than CPUs + if nbTasks > runtime.NumCPU() { + nbTasks = runtime.NumCPU() + } + // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) @@ -240,6 +245,7 @@ func (p *{{ $.TAffine }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elemen // // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) { + // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. // note: // each of the msmCX method is the same, except for the c constant it declares // duplicating (through template generation) these methods allows to declare the buckets on the stack @@ -274,7 +280,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem // if nbTasks is not set, use all available CPUs if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() + config.NbTasks = runtime.NumCPU() * 2 } else if config.NbTasks > 1024 { return nil, errors.New("invalid config: config.NbTasks > 1024") } @@ -306,29 +312,51 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem C := bestC(nbPoints) nbChunks := int(computeNbChunks(C)) - // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split - if config.NbTasks > 1 && nbChunks < config.NbTasks { - // before splitting, let's see if we end up with more tasks than thread; - cSplit := bestC(nbPoints/2) - nbChunksPostSplit := int(computeNbChunks(cSplit)) - nbTasksPostSplit := nbChunksPostSplit*2 - if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) { - // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. - config.NbTasks /= 2 - var _p {{ $.TJacobian }} - chDone := make(chan struct{}, 1) - go func() { - _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) - close(chDone) - }() - p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) - <-chDone - p.AddAssign(&_p) - return p, nil + // should we recursively split the msm in half? (see below) + // we want to minimize the execution time of the algorithm; + // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. + + // costFunction returns a metric that represent the "wall time" of the algorithm + costFunction := func(nbTasks, nbCpus, costPerTask int) int { + // cost for the reduction of all tasks (msmReduceChunk) + totalCost := nbTasks + + // cost for the computation of each task (msmProcessChunk) + for nbTasks >= nbCpus { + nbTasks -= nbCpus + totalCost += costPerTask + } + if nbTasks > 0 { + totalCost += costPerTask } + return totalCost + } + + // costPerTask is the approximate number of group ops per task + costPerTask := func(c uint64, nbPoints int) int {return (nbPoints + int((1 << c)))} + + costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) + + cPostSplit := bestC(nbPoints/2) + nbChunksPostSplit := int(computeNbChunks(cPostSplit)) + costPostSplit := costFunction(nbChunksPostSplit * 2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) + + // if the cost of the split msm is lower than the cost of the non split msm, we split + if costPostSplit < costPreSplit { + config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) + var _p {{ $.TJacobian }} + chDone := make(chan struct{}, 1) + go func() { + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil } + // if we don't split, we use the best C we found _innerMsm{{ $.UPointName }}(p, C, points, scalars, config) return p, nil @@ -350,6 +378,19 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1) } + // we use a semaphore to limit the number of go routines running concurrently + // (only if nbTasks < nbCPU) + var sem chan struct{} + if config.NbTasks < runtime.NumCPU() { + sem = make(chan struct{}, config.NbTasks * 2) // *2 because if chunk is overweight we split it in two + for i:=0; i < config.NbTasks; i++ { + sem <- struct{}{} + } + defer func() { + close(sem) + }() + } + // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { @@ -362,8 +403,12 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // else what would happen is this go routine would finish much later than the others. chSplit := make(chan {{ $.TJacobianExtended }}, 2) split := n / 2 - go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + + if sem != nil { + sem <- struct{}{} // add another token to the semaphore, since we split in two. + } + go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) + go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit @@ -373,7 +418,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T }(j) continue } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) } return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:]) @@ -382,7 +427,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // getChunkProcessor{{ $.UPointName }} decides, depending on c window size and statistics for the chunk // to return the best algorithm to process the chunk. -func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) { +func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16, sem chan struct{}) { switch c { {{- range $c := $.LastCRange}} case {{$c}}: diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 6c4672b3de..a1c4dd80aa 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -38,7 +38,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - digits []uint16) { + digits []uint16, + sem chan struct{}) { + + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } // the batch affine addition needs independent points; in other words, for a window of batchSize // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying @@ -230,6 +236,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B total.add(&runningSum) } + + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} + } + chRes <- total } diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl index 89b4fbe80e..95dfcdeb39 100644 --- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -21,9 +21,13 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - digits []uint16) { - + digits []uint16, + sem chan struct{}) { + if sem != nil { + // if we are limited, wait for a token in the semaphore + <-sem + } var buckets B for i := 0 ; i < len(buckets); i++ { @@ -60,6 +64,12 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk total.add(&runningSum) } + if sem != nil { + // release a token to the semaphore + // before sending to chRes + sem <- struct{}{} +} + chRes <- total } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 0201ada492..3feb22ffe2 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -333,7 +333,7 @@ func _innerMsm{{ $.UPointName }}Reference(p *{{ $.TJacobian }}, points []{{ $.TA n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$.cmax}}] - go processChunk(uint64(j), chChunks[j], {{$.cmax}}, points, digits[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], {{$.cmax}}, points, digits[j*n:(j+1)*n], nil) } return msmReduceChunk{{ $.TAffine }}(p, int({{$.cmax}}), chChunks[:])