From 1f40d7cc5f019cb98efc3fae897ee2297291197b Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 6 Jun 2023 10:35:01 -0500
Subject: [PATCH] msm: semaphore to limit CPUs + better split strategy (up to
 25% perf boost on 96cores) (#403)

* feat: add semaphore in msm and better split strategy

* fix: close the chan semaphore

* perf: reduced default value of nbTasks
---
 ecc/bls12-377/multiexp.go                     | 185 +++++++++++++-----
 ecc/bls12-377/multiexp_affine.go              |  28 ++-
 ecc/bls12-377/multiexp_jacobian.go            |  28 ++-
 ecc/bls12-377/multiexp_test.go                |   4 +-
 ecc/bls12-378/multiexp.go                     | 185 +++++++++++++-----
 ecc/bls12-378/multiexp_affine.go              |  28 ++-
 ecc/bls12-378/multiexp_jacobian.go            |  28 ++-
 ecc/bls12-378/multiexp_test.go                |   4 +-
 ecc/bls12-381/multiexp.go                     | 185 +++++++++++++-----
 ecc/bls12-381/multiexp_affine.go              |  28 ++-
 ecc/bls12-381/multiexp_jacobian.go            |  28 ++-
 ecc/bls12-381/multiexp_test.go                |   4 +-
 ecc/bls24-315/multiexp.go                     | 185 +++++++++++++-----
 ecc/bls24-315/multiexp_affine.go              |  28 ++-
 ecc/bls24-315/multiexp_jacobian.go            |  28 ++-
 ecc/bls24-315/multiexp_test.go                |   4 +-
 ecc/bls24-317/multiexp.go                     | 185 +++++++++++++-----
 ecc/bls24-317/multiexp_affine.go              |  28 ++-
 ecc/bls24-317/multiexp_jacobian.go            |  28 ++-
 ecc/bls24-317/multiexp_test.go                |   4 +-
 ecc/bn254/multiexp.go                         | 185 +++++++++++++-----
 ecc/bn254/multiexp_affine.go                  |  28 ++-
 ecc/bn254/multiexp_jacobian.go                |  28 ++-
 ecc/bn254/multiexp_test.go                    |   4 +-
 ecc/bw6-633/multiexp.go                       | 185 +++++++++++++-----
 ecc/bw6-633/multiexp_affine.go                |  28 ++-
 ecc/bw6-633/multiexp_jacobian.go              |  28 ++-
 ecc/bw6-633/multiexp_test.go                  |   4 +-
 ecc/bw6-756/multiexp.go                       | 185 +++++++++++++-----
 ecc/bw6-756/multiexp_affine.go                |  28 ++-
 ecc/bw6-756/multiexp_jacobian.go              |  28 ++-
 ecc/bw6-756/multiexp_test.go                  |   4 +-
 ecc/bw6-761/multiexp.go                       | 185 +++++++++++++-----
 ecc/bw6-761/multiexp_affine.go                |  28 ++-
 ecc/bw6-761/multiexp_jacobian.go              |  28 ++-
 ecc/bw6-761/multiexp_test.go                  |   4 +-
 ecc/secp256k1/multiexp.go                     |  95 ++++++---
 ecc/secp256k1/multiexp_affine.go              |  14 +-
 ecc/secp256k1/multiexp_jacobian.go            |  14 +-
 ecc/secp256k1/multiexp_test.go                |   2 +-
 .../generator/ecc/template/multiexp.go.tmpl   |  95 ++++++---
 .../ecc/template/multiexp_affine.go.tmpl      |  15 +-
 .../ecc/template/multiexp_jacobian.go.tmpl    |  14 +-
 .../ecc/template/tests/multiexp.go.tmpl       |   2 +-
 44 files changed, 1895 insertions(+), 561 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 10699d84a7..52fdc682c1 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -582,6 +662,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index c6c022df46..d8ed9f8c4d 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index 8226a8d912..f766d8adb1 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 1166d52a2e..712b8f7d33 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 926c013cc4..494177fb58 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -300,6 +340,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -334,7 +375,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -364,29 +405,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -408,6 +471,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -420,8 +496,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -431,7 +511,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -439,7 +519,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -586,6 +666,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 22a42a7ef2..6fe1672180 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index fe3cd412a3..86b7dbc9ae 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -99,7 +111,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -135,6 +153,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 25a035e5f2..ad7d9474d1 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index cdc96a2b6a..df673c4c02 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 3:
@@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 3:
@@ -582,6 +662,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 285da72848..14a1dc29d6 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index 9abe78b336..b5af1128a3 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 8cd076b090..92dd4c7eb4 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 7c708411fd..4619bee3f8 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -582,6 +662,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index c804991099..7d27c125c6 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index eeeb196809..c6f10cc799 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 0492475b39..7961763cf1 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index e4909fa9e1..b3c1f10e3f 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 3:
@@ -298,6 +338,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -332,7 +373,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -362,29 +403,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -406,6 +469,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -418,8 +494,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -429,7 +509,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -437,7 +517,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 3:
@@ -582,6 +662,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index ba8135b40f..cbbb297004 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index a07c9e874f..a1eb686cb7 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -97,7 +109,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -133,6 +151,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index e48db59fd5..efd3fb3709 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 3650ae1e8b..9204858aef 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -300,6 +340,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -334,7 +375,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -364,29 +405,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -408,6 +471,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -420,8 +496,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -431,7 +511,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -439,7 +519,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -586,6 +666,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 81078a1438..e4b3bfced9 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -37,7 +37,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -226,6 +232,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -353,7 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -542,6 +560,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index 53490b9cc0..f9b8a901bd 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -99,7 +111,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -135,6 +153,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 17989d3689..e1f848e9f2 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index cbab47917f..3e0ad3498e 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 4:
@@ -247,6 +287,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -281,7 +322,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -311,29 +352,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -355,6 +418,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -367,8 +443,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -378,7 +458,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -386,7 +466,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 4:
@@ -480,6 +560,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 2c9eaa8ac1..afa4a2bf31 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -292,7 +304,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -481,6 +499,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index a79e632607..e8d707ec00 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -81,7 +93,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -117,6 +135,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 74f2ef86eb..1e45861b77 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 3e691db6c9..9129701454 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 3:
@@ -247,6 +287,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -281,7 +322,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -311,29 +352,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -355,6 +418,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -367,8 +443,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -378,7 +458,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -386,7 +466,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 3:
@@ -480,6 +560,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 4a5df9652f..edcf8f1b8d 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -292,7 +304,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -481,6 +499,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 9251324d15..503e51b341 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -81,7 +93,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -117,6 +135,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index d454ef78df..64b515445d 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 47aa334cc0..161132e0b5 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -249,6 +289,7 @@ func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -283,7 +324,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -313,29 +354,51 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G2Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G2Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
@@ -357,6 +420,19 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -369,8 +445,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -380,7 +460,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -388,7 +468,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 
 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -484,6 +564,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 1e67370777..38f11c5b03 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
@@ -292,7 +304,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -481,6 +499,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index 16d7e6c8d3..2401259a33 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
@@ -83,7 +95,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -119,6 +137,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 66e6e17241..d98bd688b6 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
@@ -718,7 +718,7 @@ func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
-		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
diff --git a/ecc/secp256k1/multiexp.go b/ecc/secp256k1/multiexp.go
index 02febdfef6..9977c2b345 100644
--- a/ecc/secp256k1/multiexp.go
+++ b/ecc/secp256k1/multiexp.go
@@ -41,6 +41,7 @@ func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -75,7 +76,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -105,29 +106,51 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p G1Jac
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm;
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
 	}
 
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+
+	cPostSplit := bestC(nbPoints / 2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p G1Jac
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
+	}
+
+	// if we don't split, we use the best C we found
 	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
@@ -149,6 +172,19 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks*2) // *2 because if chunk is overweight we split it in two
+		for i := 0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -161,8 +197,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -172,7 +212,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -180,7 +220,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 
 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
 	switch c {
 
 	case 2:
@@ -318,6 +358,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
diff --git a/ecc/secp256k1/multiexp_affine.go b/ecc/secp256k1/multiexp_affine.go
index a7a6c0c16a..9cb8e2e167 100644
--- a/ecc/secp256k1/multiexp_affine.go
+++ b/ecc/secp256k1/multiexp_affine.go
@@ -36,7 +36,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -225,6 +231,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/ecc/secp256k1/multiexp_jacobian.go b/ecc/secp256k1/multiexp_jacobian.go
index 788e646617..40a34f8fae 100644
--- a/ecc/secp256k1/multiexp_jacobian.go
+++ b/ecc/secp256k1/multiexp_jacobian.go
@@ -20,7 +20,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint16) {
+	digits []uint16,
+	sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -56,6 +62,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 		total.add(&runningSum)
 	}
 
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 }
 
diff --git a/ecc/secp256k1/multiexp_test.go b/ecc/secp256k1/multiexp_test.go
index e0ad2cf8d1..87cbd1575c 100644
--- a/ecc/secp256k1/multiexp_test.go
+++ b/ecc/secp256k1/multiexp_test.go
@@ -306,7 +306,7 @@ func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, con
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC15]
-		go processChunk(uint64(j), chChunks[j], 15, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], 15, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunkG1Affine(p, int(15), chChunks[:])
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index b79994d7f5..1096439278 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -69,6 +69,11 @@ type chunkStat struct {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 func partitionScalars(scalars []fr.Element, c uint64,  nbTasks int) ([]uint16, []chunkStat) {
+	// no benefit here to have more tasks than CPUs
+	if nbTasks > runtime.NumCPU() {
+		nbTasks = runtime.NumCPU()
+	}
+
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
@@ -240,6 +245,7 @@ func (p *{{ $.TAffine }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elemen
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) {
+	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
 	// note:
 	// each of the msmCX method is the same, except for the c constant it declares
 	// duplicating (through template generation) these methods allows to declare the buckets on the stack
@@ -274,7 +280,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 
 	// if nbTasks is not set, use all available CPUs
 	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
+		config.NbTasks = runtime.NumCPU() * 2
 	} else if config.NbTasks > 1024 {
 		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
@@ -306,29 +312,51 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 	C := bestC(nbPoints)
 	nbChunks := int(computeNbChunks(C))
 
-	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
-	if config.NbTasks > 1 && nbChunks < config.NbTasks {
-		// before splitting, let's see if we end up with more tasks than thread;
-		cSplit := bestC(nbPoints/2)
-		nbChunksPostSplit := int(computeNbChunks(cSplit))
-		nbTasksPostSplit := nbChunksPostSplit*2
-		if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) {
-			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
-			config.NbTasks /= 2
-			var _p {{ $.TJacobian }}
-			chDone := make(chan struct{}, 1)
-			go func() {
-				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
-				close(chDone)
-			}()
-			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
-			<-chDone
-			p.AddAssign(&_p)
-			return p, nil
+	// should we recursively split the msm in half? (see below)
+	// we want to minimize the execution time of the algorithm; 
+	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
+
+	// costFunction returns a metric that represent the "wall time" of the algorithm
+	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
+		// cost for the reduction of all tasks (msmReduceChunk)
+		totalCost := nbTasks
+
+		// cost for the computation of each task (msmProcessChunk)
+		for nbTasks >= nbCpus {
+			nbTasks -= nbCpus
+			totalCost += costPerTask
+		}
+		if nbTasks > 0 {
+			totalCost += costPerTask
 		}
+		return totalCost
+	}
+
+	// costPerTask is the approximate number of group ops per task
+	costPerTask := func(c uint64, nbPoints int) int {return (nbPoints + int((1 << c)))}
+
+	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
+	
+	cPostSplit := bestC(nbPoints/2)
+	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
+	costPostSplit := costFunction(nbChunksPostSplit * 2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
+
+	// if the cost of the split msm is lower than the cost of the non split msm, we split
+	if costPostSplit < costPreSplit {
+		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
+		var _p {{ $.TJacobian }}
+		chDone := make(chan struct{}, 1)
+		go func() {
+			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
+			close(chDone)
+		}()
+		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
+		<-chDone
+		p.AddAssign(&_p)
+		return p, nil
 	}
 
+	// if we don't split, we use the best C we found
 	_innerMsm{{ $.UPointName }}(p, C, points, scalars, config)
 
 	return p, nil
@@ -350,6 +378,19 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 		chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1)
 	}
 
+	// we use a semaphore to limit the number of go routines running concurrently
+	// (only if nbTasks < nbCPU)
+	var sem chan struct{}
+	if config.NbTasks < runtime.NumCPU() {
+		sem = make(chan struct{}, config.NbTasks * 2) // *2 because if chunk is overweight we split it in two
+		for i:=0; i < config.NbTasks; i++ {
+			sem <- struct{}{}
+		}
+		defer func() {
+			close(sem)
+		}()
+	}
+
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
@@ -362,8 +403,12 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan {{ $.TJacobianExtended }}, 2)
 			split := n / 2
-			go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+
+			if sem != nil {
+				sem <- struct{}{} // add another token to the semaphore, since we split in two.
+			}
+			go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
+			go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
 			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -373,7 +418,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 			}(j)
 			continue
 		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
 	}
 
 	return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:])
@@ -382,7 +427,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 
 // getChunkProcessor{{ $.UPointName }} decides, depending on c window size and statistics for the chunk
 // to return the best algorithm to process the chunk.
-func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) {
+func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16, sem chan struct{}) {
 	switch c {
 		{{- range $c :=  $.LastCRange}}
 		case {{$c}}:
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 6c4672b3de..a1c4dd80aa 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -38,7 +38,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 	 chRes chan<- {{ $.TJacobianExtended }},
 	 c uint64,
 	 points []{{ $.TAffine }},
-	 digits []uint16) {
+	 digits []uint16,
+	 sem chan struct{}) {
+
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
 	// the batch affine addition needs independent points; in other words, for a window of batchSize
 	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
@@ -230,6 +236,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		total.add(&runningSum)
 	}
 
+
+	if sem != nil {
+		// release a token to the semaphore
+		// before sending to chRes
+		sem <- struct{}{}
+	}
+
 	chRes <- total
 
 }
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
index 89b4fbe80e..95dfcdeb39 100644
--- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -21,9 +21,13 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
 	chRes chan<- {{ $.TJacobianExtended }},
 	c uint64,
 	points []{{ $.TAffine }},
-	digits []uint16) {
-
+	digits []uint16,
+	sem chan struct{}) {
 
+	if sem != nil {
+		// if we are limited, wait for a token in the semaphore
+		<-sem
+	}
 
    var buckets B
    for i := 0 ; i < len(buckets); i++ {
@@ -60,6 +64,12 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
 	   total.add(&runningSum)
    }
 
+   if sem != nil {
+	// release a token to the semaphore
+	// before sending to chRes
+	sem <- struct{}{}
+}
+
    chRes <- total
 }
 
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 0201ada492..3feb22ffe2 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -333,7 +333,7 @@ func _innerMsm{{ $.UPointName }}Reference(p *{{ $.TJacobian }}, points []{{ $.TA
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$.cmax}}]
-		go processChunk(uint64(j), chChunks[j], {{$.cmax}}, points, digits[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], {{$.cmax}}, points, digits[j*n:(j+1)*n], nil)
 	}
 
 	return msmReduceChunk{{ $.TAffine }}(p, int({{$.cmax}}), chChunks[:])