Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

connection pool: max idle connections implementation #17443

Merged
merged 8 commits into from
Jan 10, 2025
3 changes: 3 additions & 0 deletions go/flags/endtoend/vtcombo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -282,18 +282,21 @@ Flags:
--queryserver-config-pool-conn-max-lifetime duration query server connection max lifetime, vttablet manages various mysql connection pools. This config means if a connection has lived at least this long, it connection will be removed from pool upon the next time it is returned to the pool.
--queryserver-config-pool-size int query server read pool size, connection pool is used by regular queries (non streaming, not in a transaction) (default 16)
--queryserver-config-query-cache-memory int query server query cache size in bytes, maximum amount of memory to be used for caching. vttablet analyzes every incoming query and generate a query plan, these plans are being cached in a lru cache. This config controls the capacity of the lru cache. (default 33554432)
--queryserver-config-query-pool-max-idle-count int query server query pool - maximum number of idle connections to retain in the pool. Use this to balance between faster response times during traffic bursts and resource efficiency during low-traffic periods.
--queryserver-config-query-pool-timeout duration query server query pool timeout, it is how long vttablet waits for a connection from the query pool. If set to 0 (default) then the overall query timeout is used instead.
--queryserver-config-query-timeout duration query server query timeout, this is the query timeout in vttablet side. If a query takes more than this timeout, it will be killed. (default 30s)
--queryserver-config-schema-change-signal query server schema signal, will signal connected vtgates that schema has changed whenever this is detected. VTGates will need to have -schema_change_signal enabled for this to work (default true)
--queryserver-config-schema-reload-time duration query server schema reload time, how often vttablet reloads schemas from underlying MySQL instance. vttablet keeps table schemas in its own memory and periodically refreshes it from MySQL. This config controls the reload time. (default 30m0s)
--queryserver-config-stream-buffer-size int query server stream buffer size, the maximum number of bytes sent from vttablet for each stream call. It's recommended to keep this value in sync with vtgate's stream_buffer_size. (default 32768)
--queryserver-config-stream-pool-max-idle-count int query server stream pool - maximum number of idle connections to retain in the pool. Use this to balance between faster response times during traffic bursts and resource efficiency during low-traffic periods.
--queryserver-config-stream-pool-size int query server stream connection pool size, stream pool is used by stream queries: queries that return results to client in a streaming fashion (default 200)
--queryserver-config-stream-pool-timeout duration query server stream pool timeout, it is how long vttablet waits for a connection from the stream pool. If set to 0 (default) then there is no timeout.
--queryserver-config-strict-table-acl only allow queries that pass table acl checks
--queryserver-config-terse-errors prevent bind vars from escaping in client error messages
--queryserver-config-transaction-cap int query server transaction cap is the maximum number of transactions allowed to happen at any given point of a time for a single vttablet. E.g. by setting transaction cap to 100, there are at most 100 transactions will be processed by a vttablet and the 101th transaction will be blocked (and fail if it cannot get connection within specified timeout) (default 20)
--queryserver-config-transaction-timeout duration query server transaction timeout, a transaction will be killed if it takes longer than this value (default 30s)
--queryserver-config-truncate-error-len int truncate errors sent to client if they are longer than this value (0 means do not truncate)
--queryserver-config-txpool-max-idle-count int query server transaction pool - maximum number of idle connections to retain in the pool. Use this to balance between faster response times during traffic bursts and resource efficiency during low-traffic periods.
--queryserver-config-txpool-timeout duration query server transaction pool timeout, it is how long vttablet waits if tx pool is full (default 1s)
--queryserver-config-warn-result-size int query server result size warning threshold, warn if number of rows returned from vttablet for non-streaming queries exceeds this
--queryserver-enable-views Enable views support in vttablet.
Expand Down
3 changes: 3 additions & 0 deletions go/flags/endtoend/vttablet.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,18 +274,21 @@ Flags:
--queryserver-config-pool-conn-max-lifetime duration query server connection max lifetime, vttablet manages various mysql connection pools. This config means if a connection has lived at least this long, it connection will be removed from pool upon the next time it is returned to the pool.
--queryserver-config-pool-size int query server read pool size, connection pool is used by regular queries (non streaming, not in a transaction) (default 16)
--queryserver-config-query-cache-memory int query server query cache size in bytes, maximum amount of memory to be used for caching. vttablet analyzes every incoming query and generate a query plan, these plans are being cached in a lru cache. This config controls the capacity of the lru cache. (default 33554432)
--queryserver-config-query-pool-max-idle-count int query server query pool - maximum number of idle connections to retain in the pool. Use this to balance between faster response times during traffic bursts and resource efficiency during low-traffic periods.
--queryserver-config-query-pool-timeout duration query server query pool timeout, it is how long vttablet waits for a connection from the query pool. If set to 0 (default) then the overall query timeout is used instead.
--queryserver-config-query-timeout duration query server query timeout, this is the query timeout in vttablet side. If a query takes more than this timeout, it will be killed. (default 30s)
--queryserver-config-schema-change-signal query server schema signal, will signal connected vtgates that schema has changed whenever this is detected. VTGates will need to have -schema_change_signal enabled for this to work (default true)
--queryserver-config-schema-reload-time duration query server schema reload time, how often vttablet reloads schemas from underlying MySQL instance. vttablet keeps table schemas in its own memory and periodically refreshes it from MySQL. This config controls the reload time. (default 30m0s)
--queryserver-config-stream-buffer-size int query server stream buffer size, the maximum number of bytes sent from vttablet for each stream call. It's recommended to keep this value in sync with vtgate's stream_buffer_size. (default 32768)
--queryserver-config-stream-pool-max-idle-count int query server stream pool - maximum number of idle connections to retain in the pool. Use this to balance between faster response times during traffic bursts and resource efficiency during low-traffic periods.
--queryserver-config-stream-pool-size int query server stream connection pool size, stream pool is used by stream queries: queries that return results to client in a streaming fashion (default 200)
--queryserver-config-stream-pool-timeout duration query server stream pool timeout, it is how long vttablet waits for a connection from the stream pool. If set to 0 (default) then there is no timeout.
--queryserver-config-strict-table-acl only allow queries that pass table acl checks
--queryserver-config-terse-errors prevent bind vars from escaping in client error messages
--queryserver-config-transaction-cap int query server transaction cap is the maximum number of transactions allowed to happen at any given point of a time for a single vttablet. E.g. by setting transaction cap to 100, there are at most 100 transactions will be processed by a vttablet and the 101th transaction will be blocked (and fail if it cannot get connection within specified timeout) (default 20)
--queryserver-config-transaction-timeout duration query server transaction timeout, a transaction will be killed if it takes longer than this value (default 30s)
--queryserver-config-truncate-error-len int truncate errors sent to client if they are longer than this value (0 means do not truncate)
--queryserver-config-txpool-max-idle-count int query server transaction pool - maximum number of idle connections to retain in the pool. Use this to balance between faster response times during traffic bursts and resource efficiency during low-traffic periods.
--queryserver-config-txpool-timeout duration query server transaction pool timeout, it is how long vttablet waits if tx pool is full (default 1s)
--queryserver-config-warn-result-size int query server result size warning threshold, warn if number of rows returned from vttablet for non-streaming queries exceeds this
--queryserver-enable-views Enable views support in vttablet.
Expand Down
48 changes: 48 additions & 0 deletions go/pools/smartconnpool/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ type RefreshCheck func() (bool, error)

type Config[C Connection] struct {
Capacity int64
MaxIdleCount int64
IdleTimeout time.Duration
MaxLifetime time.Duration
RefreshInterval time.Duration
Expand Down Expand Up @@ -123,6 +124,8 @@ type ConnPool[C Connection] struct {
active atomic.Int64
// capacity is the maximum number of connections that this pool can open
capacity atomic.Int64
// maxIdleCount is the maximum idle connections in the pool
idleCount atomic.Int64

// workers is a waitgroup for all the currently running worker goroutines
workers sync.WaitGroup
Expand All @@ -138,6 +141,8 @@ type ConnPool[C Connection] struct {
// maxCapacity is the maximum value to which capacity can be set; when the pool
// is re-opened, it defaults to this capacity
maxCapacity int64
// maxIdleCount is the maximum idle connections in the pool
maxIdleCount int64
// maxLifetime is the maximum time a connection can be open
maxLifetime atomic.Int64
// idleTimeout is the maximum time a connection can remain idle
Expand All @@ -158,6 +163,7 @@ func NewPool[C Connection](config *Config[C]) *ConnPool[C] {
pool := &ConnPool[C]{}
pool.freshSettingsStack.Store(-1)
pool.config.maxCapacity = config.Capacity
pool.config.maxIdleCount = config.MaxIdleCount
pool.config.maxLifetime.Store(config.MaxLifetime.Nanoseconds())
pool.config.idleTimeout.Store(config.IdleTimeout.Nanoseconds())
pool.config.refreshInterval.Store(config.RefreshInterval.Nanoseconds())
Expand Down Expand Up @@ -192,6 +198,7 @@ func (pool *ConnPool[C]) runWorker(close <-chan struct{}, interval time.Duration
func (pool *ConnPool[C]) open() {
pool.close = make(chan struct{})
pool.capacity.Store(pool.config.maxCapacity)
pool.setIdleCount()

// The expire worker takes care of removing from the waiter list any clients whose
// context has been cancelled.
Expand Down Expand Up @@ -315,6 +322,16 @@ func (pool *ConnPool[C]) MaxCapacity() int64 {
return pool.config.maxCapacity
}

func (pool *ConnPool[C]) setIdleCount() {
capacity := pool.Capacity()
maxIdleCount := pool.config.maxIdleCount
if maxIdleCount == 0 || maxIdleCount > capacity {
pool.idleCount.Store(capacity)
} else {
pool.idleCount.Store(maxIdleCount)
}
}

// InUse returns the number of connections that the pool has lent out to clients and that
// haven't been returned yet.
func (pool *ConnPool[C]) InUse() int64 {
Expand All @@ -340,6 +357,10 @@ func (pool *ConnPool[C]) SetIdleTimeout(duration time.Duration) {
pool.config.idleTimeout.Store(duration.Nanoseconds())
}

func (pool *ConnPool[D]) IdleCount() int64 {
return pool.idleCount.Load()
}

func (pool *ConnPool[D]) RefreshInterval() time.Duration {
return time.Duration(pool.config.refreshInterval.Load())
}
Expand Down Expand Up @@ -396,6 +417,10 @@ func (pool *ConnPool[C]) put(conn *Pooled[C]) {
}

if !pool.wait.tryReturnConn(conn) {
if pool.closeOnIdleLimitReached(conn) {
return
}

connSetting := conn.Conn.Setting()
if connSetting == nil {
pool.clean.Push(conn)
Expand All @@ -407,6 +432,23 @@ func (pool *ConnPool[C]) put(conn *Pooled[C]) {
}
}

// closeOnIdleLimitReached closes a connection if the number of idle connections (active - inuse) in the pool
// exceeds the idleCount limit. It returns true if the connection is closed, false otherwise.
func (pool *ConnPool[C]) closeOnIdleLimitReached(conn *Pooled[C]) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be better to not always immediately close, or just close with some random chance, so the connection churn is not really aggressive when there's high active transaction churn?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to maintain a pool of some free available connections, otherwise there will be time spend in acquiring the connection for executing the query.
Even during high transactions, there will be log of get and put call that will happen on the pool, so we have to wait till the free connections become more than a certain limit before we start closing them.
Otherwise the application will see degradation in performance on high QPS

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I said "not always". By which I meant, randomize closing after the idle time had reached.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another way to look at it would be to make the idle time random, say between 1x and 2x of the config value, so we don't try and close lots of connections at the same time, after a spike in load.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have support for idle time. This is implementation for idle count. You can choose to use either or both settings on the pool.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the idle time and counts are hard thresholds, which will cause large numbers of connections to get closed at the same time, if a large number were opened at the same time, which is not necessary, and could be smoothed out a little.

Copy link
Member Author

@harshit-gangal harshit-gangal Jan 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not like maximum connection lifetime which we already smooth out.
In case of idle connection, it is reasonable to churn them as and when they exceed the limit.
There is no active user looking for those connections, therefore they became idle.

for {
open := pool.active.Load()
idle := open - pool.borrowed.Load()
if idle <= pool.idleCount.Load() {
return false
}
if pool.active.CompareAndSwap(open, open-1) {
pool.Metrics.idleClosed.Add(1)
conn.Close()
return true
}
}
}

func (pool *ConnPool[D]) extendedMaxLifetime() time.Duration {
maxLifetime := pool.config.maxLifetime.Load()
if maxLifetime == 0 {
Expand Down Expand Up @@ -629,6 +671,9 @@ func (pool *ConnPool[C]) setCapacity(ctx context.Context, newcap int64) error {
if oldcap == newcap {
return nil
}
// update the idle count to match the new capacity if necessary
// wait for connections to be returned to the pool if we're reducing the capacity.
defer pool.setIdleCount()

const delay = 10 * time.Millisecond

Expand Down Expand Up @@ -732,6 +777,9 @@ func (pool *ConnPool[C]) RegisterStats(stats *servenv.Exporter, name string) {
// the smartconnpool doesn't have a maximum capacity
return pool.Capacity()
})
stats.NewGaugeFunc(name+"IdleAllowed", "Tablet server conn pool idle allowed limit", func() int64 {
return pool.IdleCount()
})
stats.NewCounterFunc(name+"WaitCount", "Tablet server conn pool wait count", func() int64 {
return pool.Metrics.WaitCount()
})
Expand Down
45 changes: 45 additions & 0 deletions go/pools/smartconnpool/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,51 @@ func TestExtendedLifetimeTimeout(t *testing.T) {
}
}

// TestMaxIdleCount tests the MaxIdleCount setting, to check if the pool closes
// the idle connections when the number of idle connections exceeds the limit.
func TestMaxIdleCount(t *testing.T) {
testMaxIdleCount := func(t *testing.T, setting *Setting, maxIdleCount int64, expClosedConn int) {
var state TestState

ctx := context.Background()
p := NewPool(&Config[*TestConn]{
Capacity: 5,
MaxIdleCount: maxIdleCount,
LogWait: state.LogWait,
}).Open(newConnector(&state), nil)

defer p.Close()

var conns []*Pooled[*TestConn]
for i := 0; i < 5; i++ {
r, err := p.Get(ctx, setting)
require.NoError(t, err)
assert.EqualValues(t, i+1, state.open.Load())
assert.EqualValues(t, 0, p.Metrics.IdleClosed())

conns = append(conns, r)
}

for _, conn := range conns {
p.put(conn)
}

closedConn := 0
for _, conn := range conns {
if conn.Conn.IsClosed() {
closedConn++
}
}
assert.EqualValues(t, expClosedConn, closedConn)
assert.EqualValues(t, expClosedConn, p.Metrics.IdleClosed())
}

t.Run("WithoutSettings", func(t *testing.T) { testMaxIdleCount(t, nil, 2, 3) })
t.Run("WithSettings", func(t *testing.T) { testMaxIdleCount(t, sFoo, 2, 3) })
t.Run("WithoutSettings-MaxIdleCount-Zero", func(t *testing.T) { testMaxIdleCount(t, nil, 0, 0) })
t.Run("WithSettings-MaxIdleCount-Zero", func(t *testing.T) { testMaxIdleCount(t, sFoo, 0, 0) })
}

func TestCreateFail(t *testing.T) {
var state TestState
state.chaos.failConnect = true
Expand Down
1 change: 1 addition & 0 deletions go/vt/vttablet/tabletserver/connpool/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ func NewPool(env tabletenv.Env, name string, cfg tabletenv.ConnPoolConfig) *Pool
config := smartconnpool.Config[*Conn]{
Capacity: int64(cfg.Size),
IdleTimeout: cfg.IdleTimeout,
MaxIdleCount: int64(cfg.MaxIdleCount),
MaxLifetime: cfg.MaxLifetime,
RefreshInterval: mysqlctl.PoolDynamicHostnameResolution,
}
Expand Down
58 changes: 55 additions & 3 deletions go/vt/vttablet/tabletserver/connpool/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ func TestConnPoolTimeout(t *testing.T) {
defer db.Close()

cfg := tabletenv.ConnPoolConfig{
Size: 1,
Size: 1,
Timeout: time.Second,
IdleTimeout: 10 * time.Second,
}
cfg.Timeout = time.Second
cfg.IdleTimeout = 10 * time.Second
connPool := NewPool(tabletenv.NewEnv(vtenv.NewTestEnv(), nil, "PoolTest"), "TestPool", cfg)
params := dbconfigs.New(db.ConnParams())
connPool.Open(params, params, params)
Expand Down Expand Up @@ -135,6 +135,58 @@ func TestConnPoolSetCapacity(t *testing.T) {
}
}

// TestConnPoolMaxIdleCount tests the max idle count for the pool.
// The pool should close the idle connections if the idle count is more than the allowed idle count.
// Changing the pool capacity will affect the idle count allowed for that pool.
func TestConnPoolMaxIdleCount(t *testing.T) {
db := fakesqldb.New(t)
defer db.Close()

cfg := tabletenv.ConnPoolConfig{
Size: 5,
MaxIdleCount: 2,
}
connPool := NewPool(tabletenv.NewEnv(vtenv.NewTestEnv(), nil, "PoolTest"), "TestPool", cfg)
params := dbconfigs.New(db.ConnParams())
connPool.Open(params, params, params)
defer connPool.Close()

assert.EqualValues(t, 5, connPool.Capacity(), "pool capacity should be 5")
assert.EqualValues(t, 2, connPool.IdleCount(), "pool idle count should be 2")

var conns []*PooledConn
for i := 0; i < 3; i++ {
conn, err := connPool.Get(context.Background(), nil)
require.NoError(t, err)
conns = append(conns, conn)
}

// after recycle - 1 idle connection
conns[0].Recycle()
assert.Zero(t, connPool.Metrics.IdleClosed(), "pool idle closed should be 0")

// after recycle - 2 idle connection
conns[1].Recycle()
assert.Zero(t, connPool.Metrics.IdleClosed(), "pool idle closed should be 0")

// after recycle - 3 idle connection, 1 will be closed
conns[2].Recycle()
assert.EqualValues(t, 1, connPool.Metrics.IdleClosed(), "pool idle closed should be 1")

// changing the pool capacity will affect the idle count allowed for that pool.
// If setting the capacity to lower value than max idle count.

err := connPool.SetCapacity(context.Background(), 4)
require.NoError(t, err)
assert.EqualValues(t, 4, connPool.Capacity(), "pool capacity should be 4")
assert.EqualValues(t, 2, connPool.IdleCount(), "pool idle count should be 2")

err = connPool.SetCapacity(context.Background(), 1)
require.NoError(t, err)
assert.EqualValues(t, 1, connPool.Capacity(), "pool capacity should be 1")
assert.EqualValues(t, 1, connPool.IdleCount(), "pool idle count should be changed to 1")
}

func TestConnPoolStatJSON(t *testing.T) {
db := fakesqldb.New(t)
defer db.Close()
Expand Down
Loading
Loading