zfs_new_dirty_delay.stp

/* This script is used to see if there are any delays being posted to
 *
 * There are two aspects of it which is based on zfs_dirty_max parameter
 * which is set to 10%RAM. The soft limit is set to 60% of zfs_dirty_max
 * 1. Between soft_limit/2 and soft_limit, async writes are given more
 *    preference increasing linearly and set to max the ASYNC queue supports
 *    beyond soft limit.
 * 2. Once the above try fails to keep up with the incoming I/O, start delaying the
 *    client. The delay
 *         a) is max of 100msec as long as we are under the hard limit
 *         b) is non-deterministic and is based on the time it takes the dirty buffer size
 *            to shrink below the hard limit.
 *
 * USAGE: stap -v -d zfs zfs_dirtybuf_delay.stp <Display Frequency>
 *
 * Tunables:
 *        a) zfs_vdev_async_write_active_max_dirty_percent
 *        b) zfs_vdev_async_write_active_min_dirty_percent
 *        c) zfs_dirty_max
 *        d) zfs_vdev_async_write_min_active
 *        e) zfs_vdev_async_write_max_active
 *        f) zfs_delay_scale
 *      Goal: Ideal is to not delay but if client is really pumping hard
 *            try to make sure delays are consistent and less.
 */

/* IN MB's */
global hard_limit=0
global soft_limit=0
global write_c
global write_t

global dmu_tx_delay_hist
global flushed_dirty_amt_hist
global write_hist
global sync_freq_hist
global sync_time_hist
global sync_actual_ts

global trim_hist

global probe_start
global dirty=0;
global dmu_tx_delay_t_start;
global first_wr_ts = 0;
global last_wr_ts = 0;
global cnt=0;
global sync_ts=0;

global last_txg;
global trim_latency_ts;
global trim_latency_hist;
global block_alloc_latency_hist;

//probe module("zfs").function("gk_timestamp")
//{
//      block_alloc_latency_hist <<< $diff
//}

probe module("zfs").function("zio_trim") {
        trim_latency_ts = gettimeofday_us();
}

probe module("zfs").function("metaslab_trim_done") {
        trim_latency_hist <<< (gettimeofday_us() - trim_latency_ts)
}

probe module("zfs").function("spa_tx_trim_add_bytes") {
        trim_hist <<< ($bytes/(1024))
}
/* Write Latency */
probe module("zfs").function("zpl_write")
{
        if(cnt == 0) {
                first_wr_ts = gettimeofday_ms();
                cnt++;
        }

        write_t[tid()] = gettimeofday_ms();
        write_c[tid()]++;
}

probe module("zfs").function("zpl_write").return
{
        last_wr_ts = gettimeofday_ms();
        diff = last_wr_ts - write_t[tid()];
        write_c[tid()]++;
        write_hist <<< diff;
}

/* CLient Side Delay - Phase 2 */
probe module("zfs").function("dmu_tx_wait") {
        if($tx->tx_wait_dirty) {
                hard_limit = $zfs_dirty_data_max;
                soft_limit = ($zfs_dirty_data_max * $zfs_delay_min_dirty_percent) / 100
                dmu_tx_delay_t_start[tid()] = gettimeofday_ms();
        }
}

probe module("zfs").function("dmu_tx_wait").return {
        if($tx->tx_wait_dirty) {
                diff = gettimeofday_ms() - dmu_tx_delay_t_start[tid()];
                dmu_tx_delay_hist <<< diff;
        }
}

probe module("zfs").function("spa_sync")
{
        if(sync_ts == 0) {
                sync_ts = gettimeofday_ms();
        } else {
                ts_in_msec = gettimeofday_ms();
                sync_freq = ts_in_msec - sync_ts;
                sync_ts = ts_in_msec;
                sync_freq_hist[$spa->spa_name] <<< sync_freq
        }

        last_txg[$spa->spa_name] = $txg;

        dirty = $spa->spa_dsl_pool->dp_dirty_pertxg[$txg & 3];
        dirty = dirty/(1024*1024);
        if(dirty != 0)
                flushed_dirty_amt_hist[$spa->spa_name] <<< dirty;
}

probe module("zfs").function("spa_sync").return
{
        diff = (gettimeofday_ms() - sync_ts);
        sync_time_hist[$spa->spa_name] <<< diff
        if(diff >= 20000)
                sync_actual_ts[$spa->spa_name, diff] = sync_ts;
}
function stats:long() {
        wr_throughput = 0;
        intv = (gettimeofday_s() - probe_start);
        wr_intv = last_wr_ts - first_wr_ts;
        printf("\n\n\n -----> Run Interval %d [%s to %s] \n", intv, tz_ctime(probe_start), tz_ctime(gettimeofday_s()))

        foreach (thread_id in write_c)
                wr_throughput += write_c[thread_id]/2;

        if(hard_limit != 0)
                printf("Soft Limit: %d, Hard Limit: %d] \n", soft_limit, hard_limit)

        if (@count(dmu_tx_delay_hist)) {
                printf("== Client Delay \n")
                printf("ms \n");
                print(@hist_log(dmu_tx_delay_hist))
        }


        foreach([pool] in flushed_dirty_amt_hist) {
                if(@count(flushed_dirty_amt_hist[pool])) {
                        printf("== Data flushed in each txg_sync (%s) \n",  kernel_string(pool));
                        printf("MB \n");
                        print(@hist_log(flushed_dirty_amt_hist[pool]))
                }
        }

        foreach([pool] in sync_freq_hist) {
                if(@count(sync_freq_hist[pool])) {
                        printf("== txg sync (txg %ld) freq for pool (%s) \n", last_txg[pool], kernel_string(pool));
                        printf("msec \n");
                        print(@hist_log(sync_freq_hist[pool]))
                }
        }

        foreach([pool] in sync_time_hist) {
                if(@count(sync_time_hist[pool])) {
                        printf("== txg sync time for pool (%s) \n",  kernel_string(pool));
                        printf("msec \n");
                        print(@hist_log(sync_time_hist[pool]))
                }
        }

        foreach([pool, t] in sync_actual_ts) {
                        printf("== pool (%s) \n",  kernel_string(pool));
                        printf("[%s]", tz_ctime(sync_actual_ts[pool,t]/1000))
        }
        printf("\n");

        if(@count(trim_hist)) {
                printf("TRIMS executed %d times \n", @count(trim_hist));
                printf("KB\n");
                print(@hist_log(trim_hist))
        }

        if(@count(trim_latency_hist)) {
                printf("TRIMS Latency   (min/avg/max) --> %d/%d/%d usec\n", @min(trim_latency_hist), @avg(trim_latency_hist), @max(trim_latency_hist));
                printf("us\n");
                print(@hist_log(trim_latency_hist))
        }
        //if(@count(block_alloc_latency_hist)) {
        //      printf("Block Alloc delay (min/avg/max) --> %d/%d/%d nsec \n", @min(block_alloc_latency_hist), @avg(block_alloc_latency_hist), @max(block_alloc_latency_hist));
          //      printf("ns\n");
            //    print(@hist_log(block_alloc_latency_hist))
        //}

        if (@count(write_hist) && wr_intv) {
                printf("Write: Latency    (min/avg/max) --> %d/%d/%d msec\n", @min(write_hist), @avg(write_hist), @max(write_hist));
                printf("Write: Throughput --> %d IOPS\n", (wr_throughput*1000)/wr_intv);
                print(@hist_log(write_hist))
        }


}

probe timer.s($1) {
        stats()
}

function init:long() {

        delete dmu_tx_delay_hist
        delete flushed_dirty_amt_hist
        delete write_hist
        delete probe_start
        delete dirty
        delete dmu_tx_delay_t_start
        delete write_c
        delete write_t

}

probe begin {
        # Start fresh
        init()

        # Keep an account of when the test was started
        probe_start =  gettimeofday_s()

}

probe end {
        stats()
}