-
Notifications
You must be signed in to change notification settings - Fork 0
/
zfs_new_dirty_delay.stp
236 lines (198 loc) · 7.23 KB
/
zfs_new_dirty_delay.stp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/* This script is used to see if there are any delays being posted to
*
* There are two aspects of it which is based on zfs_dirty_max parameter
* which is set to 10%RAM. The soft limit is set to 60% of zfs_dirty_max
* 1. Between soft_limit/2 and soft_limit, async writes are given more
* preference increasing linearly and set to max the ASYNC queue supports
* beyond soft limit.
* 2. Once the above try fails to keep up with the incoming I/O, start delaying the
* client. The delay
* a) is max of 100msec as long as we are under the hard limit
* b) is non-deterministic and is based on the time it takes the dirty buffer size
* to shrink below the hard limit.
*
* USAGE: stap -v -d zfs zfs_dirtybuf_delay.stp <Display Frequency>
*
* Tunables:
* a) zfs_vdev_async_write_active_max_dirty_percent
* b) zfs_vdev_async_write_active_min_dirty_percent
* c) zfs_dirty_max
* d) zfs_vdev_async_write_min_active
* e) zfs_vdev_async_write_max_active
* f) zfs_delay_scale
* Goal: Ideal is to not delay but if client is really pumping hard
* try to make sure delays are consistent and less.
*/
/* IN MB's */
global hard_limit=0
global soft_limit=0
global write_c
global write_t
global dmu_tx_delay_hist
global flushed_dirty_amt_hist
global write_hist
global sync_freq_hist
global sync_time_hist
global sync_actual_ts
global trim_hist
global probe_start
global dirty=0;
global dmu_tx_delay_t_start;
global first_wr_ts = 0;
global last_wr_ts = 0;
global cnt=0;
global sync_ts=0;
global last_txg;
global trim_latency_ts;
global trim_latency_hist;
global block_alloc_latency_hist;
//probe module("zfs").function("gk_timestamp")
//{
// block_alloc_latency_hist <<< $diff
//}
probe module("zfs").function("zio_trim") {
trim_latency_ts = gettimeofday_us();
}
probe module("zfs").function("metaslab_trim_done") {
trim_latency_hist <<< (gettimeofday_us() - trim_latency_ts)
}
probe module("zfs").function("spa_tx_trim_add_bytes") {
trim_hist <<< ($bytes/(1024))
}
/* Write Latency */
probe module("zfs").function("zpl_write")
{
if(cnt == 0) {
first_wr_ts = gettimeofday_ms();
cnt++;
}
write_t[tid()] = gettimeofday_ms();
write_c[tid()]++;
}
probe module("zfs").function("zpl_write").return
{
last_wr_ts = gettimeofday_ms();
diff = last_wr_ts - write_t[tid()];
write_c[tid()]++;
write_hist <<< diff;
}
/* CLient Side Delay - Phase 2 */
probe module("zfs").function("dmu_tx_wait") {
if($tx->tx_wait_dirty) {
hard_limit = $zfs_dirty_data_max;
soft_limit = ($zfs_dirty_data_max * $zfs_delay_min_dirty_percent) / 100
dmu_tx_delay_t_start[tid()] = gettimeofday_ms();
}
}
probe module("zfs").function("dmu_tx_wait").return {
if($tx->tx_wait_dirty) {
diff = gettimeofday_ms() - dmu_tx_delay_t_start[tid()];
dmu_tx_delay_hist <<< diff;
}
}
probe module("zfs").function("spa_sync")
{
if(sync_ts == 0) {
sync_ts = gettimeofday_ms();
} else {
ts_in_msec = gettimeofday_ms();
sync_freq = ts_in_msec - sync_ts;
sync_ts = ts_in_msec;
sync_freq_hist[$spa->spa_name] <<< sync_freq
}
last_txg[$spa->spa_name] = $txg;
dirty = $spa->spa_dsl_pool->dp_dirty_pertxg[$txg & 3];
dirty = dirty/(1024*1024);
if(dirty != 0)
flushed_dirty_amt_hist[$spa->spa_name] <<< dirty;
}
probe module("zfs").function("spa_sync").return
{
diff = (gettimeofday_ms() - sync_ts);
sync_time_hist[$spa->spa_name] <<< diff
if(diff >= 20000)
sync_actual_ts[$spa->spa_name, diff] = sync_ts;
}
function stats:long() {
wr_throughput = 0;
intv = (gettimeofday_s() - probe_start);
wr_intv = last_wr_ts - first_wr_ts;
printf("\n\n\n -----> Run Interval %d [%s to %s] \n", intv, tz_ctime(probe_start), tz_ctime(gettimeofday_s()))
foreach (thread_id in write_c)
wr_throughput += write_c[thread_id]/2;
if(hard_limit != 0)
printf("Soft Limit: %d, Hard Limit: %d] \n", soft_limit, hard_limit)
if (@count(dmu_tx_delay_hist)) {
printf("== Client Delay \n")
printf("ms \n");
print(@hist_log(dmu_tx_delay_hist))
}
foreach([pool] in flushed_dirty_amt_hist) {
if(@count(flushed_dirty_amt_hist[pool])) {
printf("== Data flushed in each txg_sync (%s) \n", kernel_string(pool));
printf("MB \n");
print(@hist_log(flushed_dirty_amt_hist[pool]))
}
}
foreach([pool] in sync_freq_hist) {
if(@count(sync_freq_hist[pool])) {
printf("== txg sync (txg %ld) freq for pool (%s) \n", last_txg[pool], kernel_string(pool));
printf("msec \n");
print(@hist_log(sync_freq_hist[pool]))
}
}
foreach([pool] in sync_time_hist) {
if(@count(sync_time_hist[pool])) {
printf("== txg sync time for pool (%s) \n", kernel_string(pool));
printf("msec \n");
print(@hist_log(sync_time_hist[pool]))
}
}
foreach([pool, t] in sync_actual_ts) {
printf("== pool (%s) \n", kernel_string(pool));
printf("[%s]", tz_ctime(sync_actual_ts[pool,t]/1000))
}
printf("\n");
if(@count(trim_hist)) {
printf("TRIMS executed %d times \n", @count(trim_hist));
printf("KB\n");
print(@hist_log(trim_hist))
}
if(@count(trim_latency_hist)) {
printf("TRIMS Latency (min/avg/max) --> %d/%d/%d usec\n", @min(trim_latency_hist), @avg(trim_latency_hist), @max(trim_latency_hist));
printf("us\n");
print(@hist_log(trim_latency_hist))
}
//if(@count(block_alloc_latency_hist)) {
// printf("Block Alloc delay (min/avg/max) --> %d/%d/%d nsec \n", @min(block_alloc_latency_hist), @avg(block_alloc_latency_hist), @max(block_alloc_latency_hist));
// printf("ns\n");
// print(@hist_log(block_alloc_latency_hist))
//}
if (@count(write_hist) && wr_intv) {
printf("Write: Latency (min/avg/max) --> %d/%d/%d msec\n", @min(write_hist), @avg(write_hist), @max(write_hist));
printf("Write: Throughput --> %d IOPS\n", (wr_throughput*1000)/wr_intv);
print(@hist_log(write_hist))
}
}
probe timer.s($1) {
stats()
}
function init:long() {
delete dmu_tx_delay_hist
delete flushed_dirty_amt_hist
delete write_hist
delete probe_start
delete dirty
delete dmu_tx_delay_t_start
delete write_c
delete write_t
}
probe begin {
# Start fresh
init()
# Keep an account of when the test was started
probe_start = gettimeofday_s()
}
probe end {
stats()
}