-
Notifications
You must be signed in to change notification settings - Fork 3
/
drive_km.R
166 lines (154 loc) · 5.19 KB
/
drive_km.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# 10 june 2017 - removed my versions of ggsurv because GGally has now been patched
# updated june 2017 to generate km curves for restricted periods to make sense of
# drives with short observation periods
# for data from https://www.backblaze.com/hard-drive-test-data.html
# ross lazarus me fecit february 18 2016
# ggsurv modified from
# http://www.r-statistics.com/2013/07/creating-good-looking-survival-curves-the-ggsurv-function/
# the legend now corresponds to the order of the very last survival point so
# it's easier to see which is which...
library(survival)
library(rms)
library(GGally)
library(gdata)
setwd('/data/drivefail')
fixres = function(modkm)
{
## reformat and reorder the km model results
oediff = modkm$obs - modkm$exp
oesign = sign(oediff)
oegood = oesign < 0
oechi = (oediff) ^ 2 / modkm$exp
oesort = oechi * oesign
resd = data.frame(
modkm$n,
'Observed' = modkm$obs,
'Expected' = modkm$exp,
'chisq' = oechi,
'sort' = oesort
)
rownames(resd) = names(modkm$n)
sresd = resd[order(resd$sort), ]
sresd$rank = c(1:nrow(resd))
return(sresd)
}
# changeme!!!
setwd('/data/drivefail')
options(width=512)
runtag = 'july6_no_resurrection'
fn = paste('drivefail_res',runtag,sep='')
titl = 'KM plots - Backblaze drive data to end Q1 2018'
subtitl = paste('Run on',runtag,sep=' ')
# changeme!!!
f = paste(fn, 'xls', sep = '.')
d = read.delim(f, sep = '\t', head = T)
tm = table(d$manufact)
ds = subset(d, tm[d$manufact] > 200)
#toobig = (ds$serial == 'MJ0351YNGA723A')
#ds$obsdays[toobig] = 1610 # most are censored around this time up to q1 2018 MJ0351YNGA723A
# doesn't help - so many censored about the same time...
s = with(ds, Surv(time = obsdays, event = status))
km.manu = npsurv(s ~ manufact, data = ds)
ofnpdf = paste(fn, '_manufacturer.pdf', sep = '')
ofnpng = paste(fn, '_manufacturer.png', sep = '')
ofnsvg = paste(fn, '_manufacturer.svg', sep = '')
svg(ofnsvg)
ggsurv(km.manu, main = titl)
dev.off()
png(ofnpng, height = 1000, width = 1600)
ggsurv(km.manu, main = titl)
dev.off()
pdf(ofnpdf, height = 10, width = 20)
ggsurv(km.manu, main = titl)
dev.off()
tmo = table(d$model)
dm = subset(d, tmo[d$model] > 200)
# dm = subset(dm,)
sm = with(dm, Surv(time = obsdays, event = status))
km.mod = npsurv(sm ~ model, data = dm)
ofnpdf = paste(fn, '_model.pdf', sep = '')
ofnpng = paste(fn, '_model.png', sep = '')
ofnsvg = paste(fn, '_model.svg', sep = '')
svg(ofnsvg)
ggsurv(km.mod, main = titl)
dev.off()
png(ofnpng, height = 1000, width = 1600)
ggsurv(km.mod, main = titl)
dev.off()
pdf(ofnpdf, height = 10, width = 20)
ggsurv(km.mod, main = titl)
dev.off()
survdiff(sm ~ model, data = dm, rho = 0)
survdiff(s ~ manufact, data = ds, rho = 0)
# km assumes similar observation duration records for all strata
# some drives (eg seagate 8TB) only have 30 days at best with Q2 2016 data.
# this makes the right hand side of the usual full period KM plots misleading,
# because there's no new information being added about some of the drives over time
# so try only modelling the first week and so on
# some interesting urban myths about early vs late failures?
#
# eeesh - big data = big data problems
# one HDS5C3030ALA630 has a fail at 1801 days but most drives are censored at about 1600 days
# makes plot misleading...
#
# smart9 hours also broken - 9269 of 14345 values exceed 10 years worth of days!!
# as at end Q1 2018
#> max(dm$obsdays)
#[1] 1816
#> max(dm$smart9hours,na.rm=T)
#[1] 274169
#> 1816*24 # total possible hours
#[1] 43584
# > sum(dt$smart9hours > 43600)
#[1] 9296
cutps = c(3, 7, 15, 30, 60, 90, 120, 360, 720, 1080, 1440, 1802)
ncut = length(cutps)
for (i in c(1:ncut))
{
nmax = cutps[i]
titl = paste('KM first',
nmax,
'days of observation curves from Backblaze drive data to Q1 2017')
dti = dm
fixme = (dti$obsdays > nmax) # ignore all data, failing or not beyond nmax, by censoring at nmax.
dti$status[fixme] = 0
dti$obsdays[fixme] = nmax
okmodels = subset(dti,fixme,select=c(model))
models = levels(factor(as.character(okmodels$model)))
modcol = models[order(models)]
if (i == 1) {
mdf = data.frame('model' = modcol)
}
inmodels = (dti$model %in% models)
dt = subset(dti, inmodels)
stm = with(dt, Surv(time = obsdays, event = status))
km.mod = npsurv(stm ~ model, data = dt)
kmmod = survdiff(stm ~ model, data = dt, rho = 0)
pres = fixres(kmmod)
rnk = data.frame(rank=pres[order(pres$groups), ]$rank)
mdf = cbindX(mdf, rnk)# cbind(mdf, rnk)
s = paste('*** KM statistics for first',nmax,'days')
print.noquote(s)
print(pres)
ofnroot = paste('km_first',
nmax,
'_days_model_',
runtag,
sep = '')
ofnpdf = paste(ofnroot,'.pdf',sep = '')
ofnpng = paste(ofnroot,'.png',sep = '')
png(ofnpng, height = 1000, width = 1600)
print(ggsurv(km.mod, main = titl))
## ggplot requires explicit printing inside loops
dev.off()
pdf(ofnpdf, height = 10, width = 20)
print(ggsurv(km.mod, main = titl))
dev.off()
}
colnames(mdf) = c('Model', paste('Rank_day', cutps, sep = '_'))
mdfs = mdf[, 2:ncut]
vmdf = apply(mdfs, 1, var,na.rm=TRUE)
mmdf = apply(mdfs, 1, mean,na.rm=TRUE)
mdfo = cbind(mdf, 'mean' = mmdf, 'var' = vmdf)
mdfo = mdfo[order(mdfo$mean), ]
print(mdfo)