-
Notifications
You must be signed in to change notification settings - Fork 0
/
Cos2MinTorchFunctionOptimizer.py
405 lines (334 loc) · 17.2 KB
/
Cos2MinTorchFunctionOptimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# Cos2MinTorchFunctionOptimizer.py (2024)
# rename: Cos2MinTorchFunctionOptimizer -> ELRA_class.py (tbd 2024)
import torch
# import lib_grad_solve
# import numpy as np
# from math import isnan, isinf, inf
compile_cnt: int = 0
def setModelParameter(theta, model: torch.nn.Module) -> None:
"set model param(theta)"
s = e = 0
for p in model.parameters():
e += torch.numel(p)
# t, s = theta[s:e], p.size()
# params = torch.reshape(t, p.size())
p.data = torch.reshape(theta[s:e], p.size())
# p.data.copy_(torch.reshape(theta[s:e], p.size()))
s = e
return
# was: SelfConstOptimTorch
class ElraOptimizer(torch.optim.Optimizer):
from enum import Enum
class Mode(Enum):
c2min = 1
# c2min_check = 2
p2min = 3 # default
def __init__(self, params, model, lr:float = 1e-5, mode:Mode = Mode.p2min, loss = None, wd:float = 0.0) -> None:
defaults = {}
super().__init__(params, defaults)
if (mode != ElraOptimizer.Mode.c2min):
import lib_grad_solve # P2M (default)
self.__optim_step = lib_grad_solve.SelfConstOptim.p2min_step
else:
import lib_grad_solve_c2m as lib_grad_solve # C2M
self.__optim_step = lib_grad_solve.SelfConstOptim.c2min_pv_step
self.SCO = lib_grad_solve.SelfConstOptim
self.__optim_instance = lib_grad_solve.SelfConstOptim(lr=lr, wd=wd)
#if (mode == ElraOptimizer.Mode.c2min):
# self.__optim_step = lib_grad_solve.SelfConstOptim.c2min_pv_step
# elif mode == ElraOptimizer.Mode.c2min_check:
# self.__optim_step = lib_grad_solve.SelfConstOptim.c2min_greedy_step
#elif (mode == ElraOptimizer.Mode.p2min):
# self.__optim_step = lib_grad_solve.SelfConstOptim.p2min_step
self.step_call = getattr(self.__optim_instance, self.__optim_step.__name__)
assert callable(self.step_call), "function needed: step()"
assert(model is not None), "need torch model"
self.__model = model
self.__loss = loss
# print(params) # generator object Module.parameters
self.elra_TFO_dim: int = None
self.param_bak: torch.Tensor = None # backup
# self.device = torch.device("cpu") # default
self.CalcParaAvg: bool = None # Booster on/off (False=None)
self.BoosterTarget: int = 240 # 0 = off (even)
self.ParaLossSum: float = 0.0
self.ParaAvgCount: int = 0
self.ParaLossCount: int = 0
self.ParaAvgTensor: torch.Tensor = None
self.FullBoostTensor: torch.Tensor = None # last complete
self.FullBoostLoss: float = 0.0
self.LastEpochSteps: int = 0
self.EpochSteps: int = 0 # since last Booster
self.RetraceCount: int = 0
self.ChangedParam: bool = False
return
#def revert_step(self, loss: float) -> bool:
# "UNUSED: P2min: revert step check (future)"
# if (self.elra_TFO_dim is None):
# return False # do initial step anyway
#
# if not (loss < 1e999): # isnan(loss) or isinf(loss), 1e999==inf (fastest)
# self.__skip_next_grad = True
# return True # never allow NaN
#
# ret = getattr(self.__optim_instance, self.__optim_step.__name__) (
# None, loss, None )
# self.__skip_next_grad = ret
# return ret # (default=False)
@staticmethod
def CopyModel(model: torch.nn.Module, mode:str='default', device=None) -> torch.nn.Module:
"create 2nd model, for booster"
from copy import deepcopy
from os import path
import torch._dynamo
from torch._dynamo.testing import CompileCounter
assert(model is not None), "needs model"
fn: str = "compile_skip"
model_boost = deepcopy(model) # ELRA-Booster
device1 = next(model.parameters()).device
if (device is not None):
print("model.device =", str(device1), ", boost =", str(device))
model_boost.to(device)
if (mode == 'off'):
print("deepcopy(model), no compile(), mode =", mode)
return model_boost
cc_fc: int = CompileCounter().frame_count
#if (cc_fc < 1): # 0 even for compiled models ?
# print("deepcopy(model), no compile(),", mode, cc_fc)
# return model_boost
if path.isfile(fn):
# https://github.com/pytorch/pytorch/issues/128121
# 2nd compile of deepcopy(model) fails on multiple ubuntu-pc (fatal error: Python.h: file not found)
print("Detected Compile Blocker (for issue #128121) !", fn)
return model_boost
assert(hasattr(torch, 'compile')), "requires PyTorch 2.x (2024)"
if False: # path.exists("/etc/"): # Windows not supported (May 2024)
global compile_cnt
n: int = sum(p.numel() for p in model.parameters())
if not len(mode): mode = 'default'
print("compiling 2nd model... (takes a ~minute)", n)
torch._dynamo.config.suppress_errors = True
compile_crash: bool = False
torch._dynamo.reset()
# backend='cudagraphs' ['cudagraphs', 'inductor', 'onnxrt', 'openxla', 'tvm']
compile_cnt += 1
try:
model_boost = torch.compile(model_boost, mode=mode) # mode='reduce-overhead') # reduce-overhead fails for ImgNet + lama2
print("CompileCounter() =", compile_cnt, CompileCounter().frame_count)
except Exception as inst:
compile_crash = True
print("Exception in torch.compile() !!", inst)
model_boost = deepcopy(model) # again, w/o compile
if compile_crash:
print("Create Compile Blocker!", fn)
f = open(fn, "x")
f.close()
return model_boost
# def GetLossLimit(self) -> float:
# "unused"
# return self.__optim_instance.GetLossLimit()
def GetParam(self, only_new:bool, device:torch.device) -> torch.Tensor|None:
"get param x (used for MultiGpu only)"
if (self.param_bak is None):
self.InitialParam()
assert(self.param_bak is not None), "model hw/o param"
if (only_new and not self.ChangedParam): return None
self.ChangedParam = False
return self.param_bak.to(device, non_blocking=True, copy=True)
def TellTrainBoostLoss(self, train_loss:float) -> None:
"tell train_loss of boost-params (if avail)"
self.__optim_instance.LGS_TellTrainBoostLoss(train_loss)
return
def SetClasses(self, classes: int, gpubatchsize: int) -> None:
"inform solver about class-count (helpful for noise estim.)"
assert(self.elra_TFO_dim is None), "set before first solver.step()"
assert(type (self.__optim_instance).__name__ == "SelfConstOptim"), "ELRA-LGS only"
getattr(self.__optim_instance, self.SCO.LGS_SetClasses.__name__) (classes, gpubatchsize)
return
def CheckNextStep(self) -> tuple[bool, int]:
"check if next step is real vs. collect-only (only for MultiGpu SMP/DDP)"
return self.__optim_instance.LGS_CheckNextStep()
def SetLgsDevice(self, dev: torch.device = None) -> None:
"move LGS-Tensors to device to free GPU ram during fullbatch (end of epoch)"
assert(type (self.__optim_instance).__name__ == "SelfConstOptim"), "ELRA-LGS only"
if (self.elra_TFO_dim is None): return
if self.elra_TFO_dim > (20 << 20): # RN34 = 21mio
getattr(self.__optim_instance, self.SCO.LGS_SetDevice.__name__) (dev)
return
def CalcBoostTensor(self) -> None:
"epoch x average (internal)"
if (self.ParaAvgCount < 2): # reset: should never happen
self.ParaAvgTensor, self.ParaAvgCount = None, 0
self.FullBoostTensor = None
return
self.FullBoostLoss = 0.0 if (self.ParaLossCount < 1) else \
(self.ParaLossSum / self.ParaLossCount)
self.ParaAvgTensor *= 1.0 / self.ParaAvgCount # in-place
self.ParaAvgTensor, self.FullBoostTensor = None, self.ParaAvgTensor # .to(device=torch.device('cpu'))
self.ParaLossCount, self.ParaLossSum = 0, 0.0
# print("BoostCalc:", self.ParaAvgCount, self.LastEpochSteps) # debug
self.ParaAvgCount = 0
return
def GetParamAvg(self, enable: bool):
"control local param averaging + reset avg. + return vector (for booster)"
assert(type (self.__optim_instance).__name__ == "SelfConstOptim"), "ELRA only"
loss: float = self.FullBoostLoss
count: int = self.EpochSteps
self.CalcParaAvg = True if enable else None
if enable:
assert(self.BoosterTarget >= 2), "averaged step count"
if (count > 0): # once per epoch
print("GetParamAvg: c=%d, l=%.6f, r=%d" % (count, loss, self.RetraceCount))
self.EpochSteps, self.LastEpochSteps = 0, count
if (not enable) or (count < 4): # <4 steps/epoch
self.FullBoostTensor = None
return 0, loss, None
if (self.FullBoostTensor is None) and (self.ParaAvgCount > 0): # not enough real steps
if (self.BoosterTarget < 999999999):
print("GetParamAvg:BoostEpochSwitch(%d<%d,rm=%d)" %
(count, self.BoosterTarget, self.ParaAvgCount))
self.BoosterTarget = 999999999 # never within epoch (internal constant, e.g. INT_MAX)
# self.ParaAvgTensor, self.ParaAvgCount = None, 0 # forget incomplete sum
# return count, loss, None # this epoch w/o boost
if (self.ParaAvgCount >= 2): # minimum guess (only for very few steps/epoch)
self.CalcBoostTensor() # create FullBoostTensor if possible
else:
print("Warn:SkipLowBoost(pac=%d, epoS=%d)" % (self.ParaAvgCount, count))
ret_vect, self.FullBoostTensor = self.FullBoostTensor, None
return count, loss, ret_vect # .to(self.param_bak.device), None = NoNewBoostAvail
def InitialParam(self) -> None:
"internal: initial export from model to this class"
assert(self.__model is not None), "need torch model"
if True: # (self.param_bak is None):
params = [param.data.view(-1) for param in self.__model.parameters()]
params_n = torch.cat(params)
params = None
print("ParamCount = %d, norm = %.4g, dev=%s" % (len(params_n), params_n.norm().item(), str(params_n.device))) # device=CPU
self.elra_TFO_dim = len(params_n)
self.param_bak = params_n
self.ChangedParam = True
return
def FirstCycle(self, loss:float, scale:float) -> None:
"internal: first step is special"
self.InitialParam()
grads = [param.grad.data.view(-1) for param in self.__model.parameters()]
# params, grads = [], []
# for param in self.__model.parameters(): # TODO make this more torch like, see e.g. https://github.com/rahulkidambi/AccSGD/blob/master/AccSGD.py
# params.append(param.data.view(-1)) # why we get back params here (only init)
# grads.append(param.grad.data.view(-1))
# self.__model.zero_grad() # (still empty first run)
assert(scale > 0.0), "first step exploded"
# Calls our solver with x (params), function value (loss), gradient (grads)
params_n, _, _ = self.step_call(
self.param_bak, loss, torch.cat(grads) * scale)
if (params_n is not None):
# print("ParamCount = %d, loss = %.6f, dev=%s" % (len(params_n), loss, str(params_n.device))) # device=CPU
self.param_bak = params_n
self.ChangedParam = True
return
def BoosterUpdate(self, param: torch.Tensor) -> None:
"internal: average x-param"
if (self.ParaAvgTensor is not None): # (self.ParaAvgCount >= 1):
self.ParaAvgTensor += param
else:
self.ParaAvgTensor = param.clone() # clone needed
self.ParaAvgCount += 1
if (self.ParaAvgCount >= self.BoosterTarget): # > 3 and even
self.CalcBoostTensor()
return
def step(self, loss: float, scale: float|None = None) -> None:
"ELRA (C2M + P2M) step (usualy float16)"
# get X (params) and G (grads)
retrace: bool = False
if (self.elra_TFO_dim is not None): # normal cycle (faster)
# self.__optim_step.__name__ = "p2min_step"
if not (loss < 1e999): # 1e999=inf, check outside
print("NoGrad: loss=%.3e, x.norm=%.3g !" % (loss, self.param_bak.norm()))
params_n, _, retrace = self.step_call( # getattr(..) (
self.param_bak, loss, None )
else:
# grads = [] # only grad (reuse param)
grads = [param.grad.data.view(-1) for param in self.__model.parameters()]
if (scale is not None): # (scale != 1.0):
params_n, _, retrace = self.step_call( # self.__optim_instance.p2min_step(
self.param_bak, loss, torch.cat(grads) * scale)
else:
params_n, _, retrace = self.step_call(
self.param_bak, loss, torch.cat(grads) )
grads = None
else: # first cycle
self.FirstCycle(loss, 1.0 if (scale is None) else scale)
return
# self.__skip_next_grad = False # optional
self.ParaLossSum += loss
self.ParaLossCount += 1
if (params_n is not None): # no update step (collect average)
setModelParameter(params_n, self.__model) # set __model.parameters
self.ChangedParam = True
self.EpochSteps += 1 # moving steps
if (retrace):
self.RetraceCount += 1
self.param_bak = params_n
# self.param_bak.copy_(params_n)
return
# (not retrace)
if self.CalcParaAvg is not None: # and (self.param_bak is not None): # Booster
self.BoosterUpdate(self.param_bak)
self.param_bak = params_n
# self.param_bak.copy_(params_n)
return # self.ParaAvgCount # BoosterFill
def step_retrace(self, loss: float) -> None:
"ELRA (C2min + P2min) retrace step (no gradient)"
# return self.step(loss, None)
assert (self.elra_TFO_dim is not None), "normal cycle (not first)"
print("NoGrad: loss=%.3e, x.norm=%.3g !" % (loss, self.param_bak.norm()))
params_n, _, retrace = self.step_call( self.param_bak, loss, None )
assert (params_n is not None), "retrace is not coolect-only"
setModelParameter(params_n, self.__model) # set __model.parameters
self.EpochSteps += 1 # moving steps
assert(retrace), "retrace = retrace"
self.RetraceCount += 1
self.param_bak = params_n
self.ChangedParam = True
# self.param_bak.copy_(params_n)
return
def step_noscale(self, loss: float, grad: torch.Tensor=None) -> None:
"ELRA (C2min + P2min) normal step (no GradScaler, scale=1, mainly float32)"
# return self.step(loss, 1.0)
retrace: bool = False
if (self.elra_TFO_dim is not None): # normal cycle (faster)
# self.__optim_step.__name__ = "p2min_step"
if not (loss < 1e999): # 1e999=inf, (self.__skip_next_grad)
return self.step_retrace(loss)
else:
if (grad is None):
# grads = [] # only grad (reuse param)
# for param in self.__model.parameters():
# grads.append(param.grad.data.view(-1))
grads = [param.grad.data.view(-1) for param in self.__model.parameters()]
params_n, _, retrace = self.step_call( # self.__optim_instance.p2min_step(
self.param_bak, loss, torch.cat(grads))
grads = None
else: # MultiGpu SMP only
params_n, _, retrace = self.step_call(self.param_bak, loss, grad )
else: # first cycle
self.FirstCycle(loss, 1.0)
return
# self.__skip_next_grad = False # optional
self.ParaLossSum += loss
self.ParaLossCount += 1
if (params_n is not None): # no update step (collect average)
setModelParameter(params_n, self.__model) # set __model.parameters
self.ChangedParam = True
self.EpochSteps += 1 # moving steps
if (retrace):
self.RetraceCount += 1
self.param_bak = params_n
# self.param_bak.copy_(params_n)
return
# (not retrace)
if self.CalcParaAvg is not None: # and (self.param_bak is not None): # Booster
self.BoosterUpdate(self.param_bak)
self.param_bak = params_n
return
# class ElraOptimizer