update icezee model param

mieskolainen · Jul 22, 2024 · 90bb5ad · 90bb5ad
1 parent e36c840
commit 90bb5ad
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 37 deletions.
diff --git a/configs/zee/models.yml b/configs/zee/models.yml
@@ -77,9 +77,9 @@ xgb0:
     tree_method: 'hist'
     device:      'auto'         # 'auto', 'cpu', 'cuda'
 
-    learning_rate: 0.05
+    learning_rate: 0.08
     gamma: 1.5
-    max_depth: 15
+    max_depth: 13
     min_child_weight: 1.0
     max_delta_step: 1.0
     subsample: 1
@@ -123,9 +123,9 @@ iceboost0: &ICEBOOST0
     tree_method: 'hist'
     device:      'auto'         # 'auto', 'cpu', 'cuda'
 
-    learning_rate: 0.05
+    learning_rate: 0.08
     gamma: 1.5
-    max_depth: 15
+    max_depth: 13
     min_child_weight: 1.0
     max_delta_step: 1.0
     subsample: 1
@@ -154,8 +154,8 @@ iceboost0: &ICEBOOST0
   plot_trees: false
 
   # Read/Write of epochs
-  evalmode: 10                  # Evaluation and saving of the model every n-th epoch (int) during training
-  readmode: -1                  # -1 takes the minimum loss model
+  evalmode: 10           # Evaluation and saving of the model every n-th epoch (int) during training
+  readmode: -1           # -1 takes the minimum loss model
 
 
 # ICEBOOST with custom loss [BCE + Sliced Wasserstein]
@@ -172,15 +172,15 @@ iceboost_swd:
       beta: 1.0
       classes: [0,1]
       #set_filter: *MAIN_DOMAIN_FILTER # Comment out for 'inclusive'
-      label_eps: 0.0           # label smoothing epsilon (regularization)
+      label_eps: 0.0     # label smoothing epsilon (regularization)
 
   # Sliced Wasserstein distance [use with custom:binary_cross_entropy and custom:sliced_wasserstein]
   SWD_param:
-    beta: 0.2
-    p: 1                        # p-norm (1,2, ...)
-    num_slices: 4000            # Number of MC projections (Higher the better) 
-    mode: 'EBSW'
-    max_N: 30000                # Max events limit (30k & 4000 slices works with 32 GB Nvidia V100)
+    beta: 0.01
+    p: 1                 # p-norm (1,2, ...)
+    num_slices: 500      # Number of MC projections (Higher the better) 
+    mode: 'SWD'          # 'SWD' (basic), 'EBSW' (see icefit/transport.py)
+    max_N: 500000        # Max events limit (500k & 500 slices works with 32 GB Nvidia V100)
 
 
 # ICEBOOST with an additional re-weighting in-the-loop regularization
@@ -242,14 +242,14 @@ lzmlp0: &LZMLP
 
   # Optimization
   opt_param:  
-    #lossfunc: 'binary_cross_entropy'  # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
-    #lossfunc: 'binary_Lq_entropy'     # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
-    #q: 0.8                            # Lq exponent (q < 1 -> high density vals emphasized, q > 1 then low emphasized) 
+    lossfunc: 'binary_cross_entropy'  # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
+    #lossfunc: 'binary_Lq_entropy'     
+    #q: 0.8                           # Lq exponent (q < 1 -> high density vals emphasized, q > 1 then low emphasized) 
 
-    lossfunc: 'SWD'                   # Sliced Wasserstein
+    SWD_beta: 0.01                    # Sliced Wasserstein [reweighting regularization]
     SWD_p: 1                          # p-norm (1,2,..), 1 perhaps more robust
-    SWD_num_slices: 10000             # Number of MC projections (higher the better)
-    SWD_mode: 'EBSW'
+    SWD_num_slices: 1000              # Number of MC projections (higher the better)
+    SWD_mode: 'SWD'                   # 'SWD' (basic), 'EBSW' (see icefit/transport.py)
 
     lipschitz_beta:    5.0e-5         # lipschitz regularization (use with 'lzmlp')
     #logit_L1_beta: 1.0e-2            # logit norm reg. ~ beta * torch.sum(|logits|)
@@ -262,7 +262,7 @@ lzmlp0: &LZMLP
     clip_norm: 1.0
 
     epochs: 300
-    batch_size: 16384
+    batch_size: 8096
     lr: 5.0e-4
     weight_decay: 1.0e-4       # L2-regularization
 
@@ -334,10 +334,10 @@ fastkan0: &FASTKAN
     #lossfunc: 'binary_Lq_entropy'     # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
     #q: 0.8                            # Lq exponent (q < 1 -> high density vals emphasized, q > 1 then low emphasized) 
 
-    #lossfunc: 'SWD'                   # Sliced Wasserstein
-    #SWD_p: 1                          # p-norm (1,2,..), 1 perhaps more robust
-    #SWD_num_slices: 10000              # Number of MC projections (higher the better)
-    #SWD_mode: 'EBSW'
+    SWD_beta: 0.01                    # Sliced Wasserstein [reweighting regularization]
+    SWD_p: 1                          # p-norm (1,2,..), 1 perhaps more robust
+    SWD_num_slices: 1000              # Number of MC projections (higher the better)
+    SWD_mode: 'SWD'                   # 'SWD' (basic), 'EBSW' (see icefit/transport.py)
 
     #lipshitz_beta:    1.0e-4         # Lipshitz regularization (use with 'lzmlp')
     #logit_L1_beta: 1.0e-2            # logit norm reg. ~ beta * torch.sum(|logits|)
@@ -419,18 +419,17 @@ dmlp0: &DMLP
     skip_connections: False
     last_tanh: True                   # Extra tanh layer
     last_tanh_scale: 10.0             # Scale after tanh()
-
+  
   # Optimization
   opt_param:  
-    #lossfunc: 'binary_cross_entropy'  # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
-
-    #lossfunc: 'binary_Lq_entropy'    # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
+    lossfunc: 'binary_cross_entropy'  # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
+    #lossfunc: 'binary_Lq_entropy'
     #q: 0.8                           # Lq exponent (q < 1 -> high density vals emphasized, q > 1 then low emphasized) 
 
-    lossfunc: 'SWD'                   # Sliced Wasserstein
+    SWD_beta: 0.01                    # Sliced Wasserstein [reweighting regularization]
     SWD_p: 1                          # p-norm (1,2,..), 1 perhaps more robust
-    SWD_num_slices: 10000              # Number of MC projections (higher the better)
-    SWD_mode: 'EBSW'
+    SWD_num_slices: 1000              # Number of MC projections (higher the better)
+    SWD_mode: 'SWD'                   # 'SWD' (basic), 'EBSW' (see icefit/transport.py)
 
     #logit_L1_beta: 1.0e-2            # logit norm reg. ~ lambda * torch.sum(|logits|)
     logit_L2_beta: 5.0e-3             # logit norm reg. ~ lambda * torch.sum(logits**2)
@@ -442,7 +441,7 @@ dmlp0: &DMLP
     clip_norm: 1.0
 
     epochs: 300
-    batch_size: 16384
+    batch_size: 8096
     lr: 5.0e-4
     weight_decay: 1.0e-4       # L2-regularization
 

diff --git a/icenet/deep/losstools.py b/icenet/deep/losstools.py
@@ -103,6 +103,21 @@ def loss_wrapper(model, x, y, num_classes, weights, param, y_DA=None, w_DA=None,
         weights      = None # TBD. Could re-compute a new set of edge weights 
     # --------------------------------------------
 
+    def SWD_helper(logits):
+        """
+        Sliced Wasserstein reweight regularization
+        """
+        if 'SWD_beta' in param and param['SWD_beta'] > 0:
+
+            beta  = param['SWD_beta']
+            value = beta * SWD_reweight_loss(logits=logits, x=x, y=y, weights=weights,
+                                    p=param['SWD_p'], num_slices=param['SWD_num_slices'],
+                                    mode=param['SWD_mode'])
+
+            return {f'SWD x $\\beta = {beta}$': value}
+        else:
+            return {}
+
     def MI_helper(output):
         """ 
         Mutual Information regularization
@@ -159,21 +174,21 @@ def LM_helper(logits):
         logits = model.forward(x)
         loss   = BCE_loss(logits=logits, y=y, weights=weights)
 
-        loss = {'BCE': loss, **LZ_helper(), **LM_helper(logits), **MI_helper(torch.sigmoid(logits))}
+        loss = {'BCE': loss, **SWD_helper(logits), **LZ_helper(), **LM_helper(logits), **MI_helper(torch.sigmoid(logits))}
 
     elif param['lossfunc'] == 'binary_focal_entropy':
 
         logits = model.forward(x)
         loss   = binary_focal_loss(logits=logits, y=y, gamma=param['gamma'], weights=weights)
 
-        loss = {f"FE ($\\gamma = {param['gamma']}$)": loss, **LZ_helper(), **LM_helper(logits), **MI_helper(torch.sigmoid(logits))}
+        loss = {f"FE ($\\gamma = {param['gamma']}$)": loss, **SWD_helper(logits), **LZ_helper(), **LM_helper(logits), **MI_helper(torch.sigmoid(logits))}
 
     elif param['lossfunc'] == 'binary_Lq_entropy':
 
         logits = model.forward(x)
         loss   = Lq_binary_loss(logits=logits, y=y, q=param['q'], weights=weights)
 
-        loss = {f"LQ ($\\gamma = {param['q']}$)": loss, **LZ_helper(), **LM_helper(logits), **MI_helper(torch.sigmoid(logits))}
+        loss = {f"LQ ($\\gamma = {param['q']}$)": loss, **SWD_helper(logits), **LZ_helper(), **LM_helper(logits), **MI_helper(torch.sigmoid(logits))}
 
     elif param['lossfunc'] == 'SWD':
 
@@ -192,22 +207,22 @@ def LM_helper(logits):
         y_hat = model.forward(x)
         loss  = MSE_loss(y_hat=y_hat, y=y, weights=weights)
 
-        loss  = {'MSE': loss, **LZ_helper(), **LM_helper(y_hat), **MI_helper(y_hat)}
+        loss  = {'MSE': loss, **SWD_helper(logits), **LZ_helper(), **LM_helper(y_hat), **MI_helper(y_hat)}
 
     elif param['lossfunc'] == 'MSE_prob':
 
         logits = model.forward(x)
         y_hat  = torch.sigmoid(logits)
         loss   = MSE_loss(y_hat=y_hat, y=y, weights=weights)
 
-        loss  = {'MSE': loss, **LZ_helper(), **LM_helper(logits), **MI_helper(y_hat)}
+        loss  = {'MSE': loss, **SWD_helper(logits), **LZ_helper(), **LM_helper(logits), **MI_helper(y_hat)}
 
     elif param['lossfunc'] == 'MAE':
 
         y_hat = model.forward(x)
         loss  = MSE_loss(y_hat=y_hat, y=y, weights=weights)
 
-        loss  = {'MAE': loss, **LZ_helper(), **LM_helper(y_hat), **MI_helper(y_hat)}
+        loss  = {'MAE': loss, **SWD_helper(logits), **LZ_helper(), **LM_helper(y_hat), **MI_helper(y_hat)}
 
     elif param['lossfunc'] == 'cross_entropy':
         """