Skip to content

Commit

Permalink
Merge pull request #7213 from JiayiFeng/dev_add_callback_for_backward
Browse files Browse the repository at this point in the history
Error Clip Design Doc
  • Loading branch information
JiayiFeng authored Jan 9, 2018
2 parents 1dad4bb + 8ab59da commit 6ece41e
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 5 deletions.
87 changes: 87 additions & 0 deletions doc/design/error_clip.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Error Clip

## Overview

Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
## Usage

Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:

```python
var = framework.Variable(..., error_clip=myErrorClip, ...)
```

The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:

```python
ErrorClipByValue(max, min=None)
```

`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.

So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:

```python
var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
```

## Implementation

The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.

```python
class BaseErrorClipAttr(object):
def append_clip_op(self, block, grad_name):
raise NotImplementedError()


class ErrorClipByValue(BaseErrorClipAttr):
def __init__(self, max, min=None):
max = float(max)
if min is None:
min = -max
else:
min = float(min)
self.max = max
self.min = min

def append_clip_op(self, block, grad_name):
block.append_op(
type="clip",
inputs={"X": grad_name},
outputs={"Out": grad_name},
attrs={"min": self.min,
"max": self.max})
```

The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.

This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.

These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.

```python
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc)
callback(block=target_block, context=grad_to_var)
```

Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.

The callback function for `clip_op` appending is defined in *clip.py*:

```python
def error_clip_callback(block, context):
# the context is a grad_to_var map
grad_to_var = context
op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()):
fwd_var = block.var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if error_clip is not None:
error_clip.append_clip_op(block, grad_n)
```

This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
15 changes: 13 additions & 2 deletions python/paddle/v2/fluid/backward.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,17 @@ def _append_backward_ops_(target,
grad_to_var(dict)(output argument):
key(str): grad variable name
val(str): corresponding forward variable name
callback(callable object): a callable object used to decorate new generated grad ops
"""
if callback is None:

def empty_callback(block, context):
pass

callback = empty_callback
elif not hasattr(callback, '__call__'):
raise ValueError("'callback' must be a callable object.")

# grad_op_descs holds created grad_op, and will be appended to target_block
grad_op_descs = []
program = block.program
Expand Down Expand Up @@ -226,6 +236,7 @@ def _append_backward_ops_(target,
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc)
callback(block=target_block, context=grad_to_var)


def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
Expand Down Expand Up @@ -268,7 +279,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
_infer_var_data_type_(arg, block)


def append_backward(loss, parameter_list=None, no_grad_set=None):
def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
"""
Append backward part to main_program
Expand Down Expand Up @@ -312,7 +323,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
grad_to_var = dict()

_append_backward_ops_(loss, root_block, root_block, no_grad_dict,
grad_to_var)
grad_to_var, callback)
_append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)

program.current_block_idx = current_block_idx
Expand Down
41 changes: 40 additions & 1 deletion python/paddle/v2/fluid/clip.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,46 @@
import functools
import layers
from . import core

__all__ = ['GradientClipByValue', 'append_gradient_clip_ops']
__all__ = [
'GradientClipByValue', 'append_gradient_clip_ops', 'error_clip_callback'
]


class BaseErrorClipAttr(object):
def append_clip_op(self, block, grad_name):
raise NotImplementedError()


class ErrorClipByValue(BaseErrorClipAttr):
def __init__(self, max, min=None):
max = float(max)
if min is None:
min = -max
else:
min = float(min)
self.max = max
self.min = min

def append_clip_op(self, block, grad_name):
block.append_op(
type="clip",
inputs={"X": grad_name},
outputs={"Out": grad_name},
attrs={"min": self.min,
"max": self.max})


def error_clip_callback(block, context):
# the context is a grad_to_var map
grad_to_var = context
op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()):
fwd_var = block.var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if error_clip is not None:
error_clip.append_clip_op(block, grad_n)


class BaseGradientClipAttr(object):
Expand Down
14 changes: 14 additions & 0 deletions python/paddle/v2/fluid/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,11 @@ def __init__(self,
dtype=None,
lod_level=None,
persistable=None,
error_clip=None,
stop_gradient=False,
**kwargs):
self.block = block
self.error_clip = error_clip

if name is None:
name = Variable._unique_var_name_()
Expand Down Expand Up @@ -622,6 +624,17 @@ def var(self, name):
raise ValueError("var %s not in this block" % name)
return v

def var_recursive(self, name):
if self.has_var(name):
return self.var(name)
else:
if self.idx == 0:
raise ValueError("var %s is not in block(%d) nor its parents." %
name, self.idx)
else:
parent_block = self.program.block(self.parent_idx)
return parent_block.var_recursive(name)

def all_parameters(self):
return list(self.iter_parameters())

Expand Down Expand Up @@ -740,6 +753,7 @@ def copy_param_info_from(self, other):
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
clip_attr=p.clip_attr,
error_clip=p.error_clip,
name=v.name)
self.vars[new_p.name] = new_p

Expand Down
5 changes: 3 additions & 2 deletions python/paddle/v2/fluid/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from initializer import Constant
from layer_helper import LayerHelper
from regularizer import append_regularization_ops
from clip import append_gradient_clip_ops
from clip import append_gradient_clip_ops, error_clip_callback

__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']

Expand Down Expand Up @@ -197,7 +197,8 @@ def minimize(self,
This method combines interface `append_backward()` and
`create_optimization_pass()` into one.
"""
params_grads = append_backward(loss, parameter_list, no_grad_set)
params_grads = append_backward(loss, parameter_list, no_grad_set,
error_clip_callback)

params_grads = append_gradient_clip_ops(params_grads)

Expand Down

0 comments on commit 6ece41e

Please sign in to comment.