Skip to content

Commit

Permalink
fix config helper.
Browse files Browse the repository at this point in the history
  • Loading branch information
lcy-seso committed Aug 23, 2017
1 parent 25083de commit 5e59ca7
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 62 deletions.
10 changes: 10 additions & 0 deletions paddle/gserver/layers/CrossEntropyOverBeam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,17 @@ real CostForOneSequence::forward() {
}

void CostForOneSequence::backward() {
/*
* when softmax layer is the output layer, and it is combined with
* cross-entropy as cost. The derivate with regard to softmax's input
* is simply:
*
* grad_i = softmax_out_i - target_i,
*
* and here hard label is used.
*/
softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;

MatrixPtr tmp = Matrix::create(
softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);

Expand Down
16 changes: 11 additions & 5 deletions paddle/gserver/layers/CrossEntropyOverBeam.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ limitations under the License. */

namespace paddle {

/* This struct stores the beams in all search steps for a single sequence. */
struct BeamExpansion {
// store the entire beam expansion for a single sequence
std::vector<MatrixPtr> scores;
std::vector<IVectorPtr> seqInfo;

Expand Down Expand Up @@ -111,18 +111,24 @@ class CrossEntropyOverBeam : public Layer {
size_t batchSize_;
size_t beamSize_;

// Currently, this layer only works on CPU, if its inputs is on GPU,
// copy them to CPU memory.
/*
* the process of constructing beams is not friendly to GPU, currently, this
* layer only runs on CPU, if any of its inputs is on GPU memory, then copy
* it to CPU memory.
*/
std::vector<MatrixPtr> candidateScores_;
std::vector<MatrixPtr> candidateScoreGrad_;
std::vector<MatrixPtr> candidateInBeam_;
std::vector<MatrixPtr> gradToInputs_;
std::vector<IVectorPtr> goldSequence_;
std::vector<std::vector<int>> beamSplitPos_;

// split entire bath of beams into beam per sequnence.
/*
* split entire bath of beams into beam per sequnence and store the result
* into this member.
*/
std::vector<BeamExpansion> beamPerSeq_;
// beamCosts_ is used to propagate error in one sequence.
/* beamCosts_ is used to propagate error in one sequence. */
std::vector<CostForOneSequence> beamCosts_;
};

Expand Down
22 changes: 8 additions & 14 deletions paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,10 @@ using namespace paddle; // NOLINT
DECLARE_int32(gpu_id);
DECLARE_bool(thread_local_rand_use_global_seed);

// const size_t MAX_SEQ_NUM = 5;
// const size_t MAX_SEQ_LEN = 10;
// const size_t MAX_BEAM_SIZE = 3;

const size_t MAX_SEQ_NUM = 23;
const size_t MAX_SEQ_LEN = 50;
const size_t MAX_BEAM_SIZE = 27;

// const size_t SEED = 1503391792;
// const size_t SEED = 1;
const size_t SEED = (size_t)(time(NULL));

struct SingleBeamExpansion {
Expand Down Expand Up @@ -176,10 +170,12 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
beam.resetGroundTruth(seqNum);
for (size_t i = 0; i < seqNum; ++i) {
if (randFloat() > 0.5) {
// force the randomly generated label falls in the beam by chance 0.5.
// otherwise, when sequence length is relatively long and beam size is
// relatively small, the gold sequences falls off the beam at in
// the first search.
/*
* force the randomly generated label falls in the beam by chance 0.5.
* otherwise, when sequence length is relatively long and beam size is
* relatively small, the gold sequences falls off the beam at in the
* first search.
*/
real* begPos = beam.selectedIndices.data() + i * beamSize;
beam.colIdxInBeam[i] =
rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
Expand Down Expand Up @@ -222,9 +218,7 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,

if (randFloat() > 0.5) {
// force the randomly generated label falls in the beam by chance 0.5.
// otherwise, when sequence length is relatively long and beam size is
// relatively small, the gold sequences falls off the beam at in
// the first search.

real* start =
curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
int n = rand() % count_if(start, start + beamSize, [](const real& val) {
Expand Down Expand Up @@ -339,7 +333,7 @@ TEST(Layer, CrossEntropyOverBeam) {
const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
LOG(INFO) << "beamSize = " << beamSize;

// TODO(caoying): test with more beam expansions.
// TODO(caoying): test with random beam expansions.
const size_t expansionCount = 3;
vector<SingleBeamExpansion> beams;
genRandomBeamExpansion(expansionCount, beamSize, beams);
Expand Down
12 changes: 6 additions & 6 deletions python/paddle/trainer/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,16 +1605,16 @@ def __init__(self, name, inputs, softmax_selfnorm_alpha=0.1, **xargs):
@config_layer('cross_entropy_over_beam')
class CrossEntropyOverBeamLayer(LayerBase):
def __init__(self, name, inputs, **xargs):
config_assert(len(inputs) % 3 == 0, "Error input numbers.")
config_assert(len(inputs) % 3 == 0, "Error input number.")
super(CrossEntropyOverBeamLayer, self).__init__(
name, 'cross_entropy_over_beam', 0, inputs, **xargs)
input_num = len(inputs) / 3
for i in range(input_num):
input_layer = self.get_input_layer(i * 2)
config_assert(
input_layer.size == 1, "Inputs for this layer are made up of "
"several pairs and the first one in a pair is scores for "
"all the candidates, so its size should be equal to 1.")
input_layer = self.get_input_layer(i * 3)
config_assert(input_layer.size == 1, (
"Inputs for this layer are made up of "
"several triples, in which the first one is scores over "
"all candidate paths, whose size should be equal to 1."))


@config_layer('fc')
Expand Down
129 changes: 107 additions & 22 deletions python/paddle/trainer_config_helpers/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
'nce_layer',
'cross_entropy_with_selfnorm',
'cross_entropy',
'BeamInput',
'cross_entropy_over_beam',
'multi_binary_label_cross_entropy',
'sum_cost',
Expand Down Expand Up @@ -5681,10 +5682,10 @@ def multi_binary_label_cross_entropy(input,

if input.activation is None or \
not isinstance(input.activation, SigmoidActivation):
logger.log(
logging.WARN,
"%s is not recommend for multi_binary_label_cross_entropy's activation, "
"maybe the sigmoid is better" % repr(input.activation))
logger.log(logging.WARN,
("%s is not a recommended activation for "
"multi_binary_label_cross_entropy, sigmoid is better") %
repr(input.activation))

Layer(
name=name,
Expand All @@ -5699,26 +5700,110 @@ def multi_binary_label_cross_entropy(input,
size=1)


class BeamInput(object):
"""
Define the input for cross_entropy_over_beam layer.
A beam is made up of a triple: the first one is scores over all
candidates; the second one is indices of top k selected candidates; the
third one is the index of ground truth, which is also always called
gold.
"""

def __init__(self, candidate_scores, selected_candidates, gold):
assert isinstance(candidate_scores, LayerOutput)
self.candidate_scores = candidate_scores
assert candidate_scores.size == 1

assert isinstance(selected_candidates, LayerOutput)
self.selected_candidates = selected_candidates

assert isinstance(gold, LayerOutput)
self.gold = gold


@wrap_name_default()
@layer_support()
def cross_entropy_over_beam(input, label, name=None, coeff=1.0, weight=None):
"""
TODO(caoying) add comments.
def cross_entropy_over_beam(input, name=None):
"""
This layer is used in learning to search models, which is to solve complex
joint prediction problems based on learning to search through a
problem-defined search space.
assert len(input) / 2 == len(label), "Error input numbers."
for i in range(0, len(input), 2):
assert (input[i].size == 1), (
"Inputs for this layer are made up of "
"several pairs and the first one in a pair is scores for "
"all the candidates, so its size should be equal to 1.")
Specifically, the learning to search process for this layer begins with
searching a target sequence from a nested sequence. In the first search
step, top beam size sequences with highest scores, indices of these top k
sequences in the original nested sequence, and the ground truth (also
called gold) altogether (a triple) make up of the first beam.
ipts, parents = __cost_input__(input, label, weight)
Layer(
name=name,
type=LayerType.CROSS_ENTROPY_OVER_BEAM,
inputs=ipts,
coeff=coeff)
Then, several special positions, for example, start and end positions
that define meaningful segments are searched. In these searches, top k
positions with highest scores are selected, and then sequence, starting
from the selected starts till ends of the sequences (or a fixed position)
are taken to search next.
We call the possible top k results returned in one search the beam. This
search process can be repeated for pre-defined turns and leads to several
beam expansions.
Finally, the layer cross_entropy_over_beam takes all the beam expansions
which contain several candidate targets found along the multi-step search.
cross_entropy_over_beam calculates cross entropy over the expanded beams
which all the candidates in the beam as the normalized factor.
Note that, if gold falls off the beam at search step t, then the cost is
calculated over the beam at step t.
This cost layer always works together with kmax_sequence_score_layer,
sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
sub-search space.
The example usage is:
.. code-block:: python
cost = cross_entropy_over_beam(input=[
BeamInput(
candidate_scores=beam1_candidates,
selected_candidates=beam1_topk,
gold=gold1),
BeamInput(
candidate_scores=beam2_candidates,
selected_candidates=beam2_topk,
gold=gold2),
])
:param input: input beams for this layer.
:type input: BeamInput
:param name: input beams for this layer.
:type name: basestring
:return: LayerOutput object.
:rtype: LayerOutput
"""

if isinstance(input, BeamInput):
input = [input]
else:
assert isinstance(input, list), (
'input for cross_entropy_over_beam shold be a python list '
'of BeamInput object.')
for ipt in input:
assert isinstance(ipt, BeamInput), (
'input for cross_entropy_over_beam '
'should be a BeamInput object.')

ipts = []
parents = []
for beam in input:
parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
ipts += [
beam.candidate_scores.name, beam.selected_candidates.name,
beam.gold.name
]

Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)


Expand Down Expand Up @@ -6247,11 +6332,11 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
@wrap_bias_attr_default()
def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
"""
A layer applies a linear transformation to each element in each row of
the input matrix. For each element, the layer first re-scale it and then
A layer applies a linear transformation to each element in each row of
the input matrix. For each element, the layer first re-scale it and then
adds a bias to it.
This layer is very like the SlopeInterceptLayer, except the scale and
This layer is very like the SlopeInterceptLayer, except the scale and
bias are trainable.
.. math::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,27 +114,26 @@ layers {
input_layer_name: "__kmax_sequence_score_layer_0__"
}
inputs {
input_layer_name: "__fc_layer_0__"
input_layer_name: "sentences_ids"
}
inputs {
input_layer_name: "__kmax_sequence_score_layer_1__"
input_layer_name: "__fc_layer_0__"
}
inputs {
input_layer_name: "__fc_layer_1__"
input_layer_name: "__kmax_sequence_score_layer_1__"
}
inputs {
input_layer_name: "__kmax_sequence_score_layer_2__"
input_layer_name: "start_ids"
}
inputs {
input_layer_name: "sentences_ids"
input_layer_name: "__fc_layer_1__"
}
inputs {
input_layer_name: "start_ids"
input_layer_name: "__kmax_sequence_score_layer_2__"
}
inputs {
input_layer_name: "end_ids"
}
coeff: 1.0
}
parameters {
name: "___fc_layer_0__.w0"
Expand Down Expand Up @@ -177,8 +176,8 @@ parameters {
initial_smart: false
}
input_layer_names: "sentence_scores"
input_layer_names: "sentence_states"
input_layer_names: "sentences_ids"
input_layer_names: "sentence_states"
input_layer_names: "start_ids"
input_layer_names: "end_ids"
output_layer_names: "__cross_entropy_over_beam_0__"
Expand All @@ -198,8 +197,8 @@ sub_models {
layer_names: "end_ids"
layer_names: "__cross_entropy_over_beam_0__"
input_layer_names: "sentence_scores"
input_layer_names: "sentence_states"
input_layer_names: "sentences_ids"
input_layer_names: "sentence_states"
input_layer_names: "start_ids"
input_layer_names: "end_ids"
output_layer_names: "__cross_entropy_over_beam_0__"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,17 @@
sentence_idx = data_layer(name="sentences_ids", size=1)
start_idx = data_layer(name="start_ids", size=1)
end_idx = data_layer(name="end_ids", size=1)
cost = cross_entropy_over_beam(
input=[
sentence_scores, topk_sentence_ids, start_pos_scores,
topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
],
label=[sentence_idx, start_idx, end_idx])
cost = cross_entropy_over_beam(input=[
BeamInput(
candidate_scores=sentence_scores,
selected_candidates=topk_sentence_ids,
gold=sentence_idx), BeamInput(
candidate_scores=start_pos_scores,
selected_candidates=topk_start_pos_ids,
gold=start_idx), BeamInput(
candidate_scores=end_pos_scores,
selected_candidates=topk_end_pos_ids,
gold=end_idx)
])

outputs(cost)

0 comments on commit 5e59ca7

Please sign in to comment.