Skip to content

Commit

Permalink
Merge pull request #620 from hal3/sharedDataBug
Browse files Browse the repository at this point in the history
fixed shared data/csoaa bugs and static analysis bug
  • Loading branch information
JohnLangford committed May 11, 2015
2 parents 9511aad + 405de85 commit bb68807
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 71 deletions.
2 changes: 1 addition & 1 deletion test/RunTests
Original file line number Diff line number Diff line change
Expand Up @@ -1052,7 +1052,7 @@ __DATA__
train-sets/ref/search_er.stderr
# Test 66: Train a depenency parser with search (dagger) on wsj_small.dparser.vw.gz for 2 passes
{VW} -k -c -d train-sets/wsj_small.dparser.vw.gz --passes 2 --search_task dep_parser --search 3 --search_alpha 1e-4 --search_rollout oracle --holdout_off
{VW} -k -c -d train-sets/wsj_small.dparser.vw.gz --passes 6 --search_task dep_parser --search 12 --search_alpha 1e-4 --search_rollout oracle --holdout_off
train-sets/ref/search_dep_parser.stderr
# Test 67: classification with data from dictionaries (eg embeddings or gazetteers) -- note that this is impossible without dictionaries because --ignore w
Expand Down
13 changes: 7 additions & 6 deletions test/train-sets/ref/search_dep_parser.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@ num sources = 1
average since instance current true current predicted cur cur predic cache examples
loss last counter output prefix output prefix pass pol made hits gener beta
88.000000 88.000000 1 [43:1 5:2 5:2 5:2 1..] [0:8 1:1 2:1 3:1 4:..] 0 0 144 0 144 0.014199
47.500000 7.000000 2 [2:2 3:5 0:8 3:7 3:4 ] [2:2 0:8 2:5 2:3 2:4 ] 0 0 157 0 156 0.015381
47.500000 7.000000 2 [2:2 3:5 0:8 3:7 3:4 ] [2:2 5:2 2:4 2:4 0:8 ] 0 0 157 0 156 0.015381
38.250000 29.000000 4 [4:2 4:2 4:2 7:5 6:..] [2:2 0:8 2:4 2:1 4:..] 0 0 248 0 246 0.024204
28.125000 18.000000 8 [4:2 4:2 4:2 5:5 0:..] [3:2 3:2 4:2 5:5 0:..] 1 0 551 0 543 0.052760
29.375000 20.500000 8 [4:2 4:2 4:2 5:5 0:..] [4:2 3:2 4:2 5:5 0:..] 1 0 551 0 543 0.052760
19.812500 10.250000 16 [43:1 5:2 5:2 5:2 1..] [30:1 5:2 5:2 5:2 1..] 3 0 1187 0 1134 0.107122

finished run
number of examples per pass = 5
passes used = 2
weighted example sum = 10
passes used = 6
weighted example sum = 30
weighted label sum = 0
average loss = 23.9
total feature number = 275880
average loss = 10.5667
total feature number = 827640
4 changes: 2 additions & 2 deletions vowpalwabbit/cb_adf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ void do_actual_learning(cb_adf& data, base_learner& base)
if (CB::ec_is_example_header(*data.ec_seq[0])) {
start_K = 1;
for (size_t k=1; k<K; k++)
LabelDict::add_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0]);
LabelDict::add_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0], data.all->audit || data.all->hash_inv);
}
bool isTest = check_cb_adf_sequence(data, start_K);

Expand Down Expand Up @@ -328,7 +328,7 @@ void do_actual_learning(cb_adf& data, base_learner& base)
/////////////////////// remove header
if (start_K > 0)
for (size_t k=1; k<K; k++)
LabelDict::del_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0]);
LabelDict::del_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0], data.all->audit || data.all->hash_inv);
}

void global_print_newline(vw& all)
Expand Down
41 changes: 26 additions & 15 deletions vowpalwabbit/csoaa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,13 @@ void make_single_prediction(ldf& data, base_learner& base, example& ec) {
simple_label.weight = 0.;
ec.partial_prediction = 0.;

LabelDict::add_example_namespace_from_memory(data.label_features, ec, ld.costs[0].class_index);
LabelDict::add_example_namespace_from_memory(data.label_features, ec, ld.costs[0].class_index, data.all->audit || data.all->hash_inv);

ec.l.simple = simple_label;
base.predict(ec); // make a prediction
ld.costs[0].partial_prediction = ec.partial_prediction;

LabelDict::del_example_namespace_from_memory(data.label_features, ec, ld.costs[0].class_index);
LabelDict::del_example_namespace_from_memory(data.label_features, ec, ld.costs[0].class_index, data.all->audit || data.all->hash_inv);
ec.l.cs = ld;
}

Expand Down Expand Up @@ -241,7 +241,7 @@ void do_actual_learning_wap(ldf& data, base_learner& base, size_t start_K)
v_array<COST_SENSITIVE::wclass> costs1 = save_cs_label.costs;
if (costs1[0].class_index == (uint32_t)-1) continue;

LabelDict::add_example_namespace_from_memory(data.label_features, *ec1, costs1[0].class_index);
LabelDict::add_example_namespace_from_memory(data.label_features, *ec1, costs1[0].class_index, data.all->audit || data.all->hash_inv);

for (size_t k2=k1+1; k2<K; k2++) {
example *ec2 = data.ec_seq[k2];
Expand All @@ -253,7 +253,7 @@ void do_actual_learning_wap(ldf& data, base_learner& base, size_t start_K)
if (value_diff < 1e-6)
continue;

LabelDict::add_example_namespace_from_memory(data.label_features, *ec2, costs2[0].class_index);
LabelDict::add_example_namespace_from_memory(data.label_features, *ec2, costs2[0].class_index, data.all->audit || data.all->hash_inv);

// learn
ec1->example_t = data.csoaa_example_t;
Expand All @@ -265,9 +265,9 @@ void do_actual_learning_wap(ldf& data, base_learner& base, size_t start_K)
base.learn(*ec1);
unsubtract_example(*data.all, ec1);

LabelDict::del_example_namespace_from_memory(data.label_features, *ec2, costs2[0].class_index);
LabelDict::del_example_namespace_from_memory(data.label_features, *ec2, costs2[0].class_index, data.all->audit || data.all->hash_inv);
}
LabelDict::del_example_namespace_from_memory(data.label_features, *ec1, costs1[0].class_index);
LabelDict::del_example_namespace_from_memory(data.label_features, *ec1, costs1[0].class_index, data.all->audit || data.all->hash_inv);

// restore original cost-sensitive label, sum of importance weights
ec1->l.cs = save_cs_label;
Expand Down Expand Up @@ -317,9 +317,9 @@ void do_actual_learning_oaa(ldf& data, base_learner& base, size_t start_K)
ec->l.simple = simple_label;

// learn
LabelDict::add_example_namespace_from_memory(data.label_features, *ec, costs[0].class_index);
LabelDict::add_example_namespace_from_memory(data.label_features, *ec, costs[0].class_index, data.all->audit || data.all->hash_inv);
base.learn(*ec);
LabelDict::del_example_namespace_from_memory(data.label_features, *ec, costs[0].class_index);
LabelDict::del_example_namespace_from_memory(data.label_features, *ec, costs[0].class_index, data.all->audit || data.all->hash_inv);

// restore original cost-sensitive label, sum of importance weights and partial_prediction
ec->l.cs = save_cs_label;
Expand All @@ -338,15 +338,21 @@ void do_actual_learning(ldf& data, base_learner& base)
if (ec_seq_is_label_definition(data.ec_seq)) {
for (size_t i=0; i<data.ec_seq.size(); i++) {
v_array<feature> features = v_init<feature>();
v_array<audit_data> audit = v_init<audit_data>();
for (feature*f=data.ec_seq[i]->atomics[data.ec_seq[i]->indices[0]].begin; f!=data.ec_seq[i]->atomics[data.ec_seq[i]->indices[0]].end; f++) {
feature fnew = { f->x, f->weight_index };
features.push_back(fnew);
}
if ((data.all->audit || data.all->hash_inv))
for (audit_data*f=data.ec_seq[i]->audit_features[data.ec_seq[i]->indices[0]].begin; f!=data.ec_seq[i]->audit_features[data.ec_seq[i]->indices[0]].end; f++) {
audit_data f2 = { f->space, f->feature, f->weight_index, f->x, false };
audit.push_back(f2);
}

v_array<COST_SENSITIVE::wclass> costs = data.ec_seq[i]->l.cs.costs;
v_array<COST_SENSITIVE::wclass>& costs = data.ec_seq[i]->l.cs.costs;
for (size_t j=0; j<costs.size(); j++) {
size_t lab = (size_t)costs[j].x;
LabelDict::set_label_features(data.label_features, lab, features);
LabelDict::set_label_features(data.label_features, lab, features, (data.all->audit || data.all->hash_inv) ? &audit : nullptr);
}
}
return;
Expand All @@ -358,7 +364,7 @@ void do_actual_learning(ldf& data, base_learner& base)
if (ec_is_example_header(*data.ec_seq[0])) {
start_K = 1;
for (size_t k=1; k<K; k++)
LabelDict::add_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0]);
LabelDict::add_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0], (data.all->audit || data.all->hash_inv));
}
bool isTest = check_ldf_sequence(data, start_K);

Expand Down Expand Up @@ -387,7 +393,7 @@ void do_actual_learning(ldf& data, base_learner& base)
/////////////////////// remove header
if (start_K > 0)
for (size_t k=1; k<K; k++)
LabelDict::del_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0]);
LabelDict::del_example_namespaces_from_example(*data.ec_seq[k], *data.ec_seq[0], (data.all->audit || data.all->hash_inv));
}

void global_print_newline(vw& all)
Expand Down Expand Up @@ -457,8 +463,10 @@ void output_example_seq(vw& all, ldf& data)
for (example** ecc=data.ec_seq.begin; ecc!=data.ec_seq.end; ecc++)
output_example(all, **ecc, hit_loss, &(data.ec_seq));

if (!data.is_singleline && (all.raw_prediction > 0))
all.print_text(all.raw_prediction, "", data.ec_seq[0]->tag);
if (!data.is_singleline && (all.raw_prediction > 0)) {
v_array<char> empty = { nullptr, nullptr, nullptr, 0 };
all.print_text(all.raw_prediction, "", empty);
}
}
}

Expand Down Expand Up @@ -603,7 +611,10 @@ base_learner* csldf_setup(vw& all)
if (all.add_constant) {
all.add_constant = false;
}
ld.label_features.init(256, v_array<feature>(), LabelDict::size_t_eq);
v_array<feature> empty_f = { nullptr, nullptr, nullptr, 0 };
v_array<audit_data> empty_a = { nullptr, nullptr, nullptr, 0 };
LabelDict::feature_audit empty_fa = { empty_f, empty_a };
ld.label_features.init(256, empty_fa, LabelDict::size_t_eq);
ld.label_features.get(1, 94717244); // TODO: figure this out

ld.read_example_this_loop = 0;
Expand Down
4 changes: 2 additions & 2 deletions vowpalwabbit/csoaa.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ LEARNER::base_learner* csldf_setup(vw& all);

namespace LabelDict {
bool ec_is_example_header(example& ec);// example headers look like "0:-1" or just "shared"
void add_example_namespaces_from_example(example& target, example& source);
void del_example_namespaces_from_example(example& target, example& source);
void add_example_namespaces_from_example(example& target, example& source, bool audit);
void del_example_namespaces_from_example(example& target, example& source, bool audit);
}
4 changes: 2 additions & 2 deletions vowpalwabbit/example.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ const size_t autolink_namespace = 130;
const size_t neighbor_namespace = 131; // this is \x83 -- to do quadratic, say "-q a`printf "\x83"` on the command line
const size_t affix_namespace = 132; // this is \x84
const size_t spelling_namespace = 133; // this is \x85
const size_t conditioning_namespace = 134;
const size_t dictionary_namespace = 135;
const size_t conditioning_namespace = 134;// this is \x86
const size_t dictionary_namespace = 135; // this is \x87

struct feature {
float x;
Expand Down
53 changes: 34 additions & 19 deletions vowpalwabbit/label_dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
namespace LabelDict {
size_t hash_lab(size_t lab) { return 328051 + 94389193 * lab; }

void del_example_namespace(example& ec, char ns, v_array<feature> features) {
void del_example_namespace(example& ec, char ns, v_array<feature>& features, bool audit) {
size_t numf = features.size();
// print_update is called after this del_example_namespace,
// so we need to keep the ec.num_features correct,
Expand All @@ -20,15 +20,19 @@ namespace LabelDict {
ec.total_sum_feat_sq -= ec.sum_feat_sq[(size_t)ns];
ec.atomics[(size_t)ns].erase();
ec.sum_feat_sq[(size_t)ns] = 0.;
if (audit)
ec.audit_features[(size_t)ns].erase();
} else { // DID have ns
for (feature*f=features.begin; f!=features.end; f++) {
ec.sum_feat_sq[(size_t)ns] -= f->x * f->x;
ec.atomics[(size_t)ns].pop();
if (audit)
ec.audit_features[(size_t)ns].pop();
}
}
}

void add_example_namespace(example& ec, char ns, v_array<feature> features) {
void add_example_namespace(example& ec, char ns, v_array<feature>& features, v_array<audit_data>* audit) {
bool has_ns = false;
for (size_t i=0; i<ec.indices.size(); i++) {
if (ec.indices[i] == (size_t)ns) {
Expand All @@ -50,51 +54,62 @@ namespace LabelDict {

ec.num_features += features.size();
ec.total_sum_feat_sq += ec.sum_feat_sq[(size_t)ns];

if (audit != nullptr)
for (audit_data*f = audit->begin; f != audit->end; ++f) {
audit_data f2 = { f->space, f->feature, f->weight_index, f->x, false };
ec.audit_features[(size_t)ns].push_back(f2);
}
}

void add_example_namespaces_from_example(example& target, example& source) {
void add_example_namespaces_from_example(example& target, example& source, bool audit) {
for (unsigned char* idx=source.indices.begin; idx!=source.indices.end; idx++) {
if (*idx == constant_namespace) continue;
add_example_namespace(target, (char)*idx, source.atomics[*idx]);
add_example_namespace(target, (char)*idx, source.atomics[*idx],
audit ? &source.audit_features[*idx] : nullptr);
}
}

void del_example_namespaces_from_example(example& target, example& source) {
void del_example_namespaces_from_example(example& target, example& source, bool audit) {
//for (size_t*idx=source.indices.begin; idx!=source.indices.end; idx++) {
unsigned char* idx = source.indices.end;
idx--;
for (; idx>=source.indices.begin; idx--) {
if (*idx == constant_namespace) continue;
del_example_namespace(target, (char)*idx, source.atomics[*idx]);
del_example_namespace(target, (char)*idx, source.atomics[*idx], audit);
}
}

void add_example_namespace_from_memory(label_feature_map& lfm, example& ec, size_t lab) {
void add_example_namespace_from_memory(label_feature_map& lfm, example& ec, size_t lab, bool audit) {
size_t lab_hash = hash_lab(lab);
v_array<feature> features = lfm.get(lab, lab_hash);
if (features.size() == 0) return;
add_example_namespace(ec, 'l', features);
feature_audit& res = lfm.get(lab, lab_hash);
if (res.features.size() == 0) return;
add_example_namespace(ec, 'l', res.features, audit ? &res.audit : nullptr);
}

void del_example_namespace_from_memory(label_feature_map& lfm, example& ec, size_t lab) {
void del_example_namespace_from_memory(label_feature_map& lfm, example& ec, size_t lab, bool audit) {
size_t lab_hash = hash_lab(lab);
v_array<feature> features = lfm.get(lab, lab_hash);
if (features.size() == 0) return;
del_example_namespace(ec, 'l', features);
feature_audit& res = lfm.get(lab, lab_hash);
if (res.features.size() == 0) return;
del_example_namespace(ec, 'l', res.features, audit ? &res.audit : nullptr);
}

void set_label_features(label_feature_map& lfm, size_t lab, v_array<feature>features) {
void set_label_features(label_feature_map& lfm, size_t lab, v_array<feature>&features, v_array<audit_data>* audit) {
size_t lab_hash = hash_lab(lab);
if (lfm.contains(lab, lab_hash)) { return; }
lfm.put_after_get(lab, lab_hash, features);
const v_array<audit_data> empty = { nullptr, nullptr, nullptr, 0 };
feature_audit fa = { features, audit ? (*audit) : empty };
lfm.put_after_get(lab, lab_hash, fa);
}

void free_label_features(label_feature_map& lfm) {
void* label_iter = lfm.iterator();
while (label_iter != nullptr) {
v_array<feature> *features = lfm.iterator_get_value(label_iter);
features->erase();
features->delete_v();
feature_audit *res = lfm.iterator_get_value(label_iter);
res->features.erase();
res->features.delete_v();
res->audit.erase();
res->audit.delete_v();

label_iter = lfm.iterator_next(label_iter);
}
Expand Down
22 changes: 13 additions & 9 deletions vowpalwabbit/label_dictionary.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
namespace LabelDict {
typedef v_hashmap< size_t, v_array<feature> > label_feature_map;
struct feature_audit {
v_array<feature> features;
v_array<audit_data> audit;
};
typedef v_hashmap< size_t, feature_audit > label_feature_map;
inline bool size_t_eq(size_t &a, size_t &b) { return (a==b); }

void add_example_namespace(example& ec, char ns, v_array<feature> features);
void del_example_namespace(example& ec, char ns, v_array<feature> features);
void add_example_namespace(example& ec, char ns, v_array<feature>& features, v_array<audit_data>* audit);
void del_example_namespace(example& ec, char ns, v_array<feature>& features, bool audit);

void set_label_features(label_feature_map& data, size_t lab, v_array<feature>features);
void set_label_features(label_feature_map& lfm, size_t lab, v_array<feature>& features, v_array<audit_data>* audit);

void add_example_namespaces_from_example(example& target, example& source);
void del_example_namespaces_from_example(example& target, example& source);
void add_example_namespace_from_memory(label_feature_map& data, example& ec, size_t lab);
void del_example_namespace_from_memory(label_feature_map& lfm, example& ec, size_t lab);
void add_example_namespaces_from_example(example& target, example& source, bool audit);
void del_example_namespaces_from_example(example& target, example& source, bool audit);
void add_example_namespace_from_memory(label_feature_map& lfm, example& ec, size_t lab, bool audit);
void del_example_namespace_from_memory(label_feature_map& lfm, example& ec, size_t lab, bool audit);

void free_label_features(label_feature_map& data);
void free_label_features(label_feature_map& lfm);
}
Loading

0 comments on commit bb68807

Please sign in to comment.