OpenNMT · francoishernandez · Sep 9, 2021 · Aug 24, 2021 · Aug 24, 2021 · Aug 24, 2021
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -42,6 +42,16 @@ jobs:
           -src_vocab /tmp/onmt.vocab.src \
           -tgt_vocab /tmp/onmt.vocab.tgt \
           && rm -rf /tmp/sample
+    - name: Test vocabulary build with features
+      run: |
+        python onmt/bin/build_vocab.py \
+          -config data/features_data.yaml \
+          -save_data /tmp/onmt_feat \
+          -src_vocab /tmp/onmt_feat.vocab.src \
+          -tgt_vocab /tmp/onmt_feat.vocab.tgt \
+          -src_feats_vocab '{"feat0": "/tmp/onmt_feat.vocab.feat0"}' \
+          -n_sample -1 \
+          && rm -rf /tmp/sample
     - name: Test field/transform dump
       run: |
         # The dumped fields are used later when testing tools
@@ -169,6 +179,26 @@ jobs:
           -state_dim 256 \
           -n_steps 10 \
           -n_node 64
+    - name: Testing training with features
+      run: |
+        python onmt/bin/train.py \
+          -config data/features_data.yaml \
+          -src_vocab /tmp/onmt_feat.vocab.src \
+          -tgt_vocab /tmp/onmt_feat.vocab.tgt \
+          -src_feats_vocab '{"feat0": "/tmp/onmt_feat.vocab.feat0"}' \
+          -src_vocab_size 1000 -tgt_vocab_size 1000 \
+          -rnn_size 2 -batch_size 10 \
+          -word_vec_size 5 -rnn_size 10 \
+          -report_every 5 -train_steps 10 \
+          -save_model /tmp/onmt.model \
+          -save_checkpoint_steps 10
+    - name: Testing translation with features
+      run: |
+        python translate.py \
+            -model /tmp/onmt.model_step_10.pt \
+            -src data/data_features/src-test.txt \
+            -src_feats "{'feat0': 'data/data_features/src-test.feat0'}" \
+            -verbose
     - name: Test RNN translation
       run: |
         head data/src-test.txt > /tmp/src-test.txt

diff --git a/data/data_features/src-test.feat0 b/data/data_features/src-test.feat0
@@ -0,0 +1 @@
+C B A B
diff --git a/data/data_features/src-test.txt b/data/data_features/src-test.txt
@@ -0,0 +1 @@
+she is a hard-working.
diff --git a/data/data_features/src-train.feat0 b/data/data_features/src-train.feat0
@@ -0,0 +1,3 @@
+A A A A B A A A C
+A B C D E
+C B A B
diff --git a/data/data_features/src-train.txt b/data/data_features/src-train.txt
@@ -0,0 +1,3 @@
+however, according to the logs, she is a hard-working.
+however, according to the logs,
+she is a hard-working.
diff --git a/data/data_features/src-val.feat0 b/data/data_features/src-val.feat0
@@ -0,0 +1 @@
+C B A B
diff --git a/data/data_features/src-val.txt b/data/data_features/src-val.txt
@@ -0,0 +1 @@
+she is a hard-working.
diff --git a/data/data_features/tgt-train.txt b/data/data_features/tgt-train.txt
@@ -0,0 +1,3 @@
+however, according to the logs, she is a hard-working.
+however, according to the logs,
+she is a hard-working.
diff --git a/data/data_features/tgt-val.txt b/data/data_features/tgt-val.txt
@@ -0,0 +1 @@
+she is a hard-working.
diff --git a/data/features_data.yaml b/data/features_data.yaml
@@ -0,0 +1,11 @@
+# Corpus opts:
+data:
+    corpus_1:
+        path_src: data/data_features/src-train.txt
+        path_tgt: data/data_features/tgt-train.txt
+        src_feats:
+            feat0: data/data_features/src-train.feat0
+        transforms: [filterfeats, inferfeats]
+    valid:
+        path_src: data/data_features/src-val.txt
+        path_tgt: data/data_features/tgt-val.txt
diff --git a/docs/source/FAQ.md b/docs/source/FAQ.md
@@ -477,3 +477,73 @@ Training options to perform vocabulary update are:
 * `-update_vocab`: set this option
 * `-reset_optim`: set the value to "states"
 * `-train_from`: checkpoint path
+
+
+## How can I use source word features?
+
+Extra information can be added to the words in the source sentences by defining word features. 
+
+Features should be defined in a separate file using blank spaces as a separator and with each row corresponding to a source sentence. An example of the input files:
+
+data.src
+```
+however, according to the logs, she is hard-working.
+```
+
+feat0.txt
+```
+A C C C C A A B
+```
+
+**Notes**
+- Prior tokenization is not necessary, features will be inferred by using the `FeatInferTransform` transform.
+- `FilterFeatsTransform` and `FeatInferTransform` are required in order to ensure the functionality.
+- Not possible to do shared embeddings (at least with `feat_merge: concat` method)
+
+Sample config file:
+
+```
+data:
+    dummy:
+        path_src: data/train/data.src
+        path_tgt: data/train/data.tgt
+        src_feats:
+            feat_0: data/train/data.src.feat_0
+            feat_1: data/train/data.src.feat_1
+        transforms: [filterfeats, onmt_tokenize, inferfeats, filtertoolong]
+        weight: 1
+    valid:
+        path_src: data/valid/data.src
+        path_tgt: data/valid/data.tgt
+        src_feats:
+            feat_0: data/valid/data.src.feat_0
+            feat_1: data/valid/data.src.feat_1
+        transforms: [filterfeats, onmt_tokenize, inferfeats]
+
+# # Vocab opts
+src_vocab: exp/data.vocab.src
+tgt_vocab: exp/data.vocab.tgt
+src_feats_vocab: 
+    feat_0: exp/data.vocab.feat_0
+    feat_1: exp/data.vocab.feat_1
+feat_merge: "sum"
+
+```
+
+During inference you can pass features by using the `--src_feats` argument. `src_feats` is expected to be a Python like dict, mapping feature name with its data file.
+
+```
+{'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}
+```
+
+**Important note!** During inference, input sentence is expected to be tokenized. Therefore feature inferring should be handled prior to running the translate command. Example:
+
+```bash
+python translate.py -model model_step_10.pt -src ../data.txt.tok -output ../data.out --src_feats "{'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}"
+```
+
+When using the Transformer architecture make sure the following options are appropriately set:
+
+- `src_word_vec_size` and `tgt_word_vec_size` or `word_vec_size`
+- `feat_merge`: how to handle features vecs
+- `feat_vec_size` and maybe `feat_vec_exponent`
diff --git a/onmt/bin/build_vocab.py b/onmt/bin/build_vocab.py
@@ -32,11 +32,13 @@ def build_vocab_main(opts):
     transforms = make_transforms(opts, transforms_cls, fields)
 
     logger.info(f"Counter vocab from {opts.n_sample} samples.")
-    src_counter, tgt_counter = build_vocab(
+    src_counter, tgt_counter, src_feats_counter = build_vocab(
         opts, transforms, n_sample=opts.n_sample)
 
     logger.info(f"Counters src:{len(src_counter)}")
     logger.info(f"Counters tgt:{len(tgt_counter)}")
+    for feat_name, feat_counter in src_feats_counter.items():
+        logger.info(f"Counters {feat_name}:{len(feat_counter)}")
 
     def save_counter(counter, save_path):
         check_path(save_path, exist_ok=opts.overwrite, log=logger.warning)
@@ -52,6 +54,9 @@ def save_counter(counter, save_path):
     else:
         save_counter(src_counter, opts.src_vocab)
         save_counter(tgt_counter, opts.tgt_vocab)
+
+    for k, v in src_feats_counter.items():
+        save_counter(v, opts.src_feats_vocab[k])
 
 
 def _get_parser():

diff --git a/onmt/bin/translate.py b/onmt/bin/translate.py
@@ -6,6 +6,7 @@
 
 import onmt.opts as opts
 from onmt.utils.parse import ArgumentParser
+from collections import defaultdict
 
 
 def translate(opt):
@@ -15,12 +16,21 @@ def translate(opt):
     translator = build_translator(opt, logger=logger, report_score=True)
     src_shards = split_corpus(opt.src, opt.shard_size)
     tgt_shards = split_corpus(opt.tgt, opt.shard_size)
-    shard_pairs = zip(src_shards, tgt_shards)
-
-    for i, (src_shard, tgt_shard) in enumerate(shard_pairs):
+    features_shards = []
+    features_names = []
+    for feat_name, feat_path in opt.src_feats.items():
+        features_shards.append(split_corpus(feat_path, opt.shard_size))
+        features_names.append(feat_name)
+    shard_pairs = zip(src_shards, tgt_shards, *features_shards)
+
+    for i, (src_shard, tgt_shard, *features_shard) in enumerate(shard_pairs):
+        features_shard_ = defaultdict(list)
+        for j, x in enumerate(features_shard):
+            features_shard_[features_names[j]] = x
         logger.info("Translating shard %d." % i)
         translator.translate(
             src=src_shard,
+            src_feats=features_shard_,
             tgt=tgt_shard,
             batch_size=opt.batch_size,
             batch_type=opt.batch_type,

diff --git a/onmt/constants.py b/onmt/constants.py
@@ -22,6 +22,7 @@ class CorpusName(object):
 class SubwordMarker(object):
     SPACER = '▁'
     JOINER = '￭'
+    CASE_MARKUP = ["｟mrk_case_modifier_C｠", "｟mrk_begin_case_region_U｠", "｟mrk_end_case_region_U｠"]
 
 
 class ModelTask(object):