Wordpiece tokenizer (#111)

* initial commit of SplitFunctionTokenizer * added TokenType values to support Wordpiece and BPE style tokenization * Initial commit of Wordpiece and WordpieceTokenizer and WordpiecePreprocessTokenizer * initial commit of bert-base-uncased along with regression test data * removed WordpieceBuilder in favor of directly making Wordpiece configrbl replace tabs with spaces updated tests - all passing now. WordpiecePreprocessTokenizer is configurable. Added config param 'tokenizeChineseChars' * resolves issues raised in comments in the pull request - additionally, fixed an issue with the neverSplit strings. - copyrights, javadocs, config params, etc. * fixes misc. typos and a problem with clone() method
oracle · Feb 4, 2021 · a10ec50 · a10ec50
1 parent bdd257c
commit a10ec50
Show file tree

Hide file tree

Showing 16 changed files with 51,843 additions and 157 deletions.
diff --git a/Util/Tokenization/src/main/java/org/tribuo/util/tokens/Token.java b/Util/Tokenization/src/main/java/org/tribuo/util/tokens/Token.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,8 +30,9 @@ public class Token {
 
     /**
      * Constructs a token.
-     * @param text  should be equivalent to the substring of the original
-     *              tokenized text for the given character offsets start and end
+     * 
+     * @param text  should be equivalent to the substring of the original tokenized
+     *              text for the given character offsets start and end
      * @param start the starting offset of the token
      * @param end   the ending offset of the token (exclusive or inclusive?)
      */
@@ -41,8 +42,9 @@ public Token(String text, int start, int end) {
 
     /**
      * Constructs a token.
-     * @param text  should be equivalent to the substring of the original
-     *              tokenized text for the given character offsets start and end
+     * 
+     * @param text  should be equivalent to the substring of the original tokenized
+     *              text for the given character offsets start and end
      * @param start the starting offset of the token
      * @param end   the ending offset of the token (exclusive or inclusive?)
      * @param type  the type of the token
@@ -56,6 +58,7 @@ public Token(String text, int start, int end, TokenType type) {
 
     /**
      * The number of characters in this token.
+     * 
      * @return The number of characters.
      */
     public int length() {
@@ -68,18 +71,59 @@ public String toString() {
     }
 
     /**
-     * Tokenizers may product multiple kinds of tokens, depending on the
-     * application to which they're being put. For example, when processing a
-     * document for highlighting during querying, we need to send through
-     * whitespace and punctuation so that the document looks as it did in it's
-     * original form. For most tokenizer applications, they will only send word
-     * tokens.
+     * Tokenizers may product multiple kinds of tokens, depending on the application
+     * to which they're being put. For example, when processing a document for
+     * highlighting during querying, we need to send through whitespace and
+     * punctuation so that the document looks as it did in it's original form. For
+     * most tokenizer applications, they will only send word tokens.
      */
     public enum TokenType {
+        /**
+         * A WORD corresponds to a token that does not consist of or contain whitespace
+         * and may correspond to a regular "word" that could be looked up in a
+         * dictionary. Some tokenizers do not distinguish between different kinds of
+         * tokens and may use this as a default type for all generated tokens.
+         */
         WORD,
+        /**
+         * An NGRAM corresponds to a token that might correspond to a character ngram -
+         * i.e. some portion / sub-span of a regular word token (for example.)
+         */
         NGRAM,
+        /**
+         * A PUNCTUATION corresponds to tokens consisting of punctuation characters. In
+         * some applications, a PUNCTUATION may be treated differently because they may
+         * have less semantic content than regular word tokens.
+         */
         PUNCTUATION,
-        WHITESPACE
+        /**
+         * Some tokenizers may produce tokens corresponding to whitespace (e.g. space,
+         * tab, newline, etc.) It may be important for consumers of tokens generated by
+         * a tokenizer to ignore/skip WHITESPACE tokens to avoid unexpected behavior.
+         */
+        WHITESPACE,
+        /**
+         * Some tokenizers produce "sub-word" tokens. A PREFIX corresponds to a sub-word
+         * word-prefix token.
+         */
+        PREFIX,
+        /**
+         * Some tokenizers produce "sub-word" tokens. A SUFFIX corresponds to a sub-word
+         * word-suffix token.
+         */
+        SUFFIX,
+        /**
+         * Some tokenizers produce "sub-word" tokens. An INFIX corresponds to a sub-word
+         * "infix" token (i.e. from the middle).
+         */
+        INFIX,
+        /**
+         * Some tokenizers may work in concert with vocabulary data. Some applications
+         * may treat out-of-vocabulary tokens differently than other tokens. An UNKNOWN
+         * token corresponds to a token that is out-of-vocabulary or has never been seen
+         * before.
+         */
+        UNKNOWN
     }
 
 }