Skip to content

Commit

Permalink
[tokenizers] Upgrade huggingface tokenizers to 1.14.1 (#2818)
Browse files Browse the repository at this point in the history
  • Loading branch information
frankfliu authored Oct 23, 2023
1 parent 1c5aef8 commit 23e07cf
Show file tree
Hide file tree
Showing 6 changed files with 12 additions and 8 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/native_s3_huggingface.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
- name: Install Environment
run: |
yum -y update
yum -y install centos-release-scl-rh epel-release
yum -y install centos-release-scl-rh epel-release perl-core
yum -y install devtoolset-7 git patch cmake3 libstdc++-static
ln -s /usr/bin/cmake3 /usr/bin/cmake
curl https://sh.rustup.rs -sSf | sh -s -- -y
Expand Down Expand Up @@ -184,7 +184,7 @@ jobs:
- name: Install Environment
run: |
yum -y update
yum -y install centos-release-scl-rh epel-release
yum -y install centos-release-scl-rh epel-release perl-core
yum -y install devtoolset-7 git patch cmake3 libstdc++-static
ln -s /usr/bin/cmake3 /usr/bin/cmake
curl https://sh.rustup.rs -sSf | sh -s -- -y
Expand Down
2 changes: 1 addition & 1 deletion extensions/tokenizers/build.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
@rem choco install rust -y
@rem choco install jdk8 -y

set VERSION=python-v"%1"
set VERSION=v"%1"

if exist "tokenizers" (
echo Found "tokenizers"
Expand Down
2 changes: 1 addition & 1 deletion extensions/tokenizers/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ elif [[ -n $(command -v sysctl) ]]; then
fi
PLATFORM=$(uname | tr '[:upper:]' '[:lower:]')

VERSION=python-v$1
VERSION=v$1
ARCH=$2

pushd $WORK_DIR
Expand Down
2 changes: 1 addition & 1 deletion extensions/tokenizers/rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ edition = "2018"

[dependencies]
jni = "0.19.0"
tokenizers = { path = "../tokenizers/tokenizers", version = "*" }
tokenizers = { path = "../tokenizers/tokenizers", version = "*", features = ["http"] }

[target.'cfg(target_os = "linux")'.dependencies]
openssl = { version = "0.10", features = ["vendored"] }
Expand Down
8 changes: 6 additions & 2 deletions extensions/tokenizers/rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ pub extern "system" fn Java_ai_djl_huggingface_tokenizers_jni_TokenizersLibrary_
}
}
let decoding: String = tokenizer
.decode(decode_ids, skip_special_tokens == JNI_TRUE)
.decode(&*decode_ids, skip_special_tokens == JNI_TRUE)
.unwrap();
let ret = env
.new_string(decoding)
Expand Down Expand Up @@ -527,8 +527,12 @@ pub extern "system" fn Java_ai_djl_huggingface_tokenizers_jni_TokenizersLibrary_
}
batch_decode_input.push(decode_ids);
}
let mut references: Vec<&[u32]> = Vec::new();
for reference in batch_decode_input.iter() {
references.push(reference);
}
let decoding: Vec<String> = tokenizer
.decode_batch(batch_decode_input, skip_special_tokens == JNI_TRUE)
.decode_batch(&references, skip_special_tokens == JNI_TRUE)
.unwrap();
let ret: jobjectArray = env
.new_object_array(batch_len, "java/lang/String", JObject::null())
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ trt_version=8.4.1
onnxruntime_version=1.16.0
paddlepaddle_version=2.3.2
sentencepiece_version=0.1.97
tokenizers_version=0.13.3
tokenizers_version=0.14.1
fasttext_version=0.9.2
xgboost_version=1.7.5
lightgbm_version=3.2.110
Expand Down

0 comments on commit 23e07cf

Please sign in to comment.