From bffee1f7169e29e5f96e8d16dccb09876474e476 Mon Sep 17 00:00:00 2001 From: Karthik Date: Sun, 25 Feb 2024 09:28:48 -0700 Subject: [PATCH] Initialize tokenizer and simplify str_lookup prototype - new method to initialize tokenizer with a given vocab_size - removed voacb_size from the arguments of build_tokenizer - applied the changes in run.c, runq.c, test.c - pass the tokenizer object to str_lookup - helps to easily follow - "oh! we will be checking for the string in the Tokenizer" - passes: unit test (test.c) - passes: integration test (./run stories15M.bin on Mackbook M1> --- run.c | 34 +++++++++++++++++++--------------- runq.c | 26 +++++++++++++++----------- test.c | 3 ++- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/run.c b/run.c index 2fcd687a..91a4f7f0 100644 --- a/run.c +++ b/run.c @@ -378,27 +378,30 @@ typedef struct { unsigned char byte_pieces[512]; // stores all single-byte strings } Tokenizer; -int compare_tokens(const void *a, const void *b) { - return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str); -} - -void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) { - // i should have written the vocab_size into the tokenizer file... sigh +void init_tokenizer(Tokenizer* t, int vocab_size){ + // allocate memory based on the specified vocab_size t->vocab_size = vocab_size; // malloc space to hold the scores and the strings - t->vocab = (char**)malloc(vocab_size * sizeof(char*)); - t->vocab_scores = (float*)malloc(vocab_size * sizeof(float)); + t->vocab = (char**)malloc(t->vocab_size * sizeof(char*)); + t->vocab_scores = (float*)malloc(t->vocab_size * sizeof(float)); t->sorted_vocab = NULL; // initialized lazily for (int i = 0; i < 256; i++) { t->byte_pieces[i * 2] = (unsigned char)i; t->byte_pieces[i * 2 + 1] = '\0'; } +} + +int compare_tokens(const void *a, const void *b) { + return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str); +} + +void build_tokenizer(Tokenizer* t, char* tokenizer_path) { // read in the file FILE *file = fopen(tokenizer_path, "rb"); if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); } if (fread(&t->max_token_length, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); } int len; - for (int i = 0; i < vocab_size; i++) { + for (int i = 0; i < t->vocab_size; i++) { if (fread(t->vocab_scores + i, sizeof(float), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE);} if (fread(&len, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); } t->vocab[i] = (char *)malloc(len + 1); @@ -442,10 +445,10 @@ void safe_printf(char *piece) { printf("%s", piece); } -int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) { +int str_lookup(char *str, Tokenizer* t) { // efficiently find the perfect match for str in vocab, return its index or -1 if not found TokenIndex tok = { .str = str }; // acts as the key to search for - TokenIndex *res = bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens); + TokenIndex *res = bsearch(&tok, t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens); return res != NULL ? res->id : -1; } @@ -480,7 +483,7 @@ void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int * // TODO: pretty sure this isn't correct in the general case but I don't have the // energy to read more of the sentencepiece code to figure out what it's doing if (text[0] != '\0') { - int dummy_prefix = str_lookup(" ", t->sorted_vocab, t->vocab_size); + int dummy_prefix = str_lookup(" ", t); tokens[(*n_tokens)++] = dummy_prefix; } @@ -517,7 +520,7 @@ void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int * } // ok c+1 is not a continuation byte, so we've read in a full codepoint - int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); + int id = str_lookup(str_buffer, t); if (id != -1) { // we found this codepoint in vocab, add it as a token @@ -542,7 +545,7 @@ void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int * for (int i=0; i < (*n_tokens-1); i++) { // check if we can merge the pair (tokens[i], tokens[i+1]) sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]); - int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); + int id = str_lookup(str_buffer, t); if (id != -1 && t->vocab_scores[id] > best_score) { // this merge pair exists in vocab! record its score and position best_score = t->vocab_scores[id]; @@ -948,7 +951,8 @@ int main(int argc, char *argv[]) { // build the Tokenizer via the tokenizer .bin file Tokenizer tokenizer; - build_tokenizer(&tokenizer, tokenizer_path, transformer.config.vocab_size); + init_tokenizer(&tokenizer, transformer.config.vocab_size); + build_tokenizer(&tokenizer, tokenizer_path); // build the Sampler Sampler sampler; diff --git a/runq.c b/runq.c index 42360ae9..c7ebe9d0 100644 --- a/runq.c +++ b/runq.c @@ -501,13 +501,16 @@ int compare_tokens(const void *a, const void *b) { return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str); } -void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) { - // i should have written the vocab_size into the tokenizer file... sigh +void init_tokenizer(Tokenizer* t, int vocab_size){ + // allocate memory based on the specified vocab_size t->vocab_size = vocab_size; // malloc space to hold the scores and the strings - t->vocab = (char**)malloc(vocab_size * sizeof(char*)); - t->vocab_scores = (float*)malloc(vocab_size * sizeof(float)); + t->vocab = (char**)malloc(t->vocab_size * sizeof(char*)); + t->vocab_scores = (float*)malloc(t->vocab_size * sizeof(float)); t->sorted_vocab = NULL; // initialized lazily +} + +void build_tokenizer(Tokenizer* t, char* tokenizer_path) { for (int i = 0; i < 256; i++) { t->byte_pieces[i * 2] = (unsigned char)i; t->byte_pieces[i * 2 + 1] = '\0'; @@ -517,7 +520,7 @@ void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) { if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); } if (fread(&t->max_token_length, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); } int len; - for (int i = 0; i < vocab_size; i++) { + for (int i = 0; i < t->vocab_size; i++) { if (fread(t->vocab_scores + i, sizeof(float), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE);} if (fread(&len, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); } t->vocab[i] = (char *)malloc(len + 1); @@ -561,10 +564,10 @@ void safe_printf(char *piece) { printf("%s", piece); } -int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) { +int str_lookup(char *str, Tokenizer* t) { // efficiently find the perfect match for str in vocab, return its index or -1 if not found TokenIndex tok = { .str = str }; // acts as the key to search for - TokenIndex *res = bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens); + TokenIndex *res = bsearch(&tok, t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens); return res != NULL ? res->id : -1; } @@ -599,7 +602,7 @@ void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int * // TODO: pretty sure this isn't correct in the general case but I don't have the // energy to read more of the sentencepiece code to figure out what it's doing if (text[0] != '\0') { - int dummy_prefix = str_lookup(" ", t->sorted_vocab, t->vocab_size); + int dummy_prefix = str_lookup(" ", t); tokens[(*n_tokens)++] = dummy_prefix; } @@ -636,7 +639,7 @@ void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int * } // ok c+1 is not a continuation byte, so we've read in a full codepoint - int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); + int id = str_lookup(str_buffer, t); if (id != -1) { // we found this codepoint in vocab, add it as a token @@ -661,7 +664,7 @@ void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int * for (int i=0; i < (*n_tokens-1); i++) { // check if we can merge the pair (tokens[i], tokens[i+1]) sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]); - int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); + int id = str_lookup(str_buffer, t); if (id != -1 && t->vocab_scores[id] > best_score) { // this merge pair exists in vocab! record its score and position best_score = t->vocab_scores[id]; @@ -1067,7 +1070,8 @@ int main(int argc, char *argv[]) { // build the Tokenizer via the tokenizer .bin file Tokenizer tokenizer; - build_tokenizer(&tokenizer, tokenizer_path, transformer.config.vocab_size); + init_tokenizer(&tokenizer, transformer.config.vocab_size); + build_tokenizer(&tokenizer, tokenizer_path); // build the Sampler Sampler sampler; diff --git a/test.c b/test.c index 4203efde..decf0c51 100644 --- a/test.c +++ b/test.c @@ -43,7 +43,8 @@ void test_prompt_encodings() { char *tokenizer_path = "tokenizer.bin"; int vocab_size = 32000; Tokenizer tokenizer; - build_tokenizer(&tokenizer, tokenizer_path, vocab_size); + init_tokenizer(&tokenizer, vocab_size); + build_tokenizer(&tokenizer, tokenizer_path); // test 0 (test the empty string) (I added this as a simple case) char *prompt0 = "";