Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenizer in C #15

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
**/.DS_Store
run
43 changes: 43 additions & 0 deletions export_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from sentencepiece import SentencePieceProcessor

sp_model = SentencePieceProcessor(model_file="tokenizer.model")

vocab_size = sp_model.vocab_size()
bos_id = sp_model.bos_id()
eos_id = sp_model.eos_id()
pad_id = sp_model.pad_id()
print(vocab_size, bos_id, eos_id, pad_id)

# for i in range(vocab_size): print(i, repr(sp_model.id_to_piece(i)))

token_blob = []
offsets = []
offset = 0
for i in range(vocab_size):
t = sp_model.id_to_piece(i)
if i == bos_id:
t = '\n<s>\n'
elif i == eos_id:
t = '\n</s>\n'
elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

??? some comments around here could be nice. i haven't dug into sp too much but this looks odd

t = chr(int(t[3:5], 16))

t = t.replace('▁', ' ')
t = t.encode('utf-8')
offsets.append(offset)
token_blob.append(t)
offset += len(t)

# dummy token to always be able calculate token len as
# offsets[i + 1] - offsets[i]
offsets.append(offset)

with open('tokenizer.bin', 'wb') as f:
f.write((vocab_size + 1).to_bytes(4, 'little'))

for offset in offsets:
f.write(offset.to_bytes(4, 'little'))
for token in token_blob:
f.write(token)

print('tokenizer.bin is created')
33 changes: 32 additions & 1 deletion run.c
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,35 @@ int argmax(float* v, int n) {

// ----------------------------------------------------------------------------

int *token_offsets = NULL;
char *tokens_blob = NULL;

void load_tokenizer() {
FILE *file = fopen("tokenizer.bin", "rb");
if (file == NULL) {
printf("Cannot open tokenizer file\n");
exit(EXIT_FAILURE);
}

int offsets_len;
fread(&offsets_len, sizeof(int), 1, file);

token_offsets = (int*) malloc(offsets_len * sizeof(int));
fread(token_offsets, sizeof(int), offsets_len, file);

long pos = ftell(file);
fseek(file, 0, SEEK_END);
long tokens_blob_size = ftell(file) - pos;
fseek(file, pos, SEEK_SET);

tokens_blob = (char*) malloc(tokens_blob_size);
fread(tokens_blob, 1, tokens_blob_size, file);
fclose(file);
}

int main(int argc, char *argv[]) {
setbuf(stdout, NULL); // disable stdout buffering
load_tokenizer();

// poor man's C argparse
char *checkpoint = NULL;
Expand Down Expand Up @@ -449,13 +476,17 @@ int main(int argc, char *argv[]) {
// we now want to sample from this distribution to get the next token
next = sample(state.logits, config.vocab_size);
}
printf("%d\n", next);
int offset = token_offsets[next];
int next_offset = token_offsets[next + 1];
fwrite(tokens_blob + offset, 1, next_offset - offset, stdout);

// advance forward
token = next;
pos++;
}

free(token_offsets);
free(tokens_blob);
free_run_state(&state, &config);
free_weights(&weights, &config);
return 0;
Expand Down
Binary file added tokenizer.bin
Binary file not shown.