Skip to content

Commit

Permalink
allow for using -1 for eos token (if missing); untested
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Jan 8, 2025
1 parent 5198689 commit b40e1c7
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 6 deletions.
7 changes: 4 additions & 3 deletions parser/src/earley/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use derivre::{AlphabetInfo, RegexAst, StateID};
use hashbrown::HashSet;
use instant::Instant;
use serde::{Deserialize, Serialize};
use toktrie::{Recognizer, SimpleVob, TokEnv, TokTrie};
use toktrie::{Recognizer, SimpleVob, TokEnv, TokTrie, INVALID_TOKEN};

use crate::{
api::{ParserLimits, StopReason},
Expand Down Expand Up @@ -663,8 +663,9 @@ impl ParserState {
let _ = self.flush_lexer();
}

if start.is_empty() && self.lexer_allows_eos() {
set.allow_token(computer.trie().eos_token());
let eos = computer.trie().eos_token();
if eos != INVALID_TOKEN && start.is_empty() && self.lexer_allows_eos() {
set.allow_token(eos);
}

self.stats.compute_time_us += t0.elapsed().as_micros() as u64;
Expand Down
4 changes: 2 additions & 2 deletions parser/src/tokenparser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::{
};
use anyhow::{ensure, Result};
use serde_json::json;
use toktrie::{InferenceCapabilities, SimpleVob, TokEnv, TokenId};
use toktrie::{InferenceCapabilities, SimpleVob, TokEnv, TokenId, INVALID_TOKEN};

#[derive(Clone)]
pub struct TokenParser {
Expand Down Expand Up @@ -386,7 +386,7 @@ impl TokenParser {
return Err(self.stop_for_parser_error("", s));
}

if self.is_accepting() {
if self.eos_token != INVALID_TOKEN && self.is_accepting() {
allowed_tokens.allow_token(self.eos_token);
}

Expand Down
1 change: 1 addition & 0 deletions toktrie/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ mod toktree;
pub use svob::{SimpleVob, SimpleVobIter};
pub use toktree::{
Recognizer, TokEnv, TokEnvWithTrie, TokRxInfo, TokTrie, TokenId, TokenizerEnv, TrieNode,
INVALID_TOKEN,
};

/// Defines what is allowed in Branch
Expand Down
4 changes: 3 additions & 1 deletion toktrie/src/toktree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ pub struct TrieNode {
bits2: u32,
}

pub const INVALID_TOKEN: TokenId = 0xffff_ffff;

const NO_TOKEN: u32 = 0xffffff;

impl TrieNode {
Expand Down Expand Up @@ -345,7 +347,7 @@ impl TokTrie {
let max_tok = std::cmp::min(max_examples, num_set);
let mut token_names = Vec::new();
// make sure we include EOS first if it's allowed
if ts1.is_allowed(self.info.tok_eos) {
if self.info.tok_eos != INVALID_TOKEN && ts1.is_allowed(self.info.tok_eos) {
token_names.push("EOS".to_string());
}
for idx in 0..self.vocab_size() {
Expand Down

0 comments on commit b40e1c7

Please sign in to comment.