Skip to content

Commit

Permalink
Change type from char to String in processors
Browse files Browse the repository at this point in the history
  • Loading branch information
torymur committed Jan 15, 2025
1 parent 73e4bfe commit bf6170c
Showing 1 changed file with 13 additions and 10 deletions.
23 changes: 13 additions & 10 deletions src/vocabulary/processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,21 +90,22 @@ pub(crate) enum TokenProcessorLevel {
/// Modifications to be applied by `TokenProcessor`of `ByteFallback` level.
#[derive(Debug, Clone, PartialEq)]
pub(crate) struct Mods {
spacechar: char,
spacechar: String,
}

impl Default for Mods {
/// Default string modification to be applied by `TokenProcessor` of `ByteFallback` level.
fn default() -> Self {
Self { spacechar: ' ' }
Self {
spacechar: ' '.to_string(),
}
}
}

impl Mods {
/// Apply default modifications to each token.
fn apply_default(&self, token: &str) -> String {
let to = Self::default().spacechar.to_string();
token.replace(self.spacechar, &to)
token.replace(&self.spacechar, &Self::default().spacechar)
}
}

Expand All @@ -116,7 +117,7 @@ struct ReplaceDecoder {
}

impl ReplaceDecoder {
fn space_replacement(&self) -> Option<char> {
fn space_replacement(&self) -> Option<String> {
if self.content != " " {
return None;
}
Expand All @@ -126,7 +127,7 @@ impl ReplaceDecoder {
let char = chars.next();
if let Some(replacement) = char {
if chars.next().is_none() {
return Some(replacement);
return Some(replacement.to_string());
}
}
None
Expand Down Expand Up @@ -157,7 +158,7 @@ impl TokenProcessor {
}),
DecoderWrapper::Sequence(decoding_sequence) => {
let mut is_byte_fallback = false;
let mut spacechar = ' ';
let mut spacechar = ' '.to_string();

for decoder in decoding_sequence.get_decoders() {
match decoder {
Expand Down Expand Up @@ -285,16 +286,18 @@ mod tests {
let model = "hf-internal-testing/llama-tokenizer";
let tokenizer = Tokenizer::from_pretrained(model, None).expect("Tokenizer failed");
let processor = TokenProcessor::new(&tokenizer).expect("Processor failed");
let spacechar = '▁';
let mods = Mods { spacechar };
let spacechar = '▁'.to_string();
let mods = Mods {
spacechar: spacechar.clone(),
};

assert_eq!(processor.level, TokenProcessorLevel::ByteFallback(mods));

for (input, expected) in [
("abc", vec![0x61, 0x62, 0x63]),
("<0x61>", vec![0x61]),
("<0x61>a", vec![0x3C, 0x30, 0x78, 0x36, 0x31, 0x3E, 0x61]),
(&spacechar.to_string(), vec![0x20]),
(&spacechar, vec![0x20]),
(
&format!("{}{}abc", spacechar, spacechar),
vec![0x20, 0x20, 0x61, 0x62, 0x63],
Expand Down

0 comments on commit bf6170c

Please sign in to comment.