Skip to content

Commit

Permalink
No Vocabulary is insufficient for Index
Browse files Browse the repository at this point in the history
  • Loading branch information
torymur committed Jan 15, 2025
1 parent bf6170c commit a6a88da
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 26 deletions.
2 changes: 0 additions & 2 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ pub type Result<T, E = crate::Error> = std::result::Result<T, E>;
#[derive(Error, Debug)]
pub enum Error {
// Index Errors
#[error("The vocabulary does not allow to build an index that matches the input")]
InsufficientVocabulary,
#[error("Failed to build DFA {0}")]
IndexDfaError(#[from] Box<regex_automata::dfa::dense::BuildError>),
#[error("Index failed since anchored universal start state doesn't exist")]
Expand Down
65 changes: 41 additions & 24 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,44 @@ pub struct Index {
initial_state: StateId,
/// A collection of states considered as terminal states.
final_states: HashSet<StateId>,
/// A mapping of state transitions, defined by tokens ids and their corresponding state changes:
/// - The outer map's keys are the state IDs.
/// - The inner map's keys are token IDs.
/// - The inner map's values are state IDs, indicating transitions to the next state.
/// A mapping of state transitions, defined by tokens ids and their corresponding state changes.
///
/// ### Example
/// ```
/// transitions = {
/// 1: {10: 2, 15: 3},
/// 2: {20: 4, 25: 3},
/// 3: {30: 4},
/// 4: {40: 4},
/// }
/// +--------------------------------------+
/// | State 1 |
/// | Initial State |
/// +--------------------------------------+
/// | |
/// + |
/// Token ID 10 |
/// +-----------------------+ |
/// | State 2 | |
/// +-----------------------+ |
/// | | |
/// | + +
/// | Token ID 25 Token ID 15
/// | +------------------------+
/// | | State 3 |
/// | +------------------------+
/// | |
/// + +
/// Token ID 20 Token ID 30
/// +--------------------------------------+
/// | State 4 |
/// | Final state |
/// +--------------------------------------+
/// ```
transitions: HashMap<StateId, HashMap<TokenId, StateId>>,
/// The token ID reserved for the "end-of-sequence" token.
eos_token_id: TokenId,
}

/// The `Index` structure is designed to efficiently map tokens from a given vocabulary
/// to state transitions within a finite-state automaton.
///
Expand Down Expand Up @@ -122,30 +151,19 @@ impl Index {
.insert(eos_token_id, final_state);
}

// Check if there is at least one valid mapping
let is_valid = transitions.values().any(|mapping| {
mapping
.values()
.any(|end_state| final_states.contains(end_state))
});

if is_valid {
Ok(Self {
initial_state: start_state.as_u32(),
final_states,
transitions,
eos_token_id,
})
} else {
Err(Error::InsufficientVocabulary)
}
Ok(Self {
initial_state: start_state.as_u32(),
final_states,
transitions,
eos_token_id,
})
}

/// Lists allowed tokens for a give state ID or `None` if it is not found in `Index`.
pub fn allowed_tokens(&self, state: &StateId) -> Option<Vec<TokenId>> {
self.transitions
.get(state)
.map_or_else(|| None, |res| Some(res.keys().cloned().collect()))
.map(|res| res.keys().cloned().collect())
}

/// Returns transition state for a given state and token id or `None` otherwise.
Expand Down Expand Up @@ -259,7 +277,6 @@ mod tests {
}

let index = Index::new(regex, &vocabulary).expect("Index failed");

assert_eq!(index.final_states(), &HashSet::from_iter([208, 128]));

let expected = HashMap::from_iter([
Expand Down

0 comments on commit a6a88da

Please sign in to comment.