Interface-Language/src/om/tokenizer.rs
2024-12-09 00:21:13 +09:00

255 lines
8.3 KiB
Rust

pub struct Tokenizer {
state: State,
pos: usize,
reconsume: bool,
latest: Option<Token>,
input: Vec<char>,
buffer: String,
}
impl Tokenizer {
pub fn new(input: String) -> Self {
Self {
state: State::Data,
pos: 0,
reconsume: false,
latest: None,
input: input.chars().collect(),
buffer: String::new(),
}
}
fn is_eof(&self) -> bool {
self.pos >= self.input.len()
}
fn consume_input(&mut self) -> char {
let c = self.input[self.pos];
self.pos += 1;
c
}
}
impl Iterator for Tokenizer {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if self.is_eof() {
return None;
}
loop {
if self.reconsume {
self.pos -= 1;
self.reconsume = false;
}
let c = self.consume_input();
match self.state {
State::Data => match c {
'(' => {
self.state = State::NodeOpen;
}
' ' | '\n' | '\t' => {}
_ if self.is_eof() => {
return Some(Token::EOF);
}
_ => {
return Some(Token::Character(c));
}
},
State::NodeOpen => match c {
' ' | '\n' => {}
x if x.is_ascii_alphanumeric() => {
self.state = State::Define;
self.reconsume = true;
continue;
}
_ if self.is_eof() => {
return Some(Token::EOF);
}
_ => panic!("Unexpected character: {}", c),
},
State::Define => match c {
x if x.is_ascii_alphanumeric() => {
self.buffer.push(c);
}
' ' | '\n' => {
self.state = State::AfterDefine;
self.latest = Some(Token::Define {
name: self.buffer.clone(),
behavior: Vec::new(),
});
self.buffer.clear();
}
')' => {
self.state = State::Data;
}
_ if self.is_eof() => {
return Some(Token::EOF);
}
_ => panic!("Unexpected character: {}", c),
},
State::AfterDefine => match c {
' ' | '\n' => {}
'[' => {
self.state = State::Behavior;
}
_ if self.is_eof() => {
return Some(Token::EOF);
}
_ => {
self.state = State::Data;
self.reconsume = true;
return self.latest.take();
}
},
State::Behavior => match c {
x if x.is_ascii_alphanumeric() => {
self.reconsume = true;
if let Some(t) = self.latest.as_mut() {
match t {
Token::Define {
name: _,
ref mut behavior,
} => {
behavior.push(BehaviorItem::new());
}
_ => {}
}
}
self.state = State::BehaviorKey;
}
' ' | '\n' => {}
']' => {
self.state = State::Data;
return self.latest.take();
}
_ if self.is_eof() => {
return Some(Token::EOF);
}
_ => panic!("Unexpected character: {}", c),
},
State::BehaviorKey => match c {
x if x.is_ascii_alphanumeric() => {
self.buffer.push(c);
}
':' => {
if let Some(t) = self.latest.as_mut() {
match t {
Token::Define {
name: _,
ref mut behavior,
} => {
behavior.last_mut().unwrap().set_prefix(self.buffer.clone());
self.buffer.clear();
self.state = State::BehaviorKey;
}
_ => {}
}
}
}
'=' => {
if let Some(t) = self.latest.as_mut() {
match t {
Token::Define {
name: _,
ref mut behavior,
} => {
behavior.last_mut().unwrap().set_key(self.buffer.clone());
self.buffer.clear();
self.state = State::BehaviorValue;
}
_ => {}
}
}
}
_ if self.is_eof() => {
return Some(Token::EOF);
}
_ => panic!("Unexpected character: {}", c),
},
State::BehaviorValue => match c {
x if x.is_ascii_alphanumeric() => {
self.buffer.push(c);
}
']' => {
if let Some(t) = self.latest.as_mut() {
match t {
Token::Define {
name: _,
ref mut behavior,
} => {
behavior.last_mut().unwrap().set_value(self.buffer.clone());
self.buffer.clear();
self.state = State::Behavior;
}
_ => {}
}
}
return self.latest.take();
}
' ' | '\n' => {}
_ if self.is_eof() => {
return Some(Token::EOF);
}
_ => panic!("Unexpected character: {}", c),
},
}
}
}
}
#[derive(Clone, Copy)]
enum State {
Data,
NodeOpen,
Define,
AfterDefine,
Behavior,
BehaviorKey,
BehaviorValue,
}
use crate::om::behavior::BehaviorItem;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
Define {
name: String,
behavior: Vec<BehaviorItem>,
},
Character(char),
EOF,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_input() {
let input = "".to_string();
let mut tokenizer = Tokenizer::new(input);
assert_eq!(None, tokenizer.next());
}
#[test]
fn test_tokenizer() {
let input = "(foo [prefix:key=value])".to_string();
let mut tokenizer = Tokenizer::new(input);
let expected = [
Token::Define {
name: "foo".to_string(),
behavior: vec![BehaviorItem {
prefix: Some("prefix".to_string()),
key: "key".to_string(),
value: "value".to_string(),
}],
},
Token::EOF,
];
for e in expected {
assert_eq!(Some(e), tokenizer.next());
}
}
}