use alloc::{string::String, vec::Vec}; use crate::{Diagnostic, Span, diagnostic::Result}; #[derive(Debug, Clone, PartialEq)] pub struct Token { pub kind: TokenKind, pub span: Span, } #[derive(Debug, Clone, PartialEq)] pub enum TokenKind { Ident(String), Int(i64), Float(f64), String(String), Regex(String), True, False, Let, In, Match, Import, Default, Underscore, LBrace, RBrace, LBracket, RBracket, LParen, RParen, Semicolon, Comma, Dot, Colon, Equal, Arrow, Amp, SlashSlash, Gt, Gte, Lt, Lte, Eof, } pub struct Lexer<'a> { source: &'a str, bytes: &'a [u8], pos: usize, } impl<'a> Lexer<'a> { pub fn new(source: &'a str) -> Self { Self { source, bytes: source.as_bytes(), pos: 0, } } pub fn tokenize(mut self) -> Result> { let mut tokens = Vec::new(); loop { let token = self.next_token()?; let is_eof = token.kind == TokenKind::Eof; tokens.push(token); if is_eof { return Ok(tokens); } } } fn next_token(&mut self) -> Result { self.skip_ws_and_comments(); let start = self.pos; let Some(ch) = self.peek() else { return Ok(Token { kind: TokenKind::Eof, span: Span::empty(self.pos), }); }; let kind = match ch { b'{' => { self.pos += 1; TokenKind::LBrace } b'}' => { self.pos += 1; TokenKind::RBrace } b'[' => { self.pos += 1; TokenKind::LBracket } b']' => { self.pos += 1; TokenKind::RBracket } b'(' => { self.pos += 1; TokenKind::LParen } b')' => { self.pos += 1; TokenKind::RParen } b';' => { self.pos += 1; TokenKind::Semicolon } b',' => { self.pos += 1; TokenKind::Comma } b'.' => { self.pos += 1; TokenKind::Dot } b':' => { self.pos += 1; TokenKind::Colon } b'_' => { self.pos += 1; TokenKind::Underscore } b'&' => { self.pos += 1; TokenKind::Amp } b'=' => { self.pos += 1; if self.consume(b'>') { TokenKind::Arrow } else { TokenKind::Equal } } b'>' => { self.pos += 1; if self.consume(b'=') { TokenKind::Gte } else { TokenKind::Gt } } b'<' => { self.pos += 1; if self.consume(b'=') { TokenKind::Lte } else { TokenKind::Lt } } b'/' => { self.pos += 1; if self.consume(b'/') { TokenKind::SlashSlash } else { self.lex_regex(start)? } } b'"' => self.lex_string()?, b'0'..=b'9' => self.lex_number()?, c if is_ident_start(c) => self.lex_ident_or_keyword(), _ => { return Err(Diagnostic::syntax( Span::new(start, start + 1), "unexpected character", )); } }; Ok(Token { kind, span: Span::new(start, self.pos), }) } fn skip_ws_and_comments(&mut self) { loop { while matches!(self.peek(), Some(b' ' | b'\t' | b'\r' | b'\n')) { self.pos += 1; } if self.peek() == Some(b'#') { while let Some(c) = self.peek() { self.pos += 1; if c == b'\n' { break; } } continue; } break; } } fn lex_string(&mut self) -> Result { let start = self.pos; self.pos += 1; let mut value = String::new(); while let Some(c) = self.peek() { self.pos += 1; match c { b'"' => return Ok(TokenKind::String(value)), b'\\' => { let Some(escaped) = self.peek() else { return Err(Diagnostic::syntax( Span::new(start, self.pos), "unterminated escape", )); }; self.pos += 1; let ch = match escaped { b'"' => '"', b'\\' => '\\', b'n' => '\n', b'r' => '\r', b't' => '\t', other => other as char, }; value.push(ch); } other => value.push(other as char), } } Err(Diagnostic::syntax( Span::new(start, self.pos), "unterminated string", )) } fn lex_regex(&mut self, start: usize) -> Result { let mut pattern = String::new(); let mut escaped = false; while let Some(c) = self.peek() { self.pos += 1; if escaped { pattern.push(c as char); escaped = false; continue; } match c { b'\\' => { pattern.push('\\'); escaped = true; } b'/' => return Ok(TokenKind::Regex(pattern)), other => pattern.push(other as char), } } Err(Diagnostic::syntax( Span::new(start, self.pos), "unterminated regex", )) } fn lex_number(&mut self) -> Result { let start = self.pos; while matches!(self.peek(), Some(b'0'..=b'9')) { self.pos += 1; } let mut is_float = false; if self.peek() == Some(b'.') && matches!(self.peek_n(1), Some(b'0'..=b'9')) { is_float = true; self.pos += 1; while matches!(self.peek(), Some(b'0'..=b'9')) { self.pos += 1; } } let text = &self.source[start..self.pos]; if is_float { text.parse::().map(TokenKind::Float).map_err(|_| { Diagnostic::syntax(Span::new(start, self.pos), "invalid float literal") }) } else { text.parse::() .map(TokenKind::Int) .map_err(|_| Diagnostic::syntax(Span::new(start, self.pos), "invalid int literal")) } } fn lex_ident_or_keyword(&mut self) -> TokenKind { let start = self.pos; self.pos += 1; while matches!(self.peek(), Some(c) if is_ident_continue(c)) { self.pos += 1; } let text = &self.source[start..self.pos]; match text { "true" => TokenKind::True, "false" => TokenKind::False, "let" => TokenKind::Let, "in" => TokenKind::In, "match" => TokenKind::Match, "import" => TokenKind::Import, "default" => TokenKind::Default, _ => TokenKind::Ident(String::from(text)), } } fn peek(&self) -> Option { self.bytes.get(self.pos).copied() } fn peek_n(&self, n: usize) -> Option { self.bytes.get(self.pos + n).copied() } fn consume(&mut self, expected: u8) -> bool { if self.peek() == Some(expected) { self.pos += 1; true } else { false } } } fn is_ident_start(c: u8) -> bool { c.is_ascii_alphabetic() } fn is_ident_continue(c: u8) -> bool { c.is_ascii_alphanumeric() || c == b'_' } #[cfg(test)] mod tests { use super::*; #[test] fn tokenizes_basic_source() { let tokens = Lexer::new("port = Int & >= 1;").tokenize().unwrap(); assert!(matches!(tokens[0].kind, TokenKind::Ident(_))); assert_eq!(tokens[1].kind, TokenKind::Equal); assert_eq!(tokens[3].kind, TokenKind::Amp); assert_eq!(tokens[4].kind, TokenKind::Gte); } }