Decodal/crates/decodal-core/src/lexer.rs

343 lines
8.8 KiB
Rust

use alloc::{string::String, vec::Vec};
use crate::{Diagnostic, Span, diagnostic::Result};
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
pub kind: TokenKind,
pub span: Span,
}
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
Ident(String),
Int(i64),
Float(f64),
String(String),
Regex(String),
True,
False,
Let,
In,
Match,
Import,
Default,
Underscore,
LBrace,
RBrace,
LBracket,
RBracket,
LParen,
RParen,
Semicolon,
Comma,
Dot,
Colon,
Equal,
Arrow,
Amp,
SlashSlash,
Gt,
Gte,
Lt,
Lte,
Eof,
}
pub struct Lexer<'a> {
source: &'a str,
bytes: &'a [u8],
pos: usize,
}
impl<'a> Lexer<'a> {
pub fn new(source: &'a str) -> Self {
Self {
source,
bytes: source.as_bytes(),
pos: 0,
}
}
pub fn tokenize(mut self) -> Result<Vec<Token>> {
let mut tokens = Vec::new();
loop {
let token = self.next_token()?;
let is_eof = token.kind == TokenKind::Eof;
tokens.push(token);
if is_eof {
return Ok(tokens);
}
}
}
fn next_token(&mut self) -> Result<Token> {
self.skip_ws_and_comments();
let start = self.pos;
let Some(ch) = self.peek() else {
return Ok(Token {
kind: TokenKind::Eof,
span: Span::empty(self.pos),
});
};
let kind = match ch {
b'{' => {
self.pos += 1;
TokenKind::LBrace
}
b'}' => {
self.pos += 1;
TokenKind::RBrace
}
b'[' => {
self.pos += 1;
TokenKind::LBracket
}
b']' => {
self.pos += 1;
TokenKind::RBracket
}
b'(' => {
self.pos += 1;
TokenKind::LParen
}
b')' => {
self.pos += 1;
TokenKind::RParen
}
b';' => {
self.pos += 1;
TokenKind::Semicolon
}
b',' => {
self.pos += 1;
TokenKind::Comma
}
b'.' => {
self.pos += 1;
TokenKind::Dot
}
b':' => {
self.pos += 1;
TokenKind::Colon
}
b'_' => {
self.pos += 1;
TokenKind::Underscore
}
b'&' => {
self.pos += 1;
TokenKind::Amp
}
b'=' => {
self.pos += 1;
if self.consume(b'>') {
TokenKind::Arrow
} else {
TokenKind::Equal
}
}
b'>' => {
self.pos += 1;
if self.consume(b'=') {
TokenKind::Gte
} else {
TokenKind::Gt
}
}
b'<' => {
self.pos += 1;
if self.consume(b'=') {
TokenKind::Lte
} else {
TokenKind::Lt
}
}
b'/' => {
self.pos += 1;
if self.consume(b'/') {
TokenKind::SlashSlash
} else {
self.lex_regex(start)?
}
}
b'"' => self.lex_string()?,
b'0'..=b'9' => self.lex_number()?,
c if is_ident_start(c) => self.lex_ident_or_keyword(),
_ => {
return Err(Diagnostic::syntax(
Span::new(start, start + 1),
"unexpected character",
));
}
};
Ok(Token {
kind,
span: Span::new(start, self.pos),
})
}
fn skip_ws_and_comments(&mut self) {
loop {
while matches!(self.peek(), Some(b' ' | b'\t' | b'\r' | b'\n')) {
self.pos += 1;
}
if self.peek() == Some(b'#') {
while let Some(c) = self.peek() {
self.pos += 1;
if c == b'\n' {
break;
}
}
continue;
}
break;
}
}
fn lex_string(&mut self) -> Result<TokenKind> {
let start = self.pos;
self.pos += 1;
let mut value = String::new();
while let Some(c) = self.peek() {
self.pos += 1;
match c {
b'"' => return Ok(TokenKind::String(value)),
b'\\' => {
let Some(escaped) = self.peek() else {
return Err(Diagnostic::syntax(
Span::new(start, self.pos),
"unterminated escape",
));
};
self.pos += 1;
let ch = match escaped {
b'"' => '"',
b'\\' => '\\',
b'n' => '\n',
b'r' => '\r',
b't' => '\t',
other => other as char,
};
value.push(ch);
}
other => value.push(other as char),
}
}
Err(Diagnostic::syntax(
Span::new(start, self.pos),
"unterminated string",
))
}
fn lex_regex(&mut self, start: usize) -> Result<TokenKind> {
let mut pattern = String::new();
let mut escaped = false;
while let Some(c) = self.peek() {
self.pos += 1;
if escaped {
pattern.push(c as char);
escaped = false;
continue;
}
match c {
b'\\' => {
pattern.push('\\');
escaped = true;
}
b'/' => return Ok(TokenKind::Regex(pattern)),
other => pattern.push(other as char),
}
}
Err(Diagnostic::syntax(
Span::new(start, self.pos),
"unterminated regex",
))
}
fn lex_number(&mut self) -> Result<TokenKind> {
let start = self.pos;
while matches!(self.peek(), Some(b'0'..=b'9')) {
self.pos += 1;
}
let mut is_float = false;
if self.peek() == Some(b'.') && matches!(self.peek_n(1), Some(b'0'..=b'9')) {
is_float = true;
self.pos += 1;
while matches!(self.peek(), Some(b'0'..=b'9')) {
self.pos += 1;
}
}
let text = &self.source[start..self.pos];
if is_float {
text.parse::<f64>().map(TokenKind::Float).map_err(|_| {
Diagnostic::syntax(Span::new(start, self.pos), "invalid float literal")
})
} else {
text.parse::<i64>()
.map(TokenKind::Int)
.map_err(|_| Diagnostic::syntax(Span::new(start, self.pos), "invalid int literal"))
}
}
fn lex_ident_or_keyword(&mut self) -> TokenKind {
let start = self.pos;
self.pos += 1;
while matches!(self.peek(), Some(c) if is_ident_continue(c)) {
self.pos += 1;
}
let text = &self.source[start..self.pos];
match text {
"true" => TokenKind::True,
"false" => TokenKind::False,
"let" => TokenKind::Let,
"in" => TokenKind::In,
"match" => TokenKind::Match,
"import" => TokenKind::Import,
"default" => TokenKind::Default,
_ => TokenKind::Ident(String::from(text)),
}
}
fn peek(&self) -> Option<u8> {
self.bytes.get(self.pos).copied()
}
fn peek_n(&self, n: usize) -> Option<u8> {
self.bytes.get(self.pos + n).copied()
}
fn consume(&mut self, expected: u8) -> bool {
if self.peek() == Some(expected) {
self.pos += 1;
true
} else {
false
}
}
}
fn is_ident_start(c: u8) -> bool {
c.is_ascii_alphabetic()
}
fn is_ident_continue(c: u8) -> bool {
c.is_ascii_alphanumeric() || c == b'_'
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenizes_basic_source() {
let tokens = Lexer::new("port = Int & >= 1;").tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Ident(_)));
assert_eq!(tokens[1].kind, TokenKind::Equal);
assert_eq!(tokens[3].kind, TokenKind::Amp);
assert_eq!(tokens[4].kind, TokenKind::Gte);
}
}