343 lines
8.8 KiB
Rust
343 lines
8.8 KiB
Rust
use alloc::{string::String, vec::Vec};
|
|
|
|
use crate::{Diagnostic, Span, diagnostic::Result};
|
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub struct Token {
|
|
pub kind: TokenKind,
|
|
pub span: Span,
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub enum TokenKind {
|
|
Ident(String),
|
|
Int(i64),
|
|
Float(f64),
|
|
String(String),
|
|
Regex(String),
|
|
True,
|
|
False,
|
|
Let,
|
|
In,
|
|
Match,
|
|
Import,
|
|
Default,
|
|
Underscore,
|
|
LBrace,
|
|
RBrace,
|
|
LBracket,
|
|
RBracket,
|
|
LParen,
|
|
RParen,
|
|
Semicolon,
|
|
Comma,
|
|
Dot,
|
|
Colon,
|
|
Equal,
|
|
Arrow,
|
|
Amp,
|
|
SlashSlash,
|
|
Gt,
|
|
Gte,
|
|
Lt,
|
|
Lte,
|
|
Eof,
|
|
}
|
|
|
|
pub struct Lexer<'a> {
|
|
source: &'a str,
|
|
bytes: &'a [u8],
|
|
pos: usize,
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
pub fn new(source: &'a str) -> Self {
|
|
Self {
|
|
source,
|
|
bytes: source.as_bytes(),
|
|
pos: 0,
|
|
}
|
|
}
|
|
|
|
pub fn tokenize(mut self) -> Result<Vec<Token>> {
|
|
let mut tokens = Vec::new();
|
|
loop {
|
|
let token = self.next_token()?;
|
|
let is_eof = token.kind == TokenKind::Eof;
|
|
tokens.push(token);
|
|
if is_eof {
|
|
return Ok(tokens);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn next_token(&mut self) -> Result<Token> {
|
|
self.skip_ws_and_comments();
|
|
let start = self.pos;
|
|
let Some(ch) = self.peek() else {
|
|
return Ok(Token {
|
|
kind: TokenKind::Eof,
|
|
span: Span::empty(self.pos),
|
|
});
|
|
};
|
|
|
|
let kind = match ch {
|
|
b'{' => {
|
|
self.pos += 1;
|
|
TokenKind::LBrace
|
|
}
|
|
b'}' => {
|
|
self.pos += 1;
|
|
TokenKind::RBrace
|
|
}
|
|
b'[' => {
|
|
self.pos += 1;
|
|
TokenKind::LBracket
|
|
}
|
|
b']' => {
|
|
self.pos += 1;
|
|
TokenKind::RBracket
|
|
}
|
|
b'(' => {
|
|
self.pos += 1;
|
|
TokenKind::LParen
|
|
}
|
|
b')' => {
|
|
self.pos += 1;
|
|
TokenKind::RParen
|
|
}
|
|
b';' => {
|
|
self.pos += 1;
|
|
TokenKind::Semicolon
|
|
}
|
|
b',' => {
|
|
self.pos += 1;
|
|
TokenKind::Comma
|
|
}
|
|
b'.' => {
|
|
self.pos += 1;
|
|
TokenKind::Dot
|
|
}
|
|
b':' => {
|
|
self.pos += 1;
|
|
TokenKind::Colon
|
|
}
|
|
b'_' => {
|
|
self.pos += 1;
|
|
TokenKind::Underscore
|
|
}
|
|
b'&' => {
|
|
self.pos += 1;
|
|
TokenKind::Amp
|
|
}
|
|
b'=' => {
|
|
self.pos += 1;
|
|
if self.consume(b'>') {
|
|
TokenKind::Arrow
|
|
} else {
|
|
TokenKind::Equal
|
|
}
|
|
}
|
|
b'>' => {
|
|
self.pos += 1;
|
|
if self.consume(b'=') {
|
|
TokenKind::Gte
|
|
} else {
|
|
TokenKind::Gt
|
|
}
|
|
}
|
|
b'<' => {
|
|
self.pos += 1;
|
|
if self.consume(b'=') {
|
|
TokenKind::Lte
|
|
} else {
|
|
TokenKind::Lt
|
|
}
|
|
}
|
|
b'/' => {
|
|
self.pos += 1;
|
|
if self.consume(b'/') {
|
|
TokenKind::SlashSlash
|
|
} else {
|
|
self.lex_regex(start)?
|
|
}
|
|
}
|
|
b'"' => self.lex_string()?,
|
|
b'0'..=b'9' => self.lex_number()?,
|
|
c if is_ident_start(c) => self.lex_ident_or_keyword(),
|
|
_ => {
|
|
return Err(Diagnostic::syntax(
|
|
Span::new(start, start + 1),
|
|
"unexpected character",
|
|
));
|
|
}
|
|
};
|
|
|
|
Ok(Token {
|
|
kind,
|
|
span: Span::new(start, self.pos),
|
|
})
|
|
}
|
|
|
|
fn skip_ws_and_comments(&mut self) {
|
|
loop {
|
|
while matches!(self.peek(), Some(b' ' | b'\t' | b'\r' | b'\n')) {
|
|
self.pos += 1;
|
|
}
|
|
if self.peek() == Some(b'#') {
|
|
while let Some(c) = self.peek() {
|
|
self.pos += 1;
|
|
if c == b'\n' {
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
fn lex_string(&mut self) -> Result<TokenKind> {
|
|
let start = self.pos;
|
|
self.pos += 1;
|
|
let mut value = String::new();
|
|
while let Some(c) = self.peek() {
|
|
self.pos += 1;
|
|
match c {
|
|
b'"' => return Ok(TokenKind::String(value)),
|
|
b'\\' => {
|
|
let Some(escaped) = self.peek() else {
|
|
return Err(Diagnostic::syntax(
|
|
Span::new(start, self.pos),
|
|
"unterminated escape",
|
|
));
|
|
};
|
|
self.pos += 1;
|
|
let ch = match escaped {
|
|
b'"' => '"',
|
|
b'\\' => '\\',
|
|
b'n' => '\n',
|
|
b'r' => '\r',
|
|
b't' => '\t',
|
|
other => other as char,
|
|
};
|
|
value.push(ch);
|
|
}
|
|
other => value.push(other as char),
|
|
}
|
|
}
|
|
Err(Diagnostic::syntax(
|
|
Span::new(start, self.pos),
|
|
"unterminated string",
|
|
))
|
|
}
|
|
|
|
fn lex_regex(&mut self, start: usize) -> Result<TokenKind> {
|
|
let mut pattern = String::new();
|
|
let mut escaped = false;
|
|
while let Some(c) = self.peek() {
|
|
self.pos += 1;
|
|
if escaped {
|
|
pattern.push(c as char);
|
|
escaped = false;
|
|
continue;
|
|
}
|
|
match c {
|
|
b'\\' => {
|
|
pattern.push('\\');
|
|
escaped = true;
|
|
}
|
|
b'/' => return Ok(TokenKind::Regex(pattern)),
|
|
other => pattern.push(other as char),
|
|
}
|
|
}
|
|
Err(Diagnostic::syntax(
|
|
Span::new(start, self.pos),
|
|
"unterminated regex",
|
|
))
|
|
}
|
|
|
|
fn lex_number(&mut self) -> Result<TokenKind> {
|
|
let start = self.pos;
|
|
while matches!(self.peek(), Some(b'0'..=b'9')) {
|
|
self.pos += 1;
|
|
}
|
|
let mut is_float = false;
|
|
if self.peek() == Some(b'.') && matches!(self.peek_n(1), Some(b'0'..=b'9')) {
|
|
is_float = true;
|
|
self.pos += 1;
|
|
while matches!(self.peek(), Some(b'0'..=b'9')) {
|
|
self.pos += 1;
|
|
}
|
|
}
|
|
let text = &self.source[start..self.pos];
|
|
if is_float {
|
|
text.parse::<f64>().map(TokenKind::Float).map_err(|_| {
|
|
Diagnostic::syntax(Span::new(start, self.pos), "invalid float literal")
|
|
})
|
|
} else {
|
|
text.parse::<i64>()
|
|
.map(TokenKind::Int)
|
|
.map_err(|_| Diagnostic::syntax(Span::new(start, self.pos), "invalid int literal"))
|
|
}
|
|
}
|
|
|
|
fn lex_ident_or_keyword(&mut self) -> TokenKind {
|
|
let start = self.pos;
|
|
self.pos += 1;
|
|
while matches!(self.peek(), Some(c) if is_ident_continue(c)) {
|
|
self.pos += 1;
|
|
}
|
|
let text = &self.source[start..self.pos];
|
|
match text {
|
|
"true" => TokenKind::True,
|
|
"false" => TokenKind::False,
|
|
"let" => TokenKind::Let,
|
|
"in" => TokenKind::In,
|
|
"match" => TokenKind::Match,
|
|
"import" => TokenKind::Import,
|
|
"default" => TokenKind::Default,
|
|
_ => TokenKind::Ident(String::from(text)),
|
|
}
|
|
}
|
|
|
|
fn peek(&self) -> Option<u8> {
|
|
self.bytes.get(self.pos).copied()
|
|
}
|
|
|
|
fn peek_n(&self, n: usize) -> Option<u8> {
|
|
self.bytes.get(self.pos + n).copied()
|
|
}
|
|
|
|
fn consume(&mut self, expected: u8) -> bool {
|
|
if self.peek() == Some(expected) {
|
|
self.pos += 1;
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
}
|
|
|
|
fn is_ident_start(c: u8) -> bool {
|
|
c.is_ascii_alphabetic()
|
|
}
|
|
|
|
fn is_ident_continue(c: u8) -> bool {
|
|
c.is_ascii_alphanumeric() || c == b'_'
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn tokenizes_basic_source() {
|
|
let tokens = Lexer::new("port = Int & >= 1;").tokenize().unwrap();
|
|
assert!(matches!(tokens[0].kind, TokenKind::Ident(_)));
|
|
assert_eq!(tokens[1].kind, TokenKind::Equal);
|
|
assert_eq!(tokens[3].kind, TokenKind::Amp);
|
|
assert_eq!(tokens[4].kind, TokenKind::Gte);
|
|
}
|
|
}
|