Skip to content

Commit

Permalink
impl lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
indierusty committed May 25, 2023
1 parent 2ecfe29 commit da0d82e
Show file tree
Hide file tree
Showing 6 changed files with 373 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/target
/Cargo.lock
8 changes: 8 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[package]
name = "rox"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
289 changes: 289 additions & 0 deletions src/lexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
use crate::token::Token;
use crate::token::WithSpan;

pub struct Lexer {
source: Vec<char>,
start_pos: usize,
cursor: usize, // index of next char will be scanned
}

impl Lexer {
pub fn new(source: &str) -> Self {
Self {
source: source.chars().collect(),
start_pos: 0,
cursor: 0,
}
}

fn number(&mut self) -> Token {
self.consume_while(|c| c >= '0' && c <= '9');
if self.consume_if('.') {
self.consume_while(|c| c >= '0' && c <= '9');
}
Token::Number
}

fn identifier(&mut self, ch: char) -> Token {
let mut consumed_chars = self.consume_while(|c| c.is_alphabetic() || c == '_');
let mut match_chars = vec![ch];
match_chars.append(&mut consumed_chars);

match match_chars[..] {
['a', 'n', 'd'] => Token::And,
['e', 'l', 's', 'e'] => Token::Else,
['f', 'a', 'l', 's', 'e'] => Token::False,
['f', 'o', 'r'] => Token::For,
['f', 'u', 'n'] => Token::Fun,
['i', 'f'] => Token::If,
['i', 'n'] => Token::In,
['n', 'i', 'l'] => Token::Nil,
['o', 'r'] => Token::Or,
['p', 'r', 'i', 'n', 't'] => Token::Print,
['r', 'e', 't', 'u', 'r', 'n'] => Token::Return,
['t', 'r', 'u', 'e'] => Token::True,
['l', 'e', 't'] => Token::Let,
_ => Token::Identifier,
}
}

fn string(&mut self) -> Token {
self.consume_while(|c| c != '"');
if !self.consume_if('"') {
println!("Unterminated String.");
// TODO: produce good error for Unterminated String.
}
Token::String
}

fn skip_whitespace(&mut self) {
self.consume_while(|c| c.is_ascii_whitespace());
}

fn skip_comment(&mut self) {
if self.check_comment() {
self.consume_while(|c| c != '\n');
self.skip_whitespace(); // skip newline at end if present
}
}

fn next_token(&mut self) -> Option<Token> {
self.skip_whitespace();
self.skip_comment();

if let Some(ch) = self.next() {
self.start_pos = self.cursor - 1;

match ch {
'(' => Some(Token::LeftParen),
')' => Some(Token::RightParen),
'[' => Some(Token::LeftBracket),
']' => Some(Token::RightBracket),
'{' => Some(Token::LeftBrace),
'}' => Some(Token::RightBrace),
',' => Some(Token::Comma),
'.' => Some(Token::Dot),
'-' => Some(Token::Minus),
'+' => Some(Token::Plus),
';' => Some(Token::Semicolon),
'/' => Some(Token::Slash),
'*' => Some(Token::Star),
'!' => Some(self.if_match('=', Token::NotEqual, Token::Not)),
'=' => Some(self.if_match('=', Token::EqualEqual, Token::Equal)),
'>' => Some(self.if_match('=', Token::GreaterEqual, Token::Greater)),
'<' => Some(self.if_match('=', Token::LessEqual, Token::Less)),
'"' => Some(self.string()),
'0'..='9' => Some(self.number()),
'a'..='z' | 'A'..='Z' | '_' => Some(self.identifier(ch)),
_ => Some(Token::Error),
}
} else {
None
}
}

pub fn tokenize_with_context(&mut self) -> Vec<WithSpan<Token>> {
let mut tokens = vec![];
while let Some(token) = self.next_token() {
tokens.push(WithSpan::new(
token,
self.start_pos as u32,
self.cursor as u32 - 1,
));
}
tokens.push(WithSpan::new(
Token::Eof,
self.cursor as u32,
self.cursor as u32,
));
tokens
}
}

impl Lexer {
fn is_at_end(&self) -> bool {
self.cursor == self.source.len()
}

fn peek(&mut self) -> Option<char> {
if !self.is_at_end() {
Some(self.source[self.cursor])
} else {
None
}
}

fn check_comment(&mut self) -> bool {
if Some(&['/', '/'][..]) == self.source.get(self.cursor..self.cursor + 2) {
true
} else {
false
}
}

fn advance_cursor(&mut self) {
self.cursor += 1;
}

fn next(&mut self) -> Option<char> {
if !self.is_at_end() {
self.advance_cursor();
Some(self.source[self.cursor - 1])
} else {
None
}
}

fn if_match(&mut self, ch: char, then: Token, else_: Token) -> Token {
if Some(ch) == self.peek() {
self.advance_cursor();
then
} else {
else_
}
}

fn consume_if(&mut self, ch: char) -> bool {
if Some(ch) == self.peek() {
self.advance_cursor();
true
} else {
false
}
}

fn consume_while<F>(&mut self, x: F) -> Vec<char>
where
F: Fn(char) -> bool,
{
let mut consumed = vec![];
while let Some(c) = self.peek() {
if x(c) {
consumed.push(c);
self.advance_cursor()
} else {
break;
}
}
consumed
}
}

pub fn tokenize_with_context(buf: &str) -> Vec<WithSpan<Token>> {
let mut t = Lexer::new(buf);
t.tokenize_with_context()
}

#[cfg(test)]
mod tests {
use super::super::token::Token;
use super::tokenize_with_context;

fn tokenize(src: &str) -> Vec<Token> {
tokenize_with_context(src)
.iter()
.map(|t| t.value())
.collect()
}

#[test]
fn test() {
assert_eq!(
tokenize("()"),
vec![Token::LeftParen, Token::RightParen, Token::Eof]
);

assert_eq!(
tokenize("=(!) != ! == > < <= >=[[]]"),
vec![
Token::Equal,
Token::LeftParen,
Token::Not,
Token::RightParen,
Token::NotEqual,
Token::Not,
Token::EqualEqual,
Token::Greater,
Token::Less,
Token::LessEqual,
Token::GreaterEqual,
Token::LeftBracket,
Token::LeftBracket,
Token::RightBracket,
Token::RightBracket,
Token::Eof,
]
);

assert_eq!(
tokenize("fun sayhello() { print \"hello\"; }"),
vec![
Token::Fun,
Token::Identifier,
Token::LeftParen,
Token::RightParen,
Token::LeftBrace,
Token::Print,
Token::String,
Token::Semicolon,
Token::RightBrace,
Token::Eof,
]
);

assert_eq!(
tokenize("fun _count() { for (let i = 0; i < 10; i = i + 1) { print i; } }"),
vec![
Token::Fun, // fun
Token::Identifier, // count
Token::LeftParen, // )
Token::RightParen, // )
Token::LeftBrace, // {
Token::For, // for
Token::LeftParen, // (
Token::Let, // let
Token::Identifier, // i
Token::Equal, // =
Token::Number, // 0
Token::Semicolon, // ;
Token::Identifier, // i
Token::Less, // <
Token::Number, // 10
Token::Semicolon, // ;
Token::Identifier, // i
Token::Equal, // =
Token::Identifier, // i
Token::Plus, // +
Token::Number, // 1
Token::RightParen, // )
Token::LeftBrace, // {
Token::Print, // print
Token::Identifier, // i
Token::Semicolon, // ;
Token::RightBrace, // }
Token::RightBrace, // }
Token::Eof,
]
)
}
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub mod lexer;
pub mod token;
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fn main() {}
71 changes: 71 additions & 0 deletions src/token.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum Token {
// Single-character tokens.
LeftParen,
RightParen,
LeftBracket,
RightBracket,
LeftBrace,
RightBrace,
Comma,
Dot,
Minus,
Plus,
Semicolon,
Slash,
Star,
// One or two character tokens.
Not,
NotEqual,
Equal,
EqualEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
// Literals.
Identifier,
String,
Number,
// Keywords.
And,
Else,
False,
For,
Fun,
If,
In,
Nil,
Or,
Print,
Return,
True,
Let,

Error,
Eof,
}

#[derive(Debug, Clone, Copy)]
pub struct WithSpan<T> {
value: T,
start_pos: u32,
end_pos: u32,
}

impl<T> WithSpan<T>
where
T: Copy + Clone,
{
pub fn new(value: T, start_pos: u32, end_pos: u32) -> Self {
Self {
value,
start_pos,
end_pos,
}
}

pub fn value(&self) -> T {
self.value
}
}

0 comments on commit da0d82e

Please sign in to comment.