Browse Source

improve vire tokenizer and parser

master
Alison Watson 1 week ago
parent
commit
e85391de28
6 changed files with 160 additions and 141 deletions
  1. +11
    -4
      Cargo.lock
  2. +7
    -7
      Cargo.toml
  3. +2
    -0
      fw/types.rs
  4. +22
    -4
      fw/vire.rs
  5. +72
    -65
      fw/vire/parser.rs
  6. +46
    -61
      fw/vire/parser/tok.rs

+ 11
- 4
Cargo.lock View File

@@ -48,6 +48,7 @@ dependencies = [
"easy-cast",
"glam",
"half",
"intaglio",
"log",
"naga",
"serde",
@@ -127,6 +128,12 @@ dependencies = [
"unicode-segmentation",
]

[[package]]
name = "intaglio"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2730d2ed3105fa1e7e3fc8de29bf8dd37dd58c7eb18bceb15880e3361a2cfd2"

[[package]]
name = "libloading"
version = "0.7.0"
@@ -189,9 +196,9 @@ dependencies = [

[[package]]
name = "quote"
version = "1.0.9"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05"
dependencies = [
"proc-macro2",
]
@@ -271,9 +278,9 @@ checksum = "71cc4b8f7ec707459fdeddb4f137109947045592f5b0c139f7bf1360058bac6b"

[[package]]
name = "syn"
version = "1.0.77"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0"
checksum = "a4eac2e6c19f5c3abc0c229bea31ff0b9b091c7b14990e8924b92902a303a0c0"
dependencies = [
"proc-macro2",
"quote",


+ 7
- 7
Cargo.toml View File

@@ -17,19 +17,19 @@ build = "fw/build.rs"
[dependencies]
# macros:
# - blonkus-ma for our macros
# - thiserror for implementing error types
blonkus-ma = { path = "ma" }
thiserror = "~1.0"

# types:
# - bitflags for FFI usage
# - smallvec for potentially small dynamic arrays
# - smol_str for potentially small immutable strings
# - thiserror for implementing error types
# - intaglio for symbol interning?
bitflags = "~1.3"
smallvec = { version = "~1.6", features = ["const_generics", "union"] }
smol_str = "~0.1"
thiserror = "~1.0"
#intaglio = "~1.2"
# - intaglio for symbol interning
bitflags = "~1.3"
smallvec = { version = "~1.6", features = ["const_generics", "union"] }
smol_str = "~0.1"
intaglio = "~1.3"

# i/o:
# - serde for config / description files


+ 2
- 0
fw/types.rs View File

@@ -4,6 +4,8 @@ pub mod ffi;
pub mod io;
pub mod iter;

pub use intaglio as sym;

pub use easy_cast::{Cast, CastFloat, Conv, ConvFloat};

pub use smallvec::smallvec as stkvec;


+ 22
- 4
fw/vire.rs View File

@@ -1,8 +1,26 @@
/*
#[doc(hidden)]
pub mod llvm;
use crate::types::sym;

pub mod parser;
pub mod rt;
*/

//mod llvm;

#[derive(Clone, Debug)]
pub enum Cell {
Null,
Bool(bool),
Char(char),
Inte(i64),
//Rtio(i32, i32), // this even necessary?
//Fixd(Fixed),
//Angl(Angle),
Strn(String),
Symb(sym::Symbol),
//Func(usize),
Cons { cdr: Box<Cell>, car: Box<Cell> },
}

// struct Fixed(i64); // Q47.16s
// struct Angle(i32); // Q0.31s

// EOF

+ 72
- 65
fw/vire/parser.rs View File

@@ -2,89 +2,96 @@ pub mod tki;
pub mod tok;

use self::{
tki::{Err, TokenIter},
tok::{Data, Token, Type::*},
tki::TokenIter,
tok::{Token, Type::*},
};
use super::Cell;
use crate::types::sym;

#[derive(Debug)]
pub enum Datum {
Null,
Bool(bool),
Char(char),
Numb(i64),
Strn(String),
Symb(String),
Cons { cdr: Box<Datum>, car: Box<Datum> },
#[derive(thiserror::Error, Debug)]
pub enum Err {
#[error(transparent)]
Iter(#[from] tki::Err),
#[error(transparent)]
Symb(#[from] sym::SymbolOverflowError),
}

impl Datum {
pub fn parse_all(mut tki: TokenIter) -> Result<Vec<Self>, Err> {
let mut datum = Vec::new();
pub fn parse(
tki: &mut TokenIter, syms: &mut sym::SymbolTable,
) -> Result<Vec<Cell>, Err> {
let mut cells = Vec::new();

while tki.peek().is_some() {
datum.push(Self::parse(&mut tki)?);
}

Ok(datum)
while tki.peek().is_some() {
cells.push(cell(tki, syms)?);
}

pub fn parse(tki: &mut TokenIter) -> Result<Self, Err> {
match tki.err_next()? {
// atoms
| Token { typ: Bool, dat: Data::Bool(b), .. } => Ok(Self::Bool(b)),
| Token { typ: Char, dat: Data::Char(c), .. } => Ok(Self::Char(c)),
| Token { typ: Numb, dat: Data::Numb(n), .. } => Ok(Self::Numb(n)),
| Token { typ: Strn, dat: Data::Strn(s), .. } => Ok(Self::Strn(s)),
| Token { typ: Symb, dat: Data::Strn(s), .. } => Ok(Self::Symb(s)),
Ok(cells)
}

// lists
| Token { typ: Br1O, .. } => Self::list(tki, Br1C),
| Token { typ: Br3O, .. } => Self::list(tki, Br3C),
fn cell(
tki: &mut TokenIter, syms: &mut sym::SymbolTable,
) -> Result<Cell, Err> {
match tki.err_next()? {
// atoms
| Token { typ: Bool, cel, .. }
| Token { typ: Char, cel, .. }
| Token { typ: Numb, cel, .. }
| Token { typ: Strn, cel, .. } => Ok(cel),
| Token { typ: Symb, cel: Cell::Strn(s), .. } => {
Ok(Cell::Symb(syms.intern(s)?))
}

// abbreviations
| Token { typ: QQuo, .. } => Self::abbrev(tki, "quasiquote"),
| Token { typ: QSyn, .. } => Self::abbrev(tki, "quasisyntax"),
| Token { typ: Quot, .. } => Self::abbrev(tki, "quote"),
| Token { typ: Synt, .. } => Self::abbrev(tki, "syntax"),
| Token { typ: UnQS, .. } => Self::abbrev(tki, "unquote-splicing"),
| Token { typ: UnQu, .. } => Self::abbrev(tki, "unquote"),
| Token { typ: UnSS, .. } => Self::abbrev(tki, "unsyntax-splicing"),
| Token { typ: UnSy, .. } => Self::abbrev(tki, "unsyntax"),
// lists
| Token { typ: Br1O, .. } => list(tki, syms, Br1C),
| Token { typ: Br3O, .. } => list(tki, syms, Br3C),

| Token { typ, pos, .. } => Err(Err::Unexpected(typ, pos)),
}
}
// abbreviations
| Token { typ: QQuo, .. } => abbrev(tki, syms, "quasiquote"),
| Token { typ: QSyn, .. } => abbrev(tki, syms, "quasisyntax"),
| Token { typ: Quot, .. } => abbrev(tki, syms, "quote"),
| Token { typ: Synt, .. } => abbrev(tki, syms, "syntax"),
| Token { typ: UnQS, .. } => abbrev(tki, syms, "unquote-splicing"),
| Token { typ: UnQu, .. } => abbrev(tki, syms, "unquote"),
| Token { typ: UnSS, .. } => abbrev(tki, syms, "unsyntax-splicing"),
| Token { typ: UnSy, .. } => abbrev(tki, syms, "unsyntax"),

fn abbrev(tki: &mut TokenIter, name: &str) -> Result<Self, Err> {
let cdr = {
let cdr = Box::new(Self::Null);
let car = Box::new(Self::parse(tki)?);
Box::new(Self::Cons { cdr, car })
};
let car = Box::new(Self::Symb(name.to_owned()));
Ok(Self::Cons { cdr, car })
| Token { typ, pos, .. } => Err(tki::Err::Unexpected(typ, pos).into()),
}
}

fn list(tki: &mut TokenIter, end: tok::Type) -> Result<Self, Err> {
let mut data_v = Vec::new();
let mut cdr = Self::Null;
fn abbrev(
tki: &mut TokenIter, syms: &mut sym::SymbolTable, name: &'static str,
) -> Result<Cell, Err> {
let cdr = {
let cdr = Box::new(Cell::Null);
let car = Box::new(cell(tki, syms)?);
Box::new(Cell::Cons { cdr, car })
};
let car = Box::new(Cell::Symb(syms.intern(name)?));
Ok(Cell::Cons { cdr, car })
}

while !tki.drop(end)? {
if tki.drop(Peri)? {
cdr = Self::parse(tki)?;
tki.expect(end)?;
break;
} else {
data_v.push(Self::parse(tki)?);
}
}
fn list(
tki: &mut TokenIter, syms: &mut sym::SymbolTable, end: tok::Type,
) -> Result<Cell, Err> {
let mut data_v = Vec::new();
let mut cdr = Cell::Null;

for car in data_v.into_iter().rev() {
cdr = Self::Cons { car: Box::new(car), cdr: Box::new(cdr) };
while !tki.drop(end)? {
if tki.drop(Peri)? {
cdr = cell(tki, syms)?;
tki.expect(end)?;
break;
} else {
data_v.push(cell(tki, syms)?);
}
}

Ok(cdr)
for car in data_v.into_iter().rev() {
cdr = Cell::Cons { car: Box::new(car), cdr: Box::new(cdr) };
}

Ok(cdr)
}

// EOF

+ 46
- 61
fw/vire/parser/tok.rs View File

@@ -1,5 +1,6 @@
use super::Cell;
use crate::data::text::{self, PosReader, Position};
use std::fmt;
use std::{fmt, ops::ControlFlow};

#[derive(thiserror::Error, Debug)]
pub enum Err {
@@ -21,7 +22,7 @@ pub enum Err {
pub struct Token {
pub typ: Type,
pub pos: Position,
pub dat: Data,
pub cel: Cell,
}

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
@@ -46,15 +47,6 @@ pub enum Type {
UnSy,
}

#[derive(Clone, Debug)]
pub enum Data {
Bool(bool),
Char(char),
None,
Numb(i64),
Strn(String),
}

impl fmt::Display for Type {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
@@ -80,8 +72,8 @@ impl fmt::Display for Type {
}
}

const fn new_tok(rd: &PosReader, typ: Type, dat: Data) -> Token {
Token { typ, pos: rd.pos(), dat }
const fn new_tok(rd: &PosReader, typ: Type, cel: Cell) -> Token {
Token { typ, pos: rd.pos(), cel }
}

fn is_delim(c: Option<char>) -> bool {
@@ -98,7 +90,7 @@ fn delim_end<'a, 'b>(
match rd.peek() {
| c if is_delim(c) => Ok(rd),
| Some(c) => Err(Err::Delim(rd.pos(), c)),
| None => unsafe { std::hint::unreachable_unchecked() },
| None => Err(Err::Eof),
}
}

@@ -106,9 +98,9 @@ fn delim_end<'a, 'b>(
fn unquote(rd: &mut PosReader, ty1: Type, ty2: Type) -> Token {
if let Some('@') = rd.peek() {
rd.next();
new_tok(rd, ty1, Data::None)
new_tok(rd, ty1, Cell::Null)
} else {
new_tok(rd, ty2, Data::None)
new_tok(rd, ty2, Cell::Null)
}
}

@@ -136,7 +128,7 @@ fn read_uni_char(rd: &mut PosReader) -> Result<char, Err> {

fn char_pnt(rd: &mut PosReader) -> Result<Token, Err> {
let c = read_uni_char(rd)?;
Ok(new_tok(rd, Type::Char, Data::Char(c)))
Ok(new_tok(rd, Type::Char, Cell::Char(c)))
}

fn char_txt(rd: &mut PosReader) -> Result<Token, Err> {
@@ -146,7 +138,7 @@ fn char_txt(rd: &mut PosReader) -> Result<Token, Err> {
return Err(Err::Expected(rd.pos(), "⟨'⟩ after character"));
}

Ok(new_tok(rd, Type::Char, Data::Char(c)))
Ok(new_tok(rd, Type::Char, Cell::Char(c)))
}

fn char_lit(rd: &mut PosReader) -> Result<Token, Err> {
@@ -175,7 +167,7 @@ fn strn_lit(rd: &mut PosReader) -> Result<Token, Err> {
});
}

Ok(new_tok(rd, Type::Strn, Data::Strn(s)))
Ok(new_tok(rd, Type::Strn, Cell::Strn(s)))
}

fn integer_rad(rd: &mut PosReader, rad: u32) -> Result<Token, Err> {
@@ -199,16 +191,19 @@ fn integer_rad(rd: &mut PosReader, rad: u32) -> Result<Token, Err> {
let res = if sign { n.checked_sub(c) } else { n.checked_add(c) };
n = res.ok_or_else(|| Err::Numb(rd.pos()))?;
}
| Some('_') => {
rd.next();
}
| Some(c) => return Err(Err::Delim(rd.pos(), c)),
| None => return Err(Err::Eof),
}
}

Ok(new_tok(rd, Type::Numb, Data::Numb(n)))
Ok(new_tok(rd, Type::Numb, Cell::Inte(n)))
}

fn is_sym_init(c: char) -> bool {
c.is_alphabetic()
c.is_alphanumeric()
|| matches!(
c,
'!' | '$'
@@ -216,13 +211,12 @@ fn is_sym_init(c: char) -> bool {
| '/' | ':' | '<'
| '=' | '>' | '?'
| '~' | '_' | '^'
| '+' | '-'
)
}

fn is_sym_subs(c: char) -> bool {
is_sym_init(c)
|| c.is_numeric()
|| matches!(c, '0'..='9' | '.' | '@' | '+' | '-')
is_sym_init(c) || matches!(c, '.' | '@')
}

fn symbol(rd: &mut PosReader, c: char) -> Result<Token, Err> {
@@ -237,11 +231,11 @@ fn symbol(rd: &mut PosReader, c: char) -> Result<Token, Err> {
s.push(c);
}
| Some(c) => return Err(Err::Delim(rd.pos(), c)),
| None => unsafe { std::hint::unreachable_unchecked() },
| None => return Err(Err::Eof),
}
}

Ok(new_tok(rd, Type::Symb, Data::Strn(s)))
Ok(new_tok(rd, Type::Symb, Cell::Strn(s)))
}

fn line_comment(rd: &mut PosReader) -> Result<(), Err> {
@@ -283,29 +277,32 @@ fn block_comment(rd: &mut PosReader) -> Result<(), Err> {
}

impl Token {
fn read_from_char(
rd: &mut PosReader, c: char,
) -> Result<Option<Self>, Err> {
pub fn read(
rd: &mut PosReader,
) -> Result<ControlFlow<(), Option<Self>>, Err> {
use self::Type::*;

let c = match rd.next() {
| Some(c) => c,
| None => return Ok(ControlFlow::Break(())),
};
let tk = match c {
// line comments
| ';' => {
line_comment(rd)?;
return Ok(None);
return Ok(ControlFlow::Continue(None));
}

// basic tokens
| '(' => new_tok(rd, Br1O, Data::None),
| ')' => new_tok(rd, Br1C, Data::None),
| '[' => new_tok(rd, Br3O, Data::None),
| ']' => new_tok(rd, Br3C, Data::None),
| '(' => new_tok(rd, Br1O, Cell::Null),
| ')' => new_tok(rd, Br1C, Cell::Null),
| '[' => new_tok(rd, Br3O, Cell::Null),
| ']' => new_tok(rd, Br3C, Cell::Null),

| '.' => new_tok(delim_end(rd)?, Peri, Data::None),
| '.' => new_tok(delim_end(rd)?, Peri, Cell::Null),

// quote abbreviations
| '\'' => new_tok(rd, Quot, Data::None),
| '`' => new_tok(rd, QQuo, Data::None),
| '\'' => new_tok(rd, Quot, Cell::Null),
| '`' => new_tok(rd, QQuo, Cell::Null),
| ',' => unquote(rd, UnQS, UnQu),

// tokens preceded by #
@@ -313,7 +310,7 @@ impl Token {
// block comments
| '|' => {
block_comment(rd)?;
return Ok(None);
return Ok(ControlFlow::Continue(None));
}

// integers
@@ -323,12 +320,12 @@ impl Token {
| 'x' | 'X' => integer_rad(rd, 16)?,

// booleans
| 't' | 'T' => new_tok(delim_end(rd)?, Bool, Data::Bool(true)),
| 'f' | 'F' => new_tok(delim_end(rd)?, Bool, Data::Bool(false)),
| 't' | 'T' => new_tok(delim_end(rd)?, Bool, Cell::Bool(true)),
| 'f' | 'F' => new_tok(delim_end(rd)?, Bool, Cell::Bool(false)),

// syntax abbreviations
| '\'' => new_tok(rd, Synt, Data::None),
| '`' => new_tok(rd, QSyn, Data::None),
| '\'' => new_tok(rd, Synt, Cell::Null),
| '`' => new_tok(rd, QSyn, Cell::Null),
| ',' => unquote(rd, UnSS, UnSy),

// character literals
@@ -344,33 +341,21 @@ impl Token {
| c if is_sym_init(c) => symbol(rd, c)?,

// skip whitespace
| c if c.is_whitespace() => return Ok(None),
| c if c.is_whitespace() => return Ok(ControlFlow::Continue(None)),

| c => return Err(Err::Char(rd.pos(), c)),
};

Ok(Some(tk))
Ok(ControlFlow::Continue(Some(tk)))
}

pub fn read_all(name: &str, data: &str) -> Result<Vec<Self>, Err> {
let mut tokens = Vec::new();
let rd = &mut PosReader::new(data, text::ellipsize_small_str(name));

while let Some(c) = rd.peek() {
let tk = if c.is_digit(10) || c == '-' {
// integers without prefixes are a special case and must
// be handled before actually parsing a token
integer_rad(rd, 10)?
} else {
rd.next();
if let Some(tk) = Self::read_from_char(rd, c)? {
tk
} else {
continue;
}
};

tokens.push(tk);
while let ControlFlow::Continue(maybe_tk) = Self::read(rd)? {
if let Some(tk) = maybe_tk {
tokens.push(tk);
}
}

Ok(tokens)


Loading…
Cancel
Save