wip: parser

This commit is contained in:
2025-11-02 21:33:16 +08:00
parent 7ac0070589
commit ee7619c405
2 changed files with 458 additions and 93 deletions

View File

@@ -1,10 +1,6 @@
///|
suberror ParseError String
// region Context
// region Type
///|
pub(all) enum Type {
Unit
@@ -18,23 +14,14 @@ pub(all) enum Type {
Generic(String, Type)
} derive(Show)
// region Components
///|
pub(all) enum Literal {
enum Literal {
Unit
Bool(Bool)
Int(Int)
Double(Double)
} derive(Show)
///|
enum LeftValue {
Identifier(String)
FieldAccess(LeftValue, String)
IndexAccess(LeftValue, Expr)
} derive(Show)
///|
enum AddSubOp {
Add
@@ -48,6 +35,18 @@ enum MulDivRemOp {
Rem
} derive(Show)
///|
struct Block(Array[Stmt], Expr?) derive(Show)
///|
enum Pattern {
Wildcard
Identifier(String)
Literal(Literal)
Tuple(Array[Pattern])
Enum(String?, String, Array[Pattern])
} derive(Show)
///|
enum Expr {
Or(Expr, Expr)
@@ -55,28 +54,78 @@ enum Expr {
Compare(CompareOperator, Expr, Expr)
AddSub(AddSubOp, Expr, Expr)
MulDivRem(MulDivRemOp, Expr, Expr)
Neg(Expr)
Not(Expr)
If(Expr, Expr, Expr?)
Match(Expr, Array[(Pattern, Expr)])
IndexAccess(Expr, Expr)
FieldAccess(Expr, String)
FunctionCall(Expr, Array[Expr])
ArrayMake(Expr, Expr)
StructConstruct(String, Array[(String, Expr)])
EnumConstruct(String?, String, Array[Expr])
Literal(Literal)
Tuple(Array[Expr])
Array(Array[Expr])
Identifier(String)
Block(Block)
} derive(Show)
///|
enum TopLevel {
TopLetDecl(id~ : String, type_~ : Type?, expr~ : Expr)
TopFn(Function)
Struct(
id~ : String,
user_defined_type~ : Type?,
fields~ : Array[(String, Type)]
)
Enum(
id~ : String,
user_defined_type~ : Type?,
variants~ : Array[(String, Array[Type])]
)
}
struct Function {
id : String
user_defined_type : Type?
params : Array[(String, Type?)]
return_type : Type?
body : Block
} derive(Show)
///|
struct Program(Array[TopLevel])
enum Binding {
Identifier(String)
Wildcard
} derive(Show)
///|
enum Stmt {
Let(Binding, Type?, Expr)
LetTuple(Array[Binding], Type?, Expr)
LetMut(String, Type?, Expr)
Assign(Expr, Expr)
While(Expr, Array[Stmt])
Expr(Expr)
Return(Expr?)
LocalFunction(Function)
} derive(Show)
///|
struct TopLet {
id : String
type_ : Type?
expr : Expr
} derive(Show)
///|
struct StructDef {
id : String
user_defined_type : Type?
fields : Array[(String, Type)]
} derive(Show)
///|
struct EnumDef {
id : String
user_defined_type : Type?
variants : Array[(String, Array[Type])]
} derive(Show)
///|
struct Program {
top_lets : Map[String, TopLet]
top_functions : Map[String, Function]
struct_defs : Map[String, StructDef]
enum_defs : Map[String, EnumDef]
} derive(Show)
///|
fn parse_type(
@@ -95,6 +144,7 @@ fn parse_type(
(Array(elem_type), rest)
}
[LParen, .. rest] => {
// XXX: function_type has at least one type in the argument list?
let (first_type, rest) = parse_type(rest)
let types = [first_type]
loop rest {
@@ -127,11 +177,11 @@ fn parse_type(
}
///|
fn parse_struct(
fn parse_struct_decl(
tokens : ArrayView[Token],
) -> (TopLevel, ArrayView[Token]) raise ParseError {
) -> (StructDef, ArrayView[Token]) raise ParseError {
guard tokens is [Struct, UpperIdentifier(id), .. rest] else {
raise ParseError("Expected 'struct' followed by struct name")
raise ParseError("Expected upper case struct name after 'struct'")
}
let (user_defined_type, rest) = if rest
is [LBracket, UpperIdentifier(type_), RBracket, .. r] {
@@ -144,7 +194,7 @@ fn parse_struct(
}
let fields = []
loop rest {
[RCurlyBracket, .. r] => (Struct(id~, user_defined_type~, fields~), r)
[RCurlyBracket, .. r] => ({ id, user_defined_type, fields }, r)
[UpperIdentifier(field_name) | LowerIdentifier(field_name), Colon, .. r] => {
let (field_type, r) = parse_type(r)
fields.push((field_name, field_type))
@@ -159,11 +209,11 @@ fn parse_struct(
}
///|
fn parse_enum(
fn parse_enum_decl(
tokens : ArrayView[Token],
) -> (TopLevel, ArrayView[Token]) raise ParseError {
) -> (EnumDef, ArrayView[Token]) raise ParseError {
guard tokens is [Enum, UpperIdentifier(id), .. rest] else {
raise ParseError("Expected 'enum' followed by enum name")
raise ParseError("Expected upper case enum name after 'enum'")
}
let (user_defined_type, rest) = if rest
is [LBracket, UpperIdentifier(type_), RBracket, .. r] {
@@ -176,7 +226,7 @@ fn parse_enum(
}
let variants = []
loop rest {
[RCurlyBracket, .. r] => (Enum(id~, user_defined_type~, variants~), r)
[RCurlyBracket, .. r] => ({ id, user_defined_type, variants }, r)
[UpperIdentifier(variant_name), .. r] => {
let variant_types = []
if r is [LParen, .. r] {
@@ -186,10 +236,6 @@ fn parse_enum(
let (variant_type, r) = parse_type(r)
variant_types.push(variant_type)
match r {
[Comma, RParen, ..] =>
raise ParseError(
"Trailing comma in enum variant type list is not allowed",
)
[Comma, .. r] => continue r
[RParen, ..] => break
_ =>
@@ -211,39 +257,324 @@ fn parse_enum(
}
}
///|
fn parse_if_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
guard tokens is [If, .. rest] else {
raise ParseError("Expected 'if' at start of if expression")
}
let (cond, rest) = parse_expr(rest)
let (then_branch, rest) = parse_block_expr(rest)
let (else_branch, rest) = match rest {
[Else, If, ..] => {
let (if_expr, r) = parse_if_expr(rest[1:])
(Some(if_expr), r)
}
[Else, LCurlyBracket, ..] => {
let (block_expr, r) = parse_block_expr(rest[1:])
(Some(block_expr), r)
}
_ => (None, rest)
}
(If(cond, then_branch, else_branch), rest)
}
///|
fn parse_value_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
match tokens {
// array_make_expr
[Array, DoubleColon, LowerIdentifier("make"), LParen, .. rest] => {
let (size_expr, rest) = parse_expr(rest)
guard rest is [Comma, .. rest] else {
raise ParseError("Expected ',' after size expression in array make")
}
let (init_expr, rest) = parse_expr(rest)
guard rest is [RParen, .. rest] else {
raise ParseError("Expected ')' after init expression in array make")
}
(ArrayMake(size_expr, init_expr), rest)
}
// struct_construct_expr
[UpperIdentifier(struct_type), DoubleColon, LCurlyBracket, .. rest] => {
let fields = []
loop rest {
[RCurlyBracket, .. r] => (StructConstruct(struct_type, fields), r)
[LowerIdentifier(field_name) | UpperIdentifier(field_name), Colon, .. r] => {
let (field_expr, r) = parse_expr(r)
fields.push((field_name, field_expr))
match r {
[Comma, .. r] => continue r
[RCurlyBracket, ..] => continue r
_ => raise ParseError("Expected ',' or '}' after struct field")
}
}
_ => raise ParseError("Unexpected token in struct construction")
}
}
// TODO: enum_construct_expr
// unit_expr, bool_expr, int_expr, floating_point_expr
[LParen, RParen, .. rest] => (Literal(Unit), rest)
[BoolLiteral(value), .. rest] => (Literal(Bool(value)), rest)
[IntLiteral(value), .. rest] => (Literal(Int(value)), rest)
[DoubleLiteral(value), .. rest] => (Literal(Double(value)), rest)
// neg_expr, not_expr
[Sub, .. rest] => {
let (expr, rest) = parse_value_level_expr(rest)
(Neg(expr), rest)
}
[Not, .. rest] => {
let (expr, rest) = parse_expr(rest)
(Not(expr), rest)
}
// group_expr, tuple_expr
[LParen, .. r] => {
let (first_expr, r) = parse_expr(r)
let exprs = [first_expr]
loop r {
[Comma, .. r] => {
let (next_expr, r) = parse_expr(r)
exprs.push(next_expr)
continue r
}
[RParen, .. r] =>
if exprs.length() == 1 {
(exprs[0], r)
} else {
(Tuple(exprs), r)
}
_ =>
raise ParseError("Expected ',' or ')' in tuple or grouped expression")
}
}
// array_expr
[LBracket, .. r] => {
let elements = []
loop r {
[RBracket, .. r] => (Array(elements), r)
r => {
let (element, r) = parse_expr(r)
elements.push(element)
match r {
[Comma, .. r] => continue r
[RBracket, ..] => (Array(elements), r)
_ =>
raise ParseError(
"Expected ',' or ']' in array literal expression",
)
}
}
}
}
// block_expr
[LCurlyBracket, ..] => parse_block_expr(tokens)
// identifier_expr
[LowerIdentifier(name) | UpperIdentifier(name), .. rest] =>
(Identifier(name), rest)
}
}
///|
fn parse_get_or_apply_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
let (value, rest) = parse_value_level_expr(tokens)
let mut result = value
loop rest {
[LBracket, .. r] => {
let (index, r) = parse_expr(r)
guard r is [RBracket, .. r] else {
raise ParseError("Expected ']' after index expression")
}
result = IndexAccess(result, index)
continue r
}
[LParen, .. r] => ...
[Dot, UpperIdentifier(field_name) | LowerIdentifier(field_name), .. r] => {
result = FieldAccess(result, field_name)
continue r
}
r => break (result, r)
}
}
///|
fn parse_if_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
match tokens {
[If, ..] => parse_if_expr(tokens)
[Match, ..] => ...
_ => parse_get_or_apply_level_expr(tokens)
}
}
///|
fn parse_mul_div_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
let (first, rest) = parse_if_level_expr(tokens)
let mut result = first
loop rest {
[Mul, .. r] => {
let (next, r) = parse_if_level_expr(r)
result = MulDivRem(Mul, result, next)
continue r
}
[Div, .. r] => {
let (next, r) = parse_if_level_expr(r)
result = MulDivRem(Div, result, next)
continue r
}
[Rem, .. r] => {
let (next, r) = parse_if_level_expr(r)
result = MulDivRem(Rem, result, next)
continue r
}
r => (result, r)
}
}
///|
fn parse_add_sub_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
let (first, rest) = parse_mul_div_level_expr(tokens)
let mut result = first
loop rest {
[Add, .. r] => {
let (next, r) = parse_mul_div_level_expr(r)
result = AddSub(Add, result, next)
continue r
}
[Sub, .. r] => {
let (next, r) = parse_mul_div_level_expr(r)
result = AddSub(Sub, result, next)
continue r
}
r => break (result, r)
}
}
///|
fn parse_compare_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
let (first, rest) = parse_add_sub_level_expr(tokens)
match rest {
[CompareOperator(op), .. r] => {
let (next, r) = parse_add_sub_level_expr(r)
(Compare(op, first, next), r)
}
r => (first, r)
}
}
///|
fn parse_and_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
let (first, rest) = parse_compare_level_expr(tokens)
let mut result = first
loop rest {
[And, .. r] => {
let (next, r) = parse_compare_level_expr(r)
result = And(result, next)
continue r
}
r => break (result, r)
}
}
///|
fn parse_or_level_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
let (first, rest) = parse_and_level_expr(tokens)
let mut result = first
loop rest {
[Or, .. r] => {
let (next, r) = parse_and_level_expr(r)
result = Or(result, next)
continue r
}
r => break (result, r)
}
}
///|
fn parse_expr(
tokens : ArrayView[Token],
) -> (Expr, ArrayView[Token]) raise ParseError {
parse_or_level_expr(tokens)
}
///|
pub fn parse_program(tokens : Array[Token]) -> Program raise ParseError {
let program = []
let top_lets = Map::new()
let top_functions = Map::new()
let struct_defs = Map::new()
let enum_defs = Map::new()
loop tokens[:] {
[EOF] => program
[Let, LowerIdentifier(id) | UpperIdentifier(id), Colon, .. rest] => {
let (type_, rest) = parse_type(rest)
[EOF] => { top_lets, top_functions, struct_defs, enum_defs }
[Let, LowerIdentifier(id) | UpperIdentifier(id), .. rest] => {
let (type_, rest) = match rest {
[Colon, .. r] => {
let (t, r) = parse_type(r)
(Some(t), r)
}
[Assign, ..] => (None, rest)
_ =>
raise ParseError(
"Expected ':' or '=' after identifier in let declaration",
)
}
guard rest is [Assign, .. rest] else {
raise ParseError(
"Expected '=' after type annotation in let declaration",
)
}
let (expr, rest) = parse_expr(rest)
program.push(TopLetDecl(id~, type_=Some(type_), expr~))
top_lets[id] = { id, type_, expr }
guard rest is [Semicolon, .. rest] else {
raise ParseError("Expected ';' after top let declaration")
}
continue rest
}
[Let, LowerIdentifier(id) | UpperIdentifier(id), Assign, .. rest] => {
let (expr, rest) = parse_expr(rest)
program.push(TopLetDecl(id~, type_=None, expr~))
[Fn, LowerIdentifier("main"), ..] => {
let (body, rest) = parse_block_expr(tokens)
top_functions["main"] = {
id: "main",
user_defined_type: None,
params: [],
return_type: Some(Unit),
body,
}
continue rest
}
[Fn, LowerIdentifier("main"), ..] => ...
[Fn, ..] => ...
[Struct, ..] as tokens => {
let (struct_, rest) = parse_struct(tokens)
program.push(struct_)
let (struct_, rest) = parse_struct_decl(tokens)
struct_defs[struct_.id] = struct_
continue rest
}
[Enum, ..] as tokens => {
let (enum_, rest) = parse_enum(tokens)
program.push(enum_)
let (enum_, rest) = parse_enum_decl(tokens)
enum_defs[enum_.id] = enum_
continue rest
}
_ => raise ParseError("Unexpected token at top level")
[] => raise ParseError("Unexpected end of token stream")
[t, ..] => raise ParseError("Unexpected token: \{t} at top level")
}
}

View File

@@ -1,3 +1,6 @@
///|
suberror TokenizeError String
///|
pub(all) enum CompareOperator {
Equal
@@ -11,7 +14,6 @@ pub(all) enum CompareOperator {
///|
pub(all) enum Token {
EOF
BoolLiteral(Bool)
Unit
Bool
Int
@@ -20,11 +22,17 @@ pub(all) enum Token {
Not
If
Else
Match
While
Fn
Return
Let
Mut
Struct
Enum
Number(Int)
BoolLiteral(Bool)
IntLiteral(Int)
DoubleLiteral(Double)
UpperIdentifier(String)
LowerIdentifier(String)
Wildcard
@@ -36,6 +44,7 @@ pub(all) enum Token {
Sub
Mul
Div
Rem
Assign
LParen
RParen
@@ -44,13 +53,15 @@ pub(all) enum Token {
LCurlyBracket
RCurlyBracket
Arrow
MatchArrow
DoubleColon
Colon
Semicolon
Comma
} derive(Show)
///|
pub fn tokenize(input : String) -> Array[Token] {
pub fn tokenize(input : String) -> Array[Token] raise TokenizeError {
let tokens = []
loop input[:] {
[' ' | '\n' | '\r' | '\t', .. rest] => continue rest
@@ -64,17 +75,25 @@ pub fn tokenize(input : String) -> Array[Token] {
}
['0'..='9', ..] as pattern => {
let number_str = StringBuilder::new()
let mut float_point = false
let rest = loop pattern {
['0'..='9' as c, .. r] => {
number_str.write_char(c)
continue r
}
r => {
let number = try! @strconv.parse_int(number_str.to_string())
tokens.push(Number(number))
r
['.', .. r] if !float_point => {
number_str.write_char('.')
float_point = true
continue r
}
r => break r
}
let number = if float_point {
DoubleLiteral(try! @strconv.parse_double(number_str.to_string()))
} else {
IntLiteral(try! @strconv.parse_int(number_str.to_string()))
}
tokens.push(number)
continue rest
}
['A'..='Z', ..] as pattern => {
@@ -84,19 +103,17 @@ pub fn tokenize(input : String) -> Array[Token] {
ident_str.write_char(c)
continue r
}
r => {
let ident : Token = match ident_str.to_string() {
"Unit" => Unit
"Bool" => Bool
"Int" => Int
"Double" => Double
"Array" => Array
s => UpperIdentifier(s)
}
tokens.push(ident)
r
}
r => break r
}
let ident : Token = match ident_str.to_string() {
"Unit" => Unit
"Bool" => Bool
"Int" => Int
"Double" => Double
"Array" => Array
s => UpperIdentifier(s)
}
tokens.push(ident)
continue rest
}
['a'..='z', ..] | ['_', ..] as pattern => {
@@ -106,30 +123,35 @@ pub fn tokenize(input : String) -> Array[Token] {
ident_str.write_char(c)
continue r
}
r => {
let ident = match ident_str.to_string() {
"_" => Wildcard
"true" => BoolLiteral(true)
"false" => BoolLiteral(false)
"not" => Not
"if" => If
"else" => Else
"fn" => Fn
"let" => Let
"struct" => Struct
"enum" => Enum
s => LowerIdentifier(s)
}
tokens.push(ident)
r
}
r => break r
}
let ident : Token = match ident_str.to_string() {
"_" => Wildcard
"true" => BoolLiteral(true)
"false" => BoolLiteral(false)
"if" => If
"else" => Else
"match" => Match
"while" => While
"fn" => Fn
"return" => Return
"let" => Let
"mut" => Mut
"struct" => Struct
"enum" => Enum
s => LowerIdentifier(s)
}
tokens.push(ident)
continue rest
}
[.. "->", .. rest] => {
tokens.push(Arrow)
continue rest
}
[.. "=>", .. rest] => {
tokens.push(MatchArrow)
continue rest
}
[.. "==", .. rest] => {
tokens.push(CompareOperator(Equal))
continue rest
@@ -182,6 +204,14 @@ pub fn tokenize(input : String) -> Array[Token] {
tokens.push(Div)
continue rest
}
['%', .. rest] => {
tokens.push(Rem)
continue rest
}
['!', .. rest] => {
tokens.push(Not)
continue rest
}
['=', .. rest] => {
tokens.push(Assign)
continue rest
@@ -210,6 +240,10 @@ pub fn tokenize(input : String) -> Array[Token] {
tokens.push(RCurlyBracket)
continue rest
}
[.. "::", .. rest] => {
tokens.push(DoubleColon)
continue rest
}
[':', .. rest] => {
tokens.push(Colon)
continue rest
@@ -223,7 +257,7 @@ pub fn tokenize(input : String) -> Array[Token] {
continue rest
}
[] => tokens.push(EOF)
[c, ..] => abort("Unexpected character: " + c.to_string())
[c, ..] => raise TokenizeError("Unexpected character: \{c}")
}
tokens
}
@@ -238,7 +272,7 @@ test "tokenize" {
inspect(
tokenize(input),
content=(
#|[Let, LowerIdentifier("x"), Assign, Number(42), If, LowerIdentifier("x"), CompareOperator(Greater), Number(0), LCurlyBracket, LowerIdentifier("x"), Assign, LowerIdentifier("x"), Sub, Number(1), RCurlyBracket, Else, LCurlyBracket, LowerIdentifier("x"), Assign, Number(0), RCurlyBracket, EOF]
#|[Let, LowerIdentifier("x"), Assign, IntLiteral(42), If, LowerIdentifier("x"), CompareOperator(Greater), IntLiteral(0), LCurlyBracket, LowerIdentifier("x"), Assign, LowerIdentifier("x"), Sub, IntLiteral(1), RCurlyBracket, Else, LCurlyBracket, LowerIdentifier("x"), Assign, IntLiteral(0), RCurlyBracket, EOF]
),
)
}