From 688121c5a154e047dc38b34e97ccdfeb54a8ba9d Mon Sep 17 00:00:00 2001 From: dkanus Date: Tue, 16 Sep 2025 08:22:54 +0700 Subject: [PATCH] Add first version of parser --- rottlib/Cargo.toml | 3 +- rottlib/src/arena.rs | 277 ++++++++++++++++ rottlib/src/ast.rs | 376 +++++++++++++++++++++ rottlib/src/diagnostics.rs | 251 ++++++++++++++ rottlib/src/lib.rs | 4 + rottlib/src/parser/cursor.rs | 230 +++++++++++++ rottlib/src/parser/errors.rs | 95 ++++++ rottlib/src/parser/grammar/block.rs | 60 ++++ rottlib/src/parser/grammar/control.rs | 180 ++++++++++ rottlib/src/parser/grammar/flow.rs | 99 ++++++ rottlib/src/parser/grammar/mod.rs | 7 + rottlib/src/parser/grammar/pratt.rs | 406 +++++++++++++++++++++++ rottlib/src/parser/grammar/precedence.rs | 185 +++++++++++ rottlib/src/parser/grammar/statements.rs | 185 +++++++++++ rottlib/src/parser/grammar/switch.rs | 227 +++++++++++++ rottlib/src/parser/mod.rs | 66 ++++ rottlib/src/parser/pretty.rs | 353 ++++++++++++++++++++ rottlib/src/parser/recovery.rs | 253 ++++++++++++++ rottlib/src/parser/trivia.rs | 297 +++++++++++++++++ 19 files changed, 3553 insertions(+), 1 deletion(-) create mode 100644 rottlib/src/arena.rs create mode 100644 rottlib/src/ast.rs create mode 100644 rottlib/src/diagnostics.rs create mode 100644 rottlib/src/parser/cursor.rs create mode 100644 rottlib/src/parser/errors.rs create mode 100644 rottlib/src/parser/grammar/block.rs create mode 100644 rottlib/src/parser/grammar/control.rs create mode 100644 rottlib/src/parser/grammar/flow.rs create mode 100644 rottlib/src/parser/grammar/mod.rs create mode 100644 rottlib/src/parser/grammar/pratt.rs create mode 100644 rottlib/src/parser/grammar/precedence.rs create mode 100644 rottlib/src/parser/grammar/statements.rs create mode 100644 rottlib/src/parser/grammar/switch.rs create mode 100644 rottlib/src/parser/mod.rs create mode 100644 rottlib/src/parser/pretty.rs create mode 100644 rottlib/src/parser/recovery.rs create mode 100644 rottlib/src/parser/trivia.rs diff --git a/rottlib/Cargo.toml b/rottlib/Cargo.toml index 1d879c8..9a0146a 100644 --- a/rottlib/Cargo.toml +++ b/rottlib/Cargo.toml @@ -8,4 +8,5 @@ default = [] debug = [] [dependencies] -logos = "0.15" \ No newline at end of file +logos = "0.15" +bumpalo = { version = "3", features = ["boxed", "collections"] } \ No newline at end of file diff --git a/rottlib/src/arena.rs b/rottlib/src/arena.rs new file mode 100644 index 0000000..f12111a --- /dev/null +++ b/rottlib/src/arena.rs @@ -0,0 +1,277 @@ +//! Arena submodule defining types that exist in their own memory space and +//! allow multiple cheap allocations (both performance- and fragmentation-wise). +//! +//! ## Memory safety +//! +//! Dropping the [`Arena`] frees all its memory at once and does not run +//! [`Drop`] for values allocated within it. Avoid storing types that implement +//! [`Drop`] or own external resources inside [`ArenaNode`], [`ArenaVec`], or +//! [`ArenaString`]. If you must, arrange an explicit "drain/drop" pass before +//! the arena is dropped. + +use core::fmt::{Debug, Display, Formatter, Result}; +use core::ops::{Deref, DerefMut}; + +use bumpalo::{Bump, boxed, collections}; + +use crate::ast::AstSpan; +use crate::lexer::TokenLocation; + +/// Object that manages a separate memory space, which can be deallocated all +/// at once after use. +/// +/// All allocations borrow the arena immutably. +/// +/// Dropping the [`Arena`] does not run [`Drop`] for values allocated within it +/// (including values contained in [`ArenaNode`], [`ArenaVec`] +/// and [`ArenaString`]). +/// +/// This arena is not thread-safe (`!Send`, `!Sync`). Values borrow the arena +/// and therefore cannot be sent across threads independently. +#[derive(Debug)] +pub struct Arena { + bump: Bump, +} + +impl Arena { + /// Creates a new, empty arena. + #[must_use] + pub fn new() -> Self { + Self { bump: Bump::new() } + } + + /// Constructs an empty [`ArenaVec`] allocated in this arena. + /// + /// The returned vector borrows this arena and cannot outlive it. + #[must_use] + pub fn vec(&self) -> ArenaVec<'_, T> { + ArenaVec(collections::Vec::new_in(&self.bump)) + } + + ///Allocates a copy of `string` in this arena and returns + /// an [`ArenaString`]. + #[must_use] + pub fn string(&self, string: &str) -> ArenaString<'_> { + ArenaString(collections::String::from_str_in(string, &self.bump)) + } + + /// Allocates `value` in this arena with the given `span`, + /// returning an [`ArenaNode`]. + /// + /// The node's storage borrows this arena and cannot outlive it. + /// + /// Note: `T`'s [`Drop`] is not run when the arena is dropped. + #[must_use] + pub fn alloc(&self, value: T, span: AstSpan) -> ArenaNode<'_, T> { + ArenaNode { + inner: boxed::Box::new_in(value, &self.bump), + span, + } + } + + pub fn alloc_between( + &self, + value: T, + from: TokenLocation, + to: TokenLocation, + ) -> ArenaNode<'_, T> { + self.alloc(value, AstSpan { from, to }) + } + + pub fn alloc_at(&self, value: T, at: TokenLocation) -> ArenaNode<'_, T> { + self.alloc(value, AstSpan { from: at, to: at }) + } +} + +impl Default for Arena { + fn default() -> Self { + Self::new() + } +} + +/// An arena-allocated box with an attached source span. +/// +/// Equality and hashing take into account both the contained `T` and the `span` +/// (when `T: Eq + Hash`). +/// +/// Note: `T`'s [`Drop`] is not run when the arena is dropped. +#[derive(Hash, PartialEq, Eq)] +pub struct ArenaNode<'arena, T> { + /// Value allocated in the arena; this node owns it. + inner: boxed::Box<'arena, T>, + /// Token range covered by the value. + span: AstSpan, +} + +impl<'arena, T> ArenaNode<'arena, T> { + /// Creates a new [`ArenaNode`] by allocating `value` in `arena`. + #[must_use] + pub fn new_in(value: T, span: AstSpan, arena: &'arena Arena) -> Self { + Self { + inner: boxed::Box::new_in(value, &arena.bump), + span, + } + } + + /// Creates a new [`ArenaNode`] for an AST node that spans a single token. + pub fn from_token_location( + value: T, + token_location: crate::lexer::TokenLocation, + arena: &'arena Arena, + ) -> Self { + Self { + inner: boxed::Box::new_in(value, &arena.bump), + span: AstSpan { + from: token_location, + to: token_location, + }, + } + } + + pub fn span_mut(&mut self) -> &mut AstSpan { + &mut self.span + } + + pub fn extend_to(&mut self, to: TokenLocation) { + self.span.to = to; + } + + pub fn extend_from(&mut self, from: TokenLocation) { + self.span.from = from; + } + + /// Returns the token span covered by this node. + pub fn span(&self) -> &AstSpan { + &self.span + } +} + +impl<'arena, T> Deref for ArenaNode<'arena, T> { + type Target = T; + + fn deref(&self) -> &T { + &self.inner + } +} + +impl<'arena, T> DerefMut for ArenaNode<'arena, T> { + fn deref_mut(&mut self) -> &mut T { + &mut self.inner + } +} + +impl<'arena, T: Debug> Debug for ArenaNode<'arena, T> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + f.debug_struct("ArenaNode") + .field("inner", &**self) + .field("span", &self.span()) + .finish() + } +} + +/// Version of [`Vec`] that can be safely used inside a memory arena. +/// +/// Elements do not have their destructors run when the arena is dropped. +/// +/// This type dereferences to `[T]` and supports iteration by reference +/// (`&ArenaVec` and `&mut ArenaVec` implement [`IntoIterator`]). +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub struct ArenaVec<'arena, T>(collections::Vec<'arena, T>); + +impl<'arena, T> ArenaVec<'arena, T> { + /// Creates an empty `ArenaVec` allocated in `arena`. + #[must_use] + pub fn new_in(arena: &'arena Arena) -> Self { + Self(collections::Vec::new_in(&arena.bump)) + } + + /// Appends an element to the end of the vector. + /// + /// Growth is backed by the arena; increasing capacity allocates new space + /// in the arena and never frees previous blocks. + pub fn push(&mut self, value: T) { + self.0.push(value) + } + + pub fn reserve(&mut self, additional: usize) { + self.0.reserve(additional) + } + pub fn extend>(&mut self, it: I) { + self.0.extend(it) + } +} + +impl<'arena, T> Deref for ArenaVec<'arena, T> { + type Target = [T]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl<'arena, T> DerefMut for ArenaVec<'arena, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl<'arena, 's, T> IntoIterator for &'s ArenaVec<'arena, T> { + type Item = &'s T; + type IntoIter = core::slice::Iter<'s, T>; + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} + +impl<'arena, 's, T> IntoIterator for &'s mut ArenaVec<'arena, T> { + type Item = &'s mut T; + type IntoIter = core::slice::IterMut<'s, T>; + fn into_iter(self) -> Self::IntoIter { + self.0.iter_mut() + } +} + +/// Version of [`String`] that can be safely used inside a memory arena. +/// +/// This type dereferences to [`str`] and implements [`AsRef`] and +/// [`core::borrow::Borrow`] for ergonomic use with APIs expecting string +/// slices. +/// +/// The string borrows the arena and cannot outlive it. Dropping the arena +/// frees its memory without running `Drop` for the string contents. +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub struct ArenaString<'arena>(collections::String<'arena>); + +impl<'arena> ArenaString<'arena> { + /// Allocates a copy of `string` in `arena` and returns an [`ArenaString`]. + #[must_use] + pub fn from_str_in(string: &str, arena: &'arena Arena) -> Self { + Self(collections::String::from_str_in(string, &arena.bump)) + } +} + +impl<'arena> Deref for ArenaString<'arena> { + type Target = str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl<'arena> AsRef for ArenaString<'arena> { + fn as_ref(&self) -> &str { + &self.0 + } +} + +impl<'arena> core::borrow::Borrow for ArenaString<'arena> { + fn borrow(&self) -> &str { + &self.0 + } +} + +impl<'arena> Display for ArenaString<'arena> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + Display::fmt(&self.0, f) + } +} diff --git a/rottlib/src/ast.rs b/rottlib/src/ast.rs new file mode 100644 index 0000000..31d670a --- /dev/null +++ b/rottlib/src/ast.rs @@ -0,0 +1,376 @@ +use crate::arena::ArenaVec; + +use super::lexer::TokenLocation; + +use core::fmt; + +use crate::arena::{Arena, ArenaNode, ArenaString}; + +// All inclusive! +#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub struct AstSpan { + pub from: TokenLocation, + pub to: TokenLocation, +} + +impl AstSpan { + pub fn merge(left_span: &AstSpan, right_span: &AstSpan) -> AstSpan { + AstSpan { + from: left_span.from, + to: right_span.to, + } + } + + pub fn new(single_location: TokenLocation) -> AstSpan { + AstSpan { + from: single_location, + to: single_location, + } + } + + pub fn range(from: TokenLocation, to: TokenLocation) -> AstSpan { + AstSpan { from, to } + } + + pub fn extend_to(&mut self, right_most_location: TokenLocation) { + if right_most_location > self.to { + self.to = right_most_location + } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum PrefixOperator { + Not, + Minus, + BitwiseNot, + Increment, + Decrement, +} + +#[derive(Clone, Copy, Debug)] +pub enum PostfixOperator { + Increment, + Decrement, +} + +#[derive(Clone, Copy, Debug)] +pub enum InfixOperator { + // Assignments + Assign, + MultiplyAssign, + DivideAssign, + ModuloAssign, + PlusAssign, + MinusAssign, + ConcatAssign, + ConcatSpaceAssign, + // String operations + ConcatSpace, + Concat, + // Logical + And, + Xor, + Or, + // Bit-wise + BitwiseAnd, + BitwiseOr, + BitwiseXor, + // Not-equal + NotEqual, + // Comparison + Equal, + ApproximatelyEqual, + Less, + LessEqual, + Greater, + GreaterEqual, + ClockwiseFrom, + // Shifts + LeftShift, + LogicalRightShift, + RightShift, + // Terms + Plus, + Minus, + // Modulo + Modulo, + // Factor + Multiply, + Divide, + Dot, + Cross, + // Exponentiation + Exponentiation, +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug)] +pub enum Expression<'src, 'arena> { + Binary( + ExpressionRef<'src, 'arena>, + InfixOperator, + ExpressionRef<'src, 'arena>, + ), + LeftUnary(PrefixOperator, ExpressionRef<'src, 'arena>), + RightUnary(ExpressionRef<'src, 'arena>, PostfixOperator), + + Identifier(&'src str), + String(ArenaString<'arena>), + Integer(i128), + Float(f64), + + Bool(bool), + None, + Parentheses(ExpressionRef<'src, 'arena>), + + Block { + // All these end with `;` + statements: ArenaVec<'arena, StatementRef<'src, 'arena>>, + // Last statement, but only if it doesn't end with `;` + tail: Option>, + }, + If { + condition: ExpressionRef<'src, 'arena>, + body: ExpressionRef<'src, 'arena>, + else_body: Option>, + }, + While { + condition: ExpressionRef<'src, 'arena>, + body: ExpressionRef<'src, 'arena>, + }, + DoUntil { + condition: ExpressionRef<'src, 'arena>, + body: ExpressionRef<'src, 'arena>, + }, + ForEach { + iterator: ExpressionRef<'src, 'arena>, + body: ExpressionRef<'src, 'arena>, + }, + For { + init: Option>, + condition: Option>, + step: Option>, + body: ExpressionRef<'src, 'arena>, + }, + Switch { + selector: ExpressionRef<'src, 'arena>, + cases: ArenaVec<'arena, CaseRef<'src, 'arena>>, + // default case + default_arm: Option>>, + // last statement of the case block + tail: Option>, + }, + Goto(ArenaString<'arena>), + Continue, + Break(Option>), + Return(Option>), + // For injecting in place of parts that couldn't be parsed + // (along with text that wasn't able to be parsed) + Error, +} + +pub type ExpressionRef<'src, 'arena> = ArenaNode<'arena, Expression<'src, 'arena>>; + +#[derive(Debug)] +pub struct VariableDeclarator<'src, 'arena> { + pub name: ArenaString<'arena>, + pub initializer: Option>, +} + +#[derive(Debug)] +pub struct SwitchCase<'src, 'arena> { + pub labels: ArenaVec<'arena, ExpressionRef<'src, 'arena>>, // UScript allows expressions; multiple labels ok + pub body: ArenaVec<'arena, StatementRef<'src, 'arena>>, // allow fallthrough unless a Break/Goto ends it +} + +pub type CaseRef<'src, 'arena> = ArenaNode<'arena, SwitchCase<'src, 'arena>>; + +#[derive(Debug)] +pub enum Statement<'src, 'arena> { + // For the cases where user just used too many semi-colons `;;;;` + Empty, + Expression(ExpressionRef<'src, 'arena>), + // Just declarations without assignment: + // `local int i, j, k` + LocalVariableDeclaration { + type_name: ArenaString<'arena>, + identifiers: ArenaVec<'arena, ArenaString<'arena>>, + }, + // Just `int i, j = 3, k = 0` + VariableDeclaration { + type_name: ArenaString<'arena>, + declarations: ArenaVec<'arena, VariableDeclarator<'src, 'arena>>, + }, + Label(ArenaString<'arena>), + // For injecting in place of parts that couldn't be parsed + // (along with text that wasn't able to be parsed) + Error, +} + +pub type StatementRef<'src, 'arena> = ArenaNode<'arena, Statement<'src, 'arena>>; + +impl<'src, 'arena> Expression<'src, 'arena> { + pub fn new_prefix( + arena: &'arena Arena, + op_position: TokenLocation, + op: PrefixOperator, + rhs: ArenaNode<'arena, Self>, + ) -> ArenaNode<'arena, Self> { + let span = AstSpan { + from: op_position, + to: rhs.span().to, + }; + ArenaNode::new_in(Self::LeftUnary(op, rhs), span, arena) + } + pub fn new_postfix( + arena: &'arena Arena, + lhs: ArenaNode<'arena, Self>, + op: PostfixOperator, + op_position: TokenLocation, + ) -> ArenaNode<'arena, Self> { + let span = AstSpan { + from: lhs.span().from, + to: op_position, + }; + ArenaNode::new_in(Self::RightUnary(lhs, op), span, arena) + } + pub fn new_binary( + arena: &'arena Arena, + lhs: ArenaNode<'arena, Self>, + op: InfixOperator, + rhs: ArenaNode<'arena, Self>, + ) -> ArenaNode<'arena, Self> { + let span = AstSpan::merge(&lhs.span(), &rhs.span()); + ArenaNode::new_in(Self::Binary(lhs, op, rhs), span, arena) + } +} + +/// Returns `true` for expressions that require `;` when used as a statement +/// (i.e., everything except blocky control-flow forms). +pub trait NeedsSemi { + fn needs_semicolon(&self) -> bool; +} + +impl<'src, 'arena> NeedsSemi for Expression<'src, 'arena> { + #[inline] + fn needs_semicolon(&self) -> bool { + match self { + Expression::Block { .. } + | Expression::If { .. } + | Expression::While { .. } + | Expression::DoUntil { .. } + | Expression::ForEach { .. } + | Expression::For { .. } + | Expression::Error => false, + + // All other expressions require `;` when used as a statement. + _ => true, + } + } +} + +// If `ArenaNode` derefs to `T`, this works as-is. +// Otherwise, replace `(**self)` with your accessor, e.g. `self.value()` or `self.get()`. +impl<'src, 'arena> NeedsSemi for ExpressionRef<'src, 'arena> { + #[inline] + fn needs_semicolon(&self) -> bool { + (**self).needs_semicolon() + } +} + +impl<'src, 'arena> NeedsSemi for Statement<'src, 'arena> { + #[inline] + fn needs_semicolon(&self) -> bool { + match self { + Statement::Empty | Statement::Label { .. } | Statement::Error { .. } => false, + // All other expressions require `;` when used as a statement. + _ => true, + } + } +} + +// If `ArenaNode` derefs to `T`, this works as-is. +// Otherwise, replace `(**self)` with your accessor, e.g. `self.value()` or `self.get()`. +impl<'src, 'arena> NeedsSemi for StatementRef<'src, 'arena> { + #[inline] + fn needs_semicolon(&self) -> bool { + (**self).needs_semicolon() + } +} + +impl fmt::Display for PrefixOperator { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + PrefixOperator::Not => "!", + PrefixOperator::Minus => "-", + PrefixOperator::BitwiseNot => "~", + PrefixOperator::Increment => "++.", + PrefixOperator::Decrement => "--.", + }; + write!(f, "{s}") + } +} +impl fmt::Display for PostfixOperator { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + PostfixOperator::Increment => ".++", + PostfixOperator::Decrement => ".--", + }; + write!(f, "{s}") + } +} +impl fmt::Display for InfixOperator { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use InfixOperator::*; + let s = match self { + // Assignments + Assign => "=", + MultiplyAssign => "*=", + DivideAssign => "/=", + ModuloAssign => "%=", + PlusAssign => "+=", + MinusAssign => "-=", + ConcatAssign => "$=", + ConcatSpaceAssign => "@=", + // String operations + ConcatSpace => "@", + Concat => "$", + // Logical + And => "&&", + Xor => "^^", + Or => "||", + // Bitwise + BitwiseAnd => "&", + BitwiseOr => "|", + BitwiseXor => "^", + // Not equal + NotEqual => "!=", + // Comparison + Equal => "==", + ApproximatelyEqual => "~+", + Less => "<", + LessEqual => "<=", + Greater => ">", + GreaterEqual => ">=", + ClockwiseFrom => "ClockwiseFrom", + // Shift + LeftShift => "<<", + LogicalRightShift => ">>>", + RightShift => ">>", + // Term + Plus => "+", + Minus => "-", + // Modulo + Modulo => "%", + // Factor + Multiply => "*", + Divide => "/", + Dot => "Dot", + Cross => "Cross", + // Exp + Exponentiation => "**", + }; + write!(f, "{s}") + } +} diff --git a/rottlib/src/diagnostics.rs b/rottlib/src/diagnostics.rs new file mode 100644 index 0000000..57106b8 --- /dev/null +++ b/rottlib/src/diagnostics.rs @@ -0,0 +1,251 @@ +//! Diagnostics primitives for all stages of compiler and frontend code. +//! +//! These types describe what to show the user when something goes wrong while +//! parsing or doing lightweight frontend checks. They are intentionally small, +//! depend only on [`AstSpan`], and are easy to construct and store. + +use crate::ast::AstSpan; + +/// Classification of a diagnostic by its impact. +/// +/// Choose the most restrictive level that reflects the state of the source and +/// the compiler's ability to continue. +/// +/// - `Error`: use when the source is invalid according to the language rules or +/// the parser cannot make a sound interpretation. Errors typically prevent +/// code generation. Examples: mismatched delimiters, +/// missing required tokens, invalid escapes, unrecoverable ambiguity. +/// - `Warning`: use when the source is valid but likely unintended, obsolete, +/// or suboptimal. Warnings should not change program semantics if ignored +/// and must not block compilation. Examples: deprecated syntax, shadowing +/// that is allowed but suspicious, unreachable code after a return. +/// Do not use warnings to paper over true syntax errors. If the construct +/// is invalid, prefer [`Severity::Error`] even if recovery is possible. +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)] +#[non_exhaustive] +pub enum Severity { + /// Fatal to the current compilation unit or requires recovery. + Error, + /// Non-fatal advisory about suspicious but valid code. + Warning, +} + +/// A labeled source span with a short inline message. +/// +/// Message should be one sentence, start lowercase, and omit the final period. +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub struct Label { + /// Span to highlight in source coordinates. + pub span: AstSpan, + /// Short inline text shown next to the caret line. + pub message: String, +} + +/// A single pure data diagnostic message with optional structured context. +#[derive(Clone, Debug, PartialEq, Eq)] +#[must_use] +pub struct Diagnostic { + /// Headline, e.g. "Mismatched closing delimiter: `}`". + headline: String, + /// Impact of the diagnostic. See [`Severity`] for guidance. + severity: Severity, + /// Optional stable identifier, e.g. "P0007" or "L0001". + /// + /// Codes must match `^[LPTSXD][0-9]{4}$` where the prefix is the domain: + /// `L` lexer, `P` parser, `T` type check, `S` semantics, `X` lints, + /// `D` deprecations. + /// + /// Codes help users search documentation and suppress or elevate specific + /// diagnostics. Keep codes stable across releases once published. + code: Option, + /// Marks the main location the user should look at first. + /// + /// Typically the exact token or span that triggered the diagnostic. + primary_label: Option