Compare commits

..

11 Commits

28 changed files with 3798 additions and 72 deletions

View File

@ -11,6 +11,10 @@ path = "src/dump_tokens.rs"
name = "uc_lexer_verify" name = "uc_lexer_verify"
path = "src/uc_lexer_verify.rs" path = "src/uc_lexer_verify.rs"
[[bin]]
name = "temp"
path = "src/temp.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]

View File

@ -70,7 +70,7 @@ fn main() {
let (decoded_str, _, _) = encoding.decode(&raw_bytes); let (decoded_str, _, _) = encoding.decode(&raw_bytes);
let source_text = decoded_str.to_string(); let source_text = decoded_str.to_string();
let tokenized_file = TokenizedFile::from_source(&source_text); let tokenized_file = TokenizedFile::from_str(&source_text);
tokenized_file.dump_debug_layout(); tokenized_file.dump_debug_layout();
} }

129
dev_tests/src/temp.rs Normal file
View File

@ -0,0 +1,129 @@
//! src/main.rs
//! --------------------------------------------
//! Build & run:
//! cargo run
//! --------------------------------------------
use std::env;
use std::fs;
use std::io::{self, Read, Write};
use std::path::Path;
use rottlib::arena::Arena;
use rottlib::lexer::TokenizedFile;
use rottlib::parser::{ParseError, Parser, pretty::ExprTree};
/*
- Convenient array definitions: [1, 3, 5, 2, 4]
- Boolean dynamic arrays
- Structures in default properties
- Auto conversion of arrays into strings
- Making 'var' and 'local' unnecessary
- Allowing variable creation in 'for' loops
- Allowing variable creation at any place inside a function
- Default parameters for functions
- Function overloading?
- repeat/until
- The syntax of the default properties block is pretty strict for an arcane reason. Particularly adding spaces before or after the "=" will lead to errors in pre-UT2003 versions.
- Scopes
- different names for variables and in config file
- anonymous pairs (objects?) and value destruction
>>> AST > HIR > MIR > byte code
*/
/// Closest plan:
/// - Add top-level declaration parsing
/// - Handle pretty.rs shit somehow
/// - COMMITS
/// ---------------------------------------
/// - Add fancy error reporting
/// - Make a fancy REPL
/// - Add evaluation
///
/// WARNINGS:
/// - Empty code/switch blocks
fn parse_and_print(src: &str) -> Result<(), ParseError> {
let tokenized = TokenizedFile::from_str(src);
let arena = Arena::new();
let mut parser = Parser::new(&tokenized, &arena);
let expr = parser.parse_expression(); // ArenaNode<Expression>
println!("{}", ExprTree(&*expr)); // if ArenaNode<Deref>
// or: println!("{}", ExprTree(expr.as_ref())); // if no Deref
Ok(())
}
fn repl_once() -> Result<(), ParseError> {
print!("Enter an statement > ");
io::stdout().flush().unwrap();
let mut input = String::new();
if io::stdin().read_line(&mut input).is_err() {
eprintln!("failed to read input");
return Ok(());
}
if input.trim().is_empty() {
return Ok(());
}
parse_and_print(&input)
}
fn read_stdin_all() -> io::Result<String> {
let mut buf = String::new();
io::stdin().read_to_string(&mut buf)?;
Ok(buf)
}
fn read_file_to_string(path: &Path) -> io::Result<String> {
fs::read_to_string(path)
}
fn main() -> Result<(), ParseError> {
// Accept a single positional arg as the input path.
// "-" means read all of stdin.
let mut args = env::args().skip(1);
if let Some(arg1) = args.next() {
if arg1 == "-h" || arg1 == "--help" {
println!("Usage:");
println!(
" {} # REPL",
env::args().next().unwrap_or_else(|| "prog".into())
);
println!(
" {} <file> # parse file",
env::args().next().unwrap_or_else(|| "prog".into())
);
println!(
" {} - # read source from stdin",
env::args().next().unwrap_or_else(|| "prog".into())
);
return Ok(());
}
if arg1 == "-" {
match read_stdin_all() {
Ok(src) => return parse_and_print(&src),
Err(e) => {
eprintln!("stdin read error: {}", e);
return Ok(());
}
}
} else {
let path = Path::new(&arg1);
match read_file_to_string(path) {
Ok(src) => return parse_and_print(&src),
Err(e) => {
eprintln!("file read error ({}): {}", path.display(), e);
return Ok(());
}
}
}
}
// No filename provided -> keep REPL behavior
repl_once()
}

View File

@ -75,7 +75,7 @@ fn main() {
let path = entry.path(); let path = entry.path();
match fs::read(path) { match fs::read(path) {
Ok(raw_bytes) => { Ok(raw_bytes) => {
// Autodetect encoding for old Unreal script sources // Auto-detect encoding for old Unreal script sources
let (encoding_label, _, _) = chardet::detect(&raw_bytes); let (encoding_label, _, _) = chardet::detect(&raw_bytes);
let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes()) let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes())
.unwrap_or(encoding_rs::UTF_8); .unwrap_or(encoding_rs::UTF_8);
@ -95,8 +95,8 @@ fn main() {
let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files
.iter() .iter()
.map(|(path, source_code)| { .map(|(path, source_code)| {
let tokenized_file = TokenizedFile::from_source(source_code); let tokenized_file = TokenizedFile::from_str(source_code);
if tokenized_file.had_errors() { if tokenized_file.has_errors() {
println!("TK: {}", path.display()); println!("TK: {}", path.display());
} }
(path.clone(), tokenized_file) (path.clone(), tokenized_file)
@ -109,7 +109,7 @@ fn main() {
elapsed_time elapsed_time
); );
// Roundtrip check // Round-trip check
for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) { for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) {
let reconstructed = tokenized_file.reconstruct_source(); let reconstructed = tokenized_file.reconstruct_source();
if original != &reconstructed { if original != &reconstructed {

View File

@ -8,4 +8,5 @@ default = []
debug = [] debug = []
[dependencies] [dependencies]
logos = "0.15" logos = "0.15"
bumpalo = { version = "3", features = ["boxed", "collections"] }

277
rottlib/src/arena.rs Normal file
View File

@ -0,0 +1,277 @@
//! Arena submodule defining types that exist in their own memory space and
//! allow multiple cheap allocations (both performance- and fragmentation-wise).
//!
//! ## Memory safety
//!
//! Dropping the [`Arena`] frees all its memory at once and does not run
//! [`Drop`] for values allocated within it. Avoid storing types that implement
//! [`Drop`] or own external resources inside [`ArenaNode`], [`ArenaVec`], or
//! [`ArenaString`]. If you must, arrange an explicit "drain/drop" pass before
//! the arena is dropped.
use core::fmt::{Debug, Display, Formatter, Result};
use core::ops::{Deref, DerefMut};
use bumpalo::{Bump, boxed, collections};
use crate::ast::AstSpan;
use crate::lexer::TokenLocation;
/// Object that manages a separate memory space, which can be deallocated all
/// at once after use.
///
/// All allocations borrow the arena immutably.
///
/// Dropping the [`Arena`] does not run [`Drop`] for values allocated within it
/// (including values contained in [`ArenaNode`], [`ArenaVec`]
/// and [`ArenaString`]).
///
/// This arena is not thread-safe (`!Send`, `!Sync`). Values borrow the arena
/// and therefore cannot be sent across threads independently.
#[derive(Debug)]
pub struct Arena {
bump: Bump,
}
impl Arena {
/// Creates a new, empty arena.
#[must_use]
pub fn new() -> Self {
Self { bump: Bump::new() }
}
/// Constructs an empty [`ArenaVec`] allocated in this arena.
///
/// The returned vector borrows this arena and cannot outlive it.
#[must_use]
pub fn vec<T>(&self) -> ArenaVec<'_, T> {
ArenaVec(collections::Vec::new_in(&self.bump))
}
///Allocates a copy of `string` in this arena and returns
/// an [`ArenaString`].
#[must_use]
pub fn string(&self, string: &str) -> ArenaString<'_> {
ArenaString(collections::String::from_str_in(string, &self.bump))
}
/// Allocates `value` in this arena with the given `span`,
/// returning an [`ArenaNode`].
///
/// The node's storage borrows this arena and cannot outlive it.
///
/// Note: `T`'s [`Drop`] is not run when the arena is dropped.
#[must_use]
pub fn alloc<T>(&self, value: T, span: AstSpan) -> ArenaNode<'_, T> {
ArenaNode {
inner: boxed::Box::new_in(value, &self.bump),
span,
}
}
pub fn alloc_between<T>(
&self,
value: T,
from: TokenLocation,
to: TokenLocation,
) -> ArenaNode<'_, T> {
self.alloc(value, AstSpan { from, to })
}
pub fn alloc_at<T>(&self, value: T, at: TokenLocation) -> ArenaNode<'_, T> {
self.alloc(value, AstSpan { from: at, to: at })
}
}
impl Default for Arena {
fn default() -> Self {
Self::new()
}
}
/// An arena-allocated box with an attached source span.
///
/// Equality and hashing take into account both the contained `T` and the `span`
/// (when `T: Eq + Hash`).
///
/// Note: `T`'s [`Drop`] is not run when the arena is dropped.
#[derive(Hash, PartialEq, Eq)]
pub struct ArenaNode<'arena, T> {
/// Value allocated in the arena; this node owns it.
inner: boxed::Box<'arena, T>,
/// Token range covered by the value.
span: AstSpan,
}
impl<'arena, T> ArenaNode<'arena, T> {
/// Creates a new [`ArenaNode`] by allocating `value` in `arena`.
#[must_use]
pub fn new_in(value: T, span: AstSpan, arena: &'arena Arena) -> Self {
Self {
inner: boxed::Box::new_in(value, &arena.bump),
span,
}
}
/// Creates a new [`ArenaNode`] for an AST node that spans a single token.
pub fn from_token_location(
value: T,
token_location: crate::lexer::TokenLocation,
arena: &'arena Arena,
) -> Self {
Self {
inner: boxed::Box::new_in(value, &arena.bump),
span: AstSpan {
from: token_location,
to: token_location,
},
}
}
pub fn span_mut(&mut self) -> &mut AstSpan {
&mut self.span
}
pub fn extend_to(&mut self, to: TokenLocation) {
self.span.to = to;
}
pub fn extend_from(&mut self, from: TokenLocation) {
self.span.from = from;
}
/// Returns the token span covered by this node.
pub fn span(&self) -> &AstSpan {
&self.span
}
}
impl<'arena, T> Deref for ArenaNode<'arena, T> {
type Target = T;
fn deref(&self) -> &T {
&self.inner
}
}
impl<'arena, T> DerefMut for ArenaNode<'arena, T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.inner
}
}
impl<'arena, T: Debug> Debug for ArenaNode<'arena, T> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
f.debug_struct("ArenaNode")
.field("inner", &**self)
.field("span", &self.span())
.finish()
}
}
/// Version of [`Vec`] that can be safely used inside a memory arena.
///
/// Elements do not have their destructors run when the arena is dropped.
///
/// This type dereferences to `[T]` and supports iteration by reference
/// (`&ArenaVec` and `&mut ArenaVec` implement [`IntoIterator`]).
#[derive(Clone, Debug, Hash, PartialEq, Eq)]
pub struct ArenaVec<'arena, T>(collections::Vec<'arena, T>);
impl<'arena, T> ArenaVec<'arena, T> {
/// Creates an empty `ArenaVec` allocated in `arena`.
#[must_use]
pub fn new_in(arena: &'arena Arena) -> Self {
Self(collections::Vec::new_in(&arena.bump))
}
/// Appends an element to the end of the vector.
///
/// Growth is backed by the arena; increasing capacity allocates new space
/// in the arena and never frees previous blocks.
pub fn push(&mut self, value: T) {
self.0.push(value)
}
pub fn reserve(&mut self, additional: usize) {
self.0.reserve(additional)
}
pub fn extend<I: IntoIterator<Item = T>>(&mut self, it: I) {
self.0.extend(it)
}
}
impl<'arena, T> Deref for ArenaVec<'arena, T> {
type Target = [T];
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<'arena, T> DerefMut for ArenaVec<'arena, T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl<'arena, 's, T> IntoIterator for &'s ArenaVec<'arena, T> {
type Item = &'s T;
type IntoIter = core::slice::Iter<'s, T>;
fn into_iter(self) -> Self::IntoIter {
self.0.iter()
}
}
impl<'arena, 's, T> IntoIterator for &'s mut ArenaVec<'arena, T> {
type Item = &'s mut T;
type IntoIter = core::slice::IterMut<'s, T>;
fn into_iter(self) -> Self::IntoIter {
self.0.iter_mut()
}
}
/// Version of [`String`] that can be safely used inside a memory arena.
///
/// This type dereferences to [`str`] and implements [`AsRef<str>`] and
/// [`core::borrow::Borrow<str>`] for ergonomic use with APIs expecting string
/// slices.
///
/// The string borrows the arena and cannot outlive it. Dropping the arena
/// frees its memory without running `Drop` for the string contents.
#[derive(Clone, Debug, Hash, PartialEq, Eq)]
pub struct ArenaString<'arena>(collections::String<'arena>);
impl<'arena> ArenaString<'arena> {
/// Allocates a copy of `string` in `arena` and returns an [`ArenaString`].
#[must_use]
pub fn from_str_in(string: &str, arena: &'arena Arena) -> Self {
Self(collections::String::from_str_in(string, &arena.bump))
}
}
impl<'arena> Deref for ArenaString<'arena> {
type Target = str;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<'arena> AsRef<str> for ArenaString<'arena> {
fn as_ref(&self) -> &str {
&self.0
}
}
impl<'arena> core::borrow::Borrow<str> for ArenaString<'arena> {
fn borrow(&self) -> &str {
&self.0
}
}
impl<'arena> Display for ArenaString<'arena> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
Display::fmt(&self.0, f)
}
}

376
rottlib/src/ast.rs Normal file
View File

@ -0,0 +1,376 @@
use crate::arena::ArenaVec;
use super::lexer::TokenLocation;
use core::fmt;
use crate::arena::{Arena, ArenaNode, ArenaString};
// All inclusive!
#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
pub struct AstSpan {
pub from: TokenLocation,
pub to: TokenLocation,
}
impl AstSpan {
pub fn merge(left_span: &AstSpan, right_span: &AstSpan) -> AstSpan {
AstSpan {
from: left_span.from,
to: right_span.to,
}
}
pub fn new(single_location: TokenLocation) -> AstSpan {
AstSpan {
from: single_location,
to: single_location,
}
}
pub fn range(from: TokenLocation, to: TokenLocation) -> AstSpan {
AstSpan { from, to }
}
pub fn extend_to(&mut self, right_most_location: TokenLocation) {
if right_most_location > self.to {
self.to = right_most_location
}
}
}
#[derive(Clone, Copy, Debug)]
pub enum PrefixOperator {
Not,
Minus,
BitwiseNot,
Increment,
Decrement,
}
#[derive(Clone, Copy, Debug)]
pub enum PostfixOperator {
Increment,
Decrement,
}
#[derive(Clone, Copy, Debug)]
pub enum InfixOperator {
// Assignments
Assign,
MultiplyAssign,
DivideAssign,
ModuloAssign,
PlusAssign,
MinusAssign,
ConcatAssign,
ConcatSpaceAssign,
// String operations
ConcatSpace,
Concat,
// Logical
And,
Xor,
Or,
// Bit-wise
BitwiseAnd,
BitwiseOr,
BitwiseXor,
// Not-equal
NotEqual,
// Comparison
Equal,
ApproximatelyEqual,
Less,
LessEqual,
Greater,
GreaterEqual,
ClockwiseFrom,
// Shifts
LeftShift,
LogicalRightShift,
RightShift,
// Terms
Plus,
Minus,
// Modulo
Modulo,
// Factor
Multiply,
Divide,
Dot,
Cross,
// Exponentiation
Exponentiation,
}
#[allow(clippy::large_enum_variant)]
#[derive(Debug)]
pub enum Expression<'src, 'arena> {
Binary(
ExpressionRef<'src, 'arena>,
InfixOperator,
ExpressionRef<'src, 'arena>,
),
LeftUnary(PrefixOperator, ExpressionRef<'src, 'arena>),
RightUnary(ExpressionRef<'src, 'arena>, PostfixOperator),
Identifier(&'src str),
String(ArenaString<'arena>),
Integer(i128),
Float(f64),
Bool(bool),
None,
Parentheses(ExpressionRef<'src, 'arena>),
Block {
// All these end with `;`
statements: ArenaVec<'arena, StatementRef<'src, 'arena>>,
// Last statement, but only if it doesn't end with `;`
tail: Option<ExpressionRef<'src, 'arena>>,
},
If {
condition: ExpressionRef<'src, 'arena>,
body: ExpressionRef<'src, 'arena>,
else_body: Option<ExpressionRef<'src, 'arena>>,
},
While {
condition: ExpressionRef<'src, 'arena>,
body: ExpressionRef<'src, 'arena>,
},
DoUntil {
condition: ExpressionRef<'src, 'arena>,
body: ExpressionRef<'src, 'arena>,
},
ForEach {
iterator: ExpressionRef<'src, 'arena>,
body: ExpressionRef<'src, 'arena>,
},
For {
init: Option<ExpressionRef<'src, 'arena>>,
condition: Option<ExpressionRef<'src, 'arena>>,
step: Option<ExpressionRef<'src, 'arena>>,
body: ExpressionRef<'src, 'arena>,
},
Switch {
selector: ExpressionRef<'src, 'arena>,
cases: ArenaVec<'arena, CaseRef<'src, 'arena>>,
// default case
default_arm: Option<ArenaVec<'arena, StatementRef<'src, 'arena>>>,
// last statement of the case block
tail: Option<ExpressionRef<'src, 'arena>>,
},
Goto(ArenaString<'arena>),
Continue,
Break(Option<ExpressionRef<'src, 'arena>>),
Return(Option<ExpressionRef<'src, 'arena>>),
// For injecting in place of parts that couldn't be parsed
// (along with text that wasn't able to be parsed)
Error,
}
pub type ExpressionRef<'src, 'arena> = ArenaNode<'arena, Expression<'src, 'arena>>;
#[derive(Debug)]
pub struct VariableDeclarator<'src, 'arena> {
pub name: ArenaString<'arena>,
pub initializer: Option<ExpressionRef<'src, 'arena>>,
}
#[derive(Debug)]
pub struct SwitchCase<'src, 'arena> {
pub labels: ArenaVec<'arena, ExpressionRef<'src, 'arena>>, // UScript allows expressions; multiple labels ok
pub body: ArenaVec<'arena, StatementRef<'src, 'arena>>, // allow fallthrough unless a Break/Goto ends it
}
pub type CaseRef<'src, 'arena> = ArenaNode<'arena, SwitchCase<'src, 'arena>>;
#[derive(Debug)]
pub enum Statement<'src, 'arena> {
// For the cases where user just used too many semi-colons `;;;;`
Empty,
Expression(ExpressionRef<'src, 'arena>),
// Just declarations without assignment:
// `local int i, j, k`
LocalVariableDeclaration {
type_name: ArenaString<'arena>,
identifiers: ArenaVec<'arena, ArenaString<'arena>>,
},
// Just `int i, j = 3, k = 0`
VariableDeclaration {
type_name: ArenaString<'arena>,
declarations: ArenaVec<'arena, VariableDeclarator<'src, 'arena>>,
},
Label(ArenaString<'arena>),
// For injecting in place of parts that couldn't be parsed
// (along with text that wasn't able to be parsed)
Error,
}
pub type StatementRef<'src, 'arena> = ArenaNode<'arena, Statement<'src, 'arena>>;
impl<'src, 'arena> Expression<'src, 'arena> {
pub fn new_prefix(
arena: &'arena Arena,
op_position: TokenLocation,
op: PrefixOperator,
rhs: ArenaNode<'arena, Self>,
) -> ArenaNode<'arena, Self> {
let span = AstSpan {
from: op_position,
to: rhs.span().to,
};
ArenaNode::new_in(Self::LeftUnary(op, rhs), span, arena)
}
pub fn new_postfix(
arena: &'arena Arena,
lhs: ArenaNode<'arena, Self>,
op: PostfixOperator,
op_position: TokenLocation,
) -> ArenaNode<'arena, Self> {
let span = AstSpan {
from: lhs.span().from,
to: op_position,
};
ArenaNode::new_in(Self::RightUnary(lhs, op), span, arena)
}
pub fn new_binary(
arena: &'arena Arena,
lhs: ArenaNode<'arena, Self>,
op: InfixOperator,
rhs: ArenaNode<'arena, Self>,
) -> ArenaNode<'arena, Self> {
let span = AstSpan::merge(&lhs.span(), &rhs.span());
ArenaNode::new_in(Self::Binary(lhs, op, rhs), span, arena)
}
}
/// Returns `true` for expressions that require `;` when used as a statement
/// (i.e., everything except blocky control-flow forms).
pub trait NeedsSemi {
fn needs_semicolon(&self) -> bool;
}
impl<'src, 'arena> NeedsSemi for Expression<'src, 'arena> {
#[inline]
fn needs_semicolon(&self) -> bool {
match self {
Expression::Block { .. }
| Expression::If { .. }
| Expression::While { .. }
| Expression::DoUntil { .. }
| Expression::ForEach { .. }
| Expression::For { .. }
| Expression::Error => false,
// All other expressions require `;` when used as a statement.
_ => true,
}
}
}
// If `ArenaNode<T>` derefs to `T`, this works as-is.
// Otherwise, replace `(**self)` with your accessor, e.g. `self.value()` or `self.get()`.
impl<'src, 'arena> NeedsSemi for ExpressionRef<'src, 'arena> {
#[inline]
fn needs_semicolon(&self) -> bool {
(**self).needs_semicolon()
}
}
impl<'src, 'arena> NeedsSemi for Statement<'src, 'arena> {
#[inline]
fn needs_semicolon(&self) -> bool {
match self {
Statement::Empty | Statement::Label { .. } | Statement::Error { .. } => false,
// All other expressions require `;` when used as a statement.
_ => true,
}
}
}
// If `ArenaNode<T>` derefs to `T`, this works as-is.
// Otherwise, replace `(**self)` with your accessor, e.g. `self.value()` or `self.get()`.
impl<'src, 'arena> NeedsSemi for StatementRef<'src, 'arena> {
#[inline]
fn needs_semicolon(&self) -> bool {
(**self).needs_semicolon()
}
}
impl fmt::Display for PrefixOperator {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
PrefixOperator::Not => "!",
PrefixOperator::Minus => "-",
PrefixOperator::BitwiseNot => "~",
PrefixOperator::Increment => "++.",
PrefixOperator::Decrement => "--.",
};
write!(f, "{s}")
}
}
impl fmt::Display for PostfixOperator {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
PostfixOperator::Increment => ".++",
PostfixOperator::Decrement => ".--",
};
write!(f, "{s}")
}
}
impl fmt::Display for InfixOperator {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use InfixOperator::*;
let s = match self {
// Assignments
Assign => "=",
MultiplyAssign => "*=",
DivideAssign => "/=",
ModuloAssign => "%=",
PlusAssign => "+=",
MinusAssign => "-=",
ConcatAssign => "$=",
ConcatSpaceAssign => "@=",
// String operations
ConcatSpace => "@",
Concat => "$",
// Logical
And => "&&",
Xor => "^^",
Or => "||",
// Bitwise
BitwiseAnd => "&",
BitwiseOr => "|",
BitwiseXor => "^",
// Not equal
NotEqual => "!=",
// Comparison
Equal => "==",
ApproximatelyEqual => "~+",
Less => "<",
LessEqual => "<=",
Greater => ">",
GreaterEqual => ">=",
ClockwiseFrom => "ClockwiseFrom",
// Shift
LeftShift => "<<",
LogicalRightShift => ">>>",
RightShift => ">>",
// Term
Plus => "+",
Minus => "-",
// Modulo
Modulo => "%",
// Factor
Multiply => "*",
Divide => "/",
Dot => "Dot",
Cross => "Cross",
// Exp
Exponentiation => "**",
};
write!(f, "{s}")
}
}

251
rottlib/src/diagnostics.rs Normal file
View File

@ -0,0 +1,251 @@
//! Diagnostics primitives for all stages of compiler and frontend code.
//!
//! These types describe what to show the user when something goes wrong while
//! parsing or doing lightweight frontend checks. They are intentionally small,
//! depend only on [`AstSpan`], and are easy to construct and store.
use crate::ast::AstSpan;
/// Classification of a diagnostic by its impact.
///
/// Choose the most restrictive level that reflects the state of the source and
/// the compiler's ability to continue.
///
/// - `Error`: use when the source is invalid according to the language rules or
/// the parser cannot make a sound interpretation. Errors typically prevent
/// code generation. Examples: mismatched delimiters,
/// missing required tokens, invalid escapes, unrecoverable ambiguity.
/// - `Warning`: use when the source is valid but likely unintended, obsolete,
/// or suboptimal. Warnings should not change program semantics if ignored
/// and must not block compilation. Examples: deprecated syntax, shadowing
/// that is allowed but suspicious, unreachable code after a return.
/// Do not use warnings to paper over true syntax errors. If the construct
/// is invalid, prefer [`Severity::Error`] even if recovery is possible.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
#[non_exhaustive]
pub enum Severity {
/// Fatal to the current compilation unit or requires recovery.
Error,
/// Non-fatal advisory about suspicious but valid code.
Warning,
}
/// A labeled source span with a short inline message.
///
/// Message should be one sentence, start lowercase, and omit the final period.
#[derive(Clone, Debug, Hash, PartialEq, Eq)]
pub struct Label {
/// Span to highlight in source coordinates.
pub span: AstSpan,
/// Short inline text shown next to the caret line.
pub message: String,
}
/// A single pure data diagnostic message with optional structured context.
#[derive(Clone, Debug, PartialEq, Eq)]
#[must_use]
pub struct Diagnostic {
/// Headline, e.g. "Mismatched closing delimiter: `}`".
headline: String,
/// Impact of the diagnostic. See [`Severity`] for guidance.
severity: Severity,
/// Optional stable identifier, e.g. "P0007" or "L0001".
///
/// Codes must match `^[LPTSXD][0-9]{4}$` where the prefix is the domain:
/// `L` lexer, `P` parser, `T` type check, `S` semantics, `X` lints,
/// `D` deprecations.
///
/// Codes help users search documentation and suppress or elevate specific
/// diagnostics. Keep codes stable across releases once published.
code: Option<String>,
/// Marks the main location the user should look at first.
///
/// Typically the exact token or span that triggered the diagnostic.
primary_label: Option<Label>,
/// Supplemental locations that add context or cross-reference the primary
/// site.
///
/// Examples: "matching delimiter was opened here", "declared here",
/// "previous use here". Secondary labels should not compete with
/// the primary.
secondary_labels: Vec<Label>,
/// A single actionable suggestion aimed at a quick fix. Keep it concise.
help: Option<String>,
/// Additional free-form lines not intended as a fix suggestion.
///
/// Use for technical details, references, or rationale. Keep each string to
/// a single paragraph.
notes: Vec<String>,
}
impl Diagnostic {
/// Construct a new error diagnostic with the given headline.
///
/// Use for invalid constructs that prevent a sound interpretation.
pub fn error(headline: impl Into<String>) -> Self {
Self {
headline: headline.into(),
severity: Severity::Error,
code: None,
primary_label: None,
secondary_labels: Vec::new(),
notes: Vec::new(),
help: None,
}
}
/// Construct a new warning diagnostic with the given headline.
///
/// Use for valid but suspicious or suboptimal constructs.
pub fn warning(headline: impl Into<String>) -> Self {
Self {
headline: headline.into(),
severity: Severity::Warning,
code: None,
primary_label: None,
secondary_labels: Vec::new(),
notes: Vec::new(),
help: None,
}
}
/// Returns `true` iff severity is [`Severity::Error`].
pub fn stops_compilation(&self) -> bool {
self.severity == Severity::Error
}
/// Returns the diagnostic code if present.
///
/// See [DiagnosticBuilder::code] for code scheme.
pub fn code(&self) -> Option<&str> {
self.code.as_deref()
}
/// Returns the primary label, if any.
pub fn primary_label(&self) -> Option<&Label> {
self.primary_label.as_ref()
}
/// Returns the secondary labels in insertion order.
pub fn secondary_labels(&self) -> &[Label] {
&self.secondary_labels
}
/// Returns the headline.
pub fn headline(&self) -> &str {
&self.headline
}
/// Returns the severity.
pub fn severity(&self) -> Severity {
self.severity
}
/// Returns the notes.
pub fn notes(&self) -> &[String] {
&self.notes
}
/// Returns the help message, if any.
pub fn help(&self) -> Option<&str> {
self.help.as_deref()
}
}
/// A convenient diagnostic builder.
#[derive(Debug)]
#[must_use]
pub struct DiagnosticBuilder {
diagnostic: Diagnostic,
}
impl DiagnosticBuilder {
/// Creates a new builder for an error diagnostic with a given headline.
pub fn error(headline: impl Into<String>) -> Self {
Self {
diagnostic: Diagnostic::error(headline),
}
}
/// Creates a new builder for a warning diagnostic with a given headline.
pub fn warning(headline: impl Into<String>) -> Self {
Self {
diagnostic: Diagnostic::warning(headline),
}
}
/// Attach or replace the stable diagnostic code.
///
/// Codes identify classes of diagnostics across versions.
/// Keep them short, ASCII-only, and stable. Prefer the scheme:
///
/// 1. Prefix = domain:
/// - `L`: lexer (invalid char, unterminated string);
/// - `P`: parser (mismatched delimiters, expected X found Y);
/// - `T`: type check;
/// - `S`: semantic analysis (name resolution, visibility);
/// - `X`: style/lints (shadowing, dead code);
/// - `D`: deprecations.
///
/// 2. Suffix = 4 digits, zero-padded: `0001`..`9999`.
/// Example codes: `L0001`, `P0007`, `T0123`.
///
/// Codes are optional, but once published should not change.
pub fn code(mut self, code: impl Into<String>) -> Self {
self.diagnostic.code = Some(code.into());
self
}
/// Attach or replace a primary label.
///
/// One sentence, starting with lowercase letter, no period at the end.
/// Since only one primary label can be specified, the previous primary is
/// replaced.
pub fn primary_label(mut self, span: AstSpan, message: impl Into<String>) -> Self {
self.diagnostic.primary_label = Some(Label {
span,
message: message.into(),
});
self
}
/// Add a secondary label.
///
/// One sentence, starting with lowercase letter, no period at the end.
pub fn secondary_label(mut self, span: AstSpan, message: impl Into<String>) -> Self {
self.diagnostic.secondary_labels.push(Label {
span,
message: message.into(),
});
self
}
/// Add a free-form note line.
///
/// Can be several sentences, starting with uppercase letter and with period
/// at the end.
pub fn note(mut self, message: impl Into<String>) -> Self {
self.diagnostic.notes.push(message.into());
self
}
/// Set the help message.
///
/// Can be several sentences, starting with uppercase letter and with period
/// at the end.
pub fn help(mut self, message: impl Into<String>) -> Self {
self.diagnostic.help = Some(message.into());
self
}
/// Finishes building and returns the diagnostic.
pub fn build(self) -> Diagnostic {
self.diagnostic
}
}
impl From<DiagnosticBuilder> for Diagnostic {
fn from(diagnostic_builder: DiagnosticBuilder) -> Self {
diagnostic_builder.build()
}
}

View File

@ -47,7 +47,7 @@ impl<'src> DebugTools for super::TokenizedFile<'src> {
// the remainder of a multi-line token that started earlier. // the remainder of a multi-line token that started earlier.
(Some(origin_row), None) => { (Some(origin_row), None) => {
println!( println!(
"\t[Continued from line {} no new tokens here]", "\t[Continued from line {} - no new tokens here]",
origin_row + 1 origin_row + 1
); );
} }
@ -69,13 +69,13 @@ impl<'src> DebugTools for super::TokenizedFile<'src> {
/// Helper that prints every span in `spans` together with its UTF-16 /// Helper that prints every span in `spans` together with its UTF-16
/// column boundaries. /// column boundaries.
fn dump_spans<'a>(spans: &[super::TokenPiece<'a>]) { fn dump_spans<'src>(spans: &[super::TokenPiece<'src>]) {
let mut col_utf16 = 0usize; let mut col_utf16 = 0usize;
for span in spans { for span in spans {
let start = col_utf16; let start = col_utf16;
let end = start + span.length_utf16; let end = start + span.length_utf16;
println!( println!(
"\t\t{:?} @ {}{}: {:?}", "\t\t{:?} @ {}-{}: {:?}",
span.token, start, end, span.lexeme span.token, start, end, span.lexeme
); );
col_utf16 = end; col_utf16 = end;

View File

@ -28,9 +28,7 @@ use super::{TokenLocation, TokenPiece, TokenizedFile};
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct Tokens<'src> { pub struct Tokens<'src> {
/// [`TokenLocation`] of the next token to be returned. /// [`TokenLocation`] of the next token to be returned.
/// cursor: TokenLocation,
/// [`None`] means the iterator has been exhausted.
cursor: Option<TokenLocation>,
/// [`TokenizedFile`] whose tokens we're iterating over. /// [`TokenizedFile`] whose tokens we're iterating over.
source_file: &'src TokenizedFile<'src>, source_file: &'src TokenizedFile<'src>,
/// When `true`, whitespace tokens are skipped. /// When `true`, whitespace tokens are skipped.
@ -51,50 +49,57 @@ impl<'src> Tokens<'src> {
// Returns the position of the next new token, skipping carried-over pieces // Returns the position of the next new token, skipping carried-over pieces
// and blank lines. // and blank lines.
fn advance_position(&self, mut position: TokenLocation) -> Option<TokenLocation> { fn advance_position(&self, position: TokenLocation) -> TokenLocation {
if let Some(current_line) = self.source_file.lines.get(position.line) { let TokenLocation::Position {
mut line,
mut column,
} = position
else {
return TokenLocation::EndOfFile;
};
if let Some(current_line) = self.source_file.lines.get(line) {
// `Line::len()` also counts a possible token that continued from // `Line::len()` also counts a possible token that continued from
// the previous line. // the previous line.
if position.column + 1 < current_line.len() { if column + 1 < current_line.len() {
position.column += 1; column += 1;
return Some(position); return TokenLocation::Position { line, column };
} }
} }
// Current line is exhausted: walk downward until we find the first line // Current line is exhausted: walk downward until we find the first line
// that **owns local tokens**, because we only want *new* token, // that **owns local tokens**, because we only want *new* token,
// not continued from previous lines (they were already iterated over). // not continued from previous lines (they were already iterated over).
position.line += 1; line += 1;
while let Some(next_line) = self.source_file.lines.get(position.line) { while let Some(next_line) = self.source_file.lines.get(line) {
if next_line.local_range().is_some() { if next_line.local_range().is_some() {
// Start at the first *local* token, // Start at the first *local* token,
// skipping any carried-over one // skipping any carried-over one
position.column = if next_line.continued_from.is_some() { column = if next_line.continued_from.is_some() {
1 1
} else { } else {
0 0
}; };
return Some(position); return TokenLocation::Position { line, column };
} }
position.line += 1; // keep skipping empty / pure-carried lines line += 1; // keep skipping empty / pure-carried lines
} }
// No more tokens. // No more tokens.
None TokenLocation::EndOfFile
} }
// Creates a new iterator. // Creates a new iterator.
fn new(source_file: &'src TokenizedFile) -> Tokens<'src> { fn new(source_file: &'src TokenizedFile) -> Tokens<'src> {
let mut new_iterator = Tokens { let mut new_iterator = Tokens {
source_file, source_file,
cursor: Some(TokenLocation { line: 0, column: 0 }), cursor: TokenLocation::Position { line: 0, column: 0 },
skip_whitespace: false, skip_whitespace: false,
}; };
// We need to land on the first existing token so [`Iterator::next`] // We need to land on the first existing token so [`Iterator::next`]
// can assume cursor is valid. // can assume cursor is valid.
while let Some(token_position) = new_iterator.cursor { while new_iterator.cursor != TokenLocation::EndOfFile {
if new_iterator.source_file.get(token_position).is_some() { if new_iterator.source_file.get(new_iterator.cursor).is_some() {
break; break;
} }
new_iterator.cursor = new_iterator.advance_position(token_position); new_iterator.cursor = new_iterator.advance_position(new_iterator.cursor);
} }
new_iterator new_iterator
} }
@ -105,16 +110,17 @@ impl<'src> Iterator for Tokens<'src> {
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
// We only ever loop to discard whitespaces when the flag is on // We only ever loop to discard whitespaces when the flag is on
loop { while self.cursor != TokenLocation::EndOfFile {
let current_cursor = self.cursor?; let token_location = self.cursor;
let token_piece = *self.source_file.get(current_cursor)?; let token_piece = *self.source_file.get(self.cursor)?;
self.cursor = self.advance_position(current_cursor); self.cursor = self.advance_position(self.cursor);
// Optional whitespace-skip // Optional whitespace-skip
if !self.skip_whitespace || !token_piece.token.is_whitespace() { if !self.skip_whitespace || !token_piece.token.is_whitespace() {
return Some((current_cursor, token_piece)); return Some((token_location, token_piece));
} }
} }
None
} }
} }
@ -139,7 +145,7 @@ impl<'src> TokenizedFile<'src> {
/// ## Examples /// ## Examples
/// ///
/// ```rust /// ```rust
/// use mycrate::{TokenizedFile, TokenLocation, Token}; /// use super::{TokenizedFile, TokenLocation, Token};
/// let file = TokenizedFile::from_str("0 / 0"); /// let file = TokenizedFile::from_str("0 / 0");
/// assert_eq!( /// assert_eq!(
/// file.get(TokenLocation { line: 0, column: 2 }).map(|p| p.token), /// file.get(TokenLocation { line: 0, column: 2 }).map(|p| p.token),
@ -148,8 +154,11 @@ impl<'src> TokenizedFile<'src> {
/// ``` /// ```
#[track_caller] #[track_caller]
pub fn get(&self, position: TokenLocation) -> Option<&TokenPiece> { pub fn get(&self, position: TokenLocation) -> Option<&TokenPiece> {
let line = self.lines.get(position.line)?; let TokenLocation::Position { line, column } = position else {
let column = position.column; return None;
};
let line = self.lines.get(line)?;
let column = column;
if column >= line.len() { if column >= line.len() {
return None; return None;
} }

View File

@ -2,15 +2,15 @@
//! //!
//! ## Notable details //! ## Notable details
//! //!
//! Lexer for UnrealScript that recognizes inline `cpptext { }` blocks. //! Lexer for UnrealScript that recognizes inline `cpptext { ... }` blocks.
//! //!
//! In UnrealScript, `cpptext` lets authors embed raw C++ between braces. //! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.
//! Because whitespace, newlines, or comments may appear between the //! Because whitespace, newlines, or comments may appear between the
//! `cpptext` keyword and the opening `{`, the lexer must remember that //! `cpptext` keyword and the opening `{`, the lexer must remember that
//! it has just seen `cpptext` - hence a state machine. //! it has just seen `cpptext` - hence a state machine.
//! //!
//! Modes //! ## Modes
//! ------ //!
//! - **Normal** - ordinary UnrealScript tokens. //! - **Normal** - ordinary UnrealScript tokens.
//! - **AwaitingCppBlock** - after `cpptext`, waiting for the next `{`. //! - **AwaitingCppBlock** - after `cpptext`, waiting for the next `{`.
//! //!
@ -170,6 +170,8 @@ pub enum Token {
NativeReplication, NativeReplication,
// # Control-flow keywords // # Control-flow keywords
#[regex("(?i)goto")]
Goto,
#[regex("(?i)if")] #[regex("(?i)if")]
If, If,
#[regex("(?i)else")] #[regex("(?i)else")]
@ -265,9 +267,9 @@ pub enum Token {
Minus, Minus,
// ## String manipulation // ## String manipulation
#[token("@")] #[token("@")]
AtChar, ConcatSpace,
#[token("$")] #[token("$")]
DollarChar, Concat,
// ## Shifts // ## Shifts
#[token("<<")] #[token("<<")]
LeftShift, LeftShift,
@ -326,9 +328,9 @@ pub enum Token {
// # Punctuation & delimiters // # Punctuation & delimiters
#[token("(")] #[token("(")]
LeftParen, LeftParenthesis,
#[token(")")] #[token(")")]
RightParen, RightParenthesis,
#[token("{", handle_brace)] #[token("{", handle_brace)]
Brace(BraceKind), Brace(BraceKind),
#[token("}")] #[token("}")]
@ -356,7 +358,7 @@ pub enum Token {
#[regex(r"/\*", handle_block_comment)] #[regex(r"/\*", handle_block_comment)]
BlockComment, BlockComment,
#[regex(r"\r\n|\n|\r")] #[regex(r"\r\n|\n|\r")]
NewLine, Newline,
#[regex(r"[ \t]+")] #[regex(r"[ \t]+")]
Whitespace, Whitespace,
@ -367,7 +369,7 @@ pub enum Token {
impl Token { impl Token {
/// Returns `true` if this token is a newline (`Token::NewLine`). /// Returns `true` if this token is a newline (`Token::NewLine`).
pub fn is_newline(&self) -> bool { pub fn is_newline(&self) -> bool {
matches!(self, Token::NewLine) matches!(self, Token::Newline)
} }
/// Returns `true` if this token is trivia whitespace /// Returns `true` if this token is trivia whitespace
@ -375,7 +377,7 @@ impl Token {
/// ///
/// Note: comments are **not** considered whitespace. /// Note: comments are **not** considered whitespace.
pub fn is_whitespace(&self) -> bool { pub fn is_whitespace(&self) -> bool {
matches!(&self, Token::Whitespace | Token::NewLine) matches!(&self, Token::Whitespace | Token::Newline)
} }
/// Returns `true` if this token may span multiple physical lines /// Returns `true` if this token may span multiple physical lines
@ -386,6 +388,22 @@ impl Token {
Token::BlockComment | Token::Brace(BraceKind::CppBlock) | Token::Error Token::BlockComment | Token::Brace(BraceKind::CppBlock) | Token::Error
) )
} }
/// Returns `true` if this token can appear in type position
/// (either a built-in type keyword or an identifier).
pub fn is_valid_type_name_token(&self) -> bool {
matches!(
self,
Token::Int
| Token::Float
| Token::Bool
| Token::Byte
| Token::String
| Token::Array
| Token::Name
| Token::Identifier
)
}
} }
/// Consume a /* ... */ block comment with arbitrary nesting /// Consume a /* ... */ block comment with arbitrary nesting
@ -476,7 +494,7 @@ fn consume_cpp_block(lexer: &mut Lexer<Token>) {
} }
} }
/// Consume over a C-style `/* */` comment (without nesting). /// Consume over a C-style `/* ... */` comment (without nesting).
/// ///
/// Assumes that opener `/*` is already consumed. /// Assumes that opener `/*` is already consumed.
fn consume_c_comment(lexer: &mut Lexer<Token>) { fn consume_c_comment(lexer: &mut Lexer<Token>) {

View File

@ -23,11 +23,11 @@
//! compiled with `debug` feature enabled. They live in the [`debug_tools`] //! compiled with `debug` feature enabled. They live in the [`debug_tools`]
//! extension trait, implemented for [`TokenizedFile`]. //! extension trait, implemented for [`TokenizedFile`].
//! //!
//! ``` //! ```rust
//! // bring the trait into scope //! // bring the trait into scope
//! use lexer::DebugTools; //! use lexer::DebugTools;
//! //!
//! let file = TokenizedFile::from_str(src); //! let file = TokenizedFile::from_str("local int myValue;");
//! file.debug_dump(); // pretty-print token layout //! file.debug_dump(); // pretty-print token layout
//! let text = file.to_source(); // reconstruct original text //! let text = file.to_source(); // reconstruct original text
//! ``` //! ```
@ -43,7 +43,7 @@ use logos::Logos;
#[cfg(any(debug_assertions, feature = "debug"))] #[cfg(any(debug_assertions, feature = "debug"))]
pub use debug_tools::DebugTools; pub use debug_tools::DebugTools;
pub use iterator::Tokens; pub use iterator::Tokens;
pub use lexing::Token; pub use lexing::{BraceKind, Token};
/// Empirically chosen starting size for token buffer (used during tokenization) /// Empirically chosen starting size for token buffer (used during tokenization)
/// that provides good performance. /// that provides good performance.
@ -64,15 +64,22 @@ pub struct TokenPiece<'src> {
pub length_utf16: usize, pub length_utf16: usize,
} }
/// Defines location of a token inside [`TokenizedFile`] in a way, convenient /// Defines location of a token inside [`TokenizedFile`] in a form convenient
/// for communicating through LSP. /// for communicating through LSP.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct TokenLocation { pub enum TokenLocation {
/// 0-based line number. /// Actual position of some token in the file.
pub line: usize, Position {
/// 0-based index of a token in the line, possibly including the token that /// 0-based line number.
/// has continued from the previous line. line: usize,
pub column: usize, /// 0-based index of a token in the line, possibly including the token that
/// has continued from the previous line.
///
/// Columns count tokens, not bytes or chars.
column: usize,
},
/// Position af the end-of-file.
EndOfFile,
} }
/// A tokenized, lossless representation of an UnrealScript source file. /// A tokenized, lossless representation of an UnrealScript source file.
@ -102,6 +109,10 @@ struct Tokenizer<'src> {
slice_start_index: usize, slice_start_index: usize,
/// When a multi-line token is being scanned, stores the 0-based line /// When a multi-line token is being scanned, stores the 0-based line
/// on which it started; [`None`] otherwise. /// on which it started; [`None`] otherwise.
///
/// `Some(line_idx)` iff the current line is within a multi-line token that
/// started on `line_idx`; it is consumed exactly once by
/// [`Self::commit_current_line`].
multi_line_start: Option<usize>, multi_line_start: Option<usize>,
/// Set to [`true`] if the lexer reported any error tokens. /// Set to [`true`] if the lexer reported any error tokens.
had_errors: bool, had_errors: bool,
@ -141,7 +152,7 @@ impl<'src> TokenizedFile<'src> {
/// ```rust /// ```rust
/// let tokenized_file = TokenizedFile::from_str("function test() {}"); /// let tokenized_file = TokenizedFile::from_str("function test() {}");
/// if tokenized_file.has_errors() { /// if tokenized_file.has_errors() {
/// println!("Error while parsing file: {}", path.display()); /// println!("Error while parsing file.");
/// } /// }
/// ``` /// ```
#[inline] #[inline]
@ -170,7 +181,7 @@ type TokenIdx = usize;
/// Representation of a single physical line of the source file. /// Representation of a single physical line of the source file.
/// ///
/// [`Range<TokenIndex>`] are used instead of slices to avoid creating /// [`Range<TokenIdx>`] are used instead of slices to avoid creating
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids. /// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
#[derive(Clone, Debug, Hash, PartialEq, Eq)] #[derive(Clone, Debug, Hash, PartialEq, Eq)]
struct Line { struct Line {
@ -214,7 +225,7 @@ impl Line {
/// Returns a range of tokens inside [`TokenizedFile::buffer`] that start /// Returns a range of tokens inside [`TokenizedFile::buffer`] that start
/// on this line. /// on this line.
/// ///
/// [`None`] means there is no such tokens. Otherwise range is guaranteed /// [`None`] means there are no such tokens. Otherwise range is guaranteed
/// to not be empty. /// to not be empty.
#[inline] #[inline]
fn local_range(&self) -> Option<Range<TokenIdx>> { fn local_range(&self) -> Option<Range<TokenIdx>> {
@ -225,7 +236,7 @@ impl Line {
} }
} }
/// Returns amount of tokens of the line. /// Returns the number of tokens on this line.
/// ///
/// Counts both tokens that started on this line and tokens that continued /// Counts both tokens that started on this line and tokens that continued
/// from previous one. /// from previous one.
@ -246,7 +257,8 @@ impl<'src> Tokenizer<'src> {
} }
} }
/// Handles tokens that never span multiple lines. /// Handles simple tokens that *never* span multiple lines, allowing us to
/// skip a lot of work.
fn process_single_line_token(&mut self, token_piece: TokenPiece<'src>) { fn process_single_line_token(&mut self, token_piece: TokenPiece<'src>) {
if token_piece.token.is_newline() { if token_piece.token.is_newline() {
self.line_number += 1; self.line_number += 1;
@ -257,7 +269,7 @@ impl<'src> Tokenizer<'src> {
} }
} }
/// Handles tokens that may contain one or more newline characters. /// Handles tokens that might contain one or more newline characters.
fn process_multi_line_token(&mut self, token_piece: TokenPiece<'src>) { fn process_multi_line_token(&mut self, token_piece: TokenPiece<'src>) {
let start_line = self.line_number; let start_line = self.line_number;
let newline_count = count_line_breaks(token_piece.lexeme); let newline_count = count_line_breaks(token_piece.lexeme);
@ -271,12 +283,15 @@ impl<'src> Tokenizer<'src> {
// We only need to commit the line if this token actually ended the line // We only need to commit the line if this token actually ended the line
if newline_count > 0 { if newline_count > 0 {
self.commit_current_line(); self.commit_current_line();
// We only need to insert one `Line::Spanned(base)` per *interior* // We only need to insert one `Line::spanned(start_line)` per
// newline, so `newline_count - 1` such lines // *interior* line:
// (e.g. 2 line breaks in block comment -> it has //
// exactly `1` interior line) // standalone | local int i = /* Now we start long comment
let insert_count = newline_count - 1; // spanned | with three line breaks and *exactly* two
for _ in 0..insert_count { // spanned | inner lines that contain nothing but
// spanned_with_tokens | comment bytes! */ = 0;
let inner_lines_count = newline_count - 1;
for _ in 0..inner_lines_count {
self.lines.push(Line::spanned(start_line)); self.lines.push(Line::spanned(start_line));
} }
// This is called *after* `commit_current_line()` cleared previous // This is called *after* `commit_current_line()` cleared previous
@ -313,7 +328,7 @@ impl<'src> Tokenizer<'src> {
/// Finishes tokenization, converting accumulated data into /// Finishes tokenization, converting accumulated data into
/// [`TokenizedFile`]. /// [`TokenizedFile`].
fn into_tokenized_file(mut self) -> TokenizedFile<'src> { fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
// Commit any trailing tokens // Flush trailing tokens for which `commit` wasn't auto triggered
self.commit_current_line(); self.commit_current_line();
// If we still have a `multi_line_start` // If we still have a `multi_line_start`
// (i.e. a pure multi-line token with no local tokens on its last line), // (i.e. a pure multi-line token with no local tokens on its last line),
@ -322,7 +337,6 @@ impl<'src> Tokenizer<'src> {
self.lines.push(Line::spanned(from)); self.lines.push(Line::spanned(from));
} }
// Optimize for size
self.buffer.shrink_to_fit(); self.buffer.shrink_to_fit();
self.lines.shrink_to_fit(); self.lines.shrink_to_fit();
@ -343,7 +357,7 @@ fn make_token_piece<'src>(token: Token, text: &'src str) -> TokenPiece<'src> {
} }
} }
/// Counts the number of new lines in given text. /// Counts the number of newlines in given text.
fn count_line_breaks(text: &str) -> usize { fn count_line_breaks(text: &str) -> usize {
let mut bytes_iterator = text.as_bytes().iter().peekable(); let mut bytes_iterator = text.as_bytes().iter().peekable();
let mut newline_count = 0; let mut newline_count = 0;

View File

@ -1,3 +1,7 @@
#![allow(clippy::doc_overindented_list_items)] #![allow(clippy::doc_overindented_list_items)]
pub mod arena;
pub mod ast;
pub mod diagnostics;
pub mod lexer; pub mod lexer;
pub mod parser;

View File

@ -0,0 +1,230 @@
//! Cursor utilities for a token stream.
//!
//! Provides memoized lookahead over significant tokens and attaches
//! trivia to [`TriviaComponent`]. Significant tokens exclude whitespace and
//! comments; see [`crate::parser::TriviaKind`].
use crate::lexer::{Token, TokenLocation};
use crate::parser::trivia::TriviaComponent;
/// Cursor over a token stream with memoized lookahead and trivia attachment.
#[derive(Clone, Debug)]
pub(crate) struct CursorComponent<'src> {
/// Underlying token stream.
tokens: crate::lexer::Tokens<'src>,
/// Significant-token lookahead buffer.
lookahead_buffer: std::collections::VecDeque<(TokenLocation, crate::lexer::TokenPiece<'src>)>,
/// Location of the last consumed token.
previous_location: Option<TokenLocation>,
/// Location of the last significant token.
///
/// Used to associate following trivia with the correct token.
last_significant_location: Option<TokenLocation>,
/// Scratch space for [`CursorComponent::buffer_next_significant_token`],
/// used to avoid reallocations.
trivia_buffer: Vec<crate::parser::trivia::TriviaToken<'src>>,
}
impl<'src> CursorComponent<'src> {
/// Create a [`CursorComponent`] over the tokens of `file`.
pub(crate) fn new(tokenized_file: &'src crate::lexer::TokenizedFile<'src>) -> Self {
Self {
tokens: tokenized_file.tokens(),
lookahead_buffer: std::collections::VecDeque::new(),
previous_location: None,
last_significant_location: None,
trivia_buffer: Vec::new(),
}
}
/// Ensure the lookahead buffer contains at least `lookahead + 1`
/// significant tokens.
///
/// May consume trivia from the underlying stream.
/// Does not consume significant tokens.
fn ensure_min_lookahead(&mut self, lookahead: usize, trivia: &mut TriviaComponent<'src>) {
while self.lookahead_buffer.len() <= lookahead {
if !self.buffer_next_significant_token(trivia) {
break;
}
}
}
/// Scan to the next significant token, recording intervening trivia.
///
/// Returns `true` if a significant token was buffered,
/// `false` on end of file.
fn buffer_next_significant_token(&mut self, trivia: &mut TriviaComponent<'src>) -> bool {
self.trivia_buffer.clear();
while let Some((token_location, token_piece)) = self.tokens.next() {
if let Ok(trivia_kind) = crate::parser::TriviaKind::try_from(token_piece.token) {
self.trivia_buffer.push(crate::parser::TriviaToken {
kind: trivia_kind,
text: token_piece.lexeme,
location: token_location,
});
} else {
// Attach trivia found after the previous significant token
if !self.trivia_buffer.is_empty() {
trivia.record_between_locations(
self.last_significant_location,
token_location,
&mut self.trivia_buffer,
);
}
self.lookahead_buffer
.push_back((token_location, token_piece));
self.last_significant_location = Some(token_location);
return true;
}
}
// Reached end-of-file: attach trailing trivia
if !self.trivia_buffer.is_empty() {
trivia.record_between_locations(
self.last_significant_location,
TokenLocation::EndOfFile,
&mut self.trivia_buffer,
);
}
false
}
}
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Returns the next token without consuming it.
///
/// Returns [`None`] if no tokens remain.
#[must_use]
pub(crate) fn peek_token(&mut self) -> Option<Token> {
self.peek_entry().map(|(_, token_piece)| token_piece.token)
}
/// Returns the next token, its lexeme, and its location
/// without consuming it.
///
/// Returns [`None`] if no tokens remain.
#[must_use]
pub(crate) fn peek_token_lexeme_and_location(
&mut self,
) -> Option<(Token, &'src str, TokenLocation)> {
self.peek_entry().map(|(token_location, token_piece)| {
(token_piece.token, token_piece.lexeme, *token_location)
})
}
/// Returns the next token and its lexeme without consuming it.
///
/// Returns [`None`] if no tokens remain.
#[must_use]
pub(crate) fn peek_token_and_lexeme(&mut self) -> Option<(Token, &'src str)> {
self.peek_entry()
.map(|(_, token_piece)| (token_piece.token, token_piece.lexeme))
}
/// Returns the next token and its location without consuming it.
///
/// Returns [`None`] if no tokens remain.
#[must_use]
pub(crate) fn peek_token_and_location(&mut self) -> Option<(Token, TokenLocation)> {
self.peek_entry()
.map(|(token_location, token_piece)| (token_piece.token, *token_location))
}
/// Returns the location of the next token, or [`TokenLocation::EndOfFile`]
/// if none remain.
#[must_use]
pub(crate) fn peek_location(&mut self) -> TokenLocation {
self.peek_entry()
.map(|(token_location, _)| *token_location)
.unwrap_or(TokenLocation::EndOfFile)
}
/// Returns the location of the last token that was actually consumed
/// by [`crate::parser::Parser::advance`].
///
/// Returns [`None`] if no tokens have been consumed yet.
#[must_use]
pub(crate) fn last_consumed_location(&self) -> Option<TokenLocation> {
self.cursor.previous_location
}
/// Returns the most recent location the parser is "at".
///
/// If at least one token has been consumed, this is the location of the
/// last consumed token. Otherwise it falls back to the location of the
/// first significant token in the stream (or [`TokenLocation::EndOfFile`]
/// if the stream is empty).
#[must_use]
pub(crate) fn last_visited_location(&mut self) -> TokenLocation {
// Only has to `unwrap` before *any* characters were consumed
self.last_consumed_location()
.unwrap_or_else(|| self.peek_location())
}
/// Peeks the token at `lookahead` (`0` is the next token)
/// without consuming.
///
/// Returns `None` if the stream ends before that position.
#[must_use]
pub(crate) fn peek_token_at(&mut self, lookahead: usize) -> Option<Token> {
self.cursor
.ensure_min_lookahead(lookahead, &mut self.trivia);
self.cursor
.lookahead_buffer
.get(lookahead)
.map(|(_, token_piece)| token_piece.token)
}
/// Advances by one significant token.
///
/// Trivia is internally handled and recorded.
/// Does nothing at the end-of-file.
pub(crate) fn advance(&mut self) {
self.cursor.ensure_min_lookahead(0, &mut self.trivia);
if let Some((location, _)) = self.cursor.lookahead_buffer.pop_front() {
self.cursor.previous_location = Some(location);
}
}
/// If the next token equals `token`, consumes it and returns `true`.
///
/// Otherwise leaves the cursor unchanged and returns `false`.
/// Trivia is recorded automatically.
pub(crate) fn eat(&mut self, token: Token) -> bool {
let correct_token = self.peek_token() == Some(token);
if correct_token {
self.advance();
}
correct_token
}
/// Centralized peek used by public peekers.
fn peek_entry(&mut self) -> Option<&(TokenLocation, crate::lexer::TokenPiece<'src>)> {
self.cursor.ensure_min_lookahead(0, &mut self.trivia);
self.cursor.lookahead_buffer.front()
}
/// Expects `expected` at the current position.
///
/// On match consumes the token and returns its [`TokenLocation`].
/// Otherwise returns a [`crate::parser::ParseError`] of
/// the given [`crate::parser::ParseErrorKind`] that carries the current
/// span for diagnostics.
pub(crate) fn expect(
&mut self,
expected: Token,
error_kind: crate::parser::ParseErrorKind,
) -> crate::parser::ParseResult<'src, 'arena, TokenLocation> {
let token_position = self.peek_location();
// `Token` only includes type information, so comparison is valid
if self.peek_token() == Some(expected) {
self.advance();
Ok(token_position)
} else {
Err(crate::parser::ParseError {
kind: error_kind,
source_span: crate::ast::AstSpan::new(token_position),
})
}
}
}

View File

@ -0,0 +1,95 @@
//! Submodule with parsing related errors.
use crate::ast::AstSpan;
/// Internal parse error kinds.
///
/// Used by the parser as a compact signal for later construction of user-facing
/// diagnostics.
///
/// Naming convention:
/// - Prefix identifies the syntactic construct
/// (`Expression`, `For`, `Switch`, etc.).
/// - Suffix describes the exact problem (`MissingClosingParenthesis`,
/// `UnexpectedToken`, `MultipleDefaults`, etc.).
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum ParseErrorKind {
/// Expression inside `(...)` could not be parsed and no closing `)`
/// was found.
ExpressionMissingClosingParenthesis,
/// A `do` block was not followed by a matching `until`.
DoMissingUntil,
/// Found an unexpected token while parsing an expression.
ExpressionUnexpectedToken,
/// A `for` loop is missing its opening `(`.
ForMissingOpeningParenthesis,
/// The first `;` in `for (init; cond; step)` is missing.
ForMissingInitializationSemicolon,
/// The second `;` in `for (init; cond; step)` is missing.
ForMissingConditionSemicolon,
/// The closing `)` of a `for` loop is missing.
ForMissingClosingParenthesis,
/// An expression inside a block is not terminated with `;`.
BlockMissingSemicolonAfterExpression,
/// A statement inside a block is not terminated with `;`.
BlockMissingSemicolonAfterStatement,
/// `switch` has no body (missing matching braces).
SwitchMissingBody,
/// The first top-level item in a `switch` body is not a `case`.
SwitchTopLevelItemNotCase,
/// A `case` arm is missing the trailing `:`.
SwitchCaseMissingColon,
/// Found more than one `default` branch.
SwitchDuplicateDefault,
/// Found `case` arms after a `default` branch.
SwitchCasesAfterDefault,
/// A `goto` was not followed by a label.
GotoMissingLabel,
/// Unexpected end of input while parsing.
UnexpectedEndOfFile,
/// Token looked like a numeric literal but could not be parsed as one.
InvalidNumericLiteral,
/// A bare expression appeared in a `switch` arm but was not the final arm.
///
/// Such an expression must be terminated with `;` or be the final arm.
SwitchBareExpressionBeforeNextArm,
/// A `local` declaration is missing its first identifier.
///
/// At least one variable name must follow the type.
LocalMissingIdentifier,
/// A `local` declaration was followed by a token that cannot serve
/// as a type name.
LocalInvalidTypeName,
/// Invalid variable name identifier in `local` variable definition.
LocalBadVariableIdentifier,
/// An initializer appears in a `local` variable declaration.
LocalInitializerNotAllowed,
/// A non-`local` variable declaration is missing its first identifier.
///
/// At least one variable name must follow the type.
DeclMissingIdentifier,
/// Invalid variable name identifier in non-`local` variable definition.
DeclBadVariableIdentifier,
}
/// Enumerates all specific kinds of parsing errors that the parser can emit.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
#[must_use]
pub struct ParseError {
/// The specific kind of parse error that occurred.
pub kind: ParseErrorKind,
/// The source span in which the error was detected.
pub source_span: AstSpan,
}
pub type ParseResult<'src, 'arena, T> = Result<T, ParseError>;
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
#[must_use]
pub(crate) fn make_error_here(&mut self, error_kind: ParseErrorKind) -> ParseError {
ParseError {
kind: error_kind,
source_span: AstSpan::new(self.peek_location()),
}
}
}

View File

@ -0,0 +1,60 @@
use crate::ast::Expression;
use crate::lexer::Token;
use crate::parser::ParseErrorKind;
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Parses a block `{ ... }` after `{`.
///
/// Consumes tokens until the matching `}` and returns
/// an [`Expression::Block`] spanning from the opening `{` to
/// the closing `}`.
/// Returns a best-effort block on premature end-of-file.
#[must_use]
pub(crate) fn parse_block_cont(
&mut self,
block_start_location: crate::lexer::TokenLocation,
) -> crate::ast::ExpressionRef<'src, 'arena> {
let mut statements = self.arena.vec();
let mut tail = None;
loop {
let Some((token, token_location)) = self.peek_token_and_location() else {
self.report_error_here(ParseErrorKind::UnexpectedEndOfFile);
return self.arena.alloc(
Expression::Block { statements, tail },
crate::ast::AstSpan {
from: block_start_location,
to: self.peek_location(),
},
);
};
if let Token::RightBrace = token {
self.advance(); // '}'
let block_span = crate::ast::AstSpan {
from: block_start_location,
to: token_location,
};
return self
.arena
.alloc(Expression::Block { statements, tail }, block_span);
}
// We know that at this point:
// 1. There is still a token and it is not end-of-file;
// 2. It isn't end of the block.
// So having a tail statement there is a problem!
if let Some(tail_expression) = tail {
self.report_error_here(ParseErrorKind::BlockMissingSemicolonAfterExpression);
let tail_span = *tail_expression.span();
let node = self.arena.alloc(
crate::ast::Statement::Expression(tail_expression),
tail_span,
);
statements.push(node);
}
tail = self.parse_block_item(&mut statements);
// Ensure forward progress under errors to avoid infinite loops.
if self.peek_location() <= token_location {
self.advance();
}
}
}
}

View File

@ -0,0 +1,180 @@
use crate::ast::{AstSpan, Expression, ExpressionRef};
use crate::lexer::{Token, TokenLocation};
use crate::parser::{ParseErrorKind, ResultRecoveryExt};
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Parses an `if` block, assuming that `if` token was consumed.
///
/// Produces an [`Expression::If`] spanning from the `if` keyword to
/// the end of the last arm (`else` body if present,
/// otherwise the `if` body).
#[must_use]
pub(crate) fn parse_if_cont(
&mut self,
if_start_location: TokenLocation,
) -> ExpressionRef<'src, 'arena> {
let condition = self.parse_expression();
let body = self.parse_expression();
let (else_body, if_end_location) = if let Some(Token::Else) = self.peek_token() {
self.advance(); // else
let else_body = self.parse_expression();
// Capture end before moving `else_body` to build the full `if` span
let body_end = else_body.span().to;
(Some(else_body), body_end)
} else {
(None, body.span().to)
};
let span = AstSpan {
from: if_start_location,
to: if_end_location,
};
self.arena.alloc(
Expression::If {
condition,
body,
else_body,
},
span,
)
}
/// Parses a `while` loop, assuming that `while` token was consumed.
///
/// Produces an [`Expression::While`] spanning from the `while` keyword
/// to the end of the body.
#[must_use]
pub(crate) fn parse_while_cont(
&mut self,
while_start_location: TokenLocation,
) -> ExpressionRef<'src, 'arena> {
let condition = self.parse_expression();
let body = self.parse_expression();
let span = AstSpan {
from: while_start_location,
to: body.span().to,
};
self.arena
.alloc(Expression::While { condition, body }, span)
}
/// Parses a `do ... until ...` loop after `do`, assuming that `do` token
/// was consumed.
///
/// On a missing `until`, returns an error
/// [`ParseErrorKind::DoMissingUntil`].
/// On success, produces an [`Expression::DoUntil`] spanning from `do`
/// to the end of the condition.
#[must_use]
pub(crate) fn parse_do_until_cont(
&mut self,
do_start_location: TokenLocation,
) -> crate::parser::ParseExpressionResult<'src, 'arena> {
let body = self.parse_expression();
self.expect(Token::Until, ParseErrorKind::DoMissingUntil)
.widen_error_span_from(do_start_location)?;
let condition = self.parse_expression();
let span = AstSpan {
from: do_start_location,
to: condition.span().to,
};
Ok(self
.arena
.alloc(Expression::DoUntil { condition, body }, span))
}
/// Parses a `foreach` loop, assuming that `foreach` token was consumed.
///
/// Produces an [`Expression::ForEach`] spanning from `foreach`
/// to the end of the body.
#[must_use]
pub(crate) fn parse_foreach_cont(
&mut self,
foreach_start_location: TokenLocation,
) -> ExpressionRef<'src, 'arena> {
let iterator = self.parse_expression();
let body = self.parse_expression();
let span = AstSpan {
from: foreach_start_location,
to: body.span().to,
};
self.arena
.alloc(Expression::ForEach { iterator, body }, span)
}
/// Parses a `for` loop after `for`, assuming that `for` token was consumed.
///
/// Grammar: `for (init?; condition?; step?) body`.
/// Any of `init`, `condition`, or `step` may be omitted.
/// Emits specific `ParseErrorKind` values for missing
/// delimiters/separators.
/// On success returns an [`Expression::For`] spanning from `for` to
/// the end of the body.
#[must_use]
pub(crate) fn parse_for_cont(
&mut self,
for_start_location: TokenLocation,
) -> crate::parser::ParseResult<'src, 'arena, ExpressionRef<'src, 'arena>> {
self.expect(
Token::LeftParenthesis,
ParseErrorKind::ForMissingOpeningParenthesis,
)
.widen_error_span_from(for_start_location)?;
let init = if let Some(Token::Semicolon) = self.peek_token() {
self.advance();
None
} else {
let init = self.parse_expression();
self.expect(
Token::Semicolon,
ParseErrorKind::ForMissingInitializationSemicolon,
)?;
Some(init)
};
let condition = if let Some(Token::Semicolon) = self.peek_token() {
self.advance();
None
} else {
let condition = self.parse_expression();
self.expect(
Token::Semicolon,
ParseErrorKind::ForMissingConditionSemicolon,
)?;
Some(condition)
};
let step = if let Some(Token::RightParenthesis) = self.peek_token() {
self.advance();
None
} else {
let step = self.parse_expression();
self.expect(
Token::RightParenthesis,
ParseErrorKind::ForMissingClosingParenthesis,
)
.widen_error_span_from(for_start_location)
.sync_error_until(self, crate::parser::SyncLevel::CloseParenthesis)?;
Some(step)
};
let body = self.parse_expression();
let span = AstSpan {
from: for_start_location,
to: body.span().to,
};
Ok(self.arena.alloc(
Expression::For {
init,
condition,
step,
body,
},
span,
))
}
}

View File

@ -0,0 +1,99 @@
use crate::ast::{AstSpan, Expression};
use crate::lexer::{Token, TokenLocation};
use crate::parser::ParseErrorKind;
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Parse the continuation of a `return` after its keyword was consumed.
///
/// Doesn't consume the terminating `;`.
/// If the next token is not `;`, parses an expression as the optional
/// value. Produces an [`Expression::Return`] whose span runs from
/// the `return` keyword to the end of the value if present, otherwise to
/// the `return` keyword.
#[must_use]
pub(crate) fn parse_return_cont(
&mut self,
return_start_location: TokenLocation,
) -> crate::ast::ExpressionRef<'src, 'arena> {
let (value, span) = if self.peek_token() != Some(Token::Semicolon) {
let value = self.parse_expression();
let span = AstSpan {
from: return_start_location,
to: value.span().to,
};
(Some(value), span)
} else {
(
None,
AstSpan {
from: return_start_location,
to: return_start_location,
},
)
};
self.arena.alloc(Expression::Return(value), span)
}
/// Parse the continuation of a `break` after its keyword was consumed.
///
/// Doesn't consume the terminating `;`.
/// If the next token is not `;`, parses an optional value expression.
/// Produces an [`Expression::Break`] spanning from `break` to the end
/// of the value if present, otherwise to the `break` keyword.
#[must_use]
pub(crate) fn parse_break_cont(
&mut self,
break_start_location: TokenLocation,
) -> crate::ast::ExpressionRef<'src, 'arena> {
let (value, span) = if self.peek_token() != Some(Token::Semicolon) {
let value = self.parse_expression();
let span = AstSpan {
from: break_start_location,
to: value.span().to,
};
(Some(value), span)
} else {
(
None,
AstSpan {
from: break_start_location,
to: break_start_location,
},
)
};
self.arena.alloc(Expression::Break(value), span)
}
/// Parses a `goto` expression after `goto`, assuming that the `goto` token
/// was consumed.
///
/// Requires the next token to be an identifier label.
/// On missing token, returns [`ParseErrorKind::UnexpectedEndOfFile`].
/// On a non-identifier next token,
/// returns [`ParseErrorKind::GotoMissingLabel`].
/// On success, produces an [`Expression::Goto`] spanning from `goto`
/// to the label token.
#[must_use]
pub(crate) fn parse_goto_cont(
&mut self,
goto_start_location: TokenLocation,
) -> crate::parser::ParseExpressionResult<'src, 'arena> {
let Some((token, text, token_location)) = self.peek_token_lexeme_and_location() else {
return Err(self.make_error_here(ParseErrorKind::UnexpectedEndOfFile));
};
if token == Token::Identifier {
let span = AstSpan {
from: goto_start_location,
to: token_location,
};
self.advance();
Ok(self
.arena
.alloc(Expression::Goto(self.arena.string(text)), span))
} else {
Err(self.make_error_here(ParseErrorKind::GotoMissingLabel))
}
}
}

View File

@ -0,0 +1,7 @@
mod block;
mod control;
mod flow;
mod pratt;
mod precedence;
mod statements;
mod switch;

View File

@ -0,0 +1,406 @@
//! Expression parsing for the language front-end.
//!
//! This module implements a Pratt-style parser for the language's expression
//! grammar, supporting:
//!
//! * Primary expressions (literals, identifiers, parenthesized expressions)
//! * Prefix operators
//! * Postfix operators
//! * Infix operators with precedence and associativity
//!
//! Parsing is driven by [`PrecedenceRank`], which controls how tightly
//! operators bind. Infix parsing uses the pair of binding powers returned by
//! [`super::precedence::infix_precedence_ranks`] to encode associativity.
//! The parser infrastructure supports both left- and right-associative
//! operators, but Fermented UnrealScript currently defines only
//! right-associative ones.
//!
//! ## See also
//!
//! - [`crate::parser::Parser::parse_expression`] - main entry point
//! - [`PrecedenceRank`] - operator binding strengths
//! - [`super::precedence`] - operator precedence definitions
use crate::ast::{Expression, ExpressionRef, NeedsSemi};
use crate::lexer::{Token, TokenLocation};
use crate::parser::{
ParseErrorKind, ParseExpressionResult, ParseResult, ResultRecoveryExt, SyncLevel,
};
pub(crate) use super::precedence::PrecedenceRank;
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Parses an expression.
pub fn parse_expression(&mut self) -> ExpressionRef<'src, 'arena> {
self.parse_expression_with_precedence(PrecedenceRank::LOOSEST)
}
/// Parses an expression with operators of at least `min_precedence_rank`
/// (as tight or tighter).
fn parse_expression_with_precedence(
&mut self,
min_precedence_rank: PrecedenceRank,
) -> ExpressionRef<'src, 'arena> {
// Intentional order: (1) prefix/primary, (2) postfix (tighter than
// any infix), (3) infix. We don't run a second postfix pass;
// `(a+b)!` works because the parenthesized sub-expression had its own
// postfix pass before returning.
let mut left_hand_side = self
.parse_prefix_or_primary()
.sync_error_until(self, SyncLevel::Expression)
.unwrap_or_fallback(self);
// Postfix operators are tighter than any infix ones
left_hand_side = self.parse_postfix_into(left_hand_side);
left_hand_side = self.parse_infix_into(left_hand_side, min_precedence_rank);
left_hand_side
}
/// Parses a prefix or primary expression (Pratt parser's "nud" or
/// null denotation).
///
/// Errors with [`ParseErrorKind::UnexpectedEndOfFile`] if the stream ends
/// before a valid prefix/primary.
fn parse_prefix_or_primary(&mut self) -> ParseExpressionResult<'src, 'arena> {
let Some((token, token_location)) = self.peek_token_and_location() else {
return Err(self.make_error_here(ParseErrorKind::UnexpectedEndOfFile));
};
if let Ok(operator) = crate::ast::PrefixOperator::try_from(token) {
self.advance();
let right_hand_side = self.parse_expression_with_precedence(PrecedenceRank::TIGHTEST);
Ok(Expression::new_prefix(
self.arena,
token_location,
operator,
right_hand_side,
))
} else {
self.parse_primary()
}
}
/// Parses a primary expression: literals, identifiers, or a parenthesized
/// sub-expression.
///
/// # Errors
///
/// [`ParseErrorKind::ExpressionUnexpectedToken`] if the next token
/// cannot start a primary; [`ParseErrorKind::UnexpectedEndOfFile`]
/// at end of input.
fn parse_primary(&mut self) -> ParseExpressionResult<'src, 'arena> {
// For diagnostics, we only advance *after* fully parsing the current
// literal/token.
let Some((token, token_text, token_location)) = self.peek_token_lexeme_and_location()
else {
return Err(self.make_error_here(ParseErrorKind::UnexpectedEndOfFile));
};
match token {
Token::IntegerLiteral => {
let value = self.parse_integer_literal(token_text)?;
self.advance();
Ok(self
.arena
.alloc_at(Expression::Integer(value), token_location))
}
Token::FloatLiteral => {
let value = self.parse_float_literal(token_text)?;
self.advance();
Ok(self
.arena
.alloc_at(Expression::Float(value), token_location))
}
Token::StringLiteral => {
let value = unescape_string_literal(self.arena, token_text);
self.advance();
Ok(self
.arena
.alloc_at(Expression::String(value), token_location))
}
Token::True => {
self.advance();
Ok(self.arena.alloc_at(Expression::Bool(true), token_location))
}
Token::False => {
self.advance();
Ok(self.arena.alloc_at(Expression::Bool(false), token_location))
}
Token::None => {
self.advance();
Ok(self.arena.alloc_at(Expression::None, token_location))
}
Token::Identifier => {
self.advance();
Ok(self
.arena
.alloc_at(Expression::Identifier(token_text), token_location))
}
Token::LeftParenthesis => {
self.advance();
self.parse_parenthesized_expression_cont(token_location)
}
Token::If => {
self.advance();
Ok(self.parse_if_cont(token_location))
}
Token::While => {
self.advance();
Ok(self.parse_while_cont(token_location))
}
Token::Do => {
self.advance();
self.parse_do_until_cont(token_location)
}
Token::ForEach => {
self.advance();
Ok(self.parse_foreach_cont(token_location))
}
Token::For => {
self.advance();
self.parse_for_cont(token_location)
}
Token::Brace(crate::lexer::BraceKind::Normal) => {
self.advance();
Ok(self.parse_block_cont(token_location))
}
Token::Return => {
self.advance();
Ok(self.parse_return_cont(token_location))
}
Token::Break => {
self.advance();
Ok(self.parse_break_cont(token_location))
}
Token::Continue => {
self.advance();
Ok(self.arena.alloc_at(Expression::Continue, token_location))
}
Token::Goto => {
self.advance();
self.parse_goto_cont(token_location)
}
Token::Switch => {
self.advance();
self.parse_switch_cont(token_location)
}
_ => {
// Unexpected token in expression.
Err(self.make_error_here(ParseErrorKind::ExpressionUnexpectedToken))
}
}
}
/// Parses an expression in parentheses.
///
/// Assumes the `(` was already consumed; its location is
/// `left_parenthesis_location`.
/// On success, allocates a [`Expression::Parentheses`] node with a span
/// covering from `(` to `)`.
///
/// Errors with [`ParseErrorKind::ExpressionMissingClosingParenthesis`] if
/// a closing `)` is missing; the diagnostic is associated with
/// the opening `(` via `left_parenthesis_location`.
fn parse_parenthesized_expression_cont(
&mut self,
left_parenthesis_location: TokenLocation,
) -> ParseExpressionResult<'src, 'arena> {
let inner_expression = self.parse_expression();
let right_parenthesis_location = self
.expect(
Token::RightParenthesis,
ParseErrorKind::ExpressionMissingClosingParenthesis,
)
.widen_error_span_from(left_parenthesis_location)
.sync_error_at(self, SyncLevel::CloseParenthesis)?;
Ok(self.arena.alloc_between(
Expression::Parentheses(inner_expression),
left_parenthesis_location,
right_parenthesis_location,
))
}
/// Parses all postfix operators it can, creating a tree with
/// `left_hand_side` as a child.
fn parse_postfix_into(
&mut self,
mut left_hand_side: ExpressionRef<'src, 'arena>,
) -> ExpressionRef<'src, 'arena> {
// Single peek that yields `(postfix_op, location)` so the postfix loop
// can advance once per operator without extra matching/unwraps.
while let Some((operator, operator_location)) = self.peek_postfix_with_location() {
self.advance();
left_hand_side =
Expression::new_postfix(self.arena, left_hand_side, operator, operator_location);
}
left_hand_side
}
/// Parses infix operators binding at least as tight as
/// `min_precedence_rank`.
///
/// Associativity is encoded by
/// [`super::precedence::infix_precedence_ranks`]: the right-hand
/// side is parsed with `right_precedence_rank`, so `a - b - c` vs
/// `a ^ b ^ c` associate correctly based on the pair
/// `(left_rank, right_rank)`.
///
/// Stops when the next operator is looser than `min_precedence_rank`.
fn parse_infix_into(
&mut self,
mut left_hand_side: ExpressionRef<'src, 'arena>,
min_precedence_rank: PrecedenceRank,
) -> ExpressionRef<'src, 'arena> {
while let Some((operator, right_precedence_rank)) =
self.peek_infix_at_least(min_precedence_rank)
{
self.advance();
let right_hand_side = self.parse_expression_with_precedence(right_precedence_rank);
left_hand_side =
Expression::new_binary(self.arena, left_hand_side, operator, right_hand_side);
}
left_hand_side
}
/// Parses an integer literal as [`i128`].
///
/// Chosen to cover FerUS's integer range so constant folding
/// remains precise.
///
/// Errors with [`ParseErrorKind::InvalidNumericLiteral`] if `text` is
/// not a valid integer.
fn parse_integer_literal(&mut self, text: &str) -> ParseResult<i128> {
text.parse::<i128>()
.map_err(|_| self.make_error_here(ParseErrorKind::InvalidNumericLiteral))
}
/// Parses a float literal as [`f64`].
///
/// Chosen to cover FerUS's float range so constant folding remains
/// precise.
///
/// Errors with [`ParseErrorKind::InvalidNumericLiteral`] if `text` is
/// not a valid float.
fn parse_float_literal(&mut self, text: &str) -> ParseResult<f64> {
if let Ok(parsed_value) = text.parse::<f64>() {
Ok(parsed_value)
} else {
Err(self.make_error_here(ParseErrorKind::InvalidNumericLiteral))
}
}
/// Returns the next postfix operator and its location if present.
///
/// Helper to avoid peeking and mapping twice; used to drive the postfix
/// loop without unwraps.
fn peek_postfix_with_location(
&mut self,
) -> Option<(crate::ast::PostfixOperator, TokenLocation)> {
let Some((token, token_location)) = self.peek_token_and_location() else {
return None;
};
let Ok(operator) = crate::ast::PostfixOperator::try_from(token) else {
return None;
};
Some((operator, token_location))
}
/// If the next token is an infix operator with left binding power at least
/// `min_precedence_rank`, returns its operator and precedence rank.
///
/// Otherwise return [`None`].
fn peek_infix_at_least(
&mut self,
min_precedence_rank: PrecedenceRank,
) -> Option<(crate::ast::InfixOperator, PrecedenceRank)> {
let (left_precedence_rank, operator, right_precedence_rank) = self
.peek_token()
.and_then(super::precedence::infix_precedence_ranks)?;
if left_precedence_rank.is_looser_than(min_precedence_rank) {
return None;
}
Some((operator, right_precedence_rank))
}
/// Parses one item inside a `{ ... }` block.
///
/// The item can be a statement (e.g. a variable declaration) or an
/// expression. If the item is an expression without a following
/// semicolon, it is returned as the block's current tail expression
/// - the value considered to be the block's result. In well-formed
/// code such a tail expression appears only at the very end of the block.
///
/// This method never consumes the closing `}` and is only meant to be
/// called while parsing inside a block.
pub(crate) fn parse_block_item(
&mut self,
statements: &mut crate::arena::ArenaVec<'arena, crate::ast::StatementRef<'src, 'arena>>,
) -> Option<crate::ast::ExpressionRef<'src, 'arena>> {
if let Some(mut next_statement) = self.parse_statement() {
if next_statement.needs_semicolon() {
// For statements we immediately know if lack of
// semicolon is an issue
if let Some(Token::Semicolon) = self.peek_token() {
next_statement.span_mut().to = self.peek_location();
self.advance(); // ';'
} else {
self.report_error_here(ParseErrorKind::BlockMissingSemicolonAfterStatement);
}
}
statements.push(next_statement);
} else {
let mut next_expression = self.parse_expression();
if let Expression::Error = *next_expression {
self.recover_until(SyncLevel::Statement);
next_expression.span_mut().to = self.peek_location();
}
if let Some((Token::Semicolon, semicolon_location)) = self.peek_token_and_location() {
self.advance(); // ;
let span = crate::ast::AstSpan {
from: next_expression.span().from,
to: semicolon_location,
};
let expression_statement_node = self
.arena
.alloc(crate::ast::Statement::Expression(next_expression), span);
statements.push(expression_statement_node);
} else {
return Some(next_expression);
}
}
None
}
}
/// Unescapes a tokenized string literal into an arena string.
///
/// Supported escapes: `\n`, `\t`, `\"`, `\\`.
/// Unknown escape sequences are preserved as-is (UnrealScript behavior).
///
/// Note: this function assumes `raw` is the token text without surrounding
/// quotes.
fn unescape_string_literal<'arena>(
arena: &'arena crate::arena::Arena,
raw: &str,
) -> crate::arena::ArenaString<'arena> {
let mut buffer = String::with_capacity(raw.len());
let mut characters = raw.chars();
while let Some(next_character) = characters.next() {
if next_character == '\\' {
// The lexer never produces a trailing backslash in a string token,
// so there's always a following character to inspect.
if let Some(escaped_character) = characters.next() {
match escaped_character {
'n' => buffer.push('\n'),
't' => buffer.push('\t'),
'"' => buffer.push('"'),
'\\' => buffer.push('\\'),
// Simply leaving escaped character as-is is an expected
// behavior by UnrealScript
other => buffer.push(other),
}
}
} else {
buffer.push(next_character);
}
}
arena.string(&buffer)
}

View File

@ -0,0 +1,185 @@
//! Precedence tables for Fermented UnrealScript operators.
//!
//! These values don't follow the usual *binding power* convention for
//! a Pratt parser, where tighter binding corresponds to a larger number.
//! Here, the smaller the number, the tighter the binding power.
//! For this reason, we use the term *precedence rank* instead.
//!
//! ## Operators sorted by precedence (lowest number = tighter binding)
//!
//! ### Infix operators
//!
//! All infix operators in UnrealScript are
//! [left-associative](https://wiki.beyondunreal.com/Operators).
//!
//! 12: `**`
//! 16: `*`, `/`, `Cross`, `Dot`
//! 18: `%`
//! 20: `+`, `-`
//! 22: `<<`, `>>`, `>>>`
//! 24: `<`, `>`, `<=`, `>=`, `==`, `~=`, `ClockwiseFrom`
//! 26: `!=`
//! 28: `&`, `^`, `|`
//! 30: `&&`, `^^`
//! 32: `||`
//! 34: `*=`, `/=`, `+=`, `-=`
//! 40: `$`, `*`, `@`
//! 44: `$=`, `*=`, `@=`
//! 45: `-=`
//!
//! Some operator, such as `*`, appear twice with different precedence
//! ranks because they were defined with different values for different types
//! in separate script source files (as in the Killing Floor sources).
//! However, UnrealScript uses only the first definition it encounters in
//! `Object.uc`, which corresponds to the lower value.
//!
//! ### Prefix operators
//!
//! `!`, `~`, `-`, `++`, `--`.
//!
//! ### Postfix operators
//!
//! `++`, `--`.
use crate::ast::{InfixOperator, PostfixOperator, PrefixOperator};
use crate::lexer::Token;
/// Compact precedence rank used by the Pratt Parser.
///
/// A smaller number means tighter binding, and a larger number means looser
/// binding. This inverted scale matches how UnrealScript tables were recorded.
#[must_use]
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct PrecedenceRank(u8);
impl PrecedenceRank {
/// The loosest possible precedence rank.
///
/// In this inverted scale (smaller number = tighter binding),
/// this is represented by the maximum [`u8`] value.
pub const LOOSEST: Self = PrecedenceRank(u8::MAX);
/// The tightest possible precedence rank.
///
/// In this inverted scale (smaller number = tighter binding),
/// this is represented by zero.
pub const TIGHTEST: PrecedenceRank = PrecedenceRank(0);
/// Returns `true` if `other` has a looser binding than `self`.
///
/// # Examples
///
/// ```
/// # use crate::parser::expressions::PrecedenceRank;
/// let a = PrecedenceRank(40);
/// let b = PrecedenceRank(34);
/// assert!(a.is_looser_than(b)); // 40 is looser than 34
///
/// let c = PrecedenceRank(20);
/// let d = PrecedenceRank(24);
/// assert!(!c.is_looser_than(d)); // 20 is tighter than 24
/// ```
pub fn is_looser_than(self, other: Self) -> bool {
self.0 > other.0
}
}
impl TryFrom<Token> for PrefixOperator {
type Error = ();
fn try_from(token: Token) -> Result<Self, Self::Error> {
use PrefixOperator::*;
Ok(match token {
Token::Not => Not,
Token::Minus => Minus,
Token::BitwiseNot => BitwiseNot,
Token::Increment => Increment,
Token::Decrement => Decrement,
_ => return Err(()),
})
}
}
impl TryFrom<Token> for PostfixOperator {
type Error = ();
fn try_from(token: Token) -> Result<Self, Self::Error> {
use PostfixOperator::*;
Ok(match token {
Token::Increment => Increment,
Token::Decrement => Decrement,
_ => return Err(()),
})
}
}
/// Maps a token to its infix operator along with its left and right binding
/// ranks: `(left_precedence_rank, operator, right_precedence_rank)`.
///
/// Returns [`None`] if and only if `token` is not an infix operator.
pub(crate) fn infix_precedence_ranks(
token: Token,
) -> Option<(PrecedenceRank, InfixOperator, PrecedenceRank)> {
use crate::ast::InfixOperator::*;
let (left_precedence_rank, operator) = match token {
// 12: `**`
Token::Exponentiation => (12, Exponentiation),
// 16: `*`, `/`, `Cross`, `Dot` (left-assoc)
Token::Multiply => (16, Multiply),
Token::Divide => (16, Divide),
Token::Cross => (16, Cross),
Token::Dot => (16, Dot),
// 18: `%`
Token::Modulo => (18, Modulo),
// 20: `+`, `-`
Token::Plus => (20, Plus),
Token::Minus => (20, Minus),
// 22: `<<`, `>>`, `>>>`
Token::LeftShift => (22, LeftShift),
Token::RightShift => (22, RightShift),
Token::LogicalRightShift => (22, LogicalRightShift),
// 24: comparison operators
Token::Less => (24, Less),
Token::LessEqual => (24, LessEqual),
Token::Greater => (24, Greater),
Token::GreaterEqual => (24, GreaterEqual),
Token::Equal => (24, Equal),
Token::ApproximatelyEqual => (24, ApproximatelyEqual),
Token::ClockwiseFrom => (24, ClockwiseFrom),
// 26: `!=`
Token::NotEqual => (26, NotEqual),
// 28: bit-wise `&`, `^`, `|`
Token::BitwiseAnd => (28, BitwiseAnd),
Token::BitwiseXor => (28, BitwiseXor),
Token::BitwiseOr => (28, BitwiseOr),
// 30: logical `&&`, `^^`
Token::And => (30, And),
Token::Xor => (30, Xor),
// 32: logical `||`
Token::Or => (32, Or),
// 34: `*=`, `/=`, `+=`, `-=`
Token::MultiplyAssign => (34, MultiplyAssign),
Token::DivideAssign => (34, DivideAssign),
Token::PlusAssign => (34, PlusAssign),
Token::MinusAssign => (34, MinusAssign),
// Simple '=' treated with same precedence
Token::Assign => (34, Assign),
Token::ModuloAssign => (34, ModuloAssign),
// 40: `$`, `@`
Token::Concat => (40, Concat),
Token::ConcatSpace => (40, ConcatSpace),
// 44: `$=`, `@=`
Token::ConcatAssign => (44, ConcatAssign),
Token::ConcatSpaceAssign => (44, ConcatSpaceAssign),
_ => return None,
};
// All operators are left-associative, so `right_precedence_rank` is set to
// `left_binding_rank - 1` (with our "smaller is tighter" scale, this
// enforces left associativity in Pratt parsing).
//
// Since all precedences are even, subtracting one won't actually cross
// any boundary between operator groups.
Some((
PrecedenceRank(left_precedence_rank),
operator,
PrecedenceRank(left_precedence_rank - 1),
))
}

View File

@ -0,0 +1,185 @@
//! Statement parsing for the language front-end.
//!
//! Implements a simple recursive-descent parser for
//! *Fermented UnrealScript statements*.
use crate::ast::{AstSpan, Statement, StatementRef};
use crate::lexer::Token;
use crate::parser::{ParseErrorKind, ResultRecoveryExt, SyncLevel};
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Parses a single statement.
///
/// Does not consume a trailing `;` except for [`Statement::Empty`].
/// The caller handles semicolons. Returns [`Some`] if a statement is
/// recognized; otherwise [`None`].
#[must_use]
pub(crate) fn parse_statement(&mut self) -> Option<StatementRef<'src, 'arena>> {
let Some((token, lexeme, location)) = self.peek_token_lexeme_and_location() else {
self.report_error_here(ParseErrorKind::UnexpectedEndOfFile);
return None;
};
match token {
// Empty statement
Token::Semicolon => {
self.advance(); // `;`
Some(self.arena.alloc(Statement::Empty, AstSpan::new(location)))
}
// UnrealScript's standard `local` variable declaration
Token::Local => {
self.advance(); // `local`
Some(
self.parse_local_variable_declaration_cont()
.widen_error_span_from(location)
.sync_error_until(self, SyncLevel::Statement)
.unwrap_or_fallback(self),
)
}
// Label definition
Token::Identifier if matches!(self.peek_token_at(1), Some(Token::Colon)) => {
self.advance(); // `Token::Identifier`
self.advance(); // `:`
Some(self.arena.alloc(
Statement::Label(self.arena.string(lexeme)),
AstSpan::range(location, self.last_visited_location()),
))
}
// C-like variable declaration
token
if token.is_valid_type_name_token()
&& Some(Token::Identifier) == self.peek_token_at(1) =>
{
self.advance(); // `TYPE_NAME`
// Next token is guaranteed to exist by the arm condition
Some(self.parse_variable_declaration_cont(lexeme))
}
// Not a statement
_ => None,
}
}
/// Parses a local variable declaration after `local` has been consumed.
///
/// Requires the next token to be a type name. Initializers are not allowed.
/// Reports and recovers from errors; the identifier list may be empty if
/// recovery fails.
fn parse_local_variable_declaration_cont(
&mut self,
) -> crate::parser::ParseResult<'src, 'arena, StatementRef<'src, 'arena>> {
let Some((type_token, type_name)) = self.peek_token_and_lexeme() else {
return Err(self.make_error_here(ParseErrorKind::UnexpectedEndOfFile));
};
if !type_token.is_valid_type_name_token() {
return Err(self.make_error_here(ParseErrorKind::LocalInvalidTypeName));
}
let declaration_start_location = self.last_visited_location();
self.advance(); // `TYPE_NAME`
let type_name = self.arena.string(type_name);
let identifiers = self.parse_local_identifier_list();
if identifiers.is_empty() {
self.make_error_here(ParseErrorKind::LocalMissingIdentifier)
.widen_error_span_from(declaration_start_location)
.report_error(self);
}
Ok(self.arena.alloc(
Statement::LocalVariableDeclaration {
type_name,
identifiers,
},
AstSpan::range(declaration_start_location, self.last_visited_location()),
))
}
/// Parses a comma-separated list of identifiers for a local declaration.
///
/// Best-effort recovery from errors. Returns an empty list if no valid
/// identifiers are found.
fn parse_local_identifier_list(
&mut self,
) -> crate::arena::ArenaVec<'arena, crate::arena::ArenaString<'arena>> {
let mut identifiers = self.arena.vec();
while let Some((token, next_variable_name)) = self.peek_token_and_lexeme() {
if token == Token::Identifier {
identifiers.push(self.arena.string(next_variable_name));
self.advance(); // `Token::Identifier`
} else {
self.report_error_here(ParseErrorKind::LocalBadVariableIdentifier);
// Try to recover to the next variable name
self.recover_until(SyncLevel::ListSeparator);
}
// Disallow initializers in `local`.
if let Some(Token::Assign) = self.peek_token() {
self.report_error_here(ParseErrorKind::LocalInitializerNotAllowed);
self.recover_until(SyncLevel::ListSeparator);
}
// Can the list continue?
// Loop cannot stall: each iteration consumes a token or breaks
if !self.eat(Token::Comma) {
break;
}
}
// End-of-file branch
identifiers
}
/// Parses a non-local variable declaration after the type name token
/// has been consumed.
///
/// The caller must guarantee that at least one declarator follows.
/// Optional initializers are allowed.
fn parse_variable_declaration_cont(
&mut self,
type_name: &'src str,
) -> StatementRef<'src, 'arena> {
let declaration_start_location = self.last_visited_location();
let type_name = self.arena.string(type_name);
let declarations = self.parse_variable_declaration_list();
// An identifier required by method's condition
debug_assert!(!declarations.is_empty());
self.arena.alloc(
Statement::VariableDeclaration {
type_name,
declarations,
},
AstSpan::range(declaration_start_location, self.last_visited_location()),
)
}
/// Parses a comma-separated list of declarators with optional `=`
/// initializers.
///
/// Best-effort recovery on errors.
/// The caller should invoke this when the next token starts a declarator.
fn parse_variable_declaration_list(
&mut self,
) -> crate::arena::ArenaVec<'arena, crate::ast::VariableDeclarator<'src, 'arena>> {
let mut variables = self.arena.vec();
while let Some((token, next_variable_name)) = self.peek_token_and_lexeme() {
if token == Token::Identifier {
self.advance(); // `Token::Identifier`
let name = self.arena.string(next_variable_name);
let initializer = if self.eat(Token::Assign) {
Some(self.parse_expression())
} else {
None
};
variables.push(crate::ast::VariableDeclarator { name, initializer });
} else {
self.report_error_here(ParseErrorKind::DeclBadVariableIdentifier);
// Try to recover to the next variable name
self.recover_until(SyncLevel::ListSeparator);
}
// Can the list continue?
// Loop cannot stall: each iteration consumes a token or breaks
if !self.eat(Token::Comma) {
break;
}
}
// End-of-file branch
variables
}
}

View File

@ -0,0 +1,227 @@
use crate::arena::ArenaVec;
use crate::ast::{AstSpan, ExpressionRef, StatementRef};
use crate::lexer::{Token, TokenLocation};
use crate::parser::{ParseErrorKind, ResultRecoveryExt};
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Parses a `switch` expression after the `switch` keyword was consumed.
///
/// Arm bodies accept statements and expressions. A last, expression without
/// `;` in the last arm becomes the switch's tail value if none was
/// captured yet.
/// Only one `default` case arm is allowed.
/// Returns a best-effort switch node on premature EOF.
#[must_use]
pub(crate) fn parse_switch_cont(
&mut self,
switch_start_location: TokenLocation,
) -> crate::parser::ParseExpressionResult<'src, 'arena> {
let selector = self.parse_expression();
self.expect(
Token::Brace(crate::lexer::BraceKind::Normal),
ParseErrorKind::SwitchMissingBody,
)
.report_error(self);
let (mut cases, mut default_arm, mut tail) = (self.arena.vec(), None, None);
let mut span = AstSpan::new(switch_start_location);
loop {
let Some((token, token_location)) = self.peek_token_and_location() else {
self.report_error_here(ParseErrorKind::UnexpectedEndOfFile);
span.extend_to(self.peek_location());
return Ok(self.alloc_switch_node(selector, cases, default_arm, tail, span));
};
match token {
Token::RightBrace => {
self.advance(); // '}'
span.extend_to(token_location);
return Ok(self.alloc_switch_node(selector, cases, default_arm, tail, span));
}
Token::Case => {
if default_arm.is_some() {
self.report_error_here(ParseErrorKind::SwitchCasesAfterDefault);
}
let case_node = self.parse_switch_case_group(token_location, &mut tail);
cases.push(case_node);
}
Token::Default => {
if default_arm.is_some() {
self.report_error_here(ParseErrorKind::SwitchDuplicateDefault);
}
// We still parse a duplicate default to surface all errors.
// Bodies are effectively fused for error reporting;
// compilation stops anyway, so this trades AST correctness
// for diagnostics.
self.parse_switch_default_arm(
token_location,
default_arm.get_or_insert_with(|| self.arena.vec()),
&mut tail,
);
}
// This can only be triggered before parsing any `case` or
// `default` arms, since they stop either at the start of
// another arm declaration (e.g. at `case`/`default`) or
// at the `}` that ends switch body.
_ => self.parse_switch_preamble_items(&mut tail),
}
// Ensure forward progress under errors to avoid infinite loops.
if self.peek_location() <= token_location {
self.advance();
}
}
}
/// Parses a stacked `case` group and its body:
/// `case <expr>: (case <expr>:)* <arm-body-until-boundary>`.
///
/// Returns the allocated [`crate::ast::CaseRef`] node.
#[must_use]
fn parse_switch_case_group(
&mut self,
first_case_location: TokenLocation,
tail: &mut Option<ExpressionRef<'src, 'arena>>,
) -> crate::ast::CaseRef<'src, 'arena> {
let mut labels = self.arena.vec();
while let Some((Token::Case, case_location)) = self.peek_token_and_location() {
// Guaranteed progress: we entered on `Token::Case`.
self.advance(); // 'case'
labels.push(self.parse_expression());
// Enforce `:` after each case with statement-level recovery.
self.expect(Token::Colon, ParseErrorKind::SwitchCaseMissingColon)
.widen_error_span_from(case_location)
.sync_error_until(self, crate::parser::SyncLevel::Statement)
.report_error(self);
}
let mut body = self.arena.vec();
self.parse_switch_arm_body(&mut body, tail);
let case_span = compute_case_span(first_case_location, &labels, &body);
self.arena
.alloc(crate::ast::SwitchCase { labels, body }, case_span)
}
/// Parses the `default :` arm and its body.
///
/// Does not consume a boundary token after the body.
fn parse_switch_default_arm(
&mut self,
default_location: TokenLocation,
statements: &mut ArenaVec<'arena, StatementRef<'src, 'arena>>,
tail: &mut Option<ExpressionRef<'src, 'arena>>,
) {
self.advance(); // 'default'
self.expect(Token::Colon, ParseErrorKind::SwitchCaseMissingColon)
.widen_error_span_from(default_location)
.sync_error_until(self, crate::parser::SyncLevel::Statement)
.report_error(self);
self.parse_switch_arm_body(statements, tail);
}
/// Parses items of a single switch arm body until a boundary token or EOF.
///
/// Boundary tokens: `case`, `default`, `}`.
fn parse_switch_arm_body(
&mut self,
statements: &mut ArenaVec<'arena, StatementRef<'src, 'arena>>,
tail: &mut Option<ExpressionRef<'src, 'arena>>,
) {
// No need to report end-of-file as it'll be done by
// `parse_switch_cont`.
while let Some((token, token_location)) = self.peek_token_and_location() {
match token {
// Complain about tail instruction if `switch` body
// doesn't end here
Token::Case | Token::Default => {
if let Some(tail_expression) = tail.take() {
self.report_error_here(ParseErrorKind::SwitchBareExpressionBeforeNextArm);
let span = *tail_expression.span();
let stmt = self
.arena
.alloc(crate::ast::Statement::Expression(tail_expression), span);
statements.push(stmt);
}
break;
}
Token::RightBrace => break,
_ => (),
}
// We know that at this point:
// 1. There is still a token and it is not EOF;
// 2. It isn't end of the block.
// So having a tail statement there is a problem!
if let Some(tail_expression) = tail.take() {
self.report_error_here(ParseErrorKind::BlockMissingSemicolonAfterExpression);
let tail_span = *tail_expression.span();
let node = self.arena.alloc(
crate::ast::Statement::Expression(tail_expression),
tail_span,
);
statements.push(node);
}
*tail = self.parse_block_item(statements);
// Ensure forward progress under errors to avoid infinite loops.
if self.peek_location() <= token_location {
self.advance();
}
}
}
/// Parses items that were found in code *before* any arm (`case`/`default`)
/// declaration.
///
/// These aren't allowed, but we still want to perform a proper parsing step
/// to report whatever errors we can in case programmer simply forgot to put
/// an arm declaration.
///
/// Boundary tokens: `case`, `default`, `}`.
fn parse_switch_preamble_items(&mut self, tail: &mut Option<ExpressionRef<'src, 'arena>>) {
// Report the spurious token.
self.report_error_here(ParseErrorKind::SwitchTopLevelItemNotCase);
// Discard parsed statements into a sink vector.
// This is a bit "hacky", but I don't want to adapt code to skip
// production of AST nodes just to report errors in
// one problematic case.
let mut sink = self.arena.vec();
self.parse_switch_arm_body(&mut sink, tail);
}
/// Helper to allocate a `Switch` expression with the given span.
#[must_use]
fn alloc_switch_node(
&mut self,
selector: ExpressionRef<'src, 'arena>,
cases: ArenaVec<'arena, crate::ast::CaseRef<'src, 'arena>>,
default_arm: Option<ArenaVec<'arena, StatementRef<'src, 'arena>>>,
tail: Option<ExpressionRef<'src, 'arena>>,
span: AstSpan,
) -> ExpressionRef<'src, 'arena> {
self.arena.alloc(
crate::ast::Expression::Switch {
selector,
cases,
default_arm,
tail,
},
span,
)
}
}
/// Computes [`AstSpan`] covering all labels and the body.
#[must_use]
fn compute_case_span(
labels_start_location: TokenLocation,
labels: &[ExpressionRef],
body: &[StatementRef],
) -> AstSpan {
let mut span = AstSpan {
from: labels_start_location,
to: labels_start_location,
};
if let Some(last_statement) = body.last() {
span.extend_to(last_statement.span().to);
} else if let Some(last_label) = labels.last() {
span.extend_to(last_label.span().to);
}
span
}

66
rottlib/src/parser/mod.rs Normal file
View File

@ -0,0 +1,66 @@
//! Parser for Fermented UnrealScript (FerUS).
//!
//! Consumes tokens from [`crate::lexer::TokenizedFile`] and allocates AST
//! nodes in [`crate::arena::Arena`]. Basic expressions use a Pratt parser;
//! the rest rely on recursive descent in [`crate::parser::grammar`].
//! Non-fatal errors accumulate in `Parser::diagnostics` as
//! [`crate::diagnostics::Diagnostic`]; recovery skips to sync points defined by
//! [`crate::parser::recovery::SyncLevel`] and synthesizes error nodes while
//! keeping the parse going.
//!
//! Components:
//! - `cursor`: token I/O, `peek`/`advance`, and lazy trivia capture;
//! - `trivia`: trailing comments and newline counts keyed to
//! the previous significant token and BOF;
//! - `recovery`: panic-mode skipping and recovery adapters for results;
//! - `pretty`: printable trees (`ExprTree`, `StmtTree`) for messages and dumps;
//! - `errors`: [`ParseError`] and [`ParseErrorKind`].
//!
//! Lifetimes: `'src` ties to lexer slices; `'arena` ties to AST allocation.
//!
//! Guarantees:
//!
//! - Parser does not abort on user input. It emits diagnostics and error nodes.
//! - Trivia is recorded as you scan and can be queried by formatters/linters.
//! - Public surface keeps [`Parser`] small;
//! low-level plumbing lives in submodules.
use super::lexer;
pub use lexer::{TokenPiece, Tokens};
mod cursor;
mod errors;
mod grammar;
pub mod pretty;
mod recovery;
mod trivia;
pub use pretty::{ExprTree, StmtTree};
pub use errors::ParseError;
pub(crate) use errors::{ParseErrorKind, ParseResult};
pub(crate) use recovery::{ResultRecoveryExt, SyncLevel};
pub(crate) use trivia::{TriviaKind, TriviaToken};
pub type ParseExpressionResult<'src, 'arena> =
ParseResult<'src, 'arena, crate::ast::ExpressionRef<'src, 'arena>>;
/// A recursive-descent parser over token from [`crate::lexer::TokenizedFile`].
pub struct Parser<'src, 'arena> {
arena: &'arena crate::arena::Arena,
pub diagnostics: Vec<crate::diagnostics::Diagnostic>,
cursor: cursor::CursorComponent<'src>,
trivia: trivia::TriviaComponent<'src>,
}
impl<'src, 'arena> Parser<'src, 'arena> {
pub fn new(file: &'src lexer::TokenizedFile<'src>, arena: &'arena crate::arena::Arena) -> Self {
Self {
arena,
diagnostics: Vec::new(),
cursor: cursor::CursorComponent::new(file),
trivia: trivia::TriviaComponent::default(),
}
}
}

View File

@ -0,0 +1,353 @@
use crate::ast::{Expression, Statement, SwitchCase, VariableDeclarator};
use core::fmt;
/// A borrow of either a statement or an expression node,
/// plus helpers to enrich the printed tree.
enum AnyNode<'src, 'a, 'b> {
Stmt(&'b Statement<'src, 'a>),
Expr(&'b Expression<'src, 'a>),
Case(&'b SwitchCase<'src, 'a>),
/// A leaf line with a preformatted label (e.g., variable names).
Text(String),
/// Wraps a child with a tag like "cond", "body", "else", "init".
Tagged(&'static str, Box<AnyNode<'src, 'a, 'b>>),
}
/// Public wrappers to print trees starting from either kind of node.
pub struct StmtTree<'src, 'a, 'b>(pub &'b Statement<'src, 'a>);
pub struct ExprTree<'src, 'a, 'b>(pub &'b Expression<'src, 'a>);
impl<'src, 'a, 'b> fmt::Display for StmtTree<'src, 'a, 'b> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt_node(AnyNode::Stmt(self.0), f, "", true)
}
}
impl<'src, 'a, 'b> fmt::Display for ExprTree<'src, 'a, 'b> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt_node(AnyNode::Expr(self.0), f, "", true)
}
}
fn fmt_node<'src, 'a, 'b>(
node: AnyNode<'src, 'a, 'b>,
f: &mut fmt::Formatter<'_>,
prefix: &str,
is_last: bool,
) -> fmt::Result {
write!(f, "{}{}─ ", prefix, if is_last { "" } else { "" })?;
writeln!(f, "{}", label(&node))?;
let new_prefix = format!("{}{}", prefix, if is_last { " " } else { "" });
let kids = children(node);
let len = kids.len();
for (i, child) in kids.into_iter().enumerate() {
let last = i + 1 == len;
fmt_node(child, f, &new_prefix, last)?;
}
Ok(())
}
/// ----- Labeling -----
fn label<'src, 'a, 'b>(node: &AnyNode<'src, 'a, 'b>) -> String {
match node {
AnyNode::Expr(e) => expr_label(e),
AnyNode::Stmt(s) => stmt_label(s),
AnyNode::Case(c) => case_label(c),
AnyNode::Text(s) => s.clone(),
AnyNode::Tagged(tag, inner) => format!("{tag}: {}", label(inner)),
}
}
fn quote_str(s: &str) -> String {
let mut out = String::with_capacity(s.len() + 2);
out.push('"');
for ch in s.chars() {
match ch {
'\\' => out.push_str("\\\\"),
'"' => out.push_str("\\\""),
'\n' => out.push_str("\\n"),
'\r' => out.push_str("\\r"),
'\t' => out.push_str("\\t"),
c => out.push(c),
}
}
out.push('"');
out
}
fn expr_label<'src, 'a>(e: &Expression<'src, 'a>) -> String {
match e {
Expression::Binary(_, op, _) => format!("Binary {op}"),
Expression::LeftUnary(op, _) => format!("UnaryL {op}"),
Expression::RightUnary(_, op) => format!("UnaryR {op}"),
Expression::Identifier(s) => format!("Ident {s}"),
Expression::String(s) => {
// Avoid assuming ArenaString exposes &str; go via Display -> String.
format!("String {}", quote_str(&s.to_string()))
}
Expression::Integer(i) => format!("Int {i}"),
Expression::Float(x) => format!("Float {x}"),
Expression::Bool(true) => "Bool true".into(),
Expression::Bool(false) => "Bool false".into(),
Expression::None => "None".into(),
Expression::Parentheses(_) => "Parentheses".into(),
Expression::Block { statements, tail } => {
let n = statements.len() + usize::from(tail.is_some());
let tail_s = if tail.is_some() { " tail" } else { "" };
format!("BlockExpr ({n} items{tail_s})")
}
Expression::If { .. } => "IfExpr".into(),
Expression::While { .. } => "WhileExpr".into(),
Expression::DoUntil { .. } => "DoUntilExpr".into(),
Expression::ForEach { .. } => "ForEachExpr".into(),
Expression::For { .. } => "ForExpr".into(),
Expression::Switch {
cases,
default_arm: default,
..
} => {
let d = if default.is_some() { " yes" } else { " no" };
format!("SwitchExpr cases={} default:{}", cases.len(), d)
}
Expression::Goto(label) => format!("Goto {}", label.to_string()),
Expression::Continue => "Continue".into(),
Expression::Break(Some(_)) => "Break value".into(),
Expression::Break(None) => "Break".into(),
Expression::Return(Some(_)) => "Return value".into(),
Expression::Return(None) => "Return".into(),
Expression::Error => "Error".into(),
}
}
/// ----- Children collection -----
fn children<'src, 'a, 'b>(node: AnyNode<'src, 'a, 'b>) -> Vec<AnyNode<'src, 'a, 'b>> {
match node {
AnyNode::Expr(e) => expr_children(e),
AnyNode::Stmt(s) => stmt_children(s),
AnyNode::Case(c) => case_children(c),
AnyNode::Text(_) => vec![],
AnyNode::Tagged(_, inner) => children(*inner),
}
}
/// Expression children can include statements inside Block/Switch.
fn expr_children<'src, 'a, 'b>(e: &'b Expression<'src, 'a>) -> Vec<AnyNode<'src, 'a, 'b>> {
match e {
// Purely expression subtrees
Expression::Binary(lhs, _, rhs) => vec![AnyNode::Expr(&*lhs), AnyNode::Expr(&*rhs)],
Expression::LeftUnary(_, expr) => vec![AnyNode::Expr(&*expr)],
Expression::RightUnary(expr, _) => vec![AnyNode::Expr(&*expr)],
Expression::Parentheses(expr) => vec![AnyNode::Expr(&*expr)],
// Structured expression forms
Expression::Block { statements, tail } => {
let mut out: Vec<AnyNode<'src, 'a, 'b>> = statements
.iter()
.map(|s| AnyNode::Tagged("stmt", Box::new(AnyNode::Stmt(&*s))))
.collect();
if let Some(t) = tail.as_ref() {
out.push(AnyNode::Tagged("tail", Box::new(AnyNode::Expr(&*t))));
}
out
}
Expression::If {
condition,
body,
else_body,
} => {
let mut out = vec![
AnyNode::Tagged("cond", Box::new(AnyNode::Expr(&*condition))),
AnyNode::Tagged("body", Box::new(AnyNode::Expr(&*body))),
];
if let Some(e) = else_body {
out.push(AnyNode::Tagged("else", Box::new(AnyNode::Expr(&*e))));
}
out
}
Expression::While { condition, body } => vec![
AnyNode::Tagged("cond", Box::new(AnyNode::Expr(&*condition))),
AnyNode::Tagged("body", Box::new(AnyNode::Expr(&*body))),
],
Expression::DoUntil { condition, body } => vec![
AnyNode::Tagged("body", Box::new(AnyNode::Expr(&*body))),
AnyNode::Tagged("until", Box::new(AnyNode::Expr(&*condition))),
],
Expression::ForEach { iterator, body } => vec![
AnyNode::Tagged("iter", Box::new(AnyNode::Expr(&*iterator))),
AnyNode::Tagged("body", Box::new(AnyNode::Expr(&*body))),
],
Expression::For {
init,
condition,
step,
body,
} => {
let mut out = Vec::with_capacity(4);
if let Some(i) = init {
out.push(AnyNode::Tagged("init", Box::new(AnyNode::Expr(&*i))));
}
if let Some(c) = condition {
out.push(AnyNode::Tagged("cond", Box::new(AnyNode::Expr(&*c))));
}
if let Some(s) = step {
out.push(AnyNode::Tagged("step", Box::new(AnyNode::Expr(&*s))));
}
out.push(AnyNode::Tagged("body", Box::new(AnyNode::Expr(&*body))));
out
}
Expression::Switch {
selector,
cases,
default_arm: default,
tail,
} => {
let mut out: Vec<AnyNode<'src, 'a, 'b>> = vec![AnyNode::Tagged(
"selector",
Box::new(AnyNode::Expr(&*selector)),
)];
for case in cases.iter() {
out.push(AnyNode::Tagged("case", Box::new(AnyNode::Case(&*case))));
}
if let Some(d) = default.as_ref() {
for stmt in d.iter() {
out.push(AnyNode::Tagged("default", Box::new(AnyNode::Stmt(&*stmt))));
}
}
if let Some(t) = tail.as_ref() {
out.push(AnyNode::Tagged("tail", Box::new(AnyNode::Expr(&*t))));
}
out
}
// Leaves
Expression::Identifier(_)
| Expression::String(_)
| Expression::Integer(_)
| Expression::Float(_)
| Expression::Bool(_)
| Expression::None
| Expression::Goto(_)
| Expression::Continue
| Expression::Break(None)
| Expression::Return(None)
| Expression::Error => vec![],
// Single optional-child leaves
Expression::Break(Some(v)) => vec![AnyNode::Tagged("value", Box::new(AnyNode::Expr(&*v)))],
Expression::Return(Some(v)) => {
vec![AnyNode::Tagged("value", Box::new(AnyNode::Expr(&*v)))]
}
}
}
fn stmt_label<'src, 'a>(s: &Statement<'src, 'a>) -> String {
use Statement::*;
match s {
Empty => "Empty ;".into(),
Expression(_) => "Expression".into(),
LocalVariableDeclaration {
type_name,
identifiers: variable_names,
} => {
let count = variable_names.len();
let names = variable_names
.iter()
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join(", ");
format!("LocalVarDecl type={type_name} count={count} names=[{names}]")
}
VariableDeclaration {
type_name,
declarations: variable_names,
} => {
let total = variable_names.len();
let inits = variable_names
.iter()
.filter(|v| v.initializer.is_some())
.count();
let names = variable_names
.iter()
.map(|VariableDeclarator { name, .. }| name.to_string())
.collect::<Vec<_>>()
.join(", ");
format!("VarDecl type={type_name} vars={total} inits={inits} names=[{names}]")
}
Label(name) => format!("Label {name}"),
Error => "Error".into(),
}
}
fn stmt_children<'src, 'a, 'b>(s: &'b Statement<'src, 'a>) -> Vec<AnyNode<'src, 'a, 'b>> {
use Statement::*;
match s {
Empty | Label(_) | Error => vec![],
Expression(expr) => vec![AnyNode::Expr(&*expr)],
LocalVariableDeclaration {
identifiers: variable_names,
..
} => variable_names
.iter()
.map(|n| AnyNode::Text(format!("name: {n}")))
.collect(),
VariableDeclaration {
declarations: variable_names,
..
} => {
let mut out = Vec::new();
for VariableDeclarator {
name,
initializer: initial_value,
} in variable_names.iter()
{
out.push(AnyNode::Text(format!("var: {name}")));
if let Some(init_expr) = initial_value {
out.push(AnyNode::Tagged(
"init",
Box::new(AnyNode::Expr(&*init_expr)),
));
}
}
out
}
}
}
fn case_children<'src, 'a, 'b>(c: &'b SwitchCase<'src, 'a>) -> Vec<AnyNode<'src, 'a, 'b>> {
let mut out = Vec::new();
for lbl in c.labels.iter() {
out.push(AnyNode::Tagged("label", Box::new(AnyNode::Expr(&*lbl))));
}
for stmt in c.body.iter() {
out.push(AnyNode::Tagged("stmt", Box::new(AnyNode::Stmt(&*stmt))));
}
out
}
fn case_label<'src, 'a>(c: &SwitchCase<'src, 'a>) -> String {
let l = c.labels.len();
let b = c.body.len();
format!("Case labels={l} body_items={b}")
}

View File

@ -0,0 +1,253 @@
//! Best-effort error recovery utilities.
//!
//! The parser recovers from errors by skipping tokens until a synchronization
//! token is found. The sync target is chosen from [`SyncLevel`] based on
//! the error kind. Methods on [`ParseResult`] let callers widen the error span,
//! synchronize, report, and produce fallback values.
use crate::lexer::{Token, TokenLocation};
use crate::parser::{ParseError, ParseResult, Parser};
/// Synchronization groups the parser can stop at during recovery.
///
/// Stronger levels subsume weaker ones. The enum's variant order defines this
/// ordering of strength via [`Ord`]; changing it changes recovery behavior.
#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq, PartialOrd, Ord)]
pub(crate) enum SyncLevel {
/// Tokens that appear inside expressions.
///
/// Includes operators, member access `.`, ternary `? :`, an opening `(`,
/// and identifiers.
Expression,
/// List separator `,`.
ListSeparator,
/// Close of a parenthesized subexpression `)`.
CloseParenthesis,
/// Close of an index or list `]`.
CloseBracket,
/// Statement boundary or starter.
Statement,
/// Block boundary braces (both `{` and `}`).
BlockBoundary,
/// Start of a top-level or class-level declaration.
TopDeclaration,
}
impl SyncLevel {
/// Converts [`Token`] to its [`SyncLevel`], if it has one.
fn for_token(token: Token) -> Option<SyncLevel> {
use SyncLevel::*;
use Token::*;
match token {
Exponentiation | Increment | Decrement | Not | BitwiseNot | Dot | Cross | Multiply
| Divide | Modulo | Plus | Minus | ConcatSpace | Concat | LeftShift
| LogicalRightShift | RightShift | Less | LessEqual | Greater | GreaterEqual
| Equal | NotEqual | ApproximatelyEqual | ClockwiseFrom | BitwiseAnd | BitwiseOr
| BitwiseXor | And | Xor | Or | Assign | MultiplyAssign | DivideAssign
| ModuloAssign | PlusAssign | MinusAssign | ConcatAssign | ConcatSpaceAssign
| Period | Question | Colon | LeftParenthesis | Identifier => Some(Expression),
Comma => Some(ListSeparator),
RightParenthesis => Some(CloseParenthesis),
RightBracket => Some(CloseBracket),
Case | Default | If | Else | Switch | For | ForEach | While | Do | Return | Break
| Continue | Local | Semicolon => Some(Statement),
Brace(_) | RightBrace => Some(BlockBoundary),
Class | Struct | Enum | State | Function | Event | Delegate | Operator | Var
| Replication | NativeReplication | DefaultProperties | CppText | ExecDirective => {
Some(TopDeclaration)
}
_ => Option::None,
}
}
}
impl<'src, 'arena> Parser<'src, 'arena> {
/// Converts a parse error into a diagnostic and queues it.
///
/// Placeholder implementation.
fn handle_error(&mut self, error: ParseError) {
let diagnostic = crate::diagnostics::DiagnosticBuilder::error(format!(
"error {:?} while parsing",
error.kind
))
.primary_label(error.source_span, "happened here")
.build();
self.diagnostics.push(diagnostic);
}
/// Reports a parser error with [`crate::parser::ParseErrorKind`] at
/// the current location and queues an appropriate diagnostic.
pub fn report_error_here(&mut self, error_kind: crate::parser::ParseErrorKind) {
let new_error = self.make_error_here(error_kind);
self.handle_error(new_error);
}
/// Skips tokens until a token with `min_sync` level or stronger is found.
///
/// Reaches end-of-file if no qualifying token is found.
pub(crate) fn recover_until(&mut self, min_sync: SyncLevel) {
while let Some(next_token) = self.peek_token() {
if let Some(next_token_sync_level) = SyncLevel::for_token(next_token)
&& next_token_sync_level >= min_sync
{
break;
}
// Always advances when `peek_token()` is `Some(...)`,
// so the loop cannot be infinite.
self.advance();
}
}
}
/// Supplies a fallback value after a parse error so parsing can continue and
/// reveal further errors.
pub(crate) trait RecoveryFallback<'src, 'arena>: Sized {
fn fallback_value(parser: &Parser<'src, 'arena>, err: &ParseError) -> Self;
}
/// Extends [`ParseResult`] with recovery-related methods for
/// fluent error handling.
pub(crate) trait ResultRecoveryExt<'src, 'arena, T>: Sized {
/// Extends the left end of the error span to `from`.
///
/// Does nothing if `Self` is `Ok(...)`.
#[must_use]
fn widen_error_span_from(self, from: TokenLocation) -> Self;
/// Extends the right end of the error span up to but not including
/// the next token of the given sync `level`.
///
/// Does nothing if `Self` is `Ok(...)`.
#[must_use]
fn sync_error_until(self, parser: &mut Parser<'src, 'arena>, level: SyncLevel) -> Self;
/// Extends the right end of the error span to include the next token of
/// the given sync `level`.
///
/// Does nothing if `Self` is `Ok(...)`.
#[must_use]
fn sync_error_at(self, parser: &mut Parser<'src, 'arena>, level: SyncLevel) -> Self;
/// Either returns expected value or its best effort fallback.
#[must_use]
fn unwrap_or_fallback(self, parser: &mut Parser<'src, 'arena>) -> T;
/// Produces the contained value if successful,
/// or a fallback if an error occurred.
fn report_error(self, parser: &mut Parser<'src, 'arena>);
}
impl<'src, 'arena, T> ResultRecoveryExt<'src, 'arena, T> for ParseResult<'src, 'arena, T>
where
T: RecoveryFallback<'src, 'arena>,
{
fn widen_error_span_from(mut self, from: TokenLocation) -> Self {
if let Err(ref mut error) = self {
error.source_span.from = std::cmp::min(error.source_span.from, from);
}
self
}
fn sync_error_until(mut self, parser: &mut Parser<'src, 'arena>, level: SyncLevel) -> Self {
if let Err(ref mut error) = self {
parser.recover_until(level);
error.source_span.to = parser.last_visited_location();
}
self
}
fn sync_error_at(mut self, parser: &mut Parser<'src, 'arena>, level: SyncLevel) -> Self {
if let Err(ref mut error) = self {
parser.recover_until(level);
error.source_span.to = parser.peek_location();
// If we're at end-of-file, this'll simply do nothing.
parser.advance();
}
self
}
fn unwrap_or_fallback(self, parser: &mut Parser<'src, 'arena>) -> T {
self.unwrap_or_else(|error| {
let value = T::fallback_value(parser, &error);
parser.handle_error(error);
value
})
}
fn report_error(self, parser: &mut Parser<'src, 'arena>) {
if let Err(error) = self {
parser.handle_error(error);
}
}
}
impl<'src, 'arena> ResultRecoveryExt<'src, 'arena, ()> for ParseError {
fn widen_error_span_from(mut self, from: TokenLocation) -> Self {
self.source_span.from = std::cmp::min(self.source_span.from, from);
self
}
fn sync_error_until(mut self, parser: &mut Parser<'src, 'arena>, level: SyncLevel) -> Self {
parser.recover_until(level);
self.source_span.to = parser.last_visited_location();
self
}
fn sync_error_at(mut self, parser: &mut Parser<'src, 'arena>, level: SyncLevel) -> Self {
parser.recover_until(level);
self.source_span.to = parser.peek_location();
// If we're at end-of-file, this'll simply do nothing.
parser.advance();
self
}
fn unwrap_or_fallback(self, parser: &mut Parser<'src, 'arena>) -> () {
parser.handle_error(self);
}
fn report_error(self, parser: &mut Parser<'src, 'arena>) {
parser.handle_error(self);
}
}
impl<'src, 'arena> RecoveryFallback<'src, 'arena> for TokenLocation {
fn fallback_value(_: &Parser<'src, 'arena>, error: &ParseError) -> Self {
error.source_span.to
}
}
impl<'src, 'arena> RecoveryFallback<'src, 'arena> for crate::ast::ExpressionRef<'src, 'arena> {
fn fallback_value(parser: &Parser<'src, 'arena>, error: &ParseError) -> Self {
crate::arena::ArenaNode::new_in(
crate::ast::Expression::Error,
error.source_span,
parser.arena,
)
}
}
impl<'src, 'arena> RecoveryFallback<'src, 'arena> for crate::ast::StatementRef<'src, 'arena> {
fn fallback_value(parser: &Parser<'src, 'arena>, error: &ParseError) -> Self {
crate::arena::ArenaNode::new_in(
crate::ast::Statement::Error,
error.source_span,
parser.arena,
)
}
}
impl<'src, 'arena, T> RecoveryFallback<'src, 'arena> for Option<T>
where
T: RecoveryFallback<'src, 'arena>,
{
fn fallback_value(parser: &Parser<'src, 'arena>, error: &ParseError) -> Self {
Some(T::fallback_value(parser, error))
}
}

View File

@ -0,0 +1,297 @@
//! This module provides trivia token collection mechanism that lets parser code
//! iterate over significant tokens while ignoring trivia and preserving
//! full information for linting, formatting, and documentation.
//!
//! Tokens considered *trivia* are:
//!
//! 1. [`crate::lexer::Token::LineComment`];
//! 2. [`crate::lexer::Token::BlockComment`];
//! 3. [`crate::lexer::Token::Newline`];
//! 4. [`crate::lexer::Token::Whitespace`].
//!
//! Every other token is considered *significant*.
use crate::lexer::TokenLocation;
/// Types of trivia tokens, corresponding directly to the matching variants of
/// [`crate::lexer::Token`].
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub(crate) enum TriviaKind {
Whitespace,
Newline,
LineComment,
BlockComment,
}
impl std::convert::TryFrom<crate::lexer::Token> for TriviaKind {
type Error = ();
fn try_from(token: crate::lexer::Token) -> Result<Self, Self::Error> {
use crate::lexer::Token;
match token {
Token::Whitespace => Ok(TriviaKind::Whitespace),
Token::Newline => Ok(TriviaKind::Newline),
Token::LineComment => Ok(TriviaKind::LineComment),
Token::BlockComment => Ok(TriviaKind::BlockComment),
_ => Err(()),
}
}
}
/// Complete description of a trivia token.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub(crate) struct TriviaToken<'src> {
/// Specific type of the trivia.
pub kind: TriviaKind,
/// Actual content of the token.
pub text: &'src str,
/// Location of this trivia token in the token stream.
pub location: TokenLocation,
}
type TriviaRange = std::ops::Range<usize>;
type TriviaMap = std::collections::HashMap<TriviaLocation, TriviaRange>;
/// Immutable index over all recorded trivia.
///
/// Enables O(1) access to trivia immediately before/after any significant
/// token, plus file-leading and file-trailing trivia. Returned slices alias
/// internal storage and live for `'src`.
#[derive(Clone, Debug, Default)]
#[allow(dead_code)]
pub(crate) struct TriviaIndex<'src> {
/// All trivia tokens, stored contiguously in file order.
tokens: Vec<TriviaToken<'src>>,
/// Maps token location to the trivia tokens stored right after it.
after_map: TriviaMap,
/// Maps token location to the trivia tokens stored right before it.
before_map: TriviaMap,
}
/// Extends [`TokenLocation`] with *start of file* value.
///
/// Regular [`TokenLocation`] does not need this value, but trivia requires
/// a way to express "trivia before any significant token".
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
enum TriviaLocation {
/// Position before any tokens, trivia or otherwise.
StartOfFile,
/// This variant can also express "end of file" through
/// [`TokenLocation::EndOfFile`].
At(TokenLocation),
}
/// Mutable builder for `TriviaIndex`.
///
/// Used inside the parser to record trivia between successive significant
/// tokens in file order, then frozen via `into_index`.
#[derive(Debug, Default)]
#[allow(dead_code)]
pub(crate) struct TriviaComponent<'src> {
/// All trivia tokens, stored contiguously in file order.
tokens: Vec<TriviaToken<'src>>,
/// Maps token location to the trivia tokens stored right after it.
after_map: TriviaMap,
/// Maps token location to the trivia tokens stored right before it.
before_map: TriviaMap,
/// Location of the last gap's right boundary,
/// for debug-time invariant checks.
#[cfg(debug_assertions)]
last_right_boundary: Option<TriviaLocation>,
}
impl<'src> TriviaComponent<'src> {
/// Records trivia tokens that lie strictly between
/// `previous_token_location` and `next_token_location`.
///
/// [`None`] for `previous_token_location` means beginning of file;
/// `next_token_location` may be [`TokenLocation::EndOfFile`].
///
/// Empties `gap_trivia` without changing its capacity.
///
/// Requirements (checked in debug builds):
/// - previous_token_location < next_token_location;
/// - calls are monotonic: each gap starts at or after the last end;
/// - `collected` is nonempty and strictly ordered by `location`;
/// - all `collected` lie strictly inside (prev, next).
pub(crate) fn record_between_locations(
&mut self,
previous_token_location: Option<TokenLocation>,
next_token_location: TokenLocation,
gap_trivia: &mut Vec<TriviaToken<'src>>,
) {
#[cfg(debug_assertions)]
self.debug_assert_valid_recording_batch(
previous_token_location,
next_token_location,
&gap_trivia,
);
if gap_trivia.is_empty() {
return;
}
let previous_token_location = previous_token_location
.map(TriviaLocation::At)
.unwrap_or(TriviaLocation::StartOfFile);
let next_token_location = TriviaLocation::At(next_token_location);
let trivia_start = self.tokens.len();
self.tokens.append(gap_trivia);
let trivia_end = self.tokens.len();
self.after_map
.insert(previous_token_location, trivia_start..trivia_end);
self.before_map
.insert(next_token_location, trivia_start..trivia_end);
}
/// Freezes into an immutable, shareable index.
#[must_use]
#[allow(dead_code)]
pub(crate) fn into_index(self) -> TriviaIndex<'src> {
TriviaIndex {
tokens: self.tokens,
after_map: self.after_map,
before_map: self.before_map,
}
}
/// Trivia immediately after the significant token at `location`.
///
/// Returns an empty slice if `location` is not pointing at
/// a significant token or if no trivia was recorded after it.
#[must_use]
#[allow(dead_code)]
pub(crate) fn after_token(&self, location: TokenLocation) -> &[TriviaToken<'src>] {
self.slice_for(TriviaLocation::At(location), &self.after_map)
}
/// Trivia immediately before the significant token at `location`.
///
/// Returns an empty slice if `location` is not pointing at
/// a significant token or if no trivia was recorded before it.
#[must_use]
#[allow(dead_code)]
pub(crate) fn before_token(&self, location: TokenLocation) -> &[TriviaToken<'src>] {
self.slice_for(TriviaLocation::At(location), &self.before_map)
}
/// Trivia before any significant token.
#[must_use]
#[allow(dead_code)]
pub(crate) fn after_file_start(&self) -> &[TriviaToken<'src>] {
self.slice_for(TriviaLocation::StartOfFile, &self.after_map)
}
/// Trivia after the last significant token.
#[must_use]
#[allow(dead_code)]
pub(crate) fn before_file_end(&self) -> &[TriviaToken<'src>] {
self.slice_for(
TriviaLocation::At(TokenLocation::EndOfFile),
&self.before_map,
)
}
// Helper: return the recorded slice or an empty slice if none.
#[track_caller]
#[allow(dead_code)]
fn slice_for(&self, key: TriviaLocation, map: &TriviaMap) -> &[TriviaToken<'src>] {
if let Some(range) = map.get(&key) {
// Ranges are guaranteed to be valid by construction
&self.tokens[range.start..range.end]
} else {
&[]
}
}
/// Debug-only validation for `record_between_locations`'s contract.
#[cfg(debug_assertions)]
fn debug_assert_valid_recording_batch(
&mut self,
previous_token_location: Option<TokenLocation>,
next_token_location: TokenLocation,
collected: &[TriviaToken<'src>],
) {
// Prevent zero-width or reversed gaps
debug_assert!(previous_token_location < Some(next_token_location));
let previous_token_location = previous_token_location
.map(TriviaLocation::At)
.unwrap_or(TriviaLocation::StartOfFile);
let next_token_location = TriviaLocation::At(next_token_location);
// Enforce monotonic gaps: we record in file order
if let Some(last_right) = self.last_right_boundary {
debug_assert!(previous_token_location >= last_right);
}
self.last_right_boundary = Some(next_token_location);
let first_trivia_location = collected
.first()
.map(|token| TriviaLocation::At(token.location))
.expect("Provided trivia tokens array should not be empty.");
let last_trivia_location = collected
.last()
.map(|token| TriviaLocation::At(token.location))
.expect("Provided trivia tokens array should not be empty.");
// Ensure trivia lies strictly inside the gap
debug_assert!(previous_token_location < first_trivia_location);
debug_assert!(next_token_location > last_trivia_location);
// Ensure trivia locations are strictly increasing
debug_assert!(
collected
.windows(2)
.all(|window| window[0].location < window[1].location)
);
}
}
impl<'src> TriviaIndex<'src> {
/// Trivia immediately after the significant token at `location`.
///
/// Returns an empty slice if `location` is not pointing at
/// a significant token or if no trivia was recorded after it.
#[must_use]
#[allow(dead_code)]
pub(crate) fn after_token(&self, location: TokenLocation) -> &[TriviaToken<'src>] {
self.slice_for(TriviaLocation::At(location), &self.after_map)
}
/// Trivia immediately before the significant token at `location`.
///
/// Returns an empty slice if `location` is not pointing at
/// a significant token or if no trivia was recorded before it.
#[must_use]
#[allow(dead_code)]
pub(crate) fn before_token(&self, location: TokenLocation) -> &[TriviaToken<'src>] {
self.slice_for(TriviaLocation::At(location), &self.before_map)
}
/// Trivia before any significant token.
#[must_use]
#[allow(dead_code)]
pub(crate) fn after_file_start(&self) -> &[TriviaToken<'src>] {
self.slice_for(TriviaLocation::StartOfFile, &self.after_map)
}
/// Trivia after the last significant token.
#[must_use]
#[allow(dead_code)]
pub(crate) fn before_file_end(&self) -> &[TriviaToken<'src>] {
self.slice_for(
TriviaLocation::At(TokenLocation::EndOfFile),
&self.before_map,
)
}
// Helper: return the recorded slice or an empty slice if none.
#[track_caller]
#[allow(dead_code)]
fn slice_for(&self, key: TriviaLocation, map: &TriviaMap) -> &[TriviaToken<'src>] {
if let Some(range) = map.get(&key) {
// Ranges are guaranteed to be valid by construction
&self.tokens[range.start..range.end]
} else {
&[]
}
}
}

View File

@ -37,7 +37,7 @@ impl tower_lsp::LanguageServer for RottLanguageServer {
// Measure lexing performance to track parser responsiveness. // Measure lexing performance to track parser responsiveness.
let start_time = std::time::Instant::now(); let start_time = std::time::Instant::now();
let has_errors = let has_errors =
rottlib::lexer::TokenizedFile::from_source(&params.text_document.text).had_errors(); rottlib::lexer::TokenizedFile::from_str(&params.text_document.text).has_errors();
let elapsed_time = start_time.elapsed(); let elapsed_time = start_time.elapsed();
self.client self.client