rott/rottlib/src/parser/grammar/expression/pratt.rs

258 lines
10 KiB
Rust

//! Core of the expression parser for Fermented `UnrealScript`.
//!
//! This module implements a Pratt-style parser for the language's expression
//! grammar, supporting:
//!
//! * Primary expressions (see [`crate::parser::primary`] for details on what
//! we consider to be a primary expression);
//! * Prefix operators;
//! * Postfix operators;
//! * Infix operators with hard-coded precedence and associativity.
//!
//! Parsing is driven by [`PrecedenceRank`], which controls how tightly
//! operators bind. Infix parsing uses the pair of binding powers returned by
//! [`super::precedence::infix_precedence_ranks`] to encode associativity.
//! The parser infrastructure supports both left- and right-associative
//! operators, but Fermented `UnrealScript` currently defines only
//! left-associative ones.
//!
//! ## Postfix operator vs "selectors"
//!
//! Everywhere here we distinguish *selectors* like field accessor `.`,
//! function call `()` or array indices `[]` from other *postfix operators*
//! as they:
//!
//! 1. Have significantly different semantic meaning;
//! 2. Are not considered operators from `UnrealScript`'s viewpoint
//! (e.g. cannot be overloaded).
//!
//! ## See also
//!
//! - [`parser::Parser::parse_expression`] - main entry point
//! - [`PrecedenceRank`] - operator binding strengths
//! - [`super::precedence`] - operator precedence definitions
use crate::ast::{self, Expression, ExpressionRef};
use crate::lexer::TokenPosition;
use crate::parser::{
self, ParseErrorKind, ParseExpressionResult, Parser, ResultRecoveryExt, diagnostic_labels,
};
pub use super::precedence::PrecedenceRank;
/// Returns whether postfix operators like `++` and `--` are disallowed
/// after this expression.
///
/// This restriction applies only to postfix operators. Selectors such as
/// field access `.x`, indexing `[i]`, and calls `(args)` remain allowed.
fn forbids_postfix_operators(expression: &ExpressionRef<'_, '_>) -> bool {
matches!(
**expression,
Expression::If { .. }
| Expression::While { .. }
| Expression::DoUntil { .. }
| Expression::For { .. }
| Expression::ForEach { .. }
| Expression::Switch { .. }
| Expression::Block { .. }
)
}
impl<'src, 'arena> Parser<'src, 'arena> {
// TODO: success here guaranees progress
/// Parses an expression.
///
/// Always returns some expression node; any syntax errors are reported
/// through the parser's diagnostics.
#[must_use]
pub fn parse_expression(&mut self) -> ExpressionRef<'src, 'arena> {
self.parse_expression_with_min_precedence_rank(PrecedenceRank::LOOSEST)
.sync_error_until(self, parser::SyncLevel::ExpressionStart)
.unwrap_or_fallback(self)
}
/// Parses an expression in a grammar position where an expression is
/// required.
///
/// This is the checked variant of [`Parser::parse_expression`]. If the next
/// token is known not to be a valid expression starter, this reports
/// `bad_start_error_kind`, consumes the bad token, and starts panic-mode
/// recovery until [`crate::parser::SyncLevel::ExpressionStart`].
///
/// `required_by_position` identifies the token or construct that created
/// the requirement for an expression. It is attached to the diagnostic with
/// the [`diagnostic_labels::EXPRESSION_REQUIRED_BY`] label.
///
/// `expression_context_position` identifies the local syntactic anchor after
/// which the expression was expected. It is attached to the diagnostic with
/// the [`diagnostic_labels::EXPRESSION_EXPECTED_AFTER`] label.
pub(super) fn parse_expression_with_start_error(
&mut self,
bad_start_error_kind: ParseErrorKind,
required_by_position: crate::lexer::TokenPosition,
expression_context_position: crate::lexer::TokenPosition,
) -> ParseExpressionResult<'src, 'arena> {
if self.next_token_definitely_cannot_start_expression() {
let error_position = self.peek_position_or_eof();
//self.advance();
return Err(self
.make_error_at(bad_start_error_kind, error_position)
.sync_error_until(self, crate::parser::SyncLevel::ExpressionStart)
.blame_token(error_position)
.related_token(
diagnostic_labels::EXPRESSION_REQUIRED_BY,
required_by_position,
)
.related_token(
diagnostic_labels::EXPRESSION_EXPECTED_AFTER,
expression_context_position,
));
}
self.parse_expression_with_min_precedence_rank(PrecedenceRank::LOOSEST)
}
pub(super) fn make_error_expression_at(
&self,
position: TokenPosition,
) -> ExpressionRef<'src, 'arena> {
crate::arena::ArenaNode::new_in(
Expression::Error,
crate::lexer::TokenSpan::new(position),
self.arena,
)
}
/// Parses an expression, including only operators with binding power
/// at least `min_precedence_rank` (as tight or tighter).
fn parse_expression_with_min_precedence_rank(
&mut self,
min_precedence_rank: PrecedenceRank,
) -> parser::ParseExpressionResult<'src, 'arena> {
let mut left_hand_side = self.parse_prefix_or_primary()?;
left_hand_side = self.parse_selectors_after(left_hand_side)?;
// We disallow only postfix operators after expression forms that
// represent control-flow or block constructs. Selectors are still
// parsed normally.
// This avoids ambiguities in cases like:
//
// ```unrealscript
// if test() { do_it(); }
// ++ counter;
// ```
//
// This wasn't a problem in UnrealScript, because such constructs were
// never treated as expressions. And it shouldn't be an issue for us
// because neither `--` or `++` (the only existing default postfix
// operators) make any sense after such expressions anyway.
if !forbids_postfix_operators(&left_hand_side) {
left_hand_side = self.parse_postfix_after(left_hand_side);
}
self.parse_infix_after(left_hand_side, min_precedence_rank)
}
/// Parses a prefix or primary expression (Pratt parser's "nud" or
/// null denotation).
fn parse_prefix_or_primary(&mut self) -> parser::ParseExpressionResult<'src, 'arena> {
let (token, token_lexeme, token_position) =
self.require_token_lexeme_and_position(ParseErrorKind::ExpressionExpected)?;
// Avoid advancing over an obviously wrong token;
// this prevents error cases like `new(Outer, Name, 7 +) SomeClass`.
if token.is_definitely_not_expression_start() {
return Err(
self.make_error_at(ParseErrorKind::ExpressionExpected, token_position)
);
}
self.advance();
if let Ok(operator) = ast::PrefixOperator::try_from(token) {
// In UnrealScript, prefix and postfix operators bind tighter than
// any infix operators, so we can safely parse the right hand side
// at the tightest precedence.
let right_hand_side = self
.parse_expression_with_min_precedence_rank(PrecedenceRank::TIGHTEST)
.related_token("prefix_operator", token_position)?;
Ok(Expression::new_prefix(
self.arena,
token_position,
operator,
right_hand_side,
))
} else {
self.parse_primary_from_current_token(token, token_lexeme, token_position)
}
}
/// Parses all postfix operators it can, creating a tree with
/// `left_hand_side` as a child.
fn parse_postfix_after(
&mut self,
mut left_hand_side: ExpressionRef<'src, 'arena>,
) -> ExpressionRef<'src, 'arena> {
while let Some((operator, operator_position)) = self.peek_postfix_with_position() {
self.advance();
left_hand_side =
Expression::new_postfix(self.arena, left_hand_side, operator, operator_position);
}
left_hand_side
}
/// Parses infix operators binding at least as tight as
/// `min_precedence_rank`.
///
/// Associativity is encoded by
/// [`super::precedence::infix_precedence_ranks`].
///
/// Stops when the next operator is looser than `min_precedence_rank`.
fn parse_infix_after(
&mut self,
mut left_hand_side: ExpressionRef<'src, 'arena>,
min_precedence_rank: PrecedenceRank,
) -> parser::ParseExpressionResult<'src, 'arena> {
while let Some((operator, right_precedence_rank)) =
self.peek_infix_with_min_precedence_rank(min_precedence_rank)
{
self.advance();
let infix_operator_position = self.last_consumed_position_or_start();
let right_hand_side = self
.parse_expression_with_min_precedence_rank(right_precedence_rank)
.related_token("infix_operator", infix_operator_position)?;
left_hand_side =
Expression::new_binary(self.arena, left_hand_side, operator, right_hand_side);
}
Ok(left_hand_side)
}
/// Returns the next postfix operator and its position if present.
///
/// Helper to avoid peeking and mapping twice; used to drive the postfix
/// loop without unwraps.
fn peek_postfix_with_position(
&mut self,
) -> Option<(ast::PostfixOperator, crate::lexer::TokenPosition)> {
let (token, token_position) = self.peek_token_and_position()?;
let Ok(operator) = ast::PostfixOperator::try_from(token) else {
return None;
};
Some((operator, token_position))
}
/// If the next token is an infix operator with left binding power at least
/// `min_precedence_rank`, returns its operator and the minimum precedence
/// rank to use when parsing the right-hand side (i.e. the operator's right
/// binding power).
///
/// Otherwise returns [`None`].
fn peek_infix_with_min_precedence_rank(
&mut self,
min_precedence_rank: PrecedenceRank,
) -> Option<(ast::InfixOperator, PrecedenceRank)> {
let (left_precedence_rank, operator, right_precedence_rank) = self
.peek_token()
.and_then(super::precedence::infix_precedence_ranks)?;
if left_precedence_rank.is_looser_than(min_precedence_rank) {
return None;
}
Some((operator, right_precedence_rank))
}
}