rott/rottlib/src/parser/grammar/expression/literals.rs

124 lines
5.5 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Literal decoding for Fermented `UnrealScript`.
//!
//! This module defines the semantic rules for interpreting literal tokens
//! produced by the lexer. It is responsible only for *decoding* the textual
//! representation of literals into their internal values.
//!
//! The rules implemented here intentionally mirror the quirks of
//! Unreal Engine 2s `UnrealScript`.
use crate::parser::{ParseErrorKind, ParseResult};
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
/// Decodes an integer literal string into [`u128`].
///
/// Syntax:
/// - Optional base prefix: `0b` | `0o` | `0x` (case-insensitive).
/// No prefix -> decimal.
/// - Digits must match the base (`0-1`/`0-7`/`0-9A-F`).
/// - Underscores are allowed and ignored (e.g., `1_000`, `0xDE_AD`).
/// - No leading sign; parsed as a non-negative magnitude.
/// - Must fit within [`u128`].
///
/// Examples: `42`, `0b1010_0011`, `0o755`, `0xDEAD_BEEF`.
///
/// On failure, returns [`ParseErrorKind::InvalidNumericLiteral`] at
/// the parser's current cursor position.
pub(crate) fn decode_integer_literal(&self, literal: &str) -> ParseResult<'src, 'arena, u128> {
let (base, content) = match literal.split_at_checked(2) {
Some(("0b" | "0B", stripped)) => (2, stripped),
Some(("0o" | "0O", stripped)) => (8, stripped),
Some(("0x" | "0X", stripped)) => (16, stripped),
_ => (10, literal),
};
let digits_without_underscores = content.replace('_', "");
u128::from_str_radix(&digits_without_underscores, base)
.map_err(|_| self.make_error_at_last_consumed(ParseErrorKind::InvalidNumericLiteral))
}
/// Decodes a float literal as `f64`, following the permissive and only
/// partially documented behavior of `UnrealScript`.
///
/// Unreal Engine 2 does not define a precise and consistent set of rules
/// for float literals and the original compiler contains several quirks.
/// Because of this, we default to normalizing the text using a small set of
/// UnrealScript-specific rules and then parse the result using rust's
/// `f64` parser.
///
/// Rules implemented here:
/// - Only decimal floats and special literals (e.g. `NaN`, `inf`)
/// are supported (no hex or binary formats).
/// - A single trailing `f` or `F`, if present, is removed before parsing.
/// - The literal text is scanned for periods (`.`). If a second period
/// is found, everything from that second `.` onward is discarded.
///
/// Examples:
/// * `1.2.3e4` becomes `1.2`
/// * `1.2e3.4` becomes `1.2e3`
///
/// - After this truncation step, the remaining text is interpreted as a
/// normal rust `f64` literal. This means it may contain digits, at
/// most one decimal point, and an optional exponent part (for example
/// `e3` or `E-2`), but it must otherwise follow rust's `f64` syntax.
/// Underscores, spaces, and other unsupported characters cause a
/// parse error.
///
/// On failure, this function returns
/// [`ParseErrorKind::InvalidNumericLiteral`] at the current parser
/// position.
pub(crate) fn decode_float_literal(&self, literal: &str) -> ParseResult<'src, 'arena, f64> {
let content = literal
.strip_suffix('f')
.or_else(|| literal.strip_suffix('F'))
.unwrap_or(literal);
// Truncate after the second '.', matching UnrealScript behavior
let content = content
.match_indices('.')
.nth(1)
.and_then(|(period_index, _)| content.get(..period_index))
.unwrap_or(content);
content
.parse::<f64>()
.map_err(|_| self.make_error_at_last_consumed(ParseErrorKind::InvalidNumericLiteral))
}
/// Unescapes a tokenized string literal into an arena string.
///
/// Supported escapes: `\n`, `\t`, `\"`, `\\`.
/// Unknown escapes drop the backslash and emit the character unchanged
/// (`UnrealScript` behavior).
/// If `raw_string` ends with a trailing `\` (which should not happen for
/// well-formed tokens), that backslash is simply ignored.
///
/// This function assumes `raw_string` is the token text without surrounding
/// quotes.
pub(crate) fn unescape_string_literal(
&self,
raw_string: &str,
) -> crate::arena::ArenaString<'arena> {
let mut buffer = String::with_capacity(raw_string.len());
let mut characters = raw_string.chars();
while let Some(next_character) = characters.next() {
if next_character == '\\' {
// Under the lexer contract, string tokens do not end with a lone
// backslash, so there is always a following character. If this
// invariant is broken, the final '\' is simply ignored here.
if let Some(escaped_character) = characters.next() {
match escaped_character {
'n' => buffer.push('\n'),
't' => buffer.push('\t'),
'"' => buffer.push('"'),
'\\' => buffer.push('\\'),
// Simply leaving the escaped character matches
// UnrealScript behavior.
unrecognized_escape_char => buffer.push(unrecognized_escape_char),
}
}
} else {
buffer.push(next_character);
}
}
self.arena.string(&buffer)
}
}