124 lines
5.5 KiB
Rust
124 lines
5.5 KiB
Rust
//! Literal decoding for Fermented `UnrealScript`.
|
||
//!
|
||
//! This module defines the semantic rules for interpreting literal tokens
|
||
//! produced by the lexer. It is responsible only for *decoding* the textual
|
||
//! representation of literals into their internal values.
|
||
//!
|
||
//! The rules implemented here intentionally mirror the quirks of
|
||
//! Unreal Engine 2’s `UnrealScript`.
|
||
|
||
use crate::parser::{ParseErrorKind, ParseResult};
|
||
|
||
impl<'src, 'arena> crate::parser::Parser<'src, 'arena> {
|
||
/// Decodes an integer literal string into [`u128`].
|
||
///
|
||
/// Syntax:
|
||
/// - Optional base prefix: `0b` | `0o` | `0x` (case-insensitive).
|
||
/// No prefix -> decimal.
|
||
/// - Digits must match the base (`0-1`/`0-7`/`0-9A-F`).
|
||
/// - Underscores are allowed and ignored (e.g., `1_000`, `0xDE_AD`).
|
||
/// - No leading sign; parsed as a non-negative magnitude.
|
||
/// - Must fit within [`u128`].
|
||
///
|
||
/// Examples: `42`, `0b1010_0011`, `0o755`, `0xDEAD_BEEF`.
|
||
///
|
||
/// On failure, returns [`ParseErrorKind::InvalidNumericLiteral`] at
|
||
/// the parser's current cursor position.
|
||
pub(crate) fn decode_integer_literal(&self, literal: &str) -> ParseResult<'src, 'arena, u128> {
|
||
let (base, content) = match literal.split_at_checked(2) {
|
||
Some(("0b" | "0B", stripped)) => (2, stripped),
|
||
Some(("0o" | "0O", stripped)) => (8, stripped),
|
||
Some(("0x" | "0X", stripped)) => (16, stripped),
|
||
_ => (10, literal),
|
||
};
|
||
let digits_without_underscores = content.replace('_', "");
|
||
u128::from_str_radix(&digits_without_underscores, base)
|
||
.map_err(|_| self.make_error_at_last_consumed(ParseErrorKind::InvalidNumericLiteral))
|
||
}
|
||
|
||
/// Decodes a float literal as `f64`, following the permissive and only
|
||
/// partially documented behavior of `UnrealScript`.
|
||
///
|
||
/// Unreal Engine 2 does not define a precise and consistent set of rules
|
||
/// for float literals and the original compiler contains several quirks.
|
||
/// Because of this, we default to normalizing the text using a small set of
|
||
/// UnrealScript-specific rules and then parse the result using rust's
|
||
/// `f64` parser.
|
||
///
|
||
/// Rules implemented here:
|
||
/// - Only decimal floats and special literals (e.g. `NaN`, `inf`)
|
||
/// are supported (no hex or binary formats).
|
||
/// - A single trailing `f` or `F`, if present, is removed before parsing.
|
||
/// - The literal text is scanned for periods (`.`). If a second period
|
||
/// is found, everything from that second `.` onward is discarded.
|
||
///
|
||
/// Examples:
|
||
/// * `1.2.3e4` becomes `1.2`
|
||
/// * `1.2e3.4` becomes `1.2e3`
|
||
///
|
||
/// - After this truncation step, the remaining text is interpreted as a
|
||
/// normal rust `f64` literal. This means it may contain digits, at
|
||
/// most one decimal point, and an optional exponent part (for example
|
||
/// `e3` or `E-2`), but it must otherwise follow rust's `f64` syntax.
|
||
/// Underscores, spaces, and other unsupported characters cause a
|
||
/// parse error.
|
||
///
|
||
/// On failure, this function returns
|
||
/// [`ParseErrorKind::InvalidNumericLiteral`] at the current parser
|
||
/// position.
|
||
pub(crate) fn decode_float_literal(&self, literal: &str) -> ParseResult<'src, 'arena, f64> {
|
||
let content = literal
|
||
.strip_suffix('f')
|
||
.or_else(|| literal.strip_suffix('F'))
|
||
.unwrap_or(literal);
|
||
// Truncate after the second '.', matching UnrealScript behavior
|
||
let content = content
|
||
.match_indices('.')
|
||
.nth(1)
|
||
.and_then(|(period_index, _)| content.get(..period_index))
|
||
.unwrap_or(content);
|
||
content
|
||
.parse::<f64>()
|
||
.map_err(|_| self.make_error_at_last_consumed(ParseErrorKind::InvalidNumericLiteral))
|
||
}
|
||
|
||
/// Unescapes a tokenized string literal into an arena string.
|
||
///
|
||
/// Supported escapes: `\n`, `\t`, `\"`, `\\`.
|
||
/// Unknown escapes drop the backslash and emit the character unchanged
|
||
/// (`UnrealScript` behavior).
|
||
/// If `raw_string` ends with a trailing `\` (which should not happen for
|
||
/// well-formed tokens), that backslash is simply ignored.
|
||
///
|
||
/// This function assumes `raw_string` is the token text without surrounding
|
||
/// quotes.
|
||
pub(crate) fn unescape_string_literal(
|
||
&self,
|
||
raw_string: &str,
|
||
) -> crate::arena::ArenaString<'arena> {
|
||
let mut buffer = String::with_capacity(raw_string.len());
|
||
let mut characters = raw_string.chars();
|
||
while let Some(next_character) = characters.next() {
|
||
if next_character == '\\' {
|
||
// Under the lexer contract, string tokens do not end with a lone
|
||
// backslash, so there is always a following character. If this
|
||
// invariant is broken, the final '\' is simply ignored here.
|
||
if let Some(escaped_character) = characters.next() {
|
||
match escaped_character {
|
||
'n' => buffer.push('\n'),
|
||
't' => buffer.push('\t'),
|
||
'"' => buffer.push('"'),
|
||
'\\' => buffer.push('\\'),
|
||
// Simply leaving the escaped character matches
|
||
// UnrealScript behavior.
|
||
unrecognized_escape_char => buffer.push(unrecognized_escape_char),
|
||
}
|
||
}
|
||
} else {
|
||
buffer.push(next_character);
|
||
}
|
||
}
|
||
self.arena.string(&buffer)
|
||
}
|
||
}
|