rott/rottlib/src/lexer/raw_lexer.rs
dkanus 588790b9b4 Refactor everything
Huge dump of refactored code. Still in the middle of the changes that
are to be squashed later in a one huge monster commit, because there is
no value in anything atomic here.
2026-04-05 20:32:11 +07:00

633 lines
18 KiB
Rust

//! Lexer for `UnrealScript` that understands inline `cpptext { ... }` blocks.
//!
//! ## Notable details
//!
//! Lexer for `UnrealScript` that recognizes inline `cpptext { ... }` blocks.
//!
//! In `UnrealScript`, `cpptext` lets authors embed raw C++ between braces.\
//! Because whitespace, newlines, or comments may appear between the
//! `cpptext` keyword and the opening `{`, the lexer must remember that
//! it has just seen `cpptext` - hence a state machine.
//!
//! ## Modes
//!
//! - **Normal** - ordinary `UnrealScript` `RawTokens`.
//! - **`AwaitingCppBlock`** - after `cpptext`, waiting for the next `{`.
//!
//! When that brace arrives, the lexer consumes the entire C++ block as
//! one `RawToken` (`RawToken::Brace(BraceKind::CppBlock)`), tracking nested
//! braces, strings, and comments on the way. If the closing `}` is
//! missing, everything to EOF is treated as C++; downstream parsers must
//! handle that gracefully.
use logos::Lexer;
/// Which lexer mode we're in. See the module docs for the full story.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Default)]
enum LexerMode {
/// Lexing regular `UnrealScript`.
#[default]
Normal,
/// Saw `cpptext`; waiting for the opening `{` of a C++ block.
AwaitingCppBlock,
}
/// Extra per-lexer state. Currently just holds the [`LexerMode`].
///
/// This is a logos-specific implementation detail.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub struct LexerState {
mode: LexerMode,
}
/// Distinguishes an ordinary `{` token from one that starts
/// an embedded C++ block.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum BraceKind {
/// An ordinary `UnrealScript` `{`.
Normal,
/// A `{` that starts an embedded C++ block and consumes through its
/// matching `}`.
CppBlock,
}
/// Tokens produced by the `UnrealScript` lexer.
///
/// Includes both syntactic tokens and trivia such as whitespace, newlines,
/// and comments.
#[derive(logos::Logos, Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[logos(extras = LexerState)]
pub enum RawToken {
// # Compiler/directive keywords
#[regex(r"(?i)#exec[^\r\n]*(?:\r\n|\n|\r)?")]
ExecDirective,
#[regex("(?i)cpptext", |lex| {
if is_next_nontrivia_left_brace(lex) {
lex.extras.mode = LexerMode::AwaitingCppBlock;
} else {
lex.extras.mode = LexerMode::Normal;
}
})]
CppText,
#[regex("(?i)cppstruct", |lex| {
if is_next_nontrivia_left_brace(lex) {
lex.extras.mode = LexerMode::AwaitingCppBlock;
} else {
lex.extras.mode = LexerMode::Normal;
}
})]
CppStruct,
// # Declaration & structural keywords
//#[regex("(?i)class")]
#[token("class", ignore(case))]
Class,
#[token("struct", ignore(case))]
Struct,
#[token("enum", ignore(case))]
Enum,
#[token("state", ignore(case))]
State,
#[token("auto", ignore(case))]
Auto,
#[token("function", ignore(case))]
Function,
#[token("event", ignore(case))]
Event,
#[token("delegate", ignore(case))]
Delegate,
#[token("var", ignore(case))]
Var,
#[token("local", ignore(case))]
Local,
// # Inheritance, interface, dependencies
#[token("extends", ignore(case))]
Extends,
#[token("dependson", ignore(case))]
DependsOn,
// # Access modifiers & properties
#[token("private", ignore(case))]
Private,
#[token("protected", ignore(case))]
Protected,
#[token("public", ignore(case))]
Public,
#[token("const", ignore(case))]
Const,
#[token("static", ignore(case))]
Static,
#[token("native", ignore(case))]
Native,
#[token("abstract", ignore(case))]
Abstract,
#[token("deprecated", ignore(case))]
Deprecated,
#[token("safereplace", ignore(case))]
SafeReplace,
#[token("exportstructs", ignore(case))]
ExportStructs,
#[token("input", ignore(case))]
Input,
// # UnrealScript metadata/specifiers
#[token("final", ignore(case))]
Final,
#[token("default", ignore(case))]
Default,
#[token("defaultproperties", ignore(case))]
DefaultProperties,
#[token("object", ignore(case))]
Object,
#[token("begin", ignore(case))]
Begin,
#[token("end", ignore(case))]
End,
#[token("optional", ignore(case))]
Optional,
#[token("config", ignore(case))]
Config,
#[token("perobjectconfig", ignore(case))]
PerObjectConfig,
#[token("globalconfig", ignore(case))]
GlobalConfig,
#[token("collapsecategories", ignore(case))]
CollapseCategories,
#[token("dontcollapsecategories", ignore(case))]
DontCollapseCategories,
#[token("hidecategories", ignore(case))]
HideCategories,
#[token("showcategories", ignore(case))]
ShowCategories,
#[token("localized", ignore(case))]
Localized,
#[token("placeable", ignore(case))]
Placeable,
#[token("notplaceable", ignore(case))]
NotPlaceable,
#[token("instanced", ignore(case))]
Instanced,
#[token("editconst", ignore(case))]
EditConst,
#[token("editconstarray", ignore(case))]
EditConstArray,
#[token("editinline", ignore(case))]
EditInline,
#[token("editinlineuse", ignore(case))]
EditInlineUse,
#[token("editinlinenew", ignore(case))]
EditInlineNew,
#[token("noteditinlinenew", ignore(case))]
NotEditInlineNew,
#[token("edfindable", ignore(case))]
EdFindable,
#[token("editinlinenotify", ignore(case))]
EditInlineNotify,
#[token("parseconfig", ignore(case))]
ParseConfig,
#[token("automated", ignore(case))]
Automated,
#[token("dynamicrecompile", ignore(case))]
DynamicRecompile,
#[token("transient", ignore(case))]
Transient,
#[token("long", ignore(case))]
Long,
#[token("operator", ignore(case))]
Operator,
#[token("preoperator", ignore(case))]
PreOperator,
#[token("postoperator", ignore(case))]
PostOperator,
#[token("simulated", ignore(case))]
Simulated,
#[token("exec", ignore(case))]
Exec,
#[token("latent", ignore(case))]
Latent,
#[token("iterator", ignore(case))]
Iterator,
#[token("out", ignore(case))]
Out,
#[token("skip", ignore(case))]
Skip,
#[token("singular", ignore(case))]
Singular,
#[token("coerce", ignore(case))]
Coerce,
#[token("assert", ignore(case))]
Assert,
#[token("ignores", ignore(case))]
Ignores,
#[token("within", ignore(case))]
Within,
#[token("init", ignore(case))]
Init,
#[token("export", ignore(case))]
Export,
#[token("noexport", ignore(case))]
NoExport,
#[token("hidedropdown", ignore(case))]
HideDropdown,
#[token("travel", ignore(case))]
Travel,
#[token("cache", ignore(case))]
Cache,
#[token("cacheexempt", ignore(case))]
CacheExempt,
// # Replication-related
#[token("reliable", ignore(case))]
Reliable,
#[token("unreliable", ignore(case))]
Unreliable,
#[token("replication", ignore(case))]
Replication,
#[token("nativereplication", ignore(case))]
NativeReplication,
// # Control-flow keywords
#[token("goto", ignore(case))]
Goto,
#[token("if", ignore(case))]
If,
#[token("else", ignore(case))]
Else,
#[token("switch", ignore(case))]
Switch,
#[token("case", ignore(case))]
Case,
#[token("for", ignore(case))]
For,
#[token("foreach", ignore(case))]
ForEach,
#[token("while", ignore(case))]
While,
#[token("do", ignore(case))]
Do,
#[token("until", ignore(case))]
Until,
#[token("break", ignore(case))]
Break,
#[token("continue", ignore(case))]
Continue,
#[token("return", ignore(case))]
Return,
// # Built-in types
#[token("int", ignore(case))]
Int,
#[token("float", ignore(case))]
Float,
#[token("bool", ignore(case))]
Bool,
#[token("byte", ignore(case))]
Byte,
#[token("string", ignore(case))]
String,
#[token("array", ignore(case))]
Array,
#[token("name", ignore(case))]
Name,
// FloatLiteral must come before IntegerLiteral and '.'
// to have higher priority.
// It also recognizes things like: `1.foo``, `1.foo.bar`, `1.2.3`.
// It has to. Because UnrealScript is a pile of-... wonderful language,
// where everything is possible.
#[regex(r"[0-9]+(?:\.(?:[0-9]+|[A-Za-z_][A-Za-z0-9_]*))+[fF]?")]
#[regex(r"(?:[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)(?:[eE][+-]?[0-9]+)?[fF]?")]
#[regex(r"[0-9]+[eE][+-]?[0-9]+[fF]?")]
FloatLiteral,
#[regex(r"0b[01](?:_?[01])*")]
#[regex(r"0o[0-7](?:_?[0-7])*")]
#[regex(r"0x[0-9A-Fa-f](?:_?[0-9A-Fa-f])*")]
#[regex(r"[0-9][0-9]*")]
IntegerLiteral,
#[regex(r#""([^"\\\r\n]|\\.)*""#)]
StringLiteral,
#[regex(r"'[a-zA-Z0-9_\. \-]*'")]
NameLiteral,
#[token("true", ignore(case))]
True,
#[token("false", ignore(case))]
False,
#[token("none", ignore(case))]
None,
#[token("self", ignore(case))]
SelfValue,
#[token("new", ignore(case))]
New,
#[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
Identifier,
// # Operations
// ## Exponentiation
#[token("**")]
Exponentiation,
// ## Unary
#[token("++")]
Increment,
#[token("--")]
Decrement,
#[token("!")]
Not,
#[token("~")]
BitwiseNot,
// ## Vector
#[token("dot", ignore(case))]
Dot,
#[token("cross", ignore(case))]
Cross,
// ## Multiplicative
#[token("*")]
Multiply,
#[token("/")]
Divide,
#[token("%")]
Modulo,
// ## Additive
#[token("+")]
Plus,
#[token("-")]
Minus,
// ## String manipulation
#[token("@")]
ConcatSpace,
#[token("$")]
Concat,
// ## Shifts
#[token("<<")]
LeftShift,
#[token(">>>")]
LogicalRightShift,
#[token(">>")]
RightShift,
// ## Relational
#[token("<")]
Less,
#[token("<=")]
LessEqual,
#[token(">")]
Greater,
#[token(">=")]
GreaterEqual,
#[token("==")]
Equal,
#[token("!=")]
NotEqual,
#[token("~=")]
ApproximatelyEqual,
#[token("clockwisefrom", ignore(case))]
ClockwiseFrom,
// ## Bitwise
#[token("&")]
BitwiseAnd,
#[token("|")]
BitwiseOr,
#[token("^")]
BitwiseXor,
// ## Logical
#[token("&&")]
LogicalAnd,
#[token("^^")]
LogicalXor,
#[token("||")]
LogicalOr,
// ## Assignments
#[token("=")]
Assign,
#[token("*=")]
MultiplyAssign,
#[token("/=")]
DivideAssign,
#[token("%=")]
ModuloAssign,
#[token("+=")]
PlusAssign,
#[token("-=")]
MinusAssign,
#[token("$=")]
ConcatAssign,
#[token("@=")]
ConcatSpaceAssign,
// # Punctuation & delimiters
#[token("(")]
LeftParenthesis,
#[token(")")]
RightParenthesis,
#[token("{", process_left_brace)]
Brace(BraceKind),
#[token("}")]
RightBrace,
#[token("[")]
LeftBracket,
#[token("]")]
RightBracket,
#[token(";")]
Semicolon,
#[token(",")]
Comma,
#[token(".")]
Period,
#[token(":")]
Colon,
#[token("#")]
Hash,
#[token("?")]
Question,
// # Comments & whitespaces
#[regex(r"//[^\r\n]*")]
LineComment,
#[regex(r"/\*", handle_block_comment)]
BlockComment,
#[regex(r"\r\n|\n|\r")]
Newline,
#[regex(r"[ \t]+")]
Whitespace,
// # Technical
Error,
}
/// Consumes an `UnrealScript` `/* ... */` block comment, including nested comments.
///
/// Matches the entire comment, including its delimiters.
/// If the comment is unterminated, consumes to the end of input.
fn handle_block_comment(lexer: &mut Lexer<RawToken>) {
let mut comment_depth = 1;
while let Some(next_character) = lexer.remainder().chars().next() {
if lexer.remainder().starts_with("/*") {
comment_depth += 1;
lexer.bump(2);
continue;
}
if lexer.remainder().starts_with("*/") {
comment_depth -= 1;
lexer.bump(2);
if comment_depth == 0 {
break;
}
continue;
}
lexer.bump(next_character.len_utf8());
}
}
/// Processes `{` according to the current lexer mode.
///
/// Returns [`BraceKind::Normal`] for ordinary `UnrealScript` braces.
/// After `cpptext` or `cppstruct`, consumes the embedded C++ block and returns
/// [`BraceKind::CppBlock`].
fn process_left_brace(lexer: &mut Lexer<RawToken>) -> BraceKind {
match lexer.extras.mode {
LexerMode::Normal => BraceKind::Normal,
LexerMode::AwaitingCppBlock => {
lexer.extras.mode = LexerMode::Normal;
consume_cpp_block(lexer);
BraceKind::CppBlock
}
}
}
/// Consumes a complete C++ block, handling:
/// - Nested `{...}` pairs
/// - String literals (`"..."` and `'...'`), including escaped quotes
/// - Line comments (`// ...\n`)
/// - Block comments (`/* ... */`)
///
/// Leaves the lexer positioned immediately after the closing `}` of the block.
/// The opening `{` must have already been consumed by the caller.
///
/// We target UE2-era cpp blocks, so no need for anything fancy.
fn consume_cpp_block(lexer: &mut Lexer<RawToken>) {
let mut brace_depth = 1;
while let Some(next_character) = lexer.remainder().chars().next() {
match next_character {
'{' => {
brace_depth += 1;
lexer.bump(1);
}
'}' => {
brace_depth -= 1;
lexer.bump(1);
if brace_depth == 0 {
break;
}
}
'/' if lexer.remainder().starts_with("/*") => {
lexer.bump(2); // consuming two-byte sequence `/*`
consume_c_style_block_comment(lexer);
}
'/' if lexer.remainder().starts_with("//") => {
lexer.bump(2); // consuming two-byte sequence `//`
while let Some(next_character) = lexer.remainder().chars().next() {
lexer.bump(next_character.len_utf8());
if next_character == '\n' || next_character == '\r' {
break;
}
}
}
'"' | '\'' => {
lexer.bump(1); // skip `'` or `"`
consume_quoted_cpp_literal(lexer, next_character);
}
_ => lexer.bump(next_character.len_utf8()),
}
}
}
/// Consumes a non-nesting C-style `/* ... */` comment.
///
/// Assumes that the opening `/*` has already been consumed.
fn consume_c_style_block_comment(lexer: &mut Lexer<RawToken>) {
while let Some(next_character) = lexer.remainder().chars().next() {
if lexer.remainder().starts_with("*/") {
lexer.bump(2);
break;
}
lexer.bump(next_character.len_utf8());
}
}
/// Consumes a quoted C++ string or character literal.
///
/// Assumes that the opening delimiter has already been consumed.
fn consume_quoted_cpp_literal(lexer: &mut Lexer<RawToken>, delimiter: char) {
while let Some(next_character) = lexer.remainder().chars().next() {
lexer.bump(next_character.len_utf8());
if next_character == '\\' {
// Skip the escaped character
if let Some(escaped_character) = lexer.remainder().chars().next() {
lexer.bump(escaped_character.len_utf8());
}
} else if next_character == delimiter {
return;
}
}
}
/// Peek ahead from the current lexer position, skipping "trivia", and report
/// whether the next significant character is `{`.
///
/// Trivia here means:
/// - Spaces and tabs
/// - Newlines (`\r`, `\n`, or `\r\n`)
/// - Line comments (`// ...`)
/// - Block comments (`/* ... */`), including nested ones
///
/// This is used after lexing tokens like `cpptext` or `cppstruct`, where
/// `UnrealScript` allows arbitrary trivia between the keyword and the opening
/// brace of the embedded C++ block.
///
/// Returns `true` if the next non-trivia character is `{`, otherwise `false`.
/// If the input ends while skipping trivia, returns `false`.
fn is_next_nontrivia_left_brace(lexer: &Lexer<RawToken>) -> bool {
let mut remaining = lexer.remainder();
while let Some(next_character) = remaining.chars().next() {
match next_character {
' ' | '\t' | '\r' | '\n' => {
remaining = &remaining[next_character.len_utf8()..];
}
'/' if remaining.starts_with("//") => {
remaining = &remaining[2..];
while let Some(comment_character) = remaining.chars().next() {
remaining = &remaining[comment_character.len_utf8()..];
if comment_character == '\n' || comment_character == '\r' {
break;
}
}
}
'/' if remaining.starts_with("/*") => {
remaining = &remaining[2..];
let mut comment_depth = 1;
while comment_depth > 0 {
if remaining.starts_with("/*") {
comment_depth += 1;
remaining = &remaining[2..];
continue;
}
if remaining.starts_with("*/") {
comment_depth -= 1;
remaining = &remaining[2..];
continue;
}
let Some(comment_character) = remaining.chars().next() else {
return false;
};
remaining = &remaining[comment_character.len_utf8()..];
}
}
_ => return next_character == '{',
}
}
false
}