Huge dump of refactored code. Still in the middle of the changes that are to be squashed later in a one huge monster commit, because there is no value in anything atomic here.
633 lines
18 KiB
Rust
633 lines
18 KiB
Rust
//! Lexer for `UnrealScript` that understands inline `cpptext { ... }` blocks.
|
|
//!
|
|
//! ## Notable details
|
|
//!
|
|
//! Lexer for `UnrealScript` that recognizes inline `cpptext { ... }` blocks.
|
|
//!
|
|
//! In `UnrealScript`, `cpptext` lets authors embed raw C++ between braces.\
|
|
//! Because whitespace, newlines, or comments may appear between the
|
|
//! `cpptext` keyword and the opening `{`, the lexer must remember that
|
|
//! it has just seen `cpptext` - hence a state machine.
|
|
//!
|
|
//! ## Modes
|
|
//!
|
|
//! - **Normal** - ordinary `UnrealScript` `RawTokens`.
|
|
//! - **`AwaitingCppBlock`** - after `cpptext`, waiting for the next `{`.
|
|
//!
|
|
//! When that brace arrives, the lexer consumes the entire C++ block as
|
|
//! one `RawToken` (`RawToken::Brace(BraceKind::CppBlock)`), tracking nested
|
|
//! braces, strings, and comments on the way. If the closing `}` is
|
|
//! missing, everything to EOF is treated as C++; downstream parsers must
|
|
//! handle that gracefully.
|
|
|
|
use logos::Lexer;
|
|
|
|
/// Which lexer mode we're in. See the module docs for the full story.
|
|
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Default)]
|
|
enum LexerMode {
|
|
/// Lexing regular `UnrealScript`.
|
|
#[default]
|
|
Normal,
|
|
/// Saw `cpptext`; waiting for the opening `{` of a C++ block.
|
|
AwaitingCppBlock,
|
|
}
|
|
|
|
/// Extra per-lexer state. Currently just holds the [`LexerMode`].
|
|
///
|
|
/// This is a logos-specific implementation detail.
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
|
|
pub struct LexerState {
|
|
mode: LexerMode,
|
|
}
|
|
|
|
/// Distinguishes an ordinary `{` token from one that starts
|
|
/// an embedded C++ block.
|
|
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
|
|
pub enum BraceKind {
|
|
/// An ordinary `UnrealScript` `{`.
|
|
Normal,
|
|
/// A `{` that starts an embedded C++ block and consumes through its
|
|
/// matching `}`.
|
|
CppBlock,
|
|
}
|
|
|
|
/// Tokens produced by the `UnrealScript` lexer.
|
|
///
|
|
/// Includes both syntactic tokens and trivia such as whitespace, newlines,
|
|
/// and comments.
|
|
#[derive(logos::Logos, Debug, PartialEq, Eq, Hash, Clone, Copy)]
|
|
#[logos(extras = LexerState)]
|
|
pub enum RawToken {
|
|
// # Compiler/directive keywords
|
|
#[regex(r"(?i)#exec[^\r\n]*(?:\r\n|\n|\r)?")]
|
|
ExecDirective,
|
|
#[regex("(?i)cpptext", |lex| {
|
|
if is_next_nontrivia_left_brace(lex) {
|
|
lex.extras.mode = LexerMode::AwaitingCppBlock;
|
|
} else {
|
|
lex.extras.mode = LexerMode::Normal;
|
|
}
|
|
})]
|
|
CppText,
|
|
|
|
#[regex("(?i)cppstruct", |lex| {
|
|
if is_next_nontrivia_left_brace(lex) {
|
|
lex.extras.mode = LexerMode::AwaitingCppBlock;
|
|
} else {
|
|
lex.extras.mode = LexerMode::Normal;
|
|
}
|
|
})]
|
|
CppStruct,
|
|
// # Declaration & structural keywords
|
|
//#[regex("(?i)class")]
|
|
#[token("class", ignore(case))]
|
|
Class,
|
|
#[token("struct", ignore(case))]
|
|
Struct,
|
|
#[token("enum", ignore(case))]
|
|
Enum,
|
|
#[token("state", ignore(case))]
|
|
State,
|
|
#[token("auto", ignore(case))]
|
|
Auto,
|
|
#[token("function", ignore(case))]
|
|
Function,
|
|
#[token("event", ignore(case))]
|
|
Event,
|
|
#[token("delegate", ignore(case))]
|
|
Delegate,
|
|
#[token("var", ignore(case))]
|
|
Var,
|
|
#[token("local", ignore(case))]
|
|
Local,
|
|
|
|
// # Inheritance, interface, dependencies
|
|
#[token("extends", ignore(case))]
|
|
Extends,
|
|
#[token("dependson", ignore(case))]
|
|
DependsOn,
|
|
|
|
// # Access modifiers & properties
|
|
#[token("private", ignore(case))]
|
|
Private,
|
|
#[token("protected", ignore(case))]
|
|
Protected,
|
|
#[token("public", ignore(case))]
|
|
Public,
|
|
#[token("const", ignore(case))]
|
|
Const,
|
|
#[token("static", ignore(case))]
|
|
Static,
|
|
#[token("native", ignore(case))]
|
|
Native,
|
|
#[token("abstract", ignore(case))]
|
|
Abstract,
|
|
#[token("deprecated", ignore(case))]
|
|
Deprecated,
|
|
#[token("safereplace", ignore(case))]
|
|
SafeReplace,
|
|
#[token("exportstructs", ignore(case))]
|
|
ExportStructs,
|
|
#[token("input", ignore(case))]
|
|
Input,
|
|
|
|
// # UnrealScript metadata/specifiers
|
|
#[token("final", ignore(case))]
|
|
Final,
|
|
#[token("default", ignore(case))]
|
|
Default,
|
|
#[token("defaultproperties", ignore(case))]
|
|
DefaultProperties,
|
|
#[token("object", ignore(case))]
|
|
Object,
|
|
#[token("begin", ignore(case))]
|
|
Begin,
|
|
#[token("end", ignore(case))]
|
|
End,
|
|
#[token("optional", ignore(case))]
|
|
Optional,
|
|
#[token("config", ignore(case))]
|
|
Config,
|
|
#[token("perobjectconfig", ignore(case))]
|
|
PerObjectConfig,
|
|
#[token("globalconfig", ignore(case))]
|
|
GlobalConfig,
|
|
#[token("collapsecategories", ignore(case))]
|
|
CollapseCategories,
|
|
#[token("dontcollapsecategories", ignore(case))]
|
|
DontCollapseCategories,
|
|
#[token("hidecategories", ignore(case))]
|
|
HideCategories,
|
|
#[token("showcategories", ignore(case))]
|
|
ShowCategories,
|
|
#[token("localized", ignore(case))]
|
|
Localized,
|
|
#[token("placeable", ignore(case))]
|
|
Placeable,
|
|
#[token("notplaceable", ignore(case))]
|
|
NotPlaceable,
|
|
#[token("instanced", ignore(case))]
|
|
Instanced,
|
|
#[token("editconst", ignore(case))]
|
|
EditConst,
|
|
#[token("editconstarray", ignore(case))]
|
|
EditConstArray,
|
|
#[token("editinline", ignore(case))]
|
|
EditInline,
|
|
#[token("editinlineuse", ignore(case))]
|
|
EditInlineUse,
|
|
#[token("editinlinenew", ignore(case))]
|
|
EditInlineNew,
|
|
#[token("noteditinlinenew", ignore(case))]
|
|
NotEditInlineNew,
|
|
#[token("edfindable", ignore(case))]
|
|
EdFindable,
|
|
#[token("editinlinenotify", ignore(case))]
|
|
EditInlineNotify,
|
|
#[token("parseconfig", ignore(case))]
|
|
ParseConfig,
|
|
#[token("automated", ignore(case))]
|
|
Automated,
|
|
#[token("dynamicrecompile", ignore(case))]
|
|
DynamicRecompile,
|
|
#[token("transient", ignore(case))]
|
|
Transient,
|
|
#[token("long", ignore(case))]
|
|
Long,
|
|
#[token("operator", ignore(case))]
|
|
Operator,
|
|
#[token("preoperator", ignore(case))]
|
|
PreOperator,
|
|
#[token("postoperator", ignore(case))]
|
|
PostOperator,
|
|
#[token("simulated", ignore(case))]
|
|
Simulated,
|
|
#[token("exec", ignore(case))]
|
|
Exec,
|
|
#[token("latent", ignore(case))]
|
|
Latent,
|
|
#[token("iterator", ignore(case))]
|
|
Iterator,
|
|
#[token("out", ignore(case))]
|
|
Out,
|
|
#[token("skip", ignore(case))]
|
|
Skip,
|
|
#[token("singular", ignore(case))]
|
|
Singular,
|
|
#[token("coerce", ignore(case))]
|
|
Coerce,
|
|
#[token("assert", ignore(case))]
|
|
Assert,
|
|
#[token("ignores", ignore(case))]
|
|
Ignores,
|
|
#[token("within", ignore(case))]
|
|
Within,
|
|
#[token("init", ignore(case))]
|
|
Init,
|
|
#[token("export", ignore(case))]
|
|
Export,
|
|
#[token("noexport", ignore(case))]
|
|
NoExport,
|
|
#[token("hidedropdown", ignore(case))]
|
|
HideDropdown,
|
|
#[token("travel", ignore(case))]
|
|
Travel,
|
|
#[token("cache", ignore(case))]
|
|
Cache,
|
|
#[token("cacheexempt", ignore(case))]
|
|
CacheExempt,
|
|
|
|
// # Replication-related
|
|
#[token("reliable", ignore(case))]
|
|
Reliable,
|
|
#[token("unreliable", ignore(case))]
|
|
Unreliable,
|
|
#[token("replication", ignore(case))]
|
|
Replication,
|
|
#[token("nativereplication", ignore(case))]
|
|
NativeReplication,
|
|
|
|
// # Control-flow keywords
|
|
#[token("goto", ignore(case))]
|
|
Goto,
|
|
#[token("if", ignore(case))]
|
|
If,
|
|
#[token("else", ignore(case))]
|
|
Else,
|
|
#[token("switch", ignore(case))]
|
|
Switch,
|
|
#[token("case", ignore(case))]
|
|
Case,
|
|
#[token("for", ignore(case))]
|
|
For,
|
|
#[token("foreach", ignore(case))]
|
|
ForEach,
|
|
#[token("while", ignore(case))]
|
|
While,
|
|
#[token("do", ignore(case))]
|
|
Do,
|
|
#[token("until", ignore(case))]
|
|
Until,
|
|
#[token("break", ignore(case))]
|
|
Break,
|
|
#[token("continue", ignore(case))]
|
|
Continue,
|
|
#[token("return", ignore(case))]
|
|
Return,
|
|
|
|
// # Built-in types
|
|
#[token("int", ignore(case))]
|
|
Int,
|
|
#[token("float", ignore(case))]
|
|
Float,
|
|
#[token("bool", ignore(case))]
|
|
Bool,
|
|
#[token("byte", ignore(case))]
|
|
Byte,
|
|
#[token("string", ignore(case))]
|
|
String,
|
|
#[token("array", ignore(case))]
|
|
Array,
|
|
#[token("name", ignore(case))]
|
|
Name,
|
|
|
|
// FloatLiteral must come before IntegerLiteral and '.'
|
|
// to have higher priority.
|
|
// It also recognizes things like: `1.foo``, `1.foo.bar`, `1.2.3`.
|
|
// It has to. Because UnrealScript is a pile of-... wonderful language,
|
|
// where everything is possible.
|
|
#[regex(r"[0-9]+(?:\.(?:[0-9]+|[A-Za-z_][A-Za-z0-9_]*))+[fF]?")]
|
|
#[regex(r"(?:[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)(?:[eE][+-]?[0-9]+)?[fF]?")]
|
|
#[regex(r"[0-9]+[eE][+-]?[0-9]+[fF]?")]
|
|
FloatLiteral,
|
|
|
|
#[regex(r"0b[01](?:_?[01])*")]
|
|
#[regex(r"0o[0-7](?:_?[0-7])*")]
|
|
#[regex(r"0x[0-9A-Fa-f](?:_?[0-9A-Fa-f])*")]
|
|
#[regex(r"[0-9][0-9]*")]
|
|
IntegerLiteral,
|
|
|
|
#[regex(r#""([^"\\\r\n]|\\.)*""#)]
|
|
StringLiteral,
|
|
#[regex(r"'[a-zA-Z0-9_\. \-]*'")]
|
|
NameLiteral,
|
|
#[token("true", ignore(case))]
|
|
True,
|
|
#[token("false", ignore(case))]
|
|
False,
|
|
#[token("none", ignore(case))]
|
|
None,
|
|
#[token("self", ignore(case))]
|
|
SelfValue,
|
|
#[token("new", ignore(case))]
|
|
New,
|
|
#[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
|
|
Identifier,
|
|
|
|
// # Operations
|
|
// ## Exponentiation
|
|
#[token("**")]
|
|
Exponentiation,
|
|
// ## Unary
|
|
#[token("++")]
|
|
Increment,
|
|
#[token("--")]
|
|
Decrement,
|
|
#[token("!")]
|
|
Not,
|
|
#[token("~")]
|
|
BitwiseNot,
|
|
// ## Vector
|
|
#[token("dot", ignore(case))]
|
|
Dot,
|
|
#[token("cross", ignore(case))]
|
|
Cross,
|
|
// ## Multiplicative
|
|
#[token("*")]
|
|
Multiply,
|
|
#[token("/")]
|
|
Divide,
|
|
#[token("%")]
|
|
Modulo,
|
|
// ## Additive
|
|
#[token("+")]
|
|
Plus,
|
|
#[token("-")]
|
|
Minus,
|
|
// ## String manipulation
|
|
#[token("@")]
|
|
ConcatSpace,
|
|
#[token("$")]
|
|
Concat,
|
|
// ## Shifts
|
|
#[token("<<")]
|
|
LeftShift,
|
|
#[token(">>>")]
|
|
LogicalRightShift,
|
|
#[token(">>")]
|
|
RightShift,
|
|
// ## Relational
|
|
#[token("<")]
|
|
Less,
|
|
#[token("<=")]
|
|
LessEqual,
|
|
#[token(">")]
|
|
Greater,
|
|
#[token(">=")]
|
|
GreaterEqual,
|
|
#[token("==")]
|
|
Equal,
|
|
#[token("!=")]
|
|
NotEqual,
|
|
#[token("~=")]
|
|
ApproximatelyEqual,
|
|
#[token("clockwisefrom", ignore(case))]
|
|
ClockwiseFrom,
|
|
// ## Bitwise
|
|
#[token("&")]
|
|
BitwiseAnd,
|
|
#[token("|")]
|
|
BitwiseOr,
|
|
#[token("^")]
|
|
BitwiseXor,
|
|
// ## Logical
|
|
#[token("&&")]
|
|
LogicalAnd,
|
|
#[token("^^")]
|
|
LogicalXor,
|
|
#[token("||")]
|
|
LogicalOr,
|
|
// ## Assignments
|
|
#[token("=")]
|
|
Assign,
|
|
#[token("*=")]
|
|
MultiplyAssign,
|
|
#[token("/=")]
|
|
DivideAssign,
|
|
#[token("%=")]
|
|
ModuloAssign,
|
|
#[token("+=")]
|
|
PlusAssign,
|
|
#[token("-=")]
|
|
MinusAssign,
|
|
#[token("$=")]
|
|
ConcatAssign,
|
|
#[token("@=")]
|
|
ConcatSpaceAssign,
|
|
|
|
// # Punctuation & delimiters
|
|
#[token("(")]
|
|
LeftParenthesis,
|
|
#[token(")")]
|
|
RightParenthesis,
|
|
#[token("{", process_left_brace)]
|
|
Brace(BraceKind),
|
|
#[token("}")]
|
|
RightBrace,
|
|
#[token("[")]
|
|
LeftBracket,
|
|
#[token("]")]
|
|
RightBracket,
|
|
#[token(";")]
|
|
Semicolon,
|
|
#[token(",")]
|
|
Comma,
|
|
#[token(".")]
|
|
Period,
|
|
#[token(":")]
|
|
Colon,
|
|
#[token("#")]
|
|
Hash,
|
|
#[token("?")]
|
|
Question,
|
|
|
|
// # Comments & whitespaces
|
|
#[regex(r"//[^\r\n]*")]
|
|
LineComment,
|
|
#[regex(r"/\*", handle_block_comment)]
|
|
BlockComment,
|
|
#[regex(r"\r\n|\n|\r")]
|
|
Newline,
|
|
#[regex(r"[ \t]+")]
|
|
Whitespace,
|
|
|
|
// # Technical
|
|
Error,
|
|
}
|
|
|
|
/// Consumes an `UnrealScript` `/* ... */` block comment, including nested comments.
|
|
///
|
|
/// Matches the entire comment, including its delimiters.
|
|
/// If the comment is unterminated, consumes to the end of input.
|
|
fn handle_block_comment(lexer: &mut Lexer<RawToken>) {
|
|
let mut comment_depth = 1;
|
|
while let Some(next_character) = lexer.remainder().chars().next() {
|
|
if lexer.remainder().starts_with("/*") {
|
|
comment_depth += 1;
|
|
lexer.bump(2);
|
|
continue;
|
|
}
|
|
if lexer.remainder().starts_with("*/") {
|
|
comment_depth -= 1;
|
|
lexer.bump(2);
|
|
if comment_depth == 0 {
|
|
break;
|
|
}
|
|
continue;
|
|
}
|
|
lexer.bump(next_character.len_utf8());
|
|
}
|
|
}
|
|
|
|
/// Processes `{` according to the current lexer mode.
|
|
///
|
|
/// Returns [`BraceKind::Normal`] for ordinary `UnrealScript` braces.
|
|
/// After `cpptext` or `cppstruct`, consumes the embedded C++ block and returns
|
|
/// [`BraceKind::CppBlock`].
|
|
fn process_left_brace(lexer: &mut Lexer<RawToken>) -> BraceKind {
|
|
match lexer.extras.mode {
|
|
LexerMode::Normal => BraceKind::Normal,
|
|
LexerMode::AwaitingCppBlock => {
|
|
lexer.extras.mode = LexerMode::Normal;
|
|
consume_cpp_block(lexer);
|
|
BraceKind::CppBlock
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Consumes a complete C++ block, handling:
|
|
/// - Nested `{...}` pairs
|
|
/// - String literals (`"..."` and `'...'`), including escaped quotes
|
|
/// - Line comments (`// ...\n`)
|
|
/// - Block comments (`/* ... */`)
|
|
///
|
|
/// Leaves the lexer positioned immediately after the closing `}` of the block.
|
|
/// The opening `{` must have already been consumed by the caller.
|
|
///
|
|
/// We target UE2-era cpp blocks, so no need for anything fancy.
|
|
fn consume_cpp_block(lexer: &mut Lexer<RawToken>) {
|
|
let mut brace_depth = 1;
|
|
while let Some(next_character) = lexer.remainder().chars().next() {
|
|
match next_character {
|
|
'{' => {
|
|
brace_depth += 1;
|
|
lexer.bump(1);
|
|
}
|
|
'}' => {
|
|
brace_depth -= 1;
|
|
lexer.bump(1);
|
|
if brace_depth == 0 {
|
|
break;
|
|
}
|
|
}
|
|
'/' if lexer.remainder().starts_with("/*") => {
|
|
lexer.bump(2); // consuming two-byte sequence `/*`
|
|
consume_c_style_block_comment(lexer);
|
|
}
|
|
'/' if lexer.remainder().starts_with("//") => {
|
|
lexer.bump(2); // consuming two-byte sequence `//`
|
|
while let Some(next_character) = lexer.remainder().chars().next() {
|
|
lexer.bump(next_character.len_utf8());
|
|
if next_character == '\n' || next_character == '\r' {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
'"' | '\'' => {
|
|
lexer.bump(1); // skip `'` or `"`
|
|
consume_quoted_cpp_literal(lexer, next_character);
|
|
}
|
|
_ => lexer.bump(next_character.len_utf8()),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Consumes a non-nesting C-style `/* ... */` comment.
|
|
///
|
|
/// Assumes that the opening `/*` has already been consumed.
|
|
fn consume_c_style_block_comment(lexer: &mut Lexer<RawToken>) {
|
|
while let Some(next_character) = lexer.remainder().chars().next() {
|
|
if lexer.remainder().starts_with("*/") {
|
|
lexer.bump(2);
|
|
break;
|
|
}
|
|
lexer.bump(next_character.len_utf8());
|
|
}
|
|
}
|
|
|
|
/// Consumes a quoted C++ string or character literal.
|
|
///
|
|
/// Assumes that the opening delimiter has already been consumed.
|
|
fn consume_quoted_cpp_literal(lexer: &mut Lexer<RawToken>, delimiter: char) {
|
|
while let Some(next_character) = lexer.remainder().chars().next() {
|
|
lexer.bump(next_character.len_utf8());
|
|
if next_character == '\\' {
|
|
// Skip the escaped character
|
|
if let Some(escaped_character) = lexer.remainder().chars().next() {
|
|
lexer.bump(escaped_character.len_utf8());
|
|
}
|
|
} else if next_character == delimiter {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Peek ahead from the current lexer position, skipping "trivia", and report
|
|
/// whether the next significant character is `{`.
|
|
///
|
|
/// Trivia here means:
|
|
/// - Spaces and tabs
|
|
/// - Newlines (`\r`, `\n`, or `\r\n`)
|
|
/// - Line comments (`// ...`)
|
|
/// - Block comments (`/* ... */`), including nested ones
|
|
///
|
|
/// This is used after lexing tokens like `cpptext` or `cppstruct`, where
|
|
/// `UnrealScript` allows arbitrary trivia between the keyword and the opening
|
|
/// brace of the embedded C++ block.
|
|
///
|
|
/// Returns `true` if the next non-trivia character is `{`, otherwise `false`.
|
|
/// If the input ends while skipping trivia, returns `false`.
|
|
fn is_next_nontrivia_left_brace(lexer: &Lexer<RawToken>) -> bool {
|
|
let mut remaining = lexer.remainder();
|
|
|
|
while let Some(next_character) = remaining.chars().next() {
|
|
match next_character {
|
|
' ' | '\t' | '\r' | '\n' => {
|
|
remaining = &remaining[next_character.len_utf8()..];
|
|
}
|
|
'/' if remaining.starts_with("//") => {
|
|
remaining = &remaining[2..];
|
|
while let Some(comment_character) = remaining.chars().next() {
|
|
remaining = &remaining[comment_character.len_utf8()..];
|
|
if comment_character == '\n' || comment_character == '\r' {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
'/' if remaining.starts_with("/*") => {
|
|
remaining = &remaining[2..];
|
|
let mut comment_depth = 1;
|
|
while comment_depth > 0 {
|
|
if remaining.starts_with("/*") {
|
|
comment_depth += 1;
|
|
remaining = &remaining[2..];
|
|
continue;
|
|
}
|
|
if remaining.starts_with("*/") {
|
|
comment_depth -= 1;
|
|
remaining = &remaining[2..];
|
|
continue;
|
|
}
|
|
let Some(comment_character) = remaining.chars().next() else {
|
|
return false;
|
|
};
|
|
remaining = &remaining[comment_character.len_utf8()..];
|
|
}
|
|
}
|
|
_ => return next_character == '{',
|
|
}
|
|
}
|
|
|
|
false
|
|
}
|