//! Lexer for `UnrealScript` that understands inline `cpptext { ... }` blocks. //! //! ## Notable details //! //! Lexer for `UnrealScript` that recognizes inline `cpptext { ... }` blocks. //! //! In `UnrealScript`, `cpptext` lets authors embed raw C++ between braces.\ //! Because whitespace, newlines, or comments may appear between the //! `cpptext` keyword and the opening `{`, the lexer must remember that //! it has just seen `cpptext` - hence a state machine. //! //! ## Modes //! //! - **Normal** - ordinary `UnrealScript` `RawTokens`. //! - **`AwaitingCppBlock`** - after `cpptext`, waiting for the next `{`. //! //! When that brace arrives, the lexer consumes the entire C++ block as //! one `RawToken` (`RawToken::Brace(BraceKind::CppBlock)`), tracking nested //! braces, strings, and comments on the way. If the closing `}` is //! missing, everything to EOF is treated as C++; downstream parsers must //! handle that gracefully. use logos::Lexer; /// Which lexer mode we're in. See the module docs for the full story. #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Default)] enum LexerMode { /// Lexing regular `UnrealScript`. #[default] Normal, /// Saw `cpptext`; waiting for the opening `{` of a C++ block. AwaitingCppBlock, } /// Extra per-lexer state. Currently just holds the [`LexerMode`]. /// /// This is a logos-specific implementation detail. #[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] pub struct LexerState { mode: LexerMode, } /// Distinguishes an ordinary `{` token from one that starts /// an embedded C++ block. #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)] pub enum BraceKind { /// An ordinary `UnrealScript` `{`. Normal, /// A `{` that starts an embedded C++ block and consumes through its /// matching `}`. CppBlock, } /// Tokens produced by the `UnrealScript` lexer. /// /// Includes both syntactic tokens and trivia such as whitespace, newlines, /// and comments. #[derive(logos::Logos, Debug, PartialEq, Eq, Hash, Clone, Copy)] #[logos(extras = LexerState)] pub enum RawToken { // # Compiler/directive keywords #[regex(r"(?i)#exec[^\r\n]*(?:\r\n|\n|\r)?")] ExecDirective, #[regex("(?i)cpptext", |lex| { if is_next_nontrivia_left_brace(lex) { lex.extras.mode = LexerMode::AwaitingCppBlock; } else { lex.extras.mode = LexerMode::Normal; } })] CppText, #[regex("(?i)cppstruct", |lex| { if is_next_nontrivia_left_brace(lex) { lex.extras.mode = LexerMode::AwaitingCppBlock; } else { lex.extras.mode = LexerMode::Normal; } })] CppStruct, // # Declaration & structural keywords //#[regex("(?i)class")] #[token("class", ignore(case))] Class, #[token("struct", ignore(case))] Struct, #[token("enum", ignore(case))] Enum, #[token("state", ignore(case))] State, #[token("auto", ignore(case))] Auto, #[token("function", ignore(case))] Function, #[token("event", ignore(case))] Event, #[token("delegate", ignore(case))] Delegate, #[token("var", ignore(case))] Var, #[token("local", ignore(case))] Local, // # Inheritance, interface, dependencies #[token("extends", ignore(case))] Extends, #[token("dependson", ignore(case))] DependsOn, // # Access modifiers & properties #[token("private", ignore(case))] Private, #[token("protected", ignore(case))] Protected, #[token("public", ignore(case))] Public, #[token("const", ignore(case))] Const, #[token("static", ignore(case))] Static, #[token("native", ignore(case))] Native, #[token("abstract", ignore(case))] Abstract, #[token("deprecated", ignore(case))] Deprecated, #[token("safereplace", ignore(case))] SafeReplace, #[token("exportstructs", ignore(case))] ExportStructs, #[token("input", ignore(case))] Input, // # UnrealScript metadata/specifiers #[token("final", ignore(case))] Final, #[token("default", ignore(case))] Default, #[token("defaultproperties", ignore(case))] DefaultProperties, #[token("object", ignore(case))] Object, #[token("begin", ignore(case))] Begin, #[token("end", ignore(case))] End, #[token("optional", ignore(case))] Optional, #[token("config", ignore(case))] Config, #[token("perobjectconfig", ignore(case))] PerObjectConfig, #[token("globalconfig", ignore(case))] GlobalConfig, #[token("collapsecategories", ignore(case))] CollapseCategories, #[token("dontcollapsecategories", ignore(case))] DontCollapseCategories, #[token("hidecategories", ignore(case))] HideCategories, #[token("showcategories", ignore(case))] ShowCategories, #[token("localized", ignore(case))] Localized, #[token("placeable", ignore(case))] Placeable, #[token("notplaceable", ignore(case))] NotPlaceable, #[token("instanced", ignore(case))] Instanced, #[token("editconst", ignore(case))] EditConst, #[token("editconstarray", ignore(case))] EditConstArray, #[token("editinline", ignore(case))] EditInline, #[token("editinlineuse", ignore(case))] EditInlineUse, #[token("editinlinenew", ignore(case))] EditInlineNew, #[token("noteditinlinenew", ignore(case))] NotEditInlineNew, #[token("edfindable", ignore(case))] EdFindable, #[token("editinlinenotify", ignore(case))] EditInlineNotify, #[token("parseconfig", ignore(case))] ParseConfig, #[token("automated", ignore(case))] Automated, #[token("dynamicrecompile", ignore(case))] DynamicRecompile, #[token("transient", ignore(case))] Transient, #[token("long", ignore(case))] Long, #[token("operator", ignore(case))] Operator, #[token("preoperator", ignore(case))] PreOperator, #[token("postoperator", ignore(case))] PostOperator, #[token("simulated", ignore(case))] Simulated, #[token("exec", ignore(case))] Exec, #[token("latent", ignore(case))] Latent, #[token("iterator", ignore(case))] Iterator, #[token("out", ignore(case))] Out, #[token("skip", ignore(case))] Skip, #[token("singular", ignore(case))] Singular, #[token("coerce", ignore(case))] Coerce, #[token("assert", ignore(case))] Assert, #[token("ignores", ignore(case))] Ignores, #[token("within", ignore(case))] Within, #[token("init", ignore(case))] Init, #[token("export", ignore(case))] Export, #[token("noexport", ignore(case))] NoExport, #[token("hidedropdown", ignore(case))] HideDropdown, #[token("travel", ignore(case))] Travel, #[token("cache", ignore(case))] Cache, #[token("cacheexempt", ignore(case))] CacheExempt, // # Replication-related #[token("reliable", ignore(case))] Reliable, #[token("unreliable", ignore(case))] Unreliable, #[token("replication", ignore(case))] Replication, #[token("nativereplication", ignore(case))] NativeReplication, // # Control-flow keywords #[token("goto", ignore(case))] Goto, #[token("if", ignore(case))] If, #[token("else", ignore(case))] Else, #[token("switch", ignore(case))] Switch, #[token("case", ignore(case))] Case, #[token("for", ignore(case))] For, #[token("foreach", ignore(case))] ForEach, #[token("while", ignore(case))] While, #[token("do", ignore(case))] Do, #[token("until", ignore(case))] Until, #[token("break", ignore(case))] Break, #[token("continue", ignore(case))] Continue, #[token("return", ignore(case))] Return, // # Built-in types #[token("int", ignore(case))] Int, #[token("float", ignore(case))] Float, #[token("bool", ignore(case))] Bool, #[token("byte", ignore(case))] Byte, #[token("string", ignore(case))] String, #[token("array", ignore(case))] Array, #[token("name", ignore(case))] Name, // FloatLiteral must come before IntegerLiteral and '.' // to have higher priority. // It also recognizes things like: `1.foo``, `1.foo.bar`, `1.2.3`. // It has to. Because UnrealScript is a pile of-... wonderful language, // where everything is possible. #[regex(r"[0-9]+(?:\.(?:[0-9]+|[A-Za-z_][A-Za-z0-9_]*))+[fF]?")] #[regex(r"(?:[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)(?:[eE][+-]?[0-9]+)?[fF]?")] #[regex(r"[0-9]+[eE][+-]?[0-9]+[fF]?")] FloatLiteral, #[regex(r"0b[01](?:_?[01])*")] #[regex(r"0o[0-7](?:_?[0-7])*")] #[regex(r"0x[0-9A-Fa-f](?:_?[0-9A-Fa-f])*")] #[regex(r"[0-9][0-9]*")] IntegerLiteral, #[regex(r#""([^"\\\r\n]|\\.)*""#)] StringLiteral, #[regex(r"'[a-zA-Z0-9_\. \-]*'")] NameLiteral, #[token("true", ignore(case))] True, #[token("false", ignore(case))] False, #[token("none", ignore(case))] None, #[token("self", ignore(case))] SelfValue, #[token("new", ignore(case))] New, #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")] Identifier, // # Operations // ## Exponentiation #[token("**")] Exponentiation, // ## Unary #[token("++")] Increment, #[token("--")] Decrement, #[token("!")] Not, #[token("~")] BitwiseNot, // ## Vector #[token("dot", ignore(case))] Dot, #[token("cross", ignore(case))] Cross, // ## Multiplicative #[token("*")] Multiply, #[token("/")] Divide, #[token("%")] Modulo, // ## Additive #[token("+")] Plus, #[token("-")] Minus, // ## String manipulation #[token("@")] ConcatSpace, #[token("$")] Concat, // ## Shifts #[token("<<")] LeftShift, #[token(">>>")] LogicalRightShift, #[token(">>")] RightShift, // ## Relational #[token("<")] Less, #[token("<=")] LessEqual, #[token(">")] Greater, #[token(">=")] GreaterEqual, #[token("==")] Equal, #[token("!=")] NotEqual, #[token("~=")] ApproximatelyEqual, #[token("clockwisefrom", ignore(case))] ClockwiseFrom, // ## Bitwise #[token("&")] BitwiseAnd, #[token("|")] BitwiseOr, #[token("^")] BitwiseXor, // ## Logical #[token("&&")] LogicalAnd, #[token("^^")] LogicalXor, #[token("||")] LogicalOr, // ## Assignments #[token("=")] Assign, #[token("*=")] MultiplyAssign, #[token("/=")] DivideAssign, #[token("%=")] ModuloAssign, #[token("+=")] PlusAssign, #[token("-=")] MinusAssign, #[token("$=")] ConcatAssign, #[token("@=")] ConcatSpaceAssign, // # Punctuation & delimiters #[token("(")] LeftParenthesis, #[token(")")] RightParenthesis, #[token("{", process_left_brace)] Brace(BraceKind), #[token("}")] RightBrace, #[token("[")] LeftBracket, #[token("]")] RightBracket, #[token(";")] Semicolon, #[token(",")] Comma, #[token(".")] Period, #[token(":")] Colon, #[token("#")] Hash, #[token("?")] Question, // # Comments & whitespaces #[regex(r"//[^\r\n]*")] LineComment, #[regex(r"/\*", handle_block_comment)] BlockComment, #[regex(r"\r\n|\n|\r")] Newline, #[regex(r"[ \t]+")] Whitespace, // # Technical Error, } /// Consumes an `UnrealScript` `/* ... */` block comment, including nested comments. /// /// Matches the entire comment, including its delimiters. /// If the comment is unterminated, consumes to the end of input. fn handle_block_comment(lexer: &mut Lexer) { let mut comment_depth = 1; while let Some(next_character) = lexer.remainder().chars().next() { if lexer.remainder().starts_with("/*") { comment_depth += 1; lexer.bump(2); continue; } if lexer.remainder().starts_with("*/") { comment_depth -= 1; lexer.bump(2); if comment_depth == 0 { break; } continue; } lexer.bump(next_character.len_utf8()); } } /// Processes `{` according to the current lexer mode. /// /// Returns [`BraceKind::Normal`] for ordinary `UnrealScript` braces. /// After `cpptext` or `cppstruct`, consumes the embedded C++ block and returns /// [`BraceKind::CppBlock`]. fn process_left_brace(lexer: &mut Lexer) -> BraceKind { match lexer.extras.mode { LexerMode::Normal => BraceKind::Normal, LexerMode::AwaitingCppBlock => { lexer.extras.mode = LexerMode::Normal; consume_cpp_block(lexer); BraceKind::CppBlock } } } /// Consumes a complete C++ block, handling: /// - Nested `{...}` pairs /// - String literals (`"..."` and `'...'`), including escaped quotes /// - Line comments (`// ...\n`) /// - Block comments (`/* ... */`) /// /// Leaves the lexer positioned immediately after the closing `}` of the block. /// The opening `{` must have already been consumed by the caller. /// /// We target UE2-era cpp blocks, so no need for anything fancy. fn consume_cpp_block(lexer: &mut Lexer) { let mut brace_depth = 1; while let Some(next_character) = lexer.remainder().chars().next() { match next_character { '{' => { brace_depth += 1; lexer.bump(1); } '}' => { brace_depth -= 1; lexer.bump(1); if brace_depth == 0 { break; } } '/' if lexer.remainder().starts_with("/*") => { lexer.bump(2); // consuming two-byte sequence `/*` consume_c_style_block_comment(lexer); } '/' if lexer.remainder().starts_with("//") => { lexer.bump(2); // consuming two-byte sequence `//` while let Some(next_character) = lexer.remainder().chars().next() { lexer.bump(next_character.len_utf8()); if next_character == '\n' || next_character == '\r' { break; } } } '"' | '\'' => { lexer.bump(1); // skip `'` or `"` consume_quoted_cpp_literal(lexer, next_character); } _ => lexer.bump(next_character.len_utf8()), } } } /// Consumes a non-nesting C-style `/* ... */` comment. /// /// Assumes that the opening `/*` has already been consumed. fn consume_c_style_block_comment(lexer: &mut Lexer) { while let Some(next_character) = lexer.remainder().chars().next() { if lexer.remainder().starts_with("*/") { lexer.bump(2); break; } lexer.bump(next_character.len_utf8()); } } /// Consumes a quoted C++ string or character literal. /// /// Assumes that the opening delimiter has already been consumed. fn consume_quoted_cpp_literal(lexer: &mut Lexer, delimiter: char) { while let Some(next_character) = lexer.remainder().chars().next() { lexer.bump(next_character.len_utf8()); if next_character == '\\' { // Skip the escaped character if let Some(escaped_character) = lexer.remainder().chars().next() { lexer.bump(escaped_character.len_utf8()); } } else if next_character == delimiter { return; } } } /// Peek ahead from the current lexer position, skipping "trivia", and report /// whether the next significant character is `{`. /// /// Trivia here means: /// - Spaces and tabs /// - Newlines (`\r`, `\n`, or `\r\n`) /// - Line comments (`// ...`) /// - Block comments (`/* ... */`), including nested ones /// /// This is used after lexing tokens like `cpptext` or `cppstruct`, where /// `UnrealScript` allows arbitrary trivia between the keyword and the opening /// brace of the embedded C++ block. /// /// Returns `true` if the next non-trivia character is `{`, otherwise `false`. /// If the input ends while skipping trivia, returns `false`. fn is_next_nontrivia_left_brace(lexer: &Lexer) -> bool { let mut remaining = lexer.remainder(); while let Some(next_character) = remaining.chars().next() { match next_character { ' ' | '\t' | '\r' | '\n' => { remaining = &remaining[next_character.len_utf8()..]; } '/' if remaining.starts_with("//") => { remaining = &remaining[2..]; while let Some(comment_character) = remaining.chars().next() { remaining = &remaining[comment_character.len_utf8()..]; if comment_character == '\n' || comment_character == '\r' { break; } } } '/' if remaining.starts_with("/*") => { remaining = &remaining[2..]; let mut comment_depth = 1; while comment_depth > 0 { if remaining.starts_with("/*") { comment_depth += 1; remaining = &remaining[2..]; continue; } if remaining.starts_with("*/") { comment_depth -= 1; remaining = &remaining[2..]; continue; } let Some(comment_character) = remaining.chars().next() else { return false; }; remaining = &remaining[comment_character.len_utf8()..]; } } _ => return next_character == '{', } } false }