rott/rottlib/src/lexer/raw_lexer.rs

//! Lexer for `UnrealScript` that understands inline `cpptext { ... }` blocks.
//!
//! ## Notable details
//!
//! Lexer for `UnrealScript` that recognizes inline `cpptext { ... }` blocks.
//!
//! In `UnrealScript`, `cpptext` lets authors embed raw C++ between braces.\
//! Because whitespace, newlines, or comments may appear between the
//! `cpptext` keyword and the opening `{`, the lexer must remember that
//! it has just seen `cpptext` - hence a state machine.
//!
//! ## Modes
//!
//! - **Normal** - ordinary `UnrealScript` `RawTokens`.
//! - **`AwaitingCppBlock`** - after `cpptext`, waiting for the next `{`.
//!
//! When that brace arrives, the lexer consumes the entire C++ block as
//! one `RawToken` (`RawToken::Brace(BraceKind::CppBlock)`), tracking nested
//! braces, strings, and comments on the way. If the closing `}` is
//! missing, everything to EOF is treated as C++; downstream parsers must
//! handle that gracefully.

use logos::Lexer;

/// Which lexer mode we're in. See the module docs for the full story.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Default)]
enum LexerMode {
    /// Lexing regular `UnrealScript`.
    #[default]
    Normal,
    /// Saw `cpptext`; waiting for the opening `{` of a C++ block.
    AwaitingCppBlock,
}

/// Extra per-lexer state. Currently just holds the [`LexerMode`].
///
/// This is a logos-specific implementation detail.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub struct LexerState {
    mode: LexerMode,
}

/// Distinguishes an ordinary `{` token from one that starts
/// an embedded C++ block.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum BraceKind {
    /// An ordinary `UnrealScript` `{`.
    Normal,
    /// A `{` that starts an embedded C++ block and consumes through its
    /// matching `}`.
    CppBlock,
}

/// Tokens produced by the `UnrealScript` lexer.
///
/// Includes both syntactic tokens and trivia such as whitespace, newlines,
/// and comments.
#[derive(logos::Logos, Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[logos(extras = LexerState)]
pub enum RawToken {
    // # Compiler/directive keywords
    #[regex(r"(?i)#exec[^\r\n]*(?:\r\n|\n|\r)?")]
    ExecDirective,
    #[regex("(?i)cpptext", |lex| {
        if is_next_nontrivia_left_brace(lex) {
            lex.extras.mode = LexerMode::AwaitingCppBlock;
        } else {
            lex.extras.mode = LexerMode::Normal;
        }
    })]
    CppText,

    #[regex("(?i)cppstruct", |lex| {
        if is_next_nontrivia_left_brace(lex) {
            lex.extras.mode = LexerMode::AwaitingCppBlock;
        } else {
            lex.extras.mode = LexerMode::Normal;
        }
    })]
    CppStruct,
    // # Declaration & structural keywords
    //#[regex("(?i)class")]
    #[token("class", ignore(case))]
    Class,
    #[token("struct", ignore(case))]
    Struct,
    #[token("enum", ignore(case))]
    Enum,
    #[token("state", ignore(case))]
    State,
    #[token("auto", ignore(case))]
    Auto,
    #[token("function", ignore(case))]
    Function,
    #[token("event", ignore(case))]
    Event,
    #[token("delegate", ignore(case))]
    Delegate,
    #[token("var", ignore(case))]
    Var,
    #[token("local", ignore(case))]
    Local,

    // # Inheritance, interface, dependencies
    #[token("extends", ignore(case))]
    Extends,
    #[token("dependson", ignore(case))]
    DependsOn,

    // # Access modifiers & properties
    #[token("private", ignore(case))]
    Private,
    #[token("protected", ignore(case))]
    Protected,
    #[token("public", ignore(case))]
    Public,
    #[token("const", ignore(case))]
    Const,
    #[token("static", ignore(case))]
    Static,
    #[token("native", ignore(case))]
    Native,
    #[token("abstract", ignore(case))]
    Abstract,
    #[token("deprecated", ignore(case))]
    Deprecated,
    #[token("safereplace", ignore(case))]
    SafeReplace,
    #[token("exportstructs", ignore(case))]
    ExportStructs,
    #[token("input", ignore(case))]
    Input,

    // # UnrealScript metadata/specifiers
    #[token("final", ignore(case))]
    Final,
    #[token("default", ignore(case))]
    Default,
    #[token("defaultproperties", ignore(case))]
    DefaultProperties,
    #[token("object", ignore(case))]
    Object,
    #[token("begin", ignore(case))]
    Begin,
    #[token("end", ignore(case))]
    End,
    #[token("optional", ignore(case))]
    Optional,
    #[token("config", ignore(case))]
    Config,
    #[token("perobjectconfig", ignore(case))]
    PerObjectConfig,
    #[token("globalconfig", ignore(case))]
    GlobalConfig,
    #[token("collapsecategories", ignore(case))]
    CollapseCategories,
    #[token("dontcollapsecategories", ignore(case))]
    DontCollapseCategories,
    #[token("hidecategories", ignore(case))]
    HideCategories,
    #[token("showcategories", ignore(case))]
    ShowCategories,
    #[token("localized", ignore(case))]
    Localized,
    #[token("placeable", ignore(case))]
    Placeable,
    #[token("notplaceable", ignore(case))]
    NotPlaceable,
    #[token("instanced", ignore(case))]
    Instanced,
    #[token("editconst", ignore(case))]
    EditConst,
    #[token("editconstarray", ignore(case))]
    EditConstArray,
    #[token("editinline", ignore(case))]
    EditInline,
    #[token("editinlineuse", ignore(case))]
    EditInlineUse,
    #[token("editinlinenew", ignore(case))]
    EditInlineNew,
    #[token("noteditinlinenew", ignore(case))]
    NotEditInlineNew,
    #[token("edfindable", ignore(case))]
    EdFindable,
    #[token("editinlinenotify", ignore(case))]
    EditInlineNotify,
    #[token("parseconfig", ignore(case))]
    ParseConfig,
    #[token("automated", ignore(case))]
    Automated,
    #[token("dynamicrecompile", ignore(case))]
    DynamicRecompile,
    #[token("transient", ignore(case))]
    Transient,
    #[token("long", ignore(case))]
    Long,
    #[token("operator", ignore(case))]
    Operator,
    #[token("preoperator", ignore(case))]
    PreOperator,
    #[token("postoperator", ignore(case))]
    PostOperator,
    #[token("simulated", ignore(case))]
    Simulated,
    #[token("exec", ignore(case))]
    Exec,
    #[token("latent", ignore(case))]
    Latent,
    #[token("iterator", ignore(case))]
    Iterator,
    #[token("out", ignore(case))]
    Out,
    #[token("skip", ignore(case))]
    Skip,
    #[token("singular", ignore(case))]
    Singular,
    #[token("coerce", ignore(case))]
    Coerce,
    #[token("assert", ignore(case))]
    Assert,
    #[token("ignores", ignore(case))]
    Ignores,
    #[token("within", ignore(case))]
    Within,
    #[token("init", ignore(case))]
    Init,
    #[token("export", ignore(case))]
    Export,
    #[token("noexport", ignore(case))]
    NoExport,
    #[token("hidedropdown", ignore(case))]
    HideDropdown,
    #[token("travel", ignore(case))]
    Travel,
    #[token("cache", ignore(case))]
    Cache,
    #[token("cacheexempt", ignore(case))]
    CacheExempt,

    // # Replication-related
    #[token("reliable", ignore(case))]
    Reliable,
    #[token("unreliable", ignore(case))]
    Unreliable,
    #[token("replication", ignore(case))]
    Replication,
    #[token("nativereplication", ignore(case))]
    NativeReplication,

    // # Control-flow keywords
    #[token("goto", ignore(case))]
    Goto,
    #[token("if", ignore(case))]
    If,
    #[token("else", ignore(case))]
    Else,
    #[token("switch", ignore(case))]
    Switch,
    #[token("case", ignore(case))]
    Case,
    #[token("for", ignore(case))]
    For,
    #[token("foreach", ignore(case))]
    ForEach,
    #[token("while", ignore(case))]
    While,
    #[token("do", ignore(case))]
    Do,
    #[token("until", ignore(case))]
    Until,
    #[token("break", ignore(case))]
    Break,
    #[token("continue", ignore(case))]
    Continue,
    #[token("return", ignore(case))]
    Return,

    // # Built-in types
    #[token("int", ignore(case))]
    Int,
    #[token("float", ignore(case))]
    Float,
    #[token("bool", ignore(case))]
    Bool,
    #[token("byte", ignore(case))]
    Byte,
    #[token("string", ignore(case))]
    String,
    #[token("array", ignore(case))]
    Array,
    #[token("name", ignore(case))]
    Name,

    // FloatLiteral must come before IntegerLiteral and '.'
    // to have higher priority.
    // It also recognizes things like: `1.foo``, `1.foo.bar`, `1.2.3`.
    // It has to. Because UnrealScript is a pile of-... wonderful language,
    // where everything is possible.
    #[regex(r"[0-9]+(?:\.(?:[0-9]+|[A-Za-z_][A-Za-z0-9_]*))+[fF]?")]
    #[regex(r"(?:[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)(?:[eE][+-]?[0-9]+)?[fF]?")]
    #[regex(r"[0-9]+[eE][+-]?[0-9]+[fF]?")]
    FloatLiteral,

    #[regex(r"0b[01](?:_?[01])*")]
    #[regex(r"0o[0-7](?:_?[0-7])*")]
    #[regex(r"0x[0-9A-Fa-f](?:_?[0-9A-Fa-f])*")]
    #[regex(r"[0-9][0-9]*")]
    IntegerLiteral,

    #[regex(r#""([^"\\\r\n]|\\.)*""#)]
    StringLiteral,
    #[regex(r"'[a-zA-Z0-9_\. \-]*'")]
    NameLiteral,
    #[token("true", ignore(case))]
    True,
    #[token("false", ignore(case))]
    False,
    #[token("none", ignore(case))]
    None,
    #[token("self", ignore(case))]
    SelfValue,
    #[token("new", ignore(case))]
    New,
    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
    Identifier,

    // # Operations
    // ## Exponentiation
    #[token("**")]
    Exponentiation,
    // ## Unary
    #[token("++")]
    Increment,
    #[token("--")]
    Decrement,
    #[token("!")]
    Not,
    #[token("~")]
    BitwiseNot,
    // ## Vector
    #[token("dot", ignore(case))]
    Dot,
    #[token("cross", ignore(case))]
    Cross,
    // ## Multiplicative
    #[token("*")]
    Multiply,
    #[token("/")]
    Divide,
    #[token("%")]
    Modulo,
    // ## Additive
    #[token("+")]
    Plus,
    #[token("-")]
    Minus,
    // ## String manipulation
    #[token("@")]
    ConcatSpace,
    #[token("$")]
    Concat,
    // ## Shifts
    #[token("<<")]
    LeftShift,
    #[token(">>>")]
    LogicalRightShift,
    #[token(">>")]
    RightShift,
    // ## Relational
    #[token("<")]
    Less,
    #[token("<=")]
    LessEqual,
    #[token(">")]
    Greater,
    #[token(">=")]
    GreaterEqual,
    #[token("==")]
    Equal,
    #[token("!=")]
    NotEqual,
    #[token("~=")]
    ApproximatelyEqual,
    #[token("clockwisefrom", ignore(case))]
    ClockwiseFrom,
    // ## Bitwise
    #[token("&")]
    BitwiseAnd,
    #[token("|")]
    BitwiseOr,
    #[token("^")]
    BitwiseXor,
    // ## Logical
    #[token("&&")]
    LogicalAnd,
    #[token("^^")]
    LogicalXor,
    #[token("||")]
    LogicalOr,
    // ## Assignments
    #[token("=")]
    Assign,
    #[token("*=")]
    MultiplyAssign,
    #[token("/=")]
    DivideAssign,
    #[token("%=")]
    ModuloAssign,
    #[token("+=")]
    PlusAssign,
    #[token("-=")]
    MinusAssign,
    #[token("$=")]
    ConcatAssign,
    #[token("@=")]
    ConcatSpaceAssign,

    // # Punctuation & delimiters
    #[token("(")]
    LeftParenthesis,
    #[token(")")]
    RightParenthesis,
    #[token("{", process_left_brace)]
    Brace(BraceKind),
    #[token("}")]
    RightBrace,
    #[token("[")]
    LeftBracket,
    #[token("]")]
    RightBracket,
    #[token(";")]
    Semicolon,
    #[token(",")]
    Comma,
    #[token(".")]
    Period,
    #[token(":")]
    Colon,
    #[token("#")]
    Hash,
    #[token("?")]
    Question,

    // # Comments & whitespaces
    #[regex(r"//[^\r\n]*")]
    LineComment,
    #[regex(r"/\*", handle_block_comment)]
    BlockComment,
    #[regex(r"\r\n|\n|\r")]
    Newline,
    #[regex(r"[ \t]+")]
    Whitespace,

    // # Technical
    Error,
}

/// Consumes an `UnrealScript` `/* ... */` block comment, including nested comments.
///
/// Matches the entire comment, including its delimiters.
/// If the comment is unterminated, consumes to the end of input.
fn handle_block_comment(lexer: &mut Lexer<RawToken>) {
    let mut comment_depth = 1;
    while let Some(next_character) = lexer.remainder().chars().next() {
        if lexer.remainder().starts_with("/*") {
            comment_depth += 1;
            lexer.bump(2);
            continue;
        }
        if lexer.remainder().starts_with("*/") {
            comment_depth -= 1;
            lexer.bump(2);
            if comment_depth == 0 {
                break;
            }
            continue;
        }
        lexer.bump(next_character.len_utf8());
    }
}

/// Processes `{` according to the current lexer mode.
///
/// Returns [`BraceKind::Normal`] for ordinary `UnrealScript` braces.
/// After `cpptext` or `cppstruct`, consumes the embedded C++ block and returns
/// [`BraceKind::CppBlock`].
fn process_left_brace(lexer: &mut Lexer<RawToken>) -> BraceKind {
    match lexer.extras.mode {
        LexerMode::Normal => BraceKind::Normal,
        LexerMode::AwaitingCppBlock => {
            lexer.extras.mode = LexerMode::Normal;
            consume_cpp_block(lexer);
            BraceKind::CppBlock
        }
    }
}

/// Consumes a complete C++ block, handling:
///   - Nested `{...}` pairs
///   - String literals (`"..."` and `'...'`), including escaped quotes
///   - Line comments (`// ...\n`)
///   - Block comments (`/* ... */`)
///
/// Leaves the lexer positioned immediately after the closing `}` of the block.
/// The opening `{` must have already been consumed by the caller.
///
/// We target UE2-era cpp blocks, so no need for anything fancy.
fn consume_cpp_block(lexer: &mut Lexer<RawToken>) {
    let mut brace_depth = 1;
    while let Some(next_character) = lexer.remainder().chars().next() {
        match next_character {
            '{' => {
                brace_depth += 1;
                lexer.bump(1);
            }
            '}' => {
                brace_depth -= 1;
                lexer.bump(1);
                if brace_depth == 0 {
                    break;
                }
            }
            '/' if lexer.remainder().starts_with("/*") => {
                lexer.bump(2); // consuming two-byte sequence `/*`
                consume_c_style_block_comment(lexer);
            }
            '/' if lexer.remainder().starts_with("//") => {
                lexer.bump(2); // consuming two-byte sequence `//`
                while let Some(next_character) = lexer.remainder().chars().next() {
                    lexer.bump(next_character.len_utf8());
                    if next_character == '\n' || next_character == '\r' {
                        break;
                    }
                }
            }
            '"' | '\'' => {
                lexer.bump(1); // skip  `'` or `"`
                consume_quoted_cpp_literal(lexer, next_character);
            }
            _ => lexer.bump(next_character.len_utf8()),
        }
    }
}

/// Consumes a non-nesting C-style `/* ... */` comment.
///
/// Assumes that the opening `/*` has already been consumed.
fn consume_c_style_block_comment(lexer: &mut Lexer<RawToken>) {
    while let Some(next_character) = lexer.remainder().chars().next() {
        if lexer.remainder().starts_with("*/") {
            lexer.bump(2);
            break;
        }
        lexer.bump(next_character.len_utf8());
    }
}

/// Consumes a quoted C++ string or character literal.
///
/// Assumes that the opening delimiter has already been consumed.
fn consume_quoted_cpp_literal(lexer: &mut Lexer<RawToken>, delimiter: char) {
    while let Some(next_character) = lexer.remainder().chars().next() {
        lexer.bump(next_character.len_utf8());
        if next_character == '\\' {
            // Skip the escaped character
            if let Some(escaped_character) = lexer.remainder().chars().next() {
                lexer.bump(escaped_character.len_utf8());
            }
        } else if next_character == delimiter {
            return;
        }
    }
}

/// Peek ahead from the current lexer position, skipping "trivia", and report
/// whether the next significant character is `{`.
///
/// Trivia here means:
/// - Spaces and tabs
/// - Newlines (`\r`, `\n`, or `\r\n`)
/// - Line comments (`// ...`)
/// - Block comments (`/* ... */`), including nested ones
///
/// This is used after lexing tokens like `cpptext` or `cppstruct`, where
/// `UnrealScript` allows arbitrary trivia between the keyword and the opening
/// brace of the embedded C++ block.
///
/// Returns `true` if the next non-trivia character is `{`, otherwise `false`.
/// If the input ends while skipping trivia, returns `false`.
fn is_next_nontrivia_left_brace(lexer: &Lexer<RawToken>) -> bool {
    let mut remaining = lexer.remainder();

    while let Some(next_character) = remaining.chars().next() {
        match next_character {
            ' ' | '\t' | '\r' | '\n' => {
                remaining = &remaining[next_character.len_utf8()..];
            }
            '/' if remaining.starts_with("//") => {
                remaining = &remaining[2..];
                while let Some(comment_character) = remaining.chars().next() {
                    remaining = &remaining[comment_character.len_utf8()..];
                    if comment_character == '\n' || comment_character == '\r' {
                        break;
                    }
                }
            }
            '/' if remaining.starts_with("/*") => {
                remaining = &remaining[2..];
                let mut comment_depth = 1;
                while comment_depth > 0 {
                    if remaining.starts_with("/*") {
                        comment_depth += 1;
                        remaining = &remaining[2..];
                        continue;
                    }
                    if remaining.starts_with("*/") {
                        comment_depth -= 1;
                        remaining = &remaining[2..];
                        continue;
                    }
                    let Some(comment_character) = remaining.chars().next() else {
                        return false;
                    };
                    remaining = &remaining[comment_character.len_utf8()..];
                }
            }
            _ => return next_character == '{',
        }
    }

    false
}