Initial commit

2025-07-30 19:46:37 +07:00 · 2025-07-30 19:46:37 +07:00 · 4b9d6a6adb
commit 4b9d6a6adb
13 changed files with 2308 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/target
+flamegraph.svg
+perf.data
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,26 @@
+[workspace]
+resolver = "2"
+members = ["dev_tests", "rottlsp", "rottlib"]
+
+[workspace.package]
+edition = "2024"
+
+[workspace.lints.clippy]
+all = "warn"
+nursery = "warn"
+pedantic = "warn"
+
+[profile.release]
+opt-level = 3 # Optimize for speed
+strip = true # Strip symbols from binary
+lto = true # Enable link-time optimization
+panic = "abort" # Abort on panic
+overflow-checks = false # no integer checks
+codegen-units = 1 # Reduce number of codegen units to increase optimizations
+debug = false # strip all debug info
+
+[profile.flamegraph]
+inherits = "release" # start from release
+strip = false
+debug = true # full DWARF info for unwinding
+split-debuginfo = "unpacked" # keep symbols inside the binary
--- a/dev_tests/Cargo.toml
+++ b/dev_tests/Cargo.toml
@ -0,0 +1,23 @@
+[package]
+name = "dev_tests"
+version = "0.1.0"
+edition = "2024"
+
+[[bin]]
+name = "dump_tokens"
+path = "src/dump_tokens.rs"
+
+[[bin]]
+name = "uc_lexer_verify"
+path = "src/uc_lexer_verify.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+rottlib = { version = "0", path = "../rottlib", features = ["debug"] }
+walkdir="2.5"
+encoding_rs="0.8"
+chardet="0.2"
+
+[lints]
+workspace = true
--- a/dev_tests/src/dump_tokens.rs
+++ b/dev_tests/src/dump_tokens.rs
@ -0,0 +1,76 @@
+use std::{
+    fs,
+    path::{Path, PathBuf},
+};
+
+use encoding_rs::{Encoding, UTF_8};
+use rottlib::lexer::{DebugTools, TokenizedFile};
+
+/// Recursively search `root` for the first file whose *basename* matches
+/// `needle` (case-sensitive).
+///
+/// Returns the absolute path.
+fn find_file(root: &Path, needle: &str) -> Option<PathBuf> {
+    for entry in walkdir::WalkDir::new(root)
+        .into_iter()
+        .filter_map(Result::ok)
+    {
+        let path = entry.path();
+        if path.is_file() && (path.file_name().and_then(|name| name.to_str()) == Some(needle)) {
+            return fs::canonicalize(path).ok();
+        }
+    }
+    None
+}
+
+/// CLI: `dump_tokens <root_dir> <file_name>` - searches for `<file_name>`
+/// recursively inside `<root_dir>`.
+///
+/// This utility takes *root directory* and *file name* instead of the full path
+/// to help us avoid searching for them typing names out:
+///
+/// - We know where all the sources are;
+/// - We usually just know the name of the file that is being problematic.
+fn main() {
+    let mut args = std::env::args().skip(1);
+    let root_dir = args.next().unwrap_or_else(|| {
+        eprintln!("Usage: inspect_uc <root_dir> <file_name>");
+        std::process::exit(1);
+    });
+    let file_name = args.next().unwrap_or_else(|| {
+        eprintln!("Usage: inspect_uc <root_dir> <file_name>");
+        std::process::exit(1);
+    });
+
+    let root = PathBuf::from(&root_dir);
+    if !root.exists() {
+        eprintln!("Root directory '{root_dir}' does not exist.");
+        std::process::exit(1);
+    }
+
+    let found_path = find_file(&root, &file_name).map_or_else(
+        || {
+            eprintln!("File '{file_name}' not found under '{root_dir}'.");
+            std::process::exit(1);
+        },
+        |path| path,
+    );
+
+    // Read & decode
+    let raw_bytes = match fs::read(&found_path) {
+        Ok(sources) => sources,
+        Err(error) => {
+            eprintln!("Could not read {}: {error}", found_path.display());
+            std::process::exit(1);
+        }
+    };
+
+    let (encoding_label, _, _) = chardet::detect(&raw_bytes);
+    let encoding = Encoding::for_label(encoding_label.as_bytes()).unwrap_or(UTF_8);
+    let (decoded_str, _, _) = encoding.decode(&raw_bytes);
+
+    let source_text = decoded_str.to_string();
+    let tokenized_file = TokenizedFile::from_source(&source_text);
+
+    tokenized_file.dump_debug_layout();
+}
--- a/dev_tests/src/uc_lexer_verify.rs
+++ b/dev_tests/src/uc_lexer_verify.rs
@ -0,0 +1,122 @@
+use std::{collections::HashSet, fs, path::PathBuf};
+
+use rottlib::lexer::{DebugTools, TokenizedFile};
+
+/// Read `ignore.txt` (one path per line, `#` for comments) from root directory
+/// and turn it into a canonicalized [`HashSet<PathBuf>`].
+fn load_ignore_set(root: &std::path::Path) -> HashSet<PathBuf> {
+    let ignore_file = root.join("ignore.txt");
+    if !ignore_file.exists() {
+        return HashSet::new();
+    }
+
+    let content = match fs::read_to_string(&ignore_file) {
+        Ok(content) => content,
+        Err(error) => {
+            eprintln!("Could not read {}: {error}", ignore_file.display());
+            return HashSet::new();
+        }
+    };
+
+    content
+        .lines()
+        .map(str::trim)
+        .filter(|line| !line.is_empty() && !line.starts_with('#'))
+        .filter_map(|line| {
+            let next_path = PathBuf::from(line);
+            let absolute_path = if next_path.is_absolute() {
+                next_path
+            } else {
+                root.join(next_path)
+            };
+            fs::canonicalize(absolute_path).ok()
+        })
+        .collect()
+}
+
+/// CLI: `verify_uc <root_dir>` - find all `.uc` files in the provided directory
+/// (except those listed in `ignore.txt` in the root) and test them all.
+///
+/// Reported execution time is the tokenization time, without considering time
+/// it takes to read files from disk.
+///
+/// `ignore.txt` is for listing specific files, not directories.
+fn main() {
+    let root_dir = std::env::args().nth(1).unwrap(); // it is fine to crash debug utility
+    let root = PathBuf::from(&root_dir);
+
+    if !root.exists() {
+        eprintln!("Root directory '{root_dir}' does not exist.");
+        std::process::exit(1);
+    }
+
+    // Load files
+    let ignored_paths = load_ignore_set(&root);
+    let mut uc_files: Vec<(PathBuf, String)> = Vec::new();
+    for entry in walkdir::WalkDir::new(&root)
+        .into_iter()
+        .filter_map(Result::ok) // for debug tool this is ok
+        .filter(|entry| {
+            let path = entry.path();
+            // Skip anything explicitly ignored
+            if let Ok(absolute_path) = fs::canonicalize(path) {
+                if ignored_paths.contains(&absolute_path) {
+                    return false;
+                }
+            }
+            // Must be *.uc
+            path.is_file()
+                && path
+                    .extension()
+                    .and_then(|extension| extension.to_str())
+                    .is_some_and(|extension| extension.eq_ignore_ascii_case("uc"))
+        })
+    {
+        let path = entry.path();
+        match fs::read(path) {
+            Ok(raw_bytes) => {
+                // Auto‑detect encoding for old Unreal script sources
+                let (encoding_label, _, _) = chardet::detect(&raw_bytes);
+                let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes())
+                    .unwrap_or(encoding_rs::UTF_8);
+                let (decoded_text, _, _) = encoding.decode(&raw_bytes);
+                uc_files.push((path.to_path_buf(), decoded_text.into_owned()));
+            }
+            Err(error) => {
+                eprintln!("Failed to read `{}`: {error}", path.display());
+                std::process::exit(1);
+            }
+        }
+    }
+    println!("Loaded {} .uc files into memory.", uc_files.len());
+
+    // Tokenize and measure performance
+    let start_time = std::time::Instant::now();
+    let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files
+        .iter()
+        .map(|(path, source_code)| {
+            let tokenized_file = TokenizedFile::from_source(source_code);
+            if tokenized_file.had_errors() {
+                println!("TK: {}", path.display());
+            }
+            (path.clone(), tokenized_file)
+        })
+        .collect();
+    let elapsed_time = start_time.elapsed();
+    println!(
+        "Tokenized {} files in {:.2?}",
+        tokenized_files.len(),
+        elapsed_time
+    );
+
+    // Round‑trip check
+    for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) {
+        let reconstructed = tokenized_file.reconstruct_source();
+        if original != &reconstructed {
+            eprintln!("Reconstruction mismatch in `{}`!", path.display());
+            std::process::exit(1);
+        }
+    }
+
+    println!("All .uc files matched successfully.");
+}
--- a/rottlib/Cargo.toml
+++ b/rottlib/Cargo.toml
@ -0,0 +1,11 @@
+[package]
+name = "rottlib"
+version = "0.1.0"
+edition = "2024"
+
+[features]
+default = []
+debug = []
+
+[dependencies]
+logos = "0.15"
--- a/rottlib/src/lexer/debug_tools.rs
+++ b/rottlib/src/lexer/debug_tools.rs
@ -0,0 +1,92 @@
+//! Debug-only helpers for [`TokenizedFile`]
+//!
+//! This module is **compiled only if**
+//!
+//! * the current build profile has `debug_assertions` enabled, or
+//! * the crate is built with the `debug` cargo feature.
+//!
+//! These checks have been moved to the parent module.
+
+use super::Line;
+
+/// A technical trait that adds debug helpers to the lexer.
+pub trait DebugTools {
+    /// Pretty-prints the internal layout of the tokenised file - useful when
+    /// writing new passes or hunting lexer bugs.
+    ///
+    /// This method writes the layout directly to standard output.
+    ///
+    /// The format is unspecified, may change, and is not intended for
+    /// external tools.
+    ///
+    /// Each line in the printed layout starts with its 0-based number for
+    /// convenience.
+    fn dump_debug_layout(&self);
+
+    /// Reconstructs the exact, lossless source text that was fed to
+    /// [`super::TokenizedFile::from_source`] from internal representation -
+    /// useful for manually verifying that the lexer works.
+    fn reconstruct_source(&self) -> String;
+}
+
+impl<'src> DebugTools for super::TokenizedFile<'src> {
+    fn reconstruct_source(&self) -> String {
+        let mut result = String::new();
+        for line in &self.lines {
+            if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
+                for span in &self.buffer[token_range.clone()] {
+                    result.push_str(span.lexeme);
+                }
+            }
+        }
+        result
+    }
+
+    fn dump_debug_layout(&self) {
+        for (row_index, line) in self.lines.iter().enumerate() {
+            println!("Line {}", row_index + 1);
+            match line {
+                Line::Standalone(token_range) => {
+                    println!("\t[Standalone]");
+                    let mut column_utf16 = 0usize;
+                    for next_token_span in &self.buffer[token_range.clone()] {
+                        let token_beginning = column_utf16;
+                        let token_end = column_utf16 + next_token_span.length_utf16;
+                        println!(
+                            "\t\t{:?} @ {}-{}: {:?}",
+                            next_token_span.token,
+                            token_beginning,
+                            token_end,
+                            next_token_span.lexeme
+                        );
+                        column_utf16 = token_end;
+                    }
+                }
+                Line::Spanned(origin_row) => {
+                    // `origin_row` is 0-based
+                    println!(
+                        "\t[Continued from line {} - no new tokens here]",
+                        origin_row + 1
+                    );
+                }
+                Line::SpannedWithTokens(origin_row, token_range) => {
+                    // `origin_row` is 0-based
+                    println!("\t[Continued from line {} + new tokens]", origin_row + 1);
+                    let mut column_utf16 = 0usize;
+                    for next_token_span in &self.buffer[token_range.clone()] {
+                        let token_beginning = column_utf16;
+                        let token_end = column_utf16 + next_token_span.length_utf16;
+                        println!(
+                            "\t\t{:?} @ {}-{}: {:?}",
+                            next_token_span.token,
+                            token_beginning,
+                            token_end,
+                            next_token_span.lexeme
+                        );
+                        column_utf16 = token_end;
+                    }
+                }
+            }
+        }
+    }
+}
--- a/rottlib/src/lexer/lexing.rs
+++ b/rottlib/src/lexer/lexing.rs
@ -0,0 +1,476 @@
+//! Lexer for UnrealScript that understands inline `cpptext { ... }` blocks.
+//!
+//! ## Notable details
+//!
+//! Lexer for UnrealScript that recognises inline `cpptext { … }` blocks.
+//!
+//! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.  
+//! Because whitespace, newlines, or comments may appear between the
+//! `cpptext` keyword and the opening `{`, the lexer must remember that
+//! it has just seen `cpptext` - hence a state machine.
+//!
+//! Modes
+//! ------
+//! - **Normal** - ordinary UnrealScript tokens.  
+//! - **AwaitingCppBlock** - after `cpptext`, waiting for the next `{`.
+//!
+//! When that brace arrives, the lexer consumes the entire C++ block as
+//! one token (`Token::Brace(BraceKind::CppBlock)`), tracking nested
+//! braces, strings, and comments on the way. If the closing `}` is
+//! missing, everything to EOF is treated as C++; downstream parsers must
+//! handle that gracefully.
+
+use logos::Lexer;
+
+/// Which lexer mode we're in. See the module docs for the full story.
+#[derive(Default, Clone, Copy, PartialEq, Eq)]
+enum LexerMode {
+    /// Lexing regular UnrealScript.
+    #[default]
+    Normal,
+    /// Saw `cpptext`; waiting for the opening `{` of a C++ block.
+    AwaitingCppBlock,
+}
+
+/// Extra per-lexer state. Currently just holds the [`Mode`].
+///
+/// This is a logos-specific implementation detail.
+#[derive(Default)]
+pub struct LexerState {
+    mode: LexerMode,
+}
+
+/// Are these braces "real" UnrealScript braces, or the start/end of a C++ block?
+#[derive(Debug, PartialEq, Clone, Copy)]
+pub enum BraceKind {
+    Normal,
+    CppBlock,
+}
+
+/// All UnrealScript tokens that our compiler distinguishes.
+#[derive(logos::Logos, Debug, PartialEq, Clone, Copy)]
+#[logos(extras = LexerState)]
+pub enum Token {
+    // # Compiler/directive keywords
+    #[regex(r"(?i)#exec[^\r\n]*(\r|\n|\r\n)")]
+    ExecDirective,
+    #[regex("(?i)cpptext", |lex| { lex.extras.mode = LexerMode::AwaitingCppBlock; })]
+    CppText,
+
+    // # Declaration & structural keywords
+    #[regex("(?i)class")]
+    Class,
+    #[regex("(?i)struct")]
+    Struct,
+    #[regex("(?i)enum")]
+    Enum,
+    #[regex("(?i)state")]
+    State,
+    #[regex("(?i)function")]
+    Function,
+    #[regex("(?i)event")]
+    Event,
+    #[regex("(?i)delegate")]
+    Delegate,
+    #[regex("(?i)var")]
+    Var,
+    #[regex("(?i)local")]
+    Local,
+
+    // # Inheritance, interface, dependencies
+    #[regex("(?i)extends")]
+    Extends,
+    #[regex("(?i)dependson")]
+    DependsOn,
+
+    // # Access modifiers & properties
+    #[regex("(?i)private")]
+    Private,
+    #[regex("(?i)protected")]
+    Protected,
+    #[regex("(?i)public")]
+    Public,
+    #[regex("(?i)const")]
+    Const,
+    #[regex("(?i)static")]
+    Static,
+    #[regex("(?i)native")]
+    Native,
+    #[regex("(?i)abstract")]
+    Abstract,
+    #[regex("(?i)deprecated")]
+    Deprecated,
+
+    // # UnrealScript metadata/specifiers
+    #[regex("(?i)default")]
+    Default,
+    #[regex("(?i)defaultproperties")]
+    DefaultProperties,
+    #[regex("(?i)optional")]
+    Optional,
+    #[regex("(?i)config")]
+    Config,
+    #[regex("(?i)perobjectconfig")]
+    PerObjectConfig,
+    #[regex("(?i)globalconfig")]
+    GlobalConfig,
+    #[regex("(?i)collapsecategories")]
+    CollapseCategories,
+    #[regex("(?i)dontcollapsecategories")]
+    DontCollapseCategories,
+    #[regex("(?i)hidecategories")]
+    HideCategories,
+    #[regex("(?i)localized")]
+    Localized,
+    #[regex("(?i)placeable")]
+    Placeable,
+    #[regex("(?i)notplaceable")]
+    NotPlaceable,
+    #[regex("(?i)editinlinenew")]
+    EditInlineNew,
+    #[regex("(?i)noteditinlinenew")]
+    NotEditInlineNew,
+    #[regex("(?i)dynamicrecompile")]
+    DynamicRecompile,
+    #[regex("(?i)transient")]
+    Transient,
+    #[regex("(?i)operator")]
+    Operator,
+    #[regex("(?i)simulated")]
+    Simulated,
+    #[regex("(?i)latent")]
+    Latent,
+    #[regex("(?i)iterator")]
+    Iterator,
+    #[regex("(?i)out")]
+    Out,
+    #[regex("(?i)skip")]
+    Skip,
+    #[regex("(?i)singular")]
+    Singular,
+    #[regex("(?i)coerce")]
+    Coerce,
+    #[regex("(?i)assert")]
+    Assert,
+    #[regex("(?i)ignores")]
+    Ignores,
+    #[regex("(?i)within")]
+    Within,
+    #[regex("(?i)noexport")]
+    NoExport,
+
+    // # Replication-related
+    #[regex("(?i)reliable")]
+    Reliable,
+    #[regex("(?i)unreliable")]
+    Unreliable,
+    #[regex("(?i)replication")]
+    Replication,
+    #[regex("(?i)nativereplication")]
+    NativeReplication,
+
+    // # Control-flow keywords
+    #[regex("(?i)if")]
+    If,
+    #[regex("(?i)else")]
+    Else,
+    #[regex("(?i)switch")]
+    Switch,
+    #[regex("(?i)case")]
+    Case,
+    #[regex("(?i)for")]
+    For,
+    #[regex("(?i)foreach")]
+    ForEach,
+    #[regex("(?i)while")]
+    While,
+    #[regex("(?i)do")]
+    Do,
+    #[regex("(?i)until")]
+    Until,
+    #[regex("(?i)break")]
+    Break,
+    #[regex("(?i)continue")]
+    Continue,
+    #[regex("(?i)return")]
+    Return,
+
+    // # Built-in types
+    #[regex("(?i)int")]
+    Int,
+    #[regex("(?i)float")]
+    Float,
+    #[regex("(?i)bool")]
+    Bool,
+    #[regex("(?i)byte")]
+    Byte,
+    #[regex("(?i)string")]
+    String,
+    #[regex("(?i)array")]
+    Array,
+    #[regex("(?i)name")]
+    Name,
+
+    // # Literals & identifiers
+    #[regex(r"0[xX][0-9A-Fa-f]+|[0-9]+")]
+    IntegerLiteral,
+    #[regex(r"[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?")]
+    FloatLiteral,
+    #[regex(r#""([^"\\\r\n]|\\.)*""#)]
+    StringLiteral,
+    #[regex(r"'[a-zA-Z0-9_\. \-]*'")]
+    NameLiteral,
+    #[regex("(?i)true")]
+    True,
+    #[regex("(?i)false")]
+    False,
+    #[regex("(?i)none")]
+    None,
+    #[regex("(?i)self")]
+    SelfKeyword,
+    #[regex("(?i)new")]
+    New,
+    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
+    Identifier,
+
+    // # Operations
+    // ## Exponentiation
+    #[token("**")]
+    Exponentiation,
+    // ## Unary
+    #[token("++")]
+    Increment,
+    #[token("--")]
+    Decrement,
+    #[token("!")]
+    Not,
+    #[token("~")]
+    BitwiseNot,
+    // ## Vector
+    #[token("dot")]
+    Dot,
+    #[token("cross")]
+    Cross,
+    // ## Multiplicative
+    #[token("*")]
+    Multiply,
+    #[token("/")]
+    Divide,
+    #[token("%")]
+    Modulo,
+    // ## Additive
+    #[token("+")]
+    Plus,
+    #[token("-")]
+    Minus,
+    // ## String manipulation
+    #[token("@")]
+    AtChar,
+    #[token("$")]
+    DollarChar,
+    // ## Shifts
+    #[token("<<")]
+    LeftShift,
+    #[token(">>>")]
+    LogicalRightShift,
+    #[token(">>")]
+    RightShift,
+    // ## Relational
+    #[token("<")]
+    Less,
+    #[token("<=")]
+    LessEqual,
+    #[token(">")]
+    Greater,
+    #[token(">=")]
+    GreaterEqual,
+    #[token("==")]
+    Equal,
+    #[token("!=")]
+    NotEqual,
+    #[token("~=")]
+    ApproximatelyEqual,
+    // ## Bitwise
+    #[token("&")]
+    BitwiseAnd,
+    #[token("|")]
+    BitwiseOr,
+    #[token("^")]
+    BitwiseXor,
+    #[token("^^")]
+    BooleanXor,
+    // ## Logical
+    #[token("&&")]
+    And,
+    #[token("||")]
+    Or,
+    // ## Assigments
+    #[token("=")]
+    Assign,
+    #[token("*=")]
+    MultiplyAssign,
+    #[token("/=")]
+    DivideAssign,
+    #[token("+=")]
+    PlusAssign,
+    #[token("-=")]
+    MinusAssign,
+    #[token("$=")]
+    ConcatAssign,
+    #[token("@=")]
+    ConcatSpaceAssign,
+
+    // # Punctuation & delimiters
+    #[token("(")]
+    LeftParen,
+    #[token(")")]
+    RightParen,
+    #[token("{", handle_brace)]
+    Brace(BraceKind),
+    #[token("}")]
+    RightBrace,
+    #[token("[")]
+    LeftBracket,
+    #[token("]")]
+    RightBracket,
+    #[token(";")]
+    Semicolon,
+    #[token(",")]
+    Comma,
+    #[token(".")]
+    Period,
+    #[token(":")]
+    Colon,
+
+    // # Comments & whitespaces
+    #[regex(r"//[^\r\n]*")]
+    LineComment,
+    #[regex(r"/\*", handle_block_comment)]
+    BlockComment,
+    #[regex(r"\r\n|\n|\r")]
+    NewLine,
+    #[regex(r"[ \t]+")]
+    Whitespace,
+
+    // # Technical
+    Error,
+}
+
+/// Consume a /* ... */ block comment with arbitrary nesting
+/// (like UnrealScript allows).
+///
+/// Matches the whole comment (delimiters included) or [`None`] if the file ends
+/// before every `/*` is closed.
+fn handle_block_comment(lexer: &mut Lexer<Token>) -> Option<()> {
+    let mut comment_depth = 1;
+    while let Some(next_char) = lexer.remainder().chars().next() {
+        if lexer.remainder().starts_with("/*") {
+            comment_depth += 1;
+            lexer.bump(2);
+            continue;
+        }
+        if lexer.remainder().starts_with("*/") {
+            comment_depth -= 1;
+            lexer.bump(2);
+            if comment_depth == 0 {
+                return Some(());
+            }
+            continue;
+        }
+        lexer.bump(next_char.len_utf8());
+    }
+    // Unterminated comment
+    None
+}
+
+/// Called for every `{`.
+///
+/// This method either emits an opening brace or token for `cppblock`,
+/// depending on lexer's current state.
+fn handle_brace(lexer: &mut Lexer<Token>) -> Option<BraceKind> {
+    match lexer.extras.mode {
+        LexerMode::Normal => Some(BraceKind::Normal),
+
+        LexerMode::AwaitingCppBlock => {
+            lexer.extras.mode = LexerMode::Normal;
+            consume_cpp_block(lexer);
+            Some(BraceKind::CppBlock)
+        }
+    }
+}
+
+/// Consumes a complete C++ block, handling:
+///   - Nested `{...}` pairs
+///   - String literals (`"..."` and `'...'`), including escaped quotes
+///   - Line comments (`// ...\n`)
+///   - Block comments (`/* ... */`)
+///
+/// Leaves the lexer positioned immediately after the closing `}` of the block.
+/// The opening `{` must have already been consumed by the caller.
+fn consume_cpp_block(lexer: &mut Lexer<Token>) {
+    let mut depth = 1;
+    while let Some(ch) = lexer.remainder().chars().next() {
+        match ch {
+            '{' => {
+                depth += 1;
+                lexer.bump(1);
+            }
+            '}' => {
+                depth -= 1;
+                lexer.bump(1);
+                if depth == 0 {
+                    break;
+                }
+            }
+            '/' if lexer.remainder().starts_with("/*") => {
+                lexer.bump(2); // consuming two-byte sequence `/*`
+                consume_c_comment(lexer)
+            }
+            '/' if lexer.remainder().starts_with("//") => {
+                lexer.bump(2); // consuming two-byte sequence `//`
+                while let Some(c) = lexer.remainder().chars().next() {
+                    lexer.bump(c.len_utf8());
+                    if c == '\n' {
+                        break;
+                    }
+                }
+            }
+            '"' | '\'' => {
+                lexer.bump(1); // skip  `'` or `"`
+                consume_string_literal(lexer, ch);
+            }
+            _ => lexer.bump(ch.len_utf8()),
+        }
+    }
+}
+
+/// Consume over a C-style `/* … */` comment (without nesting).
+///
+/// Assumes that opener `/*` is already consumed.
+fn consume_c_comment(lexer: &mut Lexer<Token>) {
+    while let Some(next_character) = lexer.remainder().chars().next() {
+        if lexer.remainder().starts_with("*/") {
+            lexer.bump(2);
+            break;
+        } else {
+            lexer.bump(next_character.len_utf8());
+        }
+    }
+}
+
+/// Consume a string literal from C++ code.
+///
+/// Assumes that opening quotation mark is already consumed.
+fn consume_string_literal(lexer: &mut Lexer<Token>, delimiter: char) {
+    while let Some(next_character) = lexer.remainder().chars().next() {
+        lexer.bump(next_character.len_utf8());
+        if next_character == '\\' {
+            // Skip the escaped character
+            if let Some(next) = lexer.remainder().chars().next() {
+                lexer.bump(next.len_utf8());
+            }
+        } else if next_character == delimiter {
+            return;
+        }
+    }
+}
--- a/rottlib/src/lexer/mod.rs
+++ b/rottlib/src/lexer/mod.rs
@ -0,0 +1,276 @@
+//! # Tokenizer
+//!
+//! Converts raw source text into a lossless, position-aware stream of lexical
+//! [`Token`]s, grouped *per physical line*, and returns it as
+//! a [`TokenizedFile`].
+//!
+//! Design goals:
+//!
+//! 1. **Lossless**: preserving complete information for each token, enough to
+//!     recreate the original bytes without loss.
+//! 2. **LSP readiness**: the LSP heavily relies on UTF-16 support, so we
+//!     precompute lengths of each token in that encoding, making interfacing
+//!     easier.
+//!
+//! ## Opt-in debug helpers
+//!
+//! Extra diagnostics become available in **debug builds** or when the crate is
+//! compiled with `debug` feature enabled. They live in the [`DebugTools`]
+//! extension trait, implemented for [`TokenizedFile`].
+//!
+//! ```
+//! // bring the trait into scope
+//! use lexer::DebugTools;
+//!
+//! let file = TokenizedFile::from_source(src);
+//! file.debug_dump();              // pretty-print token layout
+//! let text = file.to_source();    // reconstruct original text
+//! ```
+
+mod debug_tools;
+mod lexing;
+
+use std::ops::Range;
+
+use logos::Logos;
+
+#[cfg(any(debug_assertions, feature = "debug"))]
+pub use debug_tools::DebugTools;
+pub use lexing::Token;
+
+/// Empirically chosen starting size for token buffer (used during tokenization)
+/// that provides good performance.
+const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
+
+/// A slice tagged with its token kind plus two length counters.
+///
+/// *No absolute coordinates* are stored - they are recomputed per line.
+#[derive(Debug, Clone, Copy)]
+struct TokenSpan<'src> {
+    lexeme: &'src str,
+    token: Token,
+    length_utf16: usize,
+}
+
+/// Representation of a single physical line of the source file.
+///
+/// [`Range<usize>`] are used instead of slices to avoid creating
+/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
+#[derive(Clone)]
+enum Line {
+    /// A standalone line that owns a contiguous slice in
+    /// the [`TokenizedFile::buffer`] arena.
+    Standalone(Range<usize>),
+    /// A 0-based line that is part of a multi-line token started on
+    /// another line.
+    Spanned(usize),
+    /// A 0-based line that is part of a multi-line token started on
+    /// another line *and* contains additional tokens local to itself.
+    SpannedWithTokens(usize, Range<usize>),
+}
+
+/// A tokenized, lossless representation of an UnrealScript source file.
+pub struct TokenizedFile<'src> {
+    /// Arena of every token span in this file.
+    buffer: Vec<TokenSpan<'src>>,
+    /// Mapping that provides an easy and efficient access to tokens by
+    /// line number.
+    lines: Vec<Line>,
+    /// Simple flag for marking erroneous state.
+    had_errors: bool,
+}
+
+/// Mutable state that encapsulates data needed during the tokenization loop.
+struct Tokenizer<'src> {
+    /// Arena that owns every [`TokenSpan`] produced for the file.
+    buffer: Vec<TokenSpan<'src>>,
+    /// Mapping from physical line number to the tokens that belong to it.
+    lines: Vec<Line>,
+    /// The current 0-based physical line number.
+    line_number: usize,
+    /// Index in [`Tokenizer::buffer`] where the current line starts.
+    slice_start_index: usize,
+    /// When a multi-line token is being scanned, stores the 0-based line
+    /// on which it started; [`None`] otherwise.
+    multi_line_start: Option<usize>,
+    /// Set to [`true`] if the lexer reported any error tokens.
+    had_errors: bool,
+}
+
+impl<'src> TokenizedFile<'src> {
+    /// Tokenize `source` and return a fresh [`TokenizedFile`].
+    pub fn from_source(source: &'src str) -> TokenizedFile<'src> {
+        let mut tokenizer = TokenizedFile::<'src>::builder();
+        let mut lexer = Token::lexer(source);
+
+        // Logos > Ok() > token > token span <- plugged into tokenizer
+        while let Some(token_result) = lexer.next() {
+            let token = token_result.unwrap_or_else(|_| {
+                tokenizer.had_errors = true;
+                Token::Error
+            });
+            let token_span = build_span(token, lexer.slice());
+            tokenizer.process_token_span(token_span);
+        }
+        tokenizer.into_tokenized_file()
+    }
+
+    /// Returns [`true`] if any erroneous tokens were produced during building
+    /// of this [`TokenizedFile`].
+    pub fn had_errors(&self) -> bool {
+        self.had_errors
+    }
+
+    /// Create an empty tokenizer state with tuned buffer capacity.
+    fn builder() -> Tokenizer<'src> {
+        Tokenizer {
+            buffer: Vec::with_capacity(DEFAULT_TOKEN_BUFFER_CAPACITY),
+            lines: Vec::new(),
+            line_number: 0,
+            slice_start_index: 0,
+            multi_line_start: None,
+            had_errors: false,
+        }
+    }
+}
+
+impl<'src> Tokenizer<'src> {
+    /// Handles a token span and dispatches to the appropriate handler.
+    fn process_token_span(&mut self, token_span: TokenSpan<'src>) {
+        if token_can_span_lines(&token_span.token) {
+            self.process_multi_line_token(token_span);
+        } else {
+            self.process_single_line_token(token_span);
+        }
+    }
+
+    /// Handles tokens that never span multiple lines.
+    fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) {
+        if token_is_newline(&token_span.token) {
+            self.line_number += 1;
+            self.buffer.push(token_span);
+            self.commit_current_line();
+        } else {
+            self.buffer.push(token_span);
+        }
+    }
+
+    /// Handles tokens that may contain one or more newline characters.
+    fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) {
+        let start_line = self.line_number;
+        let newline_count = count_newlines(token_span.lexeme);
+
+        // Did this token end in a newline?
+        // This can happen if this is an `Error` token that ends the file.
+        let ends_with_newline =
+            token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r');
+
+        self.buffer.push(token_span);
+        // We only need to commit the line if this token actually ended the line
+        if newline_count > 0 {
+            self.commit_current_line();
+            // We only need to insert one `Line::Spanned(base)` per *interior*
+            // newline, so `newline_count - 1` such lines
+            // (e.g. 2 line breaks in block comment -> it has
+            // exactly `1` interior line)
+            let insert_count = newline_count - 1;
+            for _ in 0..insert_count {
+                self.lines.push(Line::Spanned(start_line));
+            }
+            // This is called *after* `commit_current_line()` cleared previous
+            // stored value
+            self.multi_line_start = if ends_with_newline {
+                None // we're done at this point
+            } else {
+                Some(start_line)
+            };
+        }
+
+        self.line_number = start_line + newline_count;
+    }
+
+    /// Commits the tokens of the current physical line into `self.lines`.
+    fn commit_current_line(&mut self) {
+        let slice_end = self.buffer.len();
+        if slice_end > self.slice_start_index {
+            let slice = self.slice_start_index..slice_end;
+
+            // If we were in the middle of a multi-line token, we
+            // *always* consume `spanned_from` here, ensuring that each call to
+            // `commit_current_line()` only applies it once.
+            // This guarantees no "bleed" between adjacent multi-line tokens.
+            if let Some(from) = self.multi_line_start.take() {
+                self.lines.push(Line::SpannedWithTokens(from, slice));
+            } else {
+                self.lines.push(Line::Standalone(slice));
+            }
+            self.slice_start_index = slice_end;
+        }
+    }
+
+    /// Finishes tokenization, converting accumulated data into
+    /// [`TokenizedFile`].
+    fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
+        // Commit any trailing tokens
+        self.commit_current_line();
+        // If we still have a `spanned_from` (i.e. a pure multi-line token with
+        // no local tokens on its last line), push a bare `Spanned` entry.
+        if let Some(from) = self.multi_line_start.take() {
+            self.lines.push(Line::Spanned(from));
+        }
+
+        // Optimize for size
+        self.buffer.shrink_to_fit();
+        self.lines.shrink_to_fit();
+
+        TokenizedFile {
+            buffer: self.buffer,
+            lines: self.lines,
+            had_errors: self.had_errors,
+        }
+    }
+}
+
+fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> {
+    let length_utf16 = text.encode_utf16().count();
+    TokenSpan {
+        lexeme: text,
+        token,
+        length_utf16,
+    }
+}
+
+fn token_is_newline(token: &Token) -> bool {
+    matches!(token, Token::NewLine)
+}
+
+fn token_can_span_lines(token: &Token) -> bool {
+    matches!(
+        token,
+        Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error
+    )
+}
+
+/// Counts the number of new lines in given text.
+fn count_newlines(text: &str) -> usize {
+    let mut bytes_iterator = text.as_bytes().iter().peekable();
+    let mut newline_count = 0;
+    while let Some(&next_byte) = bytes_iterator.next() {
+        // Logos' regex rule is "\r\n|\n|\r", so we agree with it on new line
+        // character treatment
+        match next_byte {
+            b'\r' => {
+                newline_count += 1;
+                if let Some(&&b'\n') = bytes_iterator.peek() {
+                    // skip the '\n' in a CRLF
+                    bytes_iterator.next();
+                }
+            }
+            b'\n' => {
+                newline_count += 1;
+            }
+            _ => (),
+        }
+    }
+    newline_count
+}
--- a/rottlib/src/lib.rs
+++ b/rottlib/src/lib.rs
@ -0,0 +1,3 @@
+#![allow(clippy::doc_overindented_list_items)]
+
+pub mod lexer;
--- a/rottlsp/Cargo.toml
+++ b/rottlsp/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "rottlsp"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+rottlib = { version = "0", path = "../rottlib" }
+tokio = { version = "1", features = ["full"] }
+tower-lsp = "0.20"
+
+[lints]
+workspace = true
--- a/rottlsp/src/main.rs
+++ b/rottlsp/src/main.rs
@ -0,0 +1,84 @@
+use tower_lsp::lsp_types;
+
+/// A Language Server implementation for Rott.
+///  
+/// Implements the [`tower_lsp::LanguageServer`] trait to handle LSP requests
+/// (e.g. initialization, text synchronization, open notifications)
+/// asynchronously.
+struct RottLanguageServer {
+    /// Client handle for sending notifications and requests to the editor.
+    client: tower_lsp::Client,
+}
+
+#[tower_lsp::async_trait]
+impl tower_lsp::LanguageServer for RottLanguageServer {
+    // Inform the client of our server capabilities during initialization.
+    async fn initialize(
+        &self,
+        _: lsp_types::InitializeParams,
+    ) -> tower_lsp::jsonrpc::Result<lsp_types::InitializeResult> {
+        Ok(lsp_types::InitializeResult {
+            capabilities: lsp_types::ServerCapabilities {
+                // We can synchronize the text of files, which means we request
+                // to receive full updates whenever a file is opened or changed.
+                // `lsp_types::TextDocumentSyncKind::FULL` means we require full text
+                // every time.
+                text_document_sync: Some(lsp_types::TextDocumentSyncCapability::Kind(
+                    lsp_types::TextDocumentSyncKind::FULL,
+                )),
+                ..Default::default()
+            },
+            ..Default::default()
+        })
+    }
+
+    // On file open, tokenize the new document and log any lexing errors.
+    async fn did_open(&self, params: lsp_types::DidOpenTextDocumentParams) {
+        // Measure lexing performance to track parser responsiveness.
+        let start_time = std::time::Instant::now();
+        let has_errors =
+            rottlib::lexer::TokenizedFile::from_source(&params.text_document.text).had_errors();
+        let elapsed_time = start_time.elapsed();
+
+        self.client
+            .log_message(
+                lsp_types::MessageType::INFO,
+                format!(
+                    "Tokenized {} in {:?}",
+                    params.text_document.uri.path(),
+                    elapsed_time
+                ),
+            )
+            .await;
+
+        if has_errors {
+            self.client
+                .log_message(
+                    lsp_types::MessageType::INFO,
+                    format!(
+                        "There was an error while tokenizing {}",
+                        params.text_document.uri.path(),
+                    ),
+                )
+                .await;
+        }
+    }
+
+    // Handle shutdown signal.
+    async fn shutdown(&self) -> tower_lsp::jsonrpc::Result<()> {
+        // No cleanup required on shutdown; simply acknowledge the request.
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    // We are using standard input and output for communicating with an editor,
+    // so we need to avoid methods or macros that write or read using them,
+    // e.g. `println!`.
+    let (stdin, stdout) = (tokio::io::stdin(), tokio::io::stdout());
+    let (service, socket) = tower_lsp::LspService::new(|client| RottLanguageServer { client });
+    tower_lsp::Server::new(stdin, stdout, socket)
+        .serve(service)
+        .await;
+}