Initial commit

This commit is contained in:
dkanus 2025-07-30 19:46:37 +07:00
commit 4b9d6a6adb
13 changed files with 2308 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/target
flamegraph.svg
perf.data

1104
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

26
Cargo.toml Normal file
View File

@ -0,0 +1,26 @@
[workspace]
resolver = "2"
members = ["dev_tests", "rottlsp", "rottlib"]
[workspace.package]
edition = "2024"
[workspace.lints.clippy]
all = "warn"
nursery = "warn"
pedantic = "warn"
[profile.release]
opt-level = 3 # Optimize for speed
strip = true # Strip symbols from binary
lto = true # Enable link-time optimization
panic = "abort" # Abort on panic
overflow-checks = false # no integer checks
codegen-units = 1 # Reduce number of codegen units to increase optimizations
debug = false # strip all debug info
[profile.flamegraph]
inherits = "release" # start from release
strip = false
debug = true # full DWARF info for unwinding
split-debuginfo = "unpacked" # keep symbols inside the binary

23
dev_tests/Cargo.toml Normal file
View File

@ -0,0 +1,23 @@
[package]
name = "dev_tests"
version = "0.1.0"
edition = "2024"
[[bin]]
name = "dump_tokens"
path = "src/dump_tokens.rs"
[[bin]]
name = "uc_lexer_verify"
path = "src/uc_lexer_verify.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
rottlib = { version = "0", path = "../rottlib", features = ["debug"] }
walkdir="2.5"
encoding_rs="0.8"
chardet="0.2"
[lints]
workspace = true

View File

@ -0,0 +1,76 @@
use std::{
fs,
path::{Path, PathBuf},
};
use encoding_rs::{Encoding, UTF_8};
use rottlib::lexer::{DebugTools, TokenizedFile};
/// Recursively search `root` for the first file whose *basename* matches
/// `needle` (case-sensitive).
///
/// Returns the absolute path.
fn find_file(root: &Path, needle: &str) -> Option<PathBuf> {
for entry in walkdir::WalkDir::new(root)
.into_iter()
.filter_map(Result::ok)
{
let path = entry.path();
if path.is_file() && (path.file_name().and_then(|name| name.to_str()) == Some(needle)) {
return fs::canonicalize(path).ok();
}
}
None
}
/// CLI: `dump_tokens <root_dir> <file_name>` - searches for `<file_name>`
/// recursively inside `<root_dir>`.
///
/// This utility takes *root directory* and *file name* instead of the full path
/// to help us avoid searching for them typing names out:
///
/// - We know where all the sources are;
/// - We usually just know the name of the file that is being problematic.
fn main() {
let mut args = std::env::args().skip(1);
let root_dir = args.next().unwrap_or_else(|| {
eprintln!("Usage: inspect_uc <root_dir> <file_name>");
std::process::exit(1);
});
let file_name = args.next().unwrap_or_else(|| {
eprintln!("Usage: inspect_uc <root_dir> <file_name>");
std::process::exit(1);
});
let root = PathBuf::from(&root_dir);
if !root.exists() {
eprintln!("Root directory '{root_dir}' does not exist.");
std::process::exit(1);
}
let found_path = find_file(&root, &file_name).map_or_else(
|| {
eprintln!("File '{file_name}' not found under '{root_dir}'.");
std::process::exit(1);
},
|path| path,
);
// Read & decode
let raw_bytes = match fs::read(&found_path) {
Ok(sources) => sources,
Err(error) => {
eprintln!("Could not read {}: {error}", found_path.display());
std::process::exit(1);
}
};
let (encoding_label, _, _) = chardet::detect(&raw_bytes);
let encoding = Encoding::for_label(encoding_label.as_bytes()).unwrap_or(UTF_8);
let (decoded_str, _, _) = encoding.decode(&raw_bytes);
let source_text = decoded_str.to_string();
let tokenized_file = TokenizedFile::from_source(&source_text);
tokenized_file.dump_debug_layout();
}

View File

@ -0,0 +1,122 @@
use std::{collections::HashSet, fs, path::PathBuf};
use rottlib::lexer::{DebugTools, TokenizedFile};
/// Read `ignore.txt` (one path per line, `#` for comments) from root directory
/// and turn it into a canonicalized [`HashSet<PathBuf>`].
fn load_ignore_set(root: &std::path::Path) -> HashSet<PathBuf> {
let ignore_file = root.join("ignore.txt");
if !ignore_file.exists() {
return HashSet::new();
}
let content = match fs::read_to_string(&ignore_file) {
Ok(content) => content,
Err(error) => {
eprintln!("Could not read {}: {error}", ignore_file.display());
return HashSet::new();
}
};
content
.lines()
.map(str::trim)
.filter(|line| !line.is_empty() && !line.starts_with('#'))
.filter_map(|line| {
let next_path = PathBuf::from(line);
let absolute_path = if next_path.is_absolute() {
next_path
} else {
root.join(next_path)
};
fs::canonicalize(absolute_path).ok()
})
.collect()
}
/// CLI: `verify_uc <root_dir>` - find all `.uc` files in the provided directory
/// (except those listed in `ignore.txt` in the root) and test them all.
///
/// Reported execution time is the tokenization time, without considering time
/// it takes to read files from disk.
///
/// `ignore.txt` is for listing specific files, not directories.
fn main() {
let root_dir = std::env::args().nth(1).unwrap(); // it is fine to crash debug utility
let root = PathBuf::from(&root_dir);
if !root.exists() {
eprintln!("Root directory '{root_dir}' does not exist.");
std::process::exit(1);
}
// Load files
let ignored_paths = load_ignore_set(&root);
let mut uc_files: Vec<(PathBuf, String)> = Vec::new();
for entry in walkdir::WalkDir::new(&root)
.into_iter()
.filter_map(Result::ok) // for debug tool this is ok
.filter(|entry| {
let path = entry.path();
// Skip anything explicitly ignored
if let Ok(absolute_path) = fs::canonicalize(path) {
if ignored_paths.contains(&absolute_path) {
return false;
}
}
// Must be *.uc
path.is_file()
&& path
.extension()
.and_then(|extension| extension.to_str())
.is_some_and(|extension| extension.eq_ignore_ascii_case("uc"))
})
{
let path = entry.path();
match fs::read(path) {
Ok(raw_bytes) => {
// Autodetect encoding for old Unreal script sources
let (encoding_label, _, _) = chardet::detect(&raw_bytes);
let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes())
.unwrap_or(encoding_rs::UTF_8);
let (decoded_text, _, _) = encoding.decode(&raw_bytes);
uc_files.push((path.to_path_buf(), decoded_text.into_owned()));
}
Err(error) => {
eprintln!("Failed to read `{}`: {error}", path.display());
std::process::exit(1);
}
}
}
println!("Loaded {} .uc files into memory.", uc_files.len());
// Tokenize and measure performance
let start_time = std::time::Instant::now();
let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files
.iter()
.map(|(path, source_code)| {
let tokenized_file = TokenizedFile::from_source(source_code);
if tokenized_file.had_errors() {
println!("TK: {}", path.display());
}
(path.clone(), tokenized_file)
})
.collect();
let elapsed_time = start_time.elapsed();
println!(
"Tokenized {} files in {:.2?}",
tokenized_files.len(),
elapsed_time
);
// Roundtrip check
for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) {
let reconstructed = tokenized_file.reconstruct_source();
if original != &reconstructed {
eprintln!("Reconstruction mismatch in `{}`!", path.display());
std::process::exit(1);
}
}
println!("All .uc files matched successfully.");
}

11
rottlib/Cargo.toml Normal file
View File

@ -0,0 +1,11 @@
[package]
name = "rottlib"
version = "0.1.0"
edition = "2024"
[features]
default = []
debug = []
[dependencies]
logos = "0.15"

View File

@ -0,0 +1,92 @@
//! Debug-only helpers for [`TokenizedFile`]
//!
//! This module is **compiled only if**
//!
//! * the current build profile has `debug_assertions` enabled, or
//! * the crate is built with the `debug` cargo feature.
//!
//! These checks have been moved to the parent module.
use super::Line;
/// A technical trait that adds debug helpers to the lexer.
pub trait DebugTools {
/// Pretty-prints the internal layout of the tokenised file - useful when
/// writing new passes or hunting lexer bugs.
///
/// This method writes the layout directly to standard output.
///
/// The format is unspecified, may change, and is not intended for
/// external tools.
///
/// Each line in the printed layout starts with its 0-based number for
/// convenience.
fn dump_debug_layout(&self);
/// Reconstructs the exact, lossless source text that was fed to
/// [`super::TokenizedFile::from_source`] from internal representation -
/// useful for manually verifying that the lexer works.
fn reconstruct_source(&self) -> String;
}
impl<'src> DebugTools for super::TokenizedFile<'src> {
fn reconstruct_source(&self) -> String {
let mut result = String::new();
for line in &self.lines {
if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
for span in &self.buffer[token_range.clone()] {
result.push_str(span.lexeme);
}
}
}
result
}
fn dump_debug_layout(&self) {
for (row_index, line) in self.lines.iter().enumerate() {
println!("Line {}", row_index + 1);
match line {
Line::Standalone(token_range) => {
println!("\t[Standalone]");
let mut column_utf16 = 0usize;
for next_token_span in &self.buffer[token_range.clone()] {
let token_beginning = column_utf16;
let token_end = column_utf16 + next_token_span.length_utf16;
println!(
"\t\t{:?} @ {}-{}: {:?}",
next_token_span.token,
token_beginning,
token_end,
next_token_span.lexeme
);
column_utf16 = token_end;
}
}
Line::Spanned(origin_row) => {
// `origin_row` is 0-based
println!(
"\t[Continued from line {} - no new tokens here]",
origin_row + 1
);
}
Line::SpannedWithTokens(origin_row, token_range) => {
// `origin_row` is 0-based
println!("\t[Continued from line {} + new tokens]", origin_row + 1);
let mut column_utf16 = 0usize;
for next_token_span in &self.buffer[token_range.clone()] {
let token_beginning = column_utf16;
let token_end = column_utf16 + next_token_span.length_utf16;
println!(
"\t\t{:?} @ {}-{}: {:?}",
next_token_span.token,
token_beginning,
token_end,
next_token_span.lexeme
);
column_utf16 = token_end;
}
}
}
}
}
}

476
rottlib/src/lexer/lexing.rs Normal file
View File

@ -0,0 +1,476 @@
//! Lexer for UnrealScript that understands inline `cpptext { ... }` blocks.
//!
//! ## Notable details
//!
//! Lexer for UnrealScript that recognises inline `cpptext { … }` blocks.
//!
//! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.
//! Because whitespace, newlines, or comments may appear between the
//! `cpptext` keyword and the opening `{`, the lexer must remember that
//! it has just seen `cpptext` - hence a state machine.
//!
//! Modes
//! ------
//! - **Normal** - ordinary UnrealScript tokens.
//! - **AwaitingCppBlock** - after `cpptext`, waiting for the next `{`.
//!
//! When that brace arrives, the lexer consumes the entire C++ block as
//! one token (`Token::Brace(BraceKind::CppBlock)`), tracking nested
//! braces, strings, and comments on the way. If the closing `}` is
//! missing, everything to EOF is treated as C++; downstream parsers must
//! handle that gracefully.
use logos::Lexer;
/// Which lexer mode we're in. See the module docs for the full story.
#[derive(Default, Clone, Copy, PartialEq, Eq)]
enum LexerMode {
/// Lexing regular UnrealScript.
#[default]
Normal,
/// Saw `cpptext`; waiting for the opening `{` of a C++ block.
AwaitingCppBlock,
}
/// Extra per-lexer state. Currently just holds the [`Mode`].
///
/// This is a logos-specific implementation detail.
#[derive(Default)]
pub struct LexerState {
mode: LexerMode,
}
/// Are these braces "real" UnrealScript braces, or the start/end of a C++ block?
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum BraceKind {
Normal,
CppBlock,
}
/// All UnrealScript tokens that our compiler distinguishes.
#[derive(logos::Logos, Debug, PartialEq, Clone, Copy)]
#[logos(extras = LexerState)]
pub enum Token {
// # Compiler/directive keywords
#[regex(r"(?i)#exec[^\r\n]*(\r|\n|\r\n)")]
ExecDirective,
#[regex("(?i)cpptext", |lex| { lex.extras.mode = LexerMode::AwaitingCppBlock; })]
CppText,
// # Declaration & structural keywords
#[regex("(?i)class")]
Class,
#[regex("(?i)struct")]
Struct,
#[regex("(?i)enum")]
Enum,
#[regex("(?i)state")]
State,
#[regex("(?i)function")]
Function,
#[regex("(?i)event")]
Event,
#[regex("(?i)delegate")]
Delegate,
#[regex("(?i)var")]
Var,
#[regex("(?i)local")]
Local,
// # Inheritance, interface, dependencies
#[regex("(?i)extends")]
Extends,
#[regex("(?i)dependson")]
DependsOn,
// # Access modifiers & properties
#[regex("(?i)private")]
Private,
#[regex("(?i)protected")]
Protected,
#[regex("(?i)public")]
Public,
#[regex("(?i)const")]
Const,
#[regex("(?i)static")]
Static,
#[regex("(?i)native")]
Native,
#[regex("(?i)abstract")]
Abstract,
#[regex("(?i)deprecated")]
Deprecated,
// # UnrealScript metadata/specifiers
#[regex("(?i)default")]
Default,
#[regex("(?i)defaultproperties")]
DefaultProperties,
#[regex("(?i)optional")]
Optional,
#[regex("(?i)config")]
Config,
#[regex("(?i)perobjectconfig")]
PerObjectConfig,
#[regex("(?i)globalconfig")]
GlobalConfig,
#[regex("(?i)collapsecategories")]
CollapseCategories,
#[regex("(?i)dontcollapsecategories")]
DontCollapseCategories,
#[regex("(?i)hidecategories")]
HideCategories,
#[regex("(?i)localized")]
Localized,
#[regex("(?i)placeable")]
Placeable,
#[regex("(?i)notplaceable")]
NotPlaceable,
#[regex("(?i)editinlinenew")]
EditInlineNew,
#[regex("(?i)noteditinlinenew")]
NotEditInlineNew,
#[regex("(?i)dynamicrecompile")]
DynamicRecompile,
#[regex("(?i)transient")]
Transient,
#[regex("(?i)operator")]
Operator,
#[regex("(?i)simulated")]
Simulated,
#[regex("(?i)latent")]
Latent,
#[regex("(?i)iterator")]
Iterator,
#[regex("(?i)out")]
Out,
#[regex("(?i)skip")]
Skip,
#[regex("(?i)singular")]
Singular,
#[regex("(?i)coerce")]
Coerce,
#[regex("(?i)assert")]
Assert,
#[regex("(?i)ignores")]
Ignores,
#[regex("(?i)within")]
Within,
#[regex("(?i)noexport")]
NoExport,
// # Replication-related
#[regex("(?i)reliable")]
Reliable,
#[regex("(?i)unreliable")]
Unreliable,
#[regex("(?i)replication")]
Replication,
#[regex("(?i)nativereplication")]
NativeReplication,
// # Control-flow keywords
#[regex("(?i)if")]
If,
#[regex("(?i)else")]
Else,
#[regex("(?i)switch")]
Switch,
#[regex("(?i)case")]
Case,
#[regex("(?i)for")]
For,
#[regex("(?i)foreach")]
ForEach,
#[regex("(?i)while")]
While,
#[regex("(?i)do")]
Do,
#[regex("(?i)until")]
Until,
#[regex("(?i)break")]
Break,
#[regex("(?i)continue")]
Continue,
#[regex("(?i)return")]
Return,
// # Built-in types
#[regex("(?i)int")]
Int,
#[regex("(?i)float")]
Float,
#[regex("(?i)bool")]
Bool,
#[regex("(?i)byte")]
Byte,
#[regex("(?i)string")]
String,
#[regex("(?i)array")]
Array,
#[regex("(?i)name")]
Name,
// # Literals & identifiers
#[regex(r"0[xX][0-9A-Fa-f]+|[0-9]+")]
IntegerLiteral,
#[regex(r"[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?")]
FloatLiteral,
#[regex(r#""([^"\\\r\n]|\\.)*""#)]
StringLiteral,
#[regex(r"'[a-zA-Z0-9_\. \-]*'")]
NameLiteral,
#[regex("(?i)true")]
True,
#[regex("(?i)false")]
False,
#[regex("(?i)none")]
None,
#[regex("(?i)self")]
SelfKeyword,
#[regex("(?i)new")]
New,
#[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
Identifier,
// # Operations
// ## Exponentiation
#[token("**")]
Exponentiation,
// ## Unary
#[token("++")]
Increment,
#[token("--")]
Decrement,
#[token("!")]
Not,
#[token("~")]
BitwiseNot,
// ## Vector
#[token("dot")]
Dot,
#[token("cross")]
Cross,
// ## Multiplicative
#[token("*")]
Multiply,
#[token("/")]
Divide,
#[token("%")]
Modulo,
// ## Additive
#[token("+")]
Plus,
#[token("-")]
Minus,
// ## String manipulation
#[token("@")]
AtChar,
#[token("$")]
DollarChar,
// ## Shifts
#[token("<<")]
LeftShift,
#[token(">>>")]
LogicalRightShift,
#[token(">>")]
RightShift,
// ## Relational
#[token("<")]
Less,
#[token("<=")]
LessEqual,
#[token(">")]
Greater,
#[token(">=")]
GreaterEqual,
#[token("==")]
Equal,
#[token("!=")]
NotEqual,
#[token("~=")]
ApproximatelyEqual,
// ## Bitwise
#[token("&")]
BitwiseAnd,
#[token("|")]
BitwiseOr,
#[token("^")]
BitwiseXor,
#[token("^^")]
BooleanXor,
// ## Logical
#[token("&&")]
And,
#[token("||")]
Or,
// ## Assigments
#[token("=")]
Assign,
#[token("*=")]
MultiplyAssign,
#[token("/=")]
DivideAssign,
#[token("+=")]
PlusAssign,
#[token("-=")]
MinusAssign,
#[token("$=")]
ConcatAssign,
#[token("@=")]
ConcatSpaceAssign,
// # Punctuation & delimiters
#[token("(")]
LeftParen,
#[token(")")]
RightParen,
#[token("{", handle_brace)]
Brace(BraceKind),
#[token("}")]
RightBrace,
#[token("[")]
LeftBracket,
#[token("]")]
RightBracket,
#[token(";")]
Semicolon,
#[token(",")]
Comma,
#[token(".")]
Period,
#[token(":")]
Colon,
// # Comments & whitespaces
#[regex(r"//[^\r\n]*")]
LineComment,
#[regex(r"/\*", handle_block_comment)]
BlockComment,
#[regex(r"\r\n|\n|\r")]
NewLine,
#[regex(r"[ \t]+")]
Whitespace,
// # Technical
Error,
}
/// Consume a /* ... */ block comment with arbitrary nesting
/// (like UnrealScript allows).
///
/// Matches the whole comment (delimiters included) or [`None`] if the file ends
/// before every `/*` is closed.
fn handle_block_comment(lexer: &mut Lexer<Token>) -> Option<()> {
let mut comment_depth = 1;
while let Some(next_char) = lexer.remainder().chars().next() {
if lexer.remainder().starts_with("/*") {
comment_depth += 1;
lexer.bump(2);
continue;
}
if lexer.remainder().starts_with("*/") {
comment_depth -= 1;
lexer.bump(2);
if comment_depth == 0 {
return Some(());
}
continue;
}
lexer.bump(next_char.len_utf8());
}
// Unterminated comment
None
}
/// Called for every `{`.
///
/// This method either emits an opening brace or token for `cppblock`,
/// depending on lexer's current state.
fn handle_brace(lexer: &mut Lexer<Token>) -> Option<BraceKind> {
match lexer.extras.mode {
LexerMode::Normal => Some(BraceKind::Normal),
LexerMode::AwaitingCppBlock => {
lexer.extras.mode = LexerMode::Normal;
consume_cpp_block(lexer);
Some(BraceKind::CppBlock)
}
}
}
/// Consumes a complete C++ block, handling:
/// - Nested `{...}` pairs
/// - String literals (`"..."` and `'...'`), including escaped quotes
/// - Line comments (`// ...\n`)
/// - Block comments (`/* ... */`)
///
/// Leaves the lexer positioned immediately after the closing `}` of the block.
/// The opening `{` must have already been consumed by the caller.
fn consume_cpp_block(lexer: &mut Lexer<Token>) {
let mut depth = 1;
while let Some(ch) = lexer.remainder().chars().next() {
match ch {
'{' => {
depth += 1;
lexer.bump(1);
}
'}' => {
depth -= 1;
lexer.bump(1);
if depth == 0 {
break;
}
}
'/' if lexer.remainder().starts_with("/*") => {
lexer.bump(2); // consuming two-byte sequence `/*`
consume_c_comment(lexer)
}
'/' if lexer.remainder().starts_with("//") => {
lexer.bump(2); // consuming two-byte sequence `//`
while let Some(c) = lexer.remainder().chars().next() {
lexer.bump(c.len_utf8());
if c == '\n' {
break;
}
}
}
'"' | '\'' => {
lexer.bump(1); // skip `'` or `"`
consume_string_literal(lexer, ch);
}
_ => lexer.bump(ch.len_utf8()),
}
}
}
/// Consume over a C-style `/* … */` comment (without nesting).
///
/// Assumes that opener `/*` is already consumed.
fn consume_c_comment(lexer: &mut Lexer<Token>) {
while let Some(next_character) = lexer.remainder().chars().next() {
if lexer.remainder().starts_with("*/") {
lexer.bump(2);
break;
} else {
lexer.bump(next_character.len_utf8());
}
}
}
/// Consume a string literal from C++ code.
///
/// Assumes that opening quotation mark is already consumed.
fn consume_string_literal(lexer: &mut Lexer<Token>, delimiter: char) {
while let Some(next_character) = lexer.remainder().chars().next() {
lexer.bump(next_character.len_utf8());
if next_character == '\\' {
// Skip the escaped character
if let Some(next) = lexer.remainder().chars().next() {
lexer.bump(next.len_utf8());
}
} else if next_character == delimiter {
return;
}
}
}

276
rottlib/src/lexer/mod.rs Normal file
View File

@ -0,0 +1,276 @@
//! # Tokenizer
//!
//! Converts raw source text into a lossless, position-aware stream of lexical
//! [`Token`]s, grouped *per physical line*, and returns it as
//! a [`TokenizedFile`].
//!
//! Design goals:
//!
//! 1. **Lossless**: preserving complete information for each token, enough to
//! recreate the original bytes without loss.
//! 2. **LSP readiness**: the LSP heavily relies on UTF-16 support, so we
//! precompute lengths of each token in that encoding, making interfacing
//! easier.
//!
//! ## Opt-in debug helpers
//!
//! Extra diagnostics become available in **debug builds** or when the crate is
//! compiled with `debug` feature enabled. They live in the [`DebugTools`]
//! extension trait, implemented for [`TokenizedFile`].
//!
//! ```
//! // bring the trait into scope
//! use lexer::DebugTools;
//!
//! let file = TokenizedFile::from_source(src);
//! file.debug_dump(); // pretty-print token layout
//! let text = file.to_source(); // reconstruct original text
//! ```
mod debug_tools;
mod lexing;
use std::ops::Range;
use logos::Logos;
#[cfg(any(debug_assertions, feature = "debug"))]
pub use debug_tools::DebugTools;
pub use lexing::Token;
/// Empirically chosen starting size for token buffer (used during tokenization)
/// that provides good performance.
const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
/// A slice tagged with its token kind plus two length counters.
///
/// *No absolute coordinates* are stored - they are recomputed per line.
#[derive(Debug, Clone, Copy)]
struct TokenSpan<'src> {
lexeme: &'src str,
token: Token,
length_utf16: usize,
}
/// Representation of a single physical line of the source file.
///
/// [`Range<usize>`] are used instead of slices to avoid creating
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
#[derive(Clone)]
enum Line {
/// A standalone line that owns a contiguous slice in
/// the [`TokenizedFile::buffer`] arena.
Standalone(Range<usize>),
/// A 0-based line that is part of a multi-line token started on
/// another line.
Spanned(usize),
/// A 0-based line that is part of a multi-line token started on
/// another line *and* contains additional tokens local to itself.
SpannedWithTokens(usize, Range<usize>),
}
/// A tokenized, lossless representation of an UnrealScript source file.
pub struct TokenizedFile<'src> {
/// Arena of every token span in this file.
buffer: Vec<TokenSpan<'src>>,
/// Mapping that provides an easy and efficient access to tokens by
/// line number.
lines: Vec<Line>,
/// Simple flag for marking erroneous state.
had_errors: bool,
}
/// Mutable state that encapsulates data needed during the tokenization loop.
struct Tokenizer<'src> {
/// Arena that owns every [`TokenSpan`] produced for the file.
buffer: Vec<TokenSpan<'src>>,
/// Mapping from physical line number to the tokens that belong to it.
lines: Vec<Line>,
/// The current 0-based physical line number.
line_number: usize,
/// Index in [`Tokenizer::buffer`] where the current line starts.
slice_start_index: usize,
/// When a multi-line token is being scanned, stores the 0-based line
/// on which it started; [`None`] otherwise.
multi_line_start: Option<usize>,
/// Set to [`true`] if the lexer reported any error tokens.
had_errors: bool,
}
impl<'src> TokenizedFile<'src> {
/// Tokenize `source` and return a fresh [`TokenizedFile`].
pub fn from_source(source: &'src str) -> TokenizedFile<'src> {
let mut tokenizer = TokenizedFile::<'src>::builder();
let mut lexer = Token::lexer(source);
// Logos > Ok() > token > token span <- plugged into tokenizer
while let Some(token_result) = lexer.next() {
let token = token_result.unwrap_or_else(|_| {
tokenizer.had_errors = true;
Token::Error
});
let token_span = build_span(token, lexer.slice());
tokenizer.process_token_span(token_span);
}
tokenizer.into_tokenized_file()
}
/// Returns [`true`] if any erroneous tokens were produced during building
/// of this [`TokenizedFile`].
pub fn had_errors(&self) -> bool {
self.had_errors
}
/// Create an empty tokenizer state with tuned buffer capacity.
fn builder() -> Tokenizer<'src> {
Tokenizer {
buffer: Vec::with_capacity(DEFAULT_TOKEN_BUFFER_CAPACITY),
lines: Vec::new(),
line_number: 0,
slice_start_index: 0,
multi_line_start: None,
had_errors: false,
}
}
}
impl<'src> Tokenizer<'src> {
/// Handles a token span and dispatches to the appropriate handler.
fn process_token_span(&mut self, token_span: TokenSpan<'src>) {
if token_can_span_lines(&token_span.token) {
self.process_multi_line_token(token_span);
} else {
self.process_single_line_token(token_span);
}
}
/// Handles tokens that never span multiple lines.
fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) {
if token_is_newline(&token_span.token) {
self.line_number += 1;
self.buffer.push(token_span);
self.commit_current_line();
} else {
self.buffer.push(token_span);
}
}
/// Handles tokens that may contain one or more newline characters.
fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) {
let start_line = self.line_number;
let newline_count = count_newlines(token_span.lexeme);
// Did this token end in a newline?
// This can happen if this is an `Error` token that ends the file.
let ends_with_newline =
token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r');
self.buffer.push(token_span);
// We only need to commit the line if this token actually ended the line
if newline_count > 0 {
self.commit_current_line();
// We only need to insert one `Line::Spanned(base)` per *interior*
// newline, so `newline_count - 1` such lines
// (e.g. 2 line breaks in block comment -> it has
// exactly `1` interior line)
let insert_count = newline_count - 1;
for _ in 0..insert_count {
self.lines.push(Line::Spanned(start_line));
}
// This is called *after* `commit_current_line()` cleared previous
// stored value
self.multi_line_start = if ends_with_newline {
None // we're done at this point
} else {
Some(start_line)
};
}
self.line_number = start_line + newline_count;
}
/// Commits the tokens of the current physical line into `self.lines`.
fn commit_current_line(&mut self) {
let slice_end = self.buffer.len();
if slice_end > self.slice_start_index {
let slice = self.slice_start_index..slice_end;
// If we were in the middle of a multi-line token, we
// *always* consume `spanned_from` here, ensuring that each call to
// `commit_current_line()` only applies it once.
// This guarantees no "bleed" between adjacent multi-line tokens.
if let Some(from) = self.multi_line_start.take() {
self.lines.push(Line::SpannedWithTokens(from, slice));
} else {
self.lines.push(Line::Standalone(slice));
}
self.slice_start_index = slice_end;
}
}
/// Finishes tokenization, converting accumulated data into
/// [`TokenizedFile`].
fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
// Commit any trailing tokens
self.commit_current_line();
// If we still have a `spanned_from` (i.e. a pure multi-line token with
// no local tokens on its last line), push a bare `Spanned` entry.
if let Some(from) = self.multi_line_start.take() {
self.lines.push(Line::Spanned(from));
}
// Optimize for size
self.buffer.shrink_to_fit();
self.lines.shrink_to_fit();
TokenizedFile {
buffer: self.buffer,
lines: self.lines,
had_errors: self.had_errors,
}
}
}
fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> {
let length_utf16 = text.encode_utf16().count();
TokenSpan {
lexeme: text,
token,
length_utf16,
}
}
fn token_is_newline(token: &Token) -> bool {
matches!(token, Token::NewLine)
}
fn token_can_span_lines(token: &Token) -> bool {
matches!(
token,
Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error
)
}
/// Counts the number of new lines in given text.
fn count_newlines(text: &str) -> usize {
let mut bytes_iterator = text.as_bytes().iter().peekable();
let mut newline_count = 0;
while let Some(&next_byte) = bytes_iterator.next() {
// Logos' regex rule is "\r\n|\n|\r", so we agree with it on new line
// character treatment
match next_byte {
b'\r' => {
newline_count += 1;
if let Some(&&b'\n') = bytes_iterator.peek() {
// skip the '\n' in a CRLF
bytes_iterator.next();
}
}
b'\n' => {
newline_count += 1;
}
_ => (),
}
}
newline_count
}

3
rottlib/src/lib.rs Normal file
View File

@ -0,0 +1,3 @@
#![allow(clippy::doc_overindented_list_items)]
pub mod lexer;

12
rottlsp/Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "rottlsp"
version = "0.1.0"
edition = "2024"
[dependencies]
rottlib = { version = "0", path = "../rottlib" }
tokio = { version = "1", features = ["full"] }
tower-lsp = "0.20"
[lints]
workspace = true

84
rottlsp/src/main.rs Normal file
View File

@ -0,0 +1,84 @@
use tower_lsp::lsp_types;
/// A Language Server implementation for Rott.
///
/// Implements the [`tower_lsp::LanguageServer`] trait to handle LSP requests
/// (e.g. initialization, text synchronization, open notifications)
/// asynchronously.
struct RottLanguageServer {
/// Client handle for sending notifications and requests to the editor.
client: tower_lsp::Client,
}
#[tower_lsp::async_trait]
impl tower_lsp::LanguageServer for RottLanguageServer {
// Inform the client of our server capabilities during initialization.
async fn initialize(
&self,
_: lsp_types::InitializeParams,
) -> tower_lsp::jsonrpc::Result<lsp_types::InitializeResult> {
Ok(lsp_types::InitializeResult {
capabilities: lsp_types::ServerCapabilities {
// We can synchronize the text of files, which means we request
// to receive full updates whenever a file is opened or changed.
// `lsp_types::TextDocumentSyncKind::FULL` means we require full text
// every time.
text_document_sync: Some(lsp_types::TextDocumentSyncCapability::Kind(
lsp_types::TextDocumentSyncKind::FULL,
)),
..Default::default()
},
..Default::default()
})
}
// On file open, tokenize the new document and log any lexing errors.
async fn did_open(&self, params: lsp_types::DidOpenTextDocumentParams) {
// Measure lexing performance to track parser responsiveness.
let start_time = std::time::Instant::now();
let has_errors =
rottlib::lexer::TokenizedFile::from_source(&params.text_document.text).had_errors();
let elapsed_time = start_time.elapsed();
self.client
.log_message(
lsp_types::MessageType::INFO,
format!(
"Tokenized {} in {:?}",
params.text_document.uri.path(),
elapsed_time
),
)
.await;
if has_errors {
self.client
.log_message(
lsp_types::MessageType::INFO,
format!(
"There was an error while tokenizing {}",
params.text_document.uri.path(),
),
)
.await;
}
}
// Handle shutdown signal.
async fn shutdown(&self) -> tower_lsp::jsonrpc::Result<()> {
// No cleanup required on shutdown; simply acknowledge the request.
Ok(())
}
}
#[tokio::main]
async fn main() {
// We are using standard input and output for communicating with an editor,
// so we need to avoid methods or macros that write or read using them,
// e.g. `println!`.
let (stdin, stdout) = (tokio::io::stdin(), tokio::io::stdout());
let (service, socket) = tower_lsp::LspService::new(|client| RottLanguageServer { client });
tower_lsp::Server::new(stdin, stdout, socket)
.serve(service)
.await;
}