From 579c2a4d3df98eba075b7be59edabca4c3f2db72 Mon Sep 17 00:00:00 2001 From: dkanus Date: Wed, 6 Aug 2025 23:07:44 +0700 Subject: [PATCH] Refactor `Line` Previous definition of `Line` type was obnoxious and too difficult to work with. This one should make iterator implementation much easier and has clearer structure on its own. --- rottlib/src/lexer/debug_tools.rs | 83 ++++++++++------------ rottlib/src/lexer/mod.rs | 115 +++++++++++++++++++++++++------ 2 files changed, 132 insertions(+), 66 deletions(-) diff --git a/rottlib/src/lexer/debug_tools.rs b/rottlib/src/lexer/debug_tools.rs index d8c84ec..a05a8a2 100644 --- a/rottlib/src/lexer/debug_tools.rs +++ b/rottlib/src/lexer/debug_tools.rs @@ -7,8 +7,6 @@ //! //! These checks have been moved to the parent module. -use super::Line; - /// A technical trait that adds debug helpers to the lexer. pub trait DebugTools { /// Pretty-prints the internal layout of the tokenised file - useful when @@ -31,62 +29,55 @@ pub trait DebugTools { impl<'src> DebugTools for super::TokenizedFile<'src> { fn reconstruct_source(&self) -> String { - let mut result = String::new(); - for line in &self.lines { - if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line { - for span in &self.buffer[token_range.clone()] { - result.push_str(span.lexeme); - } - } - } - result + self.buffer.iter().map(|span| span.lexeme).collect() } fn dump_debug_layout(&self) { - for (row_index, line) in self.lines.iter().enumerate() { - println!("Line {}", row_index + 1); - match line { - Line::Standalone(token_range) => { + for (row_idx, line) in self.lines.iter().enumerate() { + println!("Line {}", row_idx + 1); + + match (line.continued_from, line.local_range()) { + // Stand-alone line (all tokens start here) + (None, Some(range)) => { println!("\t[Standalone]"); - let mut column_utf16 = 0usize; - for next_token_span in &self.buffer[token_range.clone()] { - let token_beginning = column_utf16; - let token_end = column_utf16 + next_token_span.length_utf16; - println!( - "\t\t{:?} @ {}-{}: {:?}", - next_token_span.token, - token_beginning, - token_end, - next_token_span.lexeme - ); - column_utf16 = token_end; - } + dump_spans(&self.buffer[range.clone()]); } - Line::Spanned(origin_row) => { - // `origin_row` is 0-based + + // Pure continuation - the only thing on this line is + // the remainder of a multi-line token that started earlier. + (Some(origin_row), None) => { println!( - "\t[Continued from line {} - no new tokens here]", + "\t[Continued from line {} – no new tokens here]", origin_row + 1 ); } - Line::SpannedWithTokens(origin_row, token_range) => { - // `origin_row` is 0-based + + // Continuation **plus** some fresh tokens that begin here. + (Some(origin_row), Some(range)) => { println!("\t[Continued from line {} + new tokens]", origin_row + 1); - let mut column_utf16 = 0usize; - for next_token_span in &self.buffer[token_range.clone()] { - let token_beginning = column_utf16; - let token_end = column_utf16 + next_token_span.length_utf16; - println!( - "\t\t{:?} @ {}-{}: {:?}", - next_token_span.token, - token_beginning, - token_end, - next_token_span.lexeme - ); - column_utf16 = token_end; - } + dump_spans(&self.buffer[range.clone()]); + } + + // An empty physical line (should be rare, but let's be safe). + (None, None) => { + println!("\t[Empty line]"); } } } } } + +/// Helper that prints every span in `spans` together with its UTF-16 +/// column boundaries. +fn dump_spans<'a>(spans: &[super::TokenPiece<'a>]) { + let mut col_utf16 = 0usize; + for span in spans { + let start = col_utf16; + let end = start + span.length_utf16; + println!( + "\t\t{:?} @ {}–{}: {:?}", + span.token, start, end, span.lexeme + ); + col_utf16 = end; + } +} diff --git a/rottlib/src/lexer/mod.rs b/rottlib/src/lexer/mod.rs index 8fd40ef..8356280 100644 --- a/rottlib/src/lexer/mod.rs +++ b/rottlib/src/lexer/mod.rs @@ -30,7 +30,7 @@ mod debug_tools; mod lexing; -use std::ops::Range; +use std::{cmp::Ordering, ops::Range}; use logos::Logos; @@ -46,27 +46,36 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000; /// /// *No absolute coordinates* are stored - they are recomputed per line. #[derive(Debug, Clone, Copy)] -struct TokenSpan<'src> { - lexeme: &'src str, - token: Token, - length_utf16: usize, +pub struct TokenSpan<'src> { + pub lexeme: &'src str, + pub token: Token, + pub length_utf16: usize, } +/// Defines location of a token inside [`TokenizedFile`] in a way, convenient +/// for communicating through LSP. +#[derive(Eq, Clone, Copy)] +pub struct TokenLocation { + line_number: usize, + column: usize, +} + +/// Type for indexing lines in a [`TokenizedFile`]. +type LineNumber = usize; + +/// Type for specific tokens inside each [`Line`]. +type TokenIndex = usize; + /// Representation of a single physical line of the source file. /// -/// [`Range`] are used instead of slices to avoid creating +/// [`Range`] are used instead of slices to avoid creating /// a self-referential struct (with [`TokenizedFile`]), which rust forbids. #[derive(Clone)] -enum Line { - /// A standalone line that owns a contiguous slice in - /// the [`TokenizedFile::buffer`] arena. - Standalone(Range), - /// A 0-based line that is part of a multi-line token started on - /// another line. - Spanned(usize), - /// A 0-based line that is part of a multi-line token started on - /// another line *and* contains additional tokens local to itself. - SpannedWithTokens(usize, Range), +struct Line { + /// Token that began on an earlier line (`None` for standalone lines). + continued_from: Option, + /// Contiguous tokens that started on this line (`start >= end` iff empty). + local_range: Range, } /// A tokenized, lossless representation of an UnrealScript source file. @@ -175,7 +184,7 @@ impl<'src> Tokenizer<'src> { // exactly `1` interior line) let insert_count = newline_count - 1; for _ in 0..insert_count { - self.lines.push(Line::Spanned(start_line)); + self.lines.push(Line::spanned(start_line)); } // This is called *after* `commit_current_line()` cleared previous // stored value @@ -200,9 +209,9 @@ impl<'src> Tokenizer<'src> { // `commit_current_line()` only applies it once. // This guarantees no "bleed" between adjacent multi-line tokens. if let Some(from) = self.multi_line_start.take() { - self.lines.push(Line::SpannedWithTokens(from, slice)); + self.lines.push(Line::spanned_with_tokens(from, slice)); } else { - self.lines.push(Line::Standalone(slice)); + self.lines.push(Line::standalone(slice)); } self.slice_start_index = slice_end; } @@ -216,7 +225,7 @@ impl<'src> Tokenizer<'src> { // If we still have a `spanned_from` (i.e. a pure multi-line token with // no local tokens on its last line), push a bare `Spanned` entry. if let Some(from) = self.multi_line_start.take() { - self.lines.push(Line::Spanned(from)); + self.lines.push(Line::spanned(from)); } // Optimize for size @@ -251,6 +260,72 @@ fn token_can_span_lines(token: &Token) -> bool { ) } +impl Line { + /// Creates a standalone line that owns a contiguous slice in + /// the [`TokenizedFile::buffer`] arena. + fn standalone(locals: Range) -> Line { + Line { + continued_from: None, + local_range: locals, + } + } + + /// Creates a line that is part of a multi-line token started on + /// another line, referencing the 0-based index of its origin. + fn spanned(carried: LineNumber) -> Line { + Line { + continued_from: Some(carried), + local_range: 0..0, + } + } + + /// Creates a line that is part of a multi-line token started on + /// another line and also contains additional tokens local to itself. + fn spanned_with_tokens(carried: LineNumber, locals: Range) -> Line { + Line { + continued_from: Some(carried), + local_range: locals, + } + } + + /// Returns a range of tokens inside [`TokenizedFile::buffer`] that start + /// on this line. + /// + /// [`None`] means there is no such tokens. Otherwise range is guaranteed + /// to not be empty. + fn local_range(&self) -> Option> { + if self.local_range.is_empty() { + None + } else { + Some(self.local_range.clone()) + } + } + + /// Returns amount of tokens of the line. + /// + /// Counts both tokens that started on this line and tokens that continued + /// from previous one. + fn len(&self) -> usize { + (self.continued_from.is_some() as usize) + (self.local_range.end - self.local_range.start) + } +} + +impl PartialEq for TokenLocation { + fn eq(&self, other: &TokenLocation) -> bool { + self.line_number == other.line_number && self.column == other.column + } +} + +impl PartialOrd for TokenLocation { + fn partial_cmp(&self, other: &TokenLocation) -> Option { + if self.line_number == other.line_number { + self.column.partial_cmp(&other.column) + } else { + self.line_number.partial_cmp(&other.line_number) + } + } +} + /// Counts the number of new lines in given text. fn count_newlines(text: &str) -> usize { let mut bytes_iterator = text.as_bytes().iter().peekable();