From 9ff20c7a6049af9926f5a67fbbbb91b4e446ec4b Mon Sep 17 00:00:00 2001 From: dkanus Date: Wed, 6 Aug 2025 23:18:08 +0700 Subject: [PATCH] Rename TokenSpan to TokenPiece and tidy code layout --- rottlib/src/lexer/mod.rs | 174 +++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/rottlib/src/lexer/mod.rs b/rottlib/src/lexer/mod.rs index 8356280..f696f7f 100644 --- a/rottlib/src/lexer/mod.rs +++ b/rottlib/src/lexer/mod.rs @@ -46,7 +46,7 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000; /// /// *No absolute coordinates* are stored - they are recomputed per line. #[derive(Debug, Clone, Copy)] -pub struct TokenSpan<'src> { +pub struct TokenPiece<'src> { pub lexeme: &'src str, pub token: Token, pub length_utf16: usize, @@ -60,28 +60,32 @@ pub struct TokenLocation { column: usize, } +impl PartialEq for TokenLocation { + fn eq(&self, other: &TokenLocation) -> bool { + self.line_number == other.line_number && self.column == other.column + } +} + +impl PartialOrd for TokenLocation { + fn partial_cmp(&self, other: &TokenLocation) -> Option { + if self.line_number == other.line_number { + self.column.partial_cmp(&other.column) + } else { + self.line_number.partial_cmp(&other.line_number) + } + } +} + /// Type for indexing lines in a [`TokenizedFile`]. type LineNumber = usize; /// Type for specific tokens inside each [`Line`]. type TokenIndex = usize; -/// Representation of a single physical line of the source file. -/// -/// [`Range`] are used instead of slices to avoid creating -/// a self-referential struct (with [`TokenizedFile`]), which rust forbids. -#[derive(Clone)] -struct Line { - /// Token that began on an earlier line (`None` for standalone lines). - continued_from: Option, - /// Contiguous tokens that started on this line (`start >= end` iff empty). - local_range: Range, -} - /// A tokenized, lossless representation of an UnrealScript source file. pub struct TokenizedFile<'src> { /// Arena of every token span in this file. - buffer: Vec>, + buffer: Vec>, /// Mapping that provides an easy and efficient access to tokens by /// line number. lines: Vec, @@ -91,8 +95,8 @@ pub struct TokenizedFile<'src> { /// Mutable state that encapsulates data needed during the tokenization loop. struct Tokenizer<'src> { - /// Arena that owns every [`TokenSpan`] produced for the file. - buffer: Vec>, + /// Arena that owns every [`TokenPiece`] produced for the file. + buffer: Vec>, /// Mapping from physical line number to the tokens that belong to it. lines: Vec, /// The current 0-based physical line number. @@ -143,9 +147,71 @@ impl<'src> TokenizedFile<'src> { } } +/// Representation of a single physical line of the source file. +/// +/// [`Range`] are used instead of slices to avoid creating +/// a self-referential struct (with [`TokenizedFile`]), which rust forbids. +#[derive(Clone)] +struct Line { + /// Token that began on an earlier line (`None` for standalone lines). + continued_from: Option, + /// Contiguous tokens that started on this line (`start >= end` iff empty). + local_range: Range, +} + +impl Line { + /// Creates a standalone line that owns a contiguous slice in + /// the [`TokenizedFile::buffer`] arena. + fn standalone(locals: Range) -> Line { + Line { + continued_from: None, + local_range: locals, + } + } + + /// Creates a line that is part of a multi-line token started on + /// another line, referencing the 0-based index of its origin. + fn spanned(carried: LineNumber) -> Line { + Line { + continued_from: Some(carried), + local_range: 0..0, + } + } + + /// Creates a line that is part of a multi-line token started on + /// another line and also contains additional tokens local to itself. + fn spanned_with_tokens(carried: LineNumber, locals: Range) -> Line { + Line { + continued_from: Some(carried), + local_range: locals, + } + } + + /// Returns a range of tokens inside [`TokenizedFile::buffer`] that start + /// on this line. + /// + /// [`None`] means there is no such tokens. Otherwise range is guaranteed + /// to not be empty. + fn local_range(&self) -> Option> { + if self.local_range.is_empty() { + None + } else { + Some(self.local_range.clone()) + } + } + + /// Returns amount of tokens of the line. + /// + /// Counts both tokens that started on this line and tokens that continued + /// from previous one. + fn len(&self) -> usize { + (self.continued_from.is_some() as usize) + (self.local_range.end - self.local_range.start) + } +} + impl<'src> Tokenizer<'src> { /// Handles a token span and dispatches to the appropriate handler. - fn process_token_span(&mut self, token_span: TokenSpan<'src>) { + fn process_token_span(&mut self, token_span: TokenPiece<'src>) { if token_can_span_lines(&token_span.token) { self.process_multi_line_token(token_span); } else { @@ -154,7 +220,7 @@ impl<'src> Tokenizer<'src> { } /// Handles tokens that never span multiple lines. - fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) { + fn process_single_line_token(&mut self, token_span: TokenPiece<'src>) { if token_is_newline(&token_span.token) { self.line_number += 1; self.buffer.push(token_span); @@ -165,7 +231,7 @@ impl<'src> Tokenizer<'src> { } /// Handles tokens that may contain one or more newline characters. - fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) { + fn process_multi_line_token(&mut self, token_span: TokenPiece<'src>) { let start_line = self.line_number; let newline_count = count_newlines(token_span.lexeme); @@ -240,9 +306,9 @@ impl<'src> Tokenizer<'src> { } } -fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> { +fn build_span<'src>(token: Token, text: &'src str) -> TokenPiece<'src> { let length_utf16 = text.encode_utf16().count(); - TokenSpan { + TokenPiece { lexeme: text, token, length_utf16, @@ -260,72 +326,6 @@ fn token_can_span_lines(token: &Token) -> bool { ) } -impl Line { - /// Creates a standalone line that owns a contiguous slice in - /// the [`TokenizedFile::buffer`] arena. - fn standalone(locals: Range) -> Line { - Line { - continued_from: None, - local_range: locals, - } - } - - /// Creates a line that is part of a multi-line token started on - /// another line, referencing the 0-based index of its origin. - fn spanned(carried: LineNumber) -> Line { - Line { - continued_from: Some(carried), - local_range: 0..0, - } - } - - /// Creates a line that is part of a multi-line token started on - /// another line and also contains additional tokens local to itself. - fn spanned_with_tokens(carried: LineNumber, locals: Range) -> Line { - Line { - continued_from: Some(carried), - local_range: locals, - } - } - - /// Returns a range of tokens inside [`TokenizedFile::buffer`] that start - /// on this line. - /// - /// [`None`] means there is no such tokens. Otherwise range is guaranteed - /// to not be empty. - fn local_range(&self) -> Option> { - if self.local_range.is_empty() { - None - } else { - Some(self.local_range.clone()) - } - } - - /// Returns amount of tokens of the line. - /// - /// Counts both tokens that started on this line and tokens that continued - /// from previous one. - fn len(&self) -> usize { - (self.continued_from.is_some() as usize) + (self.local_range.end - self.local_range.start) - } -} - -impl PartialEq for TokenLocation { - fn eq(&self, other: &TokenLocation) -> bool { - self.line_number == other.line_number && self.column == other.column - } -} - -impl PartialOrd for TokenLocation { - fn partial_cmp(&self, other: &TokenLocation) -> Option { - if self.line_number == other.line_number { - self.column.partial_cmp(&other.column) - } else { - self.line_number.partial_cmp(&other.line_number) - } - } -} - /// Counts the number of new lines in given text. fn count_newlines(text: &str) -> usize { let mut bytes_iterator = text.as_bytes().iter().peekable();