Add iterator over tokens to TokenizedFile

Add missing tokens to lexer
Rename TokenSpan to TokenPiece and tidy code layout
2025-08-07 17:24:35 +07:00 · 2025-08-07 13:49:19 +07:00 · 2025-08-06 23:18:08 +07:00 · 2025-08-06 23:17:55 +07:00
4 changed files with 425 additions and 119 deletions
--- a/rottlib/src/lexer/debug_tools.rs
+++ b/rottlib/src/lexer/debug_tools.rs
@ -7,8 +7,6 @@
 //!
 //! These checks have been moved to the parent module.

-use super::Line;
-
 /// A technical trait that adds debug helpers to the lexer.
 pub trait DebugTools {
    /// Pretty-prints the internal layout of the tokenised file - useful when
@ -31,62 +29,55 @@ pub trait DebugTools {

 impl<'src> DebugTools for super::TokenizedFile<'src> {
    fn reconstruct_source(&self) -> String {
-        let mut result = String::new();
-        for line in &self.lines {
-            if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
-                for span in &self.buffer[token_range.clone()] {
-                    result.push_str(span.lexeme);
-                }
-            }
-        }
-        result
+        self.buffer.iter().map(|span| span.lexeme).collect()
    }

    fn dump_debug_layout(&self) {
-        for (row_index, line) in self.lines.iter().enumerate() {
-            println!("Line {}", row_index + 1);
-            match line {
-                Line::Standalone(token_range) => {
+        for (row_idx, line) in self.lines.iter().enumerate() {
+            println!("Line {}", row_idx + 1);
+
+            match (line.continued_from, line.local_range()) {
+                // Stand-alone line (all tokens start here)
+                (None, Some(range)) => {
                    println!("\t[Standalone]");
-                    let mut column_utf16 = 0usize;
-                    for next_token_span in &self.buffer[token_range.clone()] {
-                        let token_beginning = column_utf16;
-                        let token_end = column_utf16 + next_token_span.length_utf16;
-                        println!(
-                            "\t\t{:?} @ {}-{}: {:?}",
-                            next_token_span.token,
-                            token_beginning,
-                            token_end,
-                            next_token_span.lexeme
-                        );
-                        column_utf16 = token_end;
+                    dump_spans(&self.buffer[range.clone()]);
                }
-                }
-                Line::Spanned(origin_row) => {
-                    // `origin_row` is 0-based
+
+                // Pure continuation - the only thing on this line is
+                // the remainder of a multi-line token that started earlier.
+                (Some(origin_row), None) => {
                    println!(
-                        "\t[Continued from line {} - no new tokens here]",
+                        "\t[Continued from line {} – no new tokens here]",
                        origin_row + 1
                    );
                }
-                Line::SpannedWithTokens(origin_row, token_range) => {
-                    // `origin_row` is 0-based
+
+                // Continuation **plus** some fresh tokens that begin here.
+                (Some(origin_row), Some(range)) => {
                    println!("\t[Continued from line {} + new tokens]", origin_row + 1);
-                    let mut column_utf16 = 0usize;
-                    for next_token_span in &self.buffer[token_range.clone()] {
-                        let token_beginning = column_utf16;
-                        let token_end = column_utf16 + next_token_span.length_utf16;
-                        println!(
-                            "\t\t{:?} @ {}-{}: {:?}",
-                            next_token_span.token,
-                            token_beginning,
-                            token_end,
-                            next_token_span.lexeme
-                        );
-                        column_utf16 = token_end;
+                    dump_spans(&self.buffer[range.clone()]);
                }
+
+                // An empty physical line (should be rare, but let's be safe).
+                (None, None) => {
+                    println!("\t[Empty line]");
                }
            }
        }
    }
 }
+
+/// Helper that prints every span in `spans` together with its UTF-16
+/// column boundaries.
+fn dump_spans<'a>(spans: &[super::TokenPiece<'a>]) {
+    let mut col_utf16 = 0usize;
+    for span in spans {
+        let start = col_utf16;
+        let end = start + span.length_utf16;
+        println!(
+            "\t\t{:?} @ {}–{}: {:?}",
+            span.token, start, end, span.lexeme
+        );
+        col_utf16 = end;
+    }
+}
--- a/rottlib/src/lexer/iterator.rs
+++ b/rottlib/src/lexer/iterator.rs
@ -0,0 +1,191 @@
+//! Sub-module that adds an iterator to [`TokenizedFile`] which yields tokens in
+//! the order they appear in the source code.
+//!
+//! ## Examples
+//!
+//! ```rust
+//! let iter = TokenizedFile::from_str("0 / 0").tokens().without_whitespace();
+//! ```
+//!
+//! ## Terminology: continued tokens
+//!
+//! Some [`super::Token`]s (e.g. [`super::Token::CppText`] or
+//! [`super::Token::BlockComment`] can span multiple lines and are recorded on
+//! every line on which they appear (usually as the first, and sometimes
+//! the only, token).
+//! In this module these are referred to as "continued" or
+//! "carried-over" tokens.
+//! Since our iterator needs to return each token only once, we take special
+//! care to skip such continued tokens during iteration.
+
+use super::{TokenLocation, TokenPiece, TokenizedFile};
+
+/// An immutable iterator over all tokens in a [`TokenizedFile`], preserving
+/// their order of appearance in the original source file.
+///
+/// After exhaustion it keeps returning [`None`].
+#[must_use]
+#[derive(Clone, Debug)]
+pub struct Tokens<'src> {
+    /// [`TokenLocation`] of the next token to be returned.
+    ///
+    /// [`None`] means the iterator has been exhausted.
+    cursor: Option<TokenLocation>,
+    /// [`TokenizedFile`] whose tokens we’re iterating over.
+    source_file: &'src TokenizedFile<'src>,
+    /// When `true`, whitespace tokens are skipped.
+    skip_whitespace: bool,
+}
+
+// Because we can only return [`None`] after we've returned it once.
+impl<'src> std::iter::FusedIterator for Tokens<'src> {}
+
+impl<'src> Tokens<'src> {
+    /// Makes the iterator skip all whitespace tokens.
+    #[must_use]
+    #[inline]
+    pub fn without_whitespace(mut self) -> Self {
+        self.skip_whitespace = true;
+        self
+    }
+
+    // Returns the position of the next new token, skipping carried-over pieces
+    // and blank lines.
+    fn advance_position(&self, mut position: TokenLocation) -> Option<TokenLocation> {
+        if let Some(current_line) = self.source_file.lines.get(position.line) {
+            // `Line::len()` also counts a possible token that continued from
+            // the previous line.
+            if position.column + 1 < current_line.len() {
+                position.column += 1;
+                return Some(position);
+            }
+        }
+        // Current line is exhausted: walk downward until we find the first line
+        // that **owns local tokens**, because we only want *new* token,
+        // not continued from previous lines (they were already iterated over).
+        position.line += 1;
+        while let Some(next_line) = self.source_file.lines.get(position.line) {
+            if next_line.local_range().is_some() {
+                // Start at the first *local* token,
+                // skipping any carried-over one
+                position.column = if next_line.continued_from.is_some() {
+                    1
+                } else {
+                    0
+                };
+                return Some(position);
+            }
+            position.line += 1; // keep skipping empty / pure-carried lines
+        }
+        // No more tokens.
+        None
+    }
+
+    // Creates a new iterator.
+    fn new(source_file: &'src TokenizedFile) -> Tokens<'src> {
+        let mut new_iterator = Tokens {
+            source_file,
+            cursor: Some(TokenLocation { line: 0, column: 0 }),
+            skip_whitespace: false,
+        };
+        // We need to land on the first existing token so [`Iterator::next`]
+        // can assume cursor is valid.
+        while let Some(token_position) = new_iterator.cursor {
+            if new_iterator.source_file.get(token_position).is_some() {
+                break;
+            }
+            new_iterator.cursor = new_iterator.advance_position(token_position);
+        }
+        new_iterator
+    }
+}
+
+impl<'src> Iterator for Tokens<'src> {
+    type Item = (TokenLocation, TokenPiece<'src>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // We only ever loop to discard whitespaces when the flag is on
+        loop {
+            let current_cursor = self.cursor?;
+            let token_piece = *self.source_file.get(current_cursor)?;
+            self.cursor = self.advance_position(current_cursor);
+
+            // Optional whitespace-skip
+            if !self.skip_whitespace || !token_piece.token.is_whitespace() {
+                return Some((current_cursor, token_piece));
+            }
+        }
+    }
+}
+
+impl<'src> TokenizedFile<'src> {
+    // Returns the final local token in `line_number`
+    // (used to resolve column 0 of a continued line).
+    fn last_piece_in_line(&self, line_number: usize) -> Option<&TokenPiece> {
+        self.lines
+            .get(line_number)
+            .and_then(|line| line.local_range())
+            // `Line::local_range()` is guaranteed to return non-empty `Range`.
+            .and_then(|range| self.buffer.get(range.end - 1))
+    }
+
+    /// Returns [`TokenPiece`] at a given location if it exists.
+    ///
+    /// If the line specified by [`TokenLocation`] starts with a token that
+    /// continues from the previous line - column `0` refers to that token.
+    ///
+    /// Never panics, invalid position returns [`None`].
+    ///
+    /// ## Examples
+    ///
+    /// ```rust
+    /// use mycrate::{TokenizedFile, TokenLocation, Token};
+    /// let file = TokenizedFile::from_str("0 / 0");
+    /// assert_eq!(
+    ///     file.get(TokenLocation { line: 0, column: 2 }).map(|p| p.token),
+    ///     Some(Token::Divide),
+    /// );
+    /// ```
+    #[track_caller]
+    pub fn get(&self, position: TokenLocation) -> Option<&TokenPiece> {
+        let line = self.lines.get(position.line)?;
+        let column = position.column;
+        if column >= line.len() {
+            return None;
+        }
+        if let Some(spanned_line_number) = line.continued_from
+            && column == 0
+        {
+            self.last_piece_in_line(spanned_line_number)
+        } else {
+            // If we have a token that continued from the previous line,
+            // then, relative to `self.buffer`, our `column` is actually 1-based
+            // and we need to shift it back to being 0-based.
+            let token_position =
+                line.local_range.start + column - if line.continued_from.is_some() { 1 } else { 0 };
+            self.buffer.get(token_position)
+        }
+    }
+
+    /// Returns an iterator over all contained tokens in the order they appear
+    /// in the original source file.
+    ///
+    /// By default includes all tokens, including whitespace and comments.
+    ///
+    /// Returns the same iterator as [`TokenizedFile::into_iter`]
+    #[must_use]
+    #[inline]
+    pub fn tokens(&'src self) -> Tokens<'src> {
+        Tokens::new(self)
+    }
+}
+
+impl<'src> IntoIterator for &'src TokenizedFile<'src> {
+    type Item = (TokenLocation, TokenPiece<'src>);
+    type IntoIter = Tokens<'src>;
+
+    #[inline]
+    fn into_iter(self) -> Self::IntoIter {
+        self.tokens()
+    }
+}
--- a/rottlib/src/lexer/lexing.rs
+++ b/rottlib/src/lexer/lexing.rs
@ -2,7 +2,7 @@
 //!
 //! ## Notable details
 //!
-//! Lexer for UnrealScript that recognises inline `cpptext { … }` blocks.
+//! Lexer for UnrealScript that recognizes inline `cpptext { … }` blocks.
 //!
 //! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.  
 //! Because whitespace, newlines, or comments may appear between the
@ -41,14 +41,14 @@ pub struct LexerState {
 }

 /// Are these braces "real" UnrealScript braces, or the start/end of a C++ block?
-#[derive(Debug, PartialEq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
 pub enum BraceKind {
    Normal,
    CppBlock,
 }

 /// All UnrealScript tokens that our compiler distinguishes.
-#[derive(logos::Logos, Debug, PartialEq, Clone, Copy)]
+#[derive(logos::Logos, Debug, PartialEq, Eq, Hash, Clone, Copy)]
 #[logos(extras = LexerState)]
 pub enum Token {
    // # Compiler/directive keywords
@ -247,9 +247,9 @@ pub enum Token {
    #[token("~")]
    BitwiseNot,
    // ## Vector
-    #[token("dot")]
+    #[regex("(?i)dot")]
    Dot,
-    #[token("cross")]
+    #[regex("(?i)cross")]
    Cross,
    // ## Multiplicative
    #[token("*")]
@ -290,6 +290,8 @@ pub enum Token {
    NotEqual,
    #[token("~=")]
    ApproximatelyEqual,
+    #[regex("(?i)clockwisefrom")]
+    ClockwiseFrom,
    // ## Bitwise
    #[token("&")]
    BitwiseAnd,
@ -297,11 +299,11 @@ pub enum Token {
    BitwiseOr,
    #[token("^")]
    BitwiseXor,
-    #[token("^^")]
-    BooleanXor,
    // ## Logical
    #[token("&&")]
    And,
+    #[token("^^")]
+    Xor,
    #[token("||")]
    Or,
    // ## Assigments
@ -311,6 +313,8 @@ pub enum Token {
    MultiplyAssign,
    #[token("/=")]
    DivideAssign,
+    #[token("%=")]
+    ModuloAssign,
    #[token("+=")]
    PlusAssign,
    #[token("-=")]
@ -341,6 +345,10 @@ pub enum Token {
    Period,
    #[token(":")]
    Colon,
+    #[token("#")]
+    Hash,
+    #[token("?")]
+    Question,

    // # Comments & whitespaces
    #[regex(r"//[^\r\n]*")]
@ -356,6 +364,30 @@ pub enum Token {
    Error,
 }

+impl Token {
+    /// Returns `true` if this token is a newline (`Token::NewLine`).
+    pub fn is_newline(&self) -> bool {
+        matches!(self, Token::NewLine)
+    }
+
+    /// Returns `true` if this token is trivia whitespace
+    /// (`Token::Whitespace` or `Token::NewLine`).
+    ///
+    /// Note: comments are **not** considered whitespace.
+    pub fn is_whitespace(&self) -> bool {
+        matches!(&self, Token::Whitespace | Token::NewLine)
+    }
+
+    /// Returns `true` if this token may span multiple physical lines
+    /// (i.e. can contain newline characters).
+    pub fn can_span_lines(&self) -> bool {
+        matches!(
+            self,
+            Token::BlockComment | Token::Brace(BraceKind::CppBlock) | Token::Error
+        )
+    }
+}
+
 /// Consume a /* ... */ block comment with arbitrary nesting
 /// (like UnrealScript allows).
 ///
--- a/rottlib/src/lexer/mod.rs
+++ b/rottlib/src/lexer/mod.rs
@ -12,22 +12,28 @@
 //!     precompute lengths of each token in that encoding, making interfacing
 //!     easier.
 //!
+//! ## Iteration over tokens
+//!
+//! For simplicity we've moved out code for iterating over tokens of
+//! [`TokenizedFile`] into a separate submodule [`iterator`].
+//!
 //! ## Opt-in debug helpers
 //!
 //! Extra diagnostics become available in **debug builds** or when the crate is
-//! compiled with `debug` feature enabled. They live in the [`DebugTools`]
+//! compiled with `debug` feature enabled. They live in the [`debug_tools`]
 //! extension trait, implemented for [`TokenizedFile`].
 //!
 //! ```
 //! // bring the trait into scope
 //! use lexer::DebugTools;
 //!
-//! let file = TokenizedFile::from_source(src);
+//! let file = TokenizedFile::from_str(src);
 //! file.debug_dump();              // pretty-print token layout
 //! let text = file.to_source();    // reconstruct original text
 //! ```

 mod debug_tools;
+mod iterator;
 mod lexing;

 use std::ops::Range;
@ -36,6 +42,7 @@ use logos::Logos;

 #[cfg(any(debug_assertions, feature = "debug"))]
 pub use debug_tools::DebugTools;
+pub use iterator::Tokens;
 pub use lexing::Token;

 /// Empirically chosen starting size for token buffer (used during tokenization)
@ -45,34 +52,34 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
 /// A slice tagged with its token kind plus two length counters.
 ///
 /// *No absolute coordinates* are stored - they are recomputed per line.
-#[derive(Debug, Clone, Copy)]
-struct TokenSpan<'src> {
-    lexeme: &'src str,
-    token: Token,
-    length_utf16: usize,
+#[derive(Debug, Hash, Clone, Copy, PartialEq, Eq)]
+pub struct TokenPiece<'src> {
+    /// Token, represented by this [`TokenPiece`].
+    pub token: Token,
+    /// Underlying text that was lexed as the corresponding token.
+    pub lexeme: &'src str,
+    /// Length of the token in UTF-16 code units for the needs of easy seeking
+    /// using given LSP cursor coordinates (line + UTF-16 offset).
+    /// Precomputed for convenience.
+    pub length_utf16: usize,
 }

-/// Representation of a single physical line of the source file.
-///
-/// [`Range<usize>`] are used instead of slices to avoid creating
-/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
-#[derive(Clone)]
-enum Line {
-    /// A standalone line that owns a contiguous slice in
-    /// the [`TokenizedFile::buffer`] arena.
-    Standalone(Range<usize>),
-    /// A 0-based line that is part of a multi-line token started on
-    /// another line.
-    Spanned(usize),
-    /// A 0-based line that is part of a multi-line token started on
-    /// another line *and* contains additional tokens local to itself.
-    SpannedWithTokens(usize, Range<usize>),
+/// Defines location of a token inside [`TokenizedFile`] in a way, convenient
+/// for communicating through LSP.
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub struct TokenLocation {
+    /// 0-based line number.
+    pub line: usize,
+    /// 0-based index of a token in the line, possibly including the token that
+    /// has continued from the previous line.
+    pub column: usize,
 }

 /// A tokenized, lossless representation of an UnrealScript source file.
+#[derive(Debug)]
 pub struct TokenizedFile<'src> {
    /// Arena of every token span in this file.
-    buffer: Vec<TokenSpan<'src>>,
+    buffer: Vec<TokenPiece<'src>>,
    /// Mapping that provides an easy and efficient access to tokens by
    /// line number.
    lines: Vec<Line>,
@ -81,14 +88,17 @@ pub struct TokenizedFile<'src> {
 }

 /// Mutable state that encapsulates data needed during the tokenization loop.
+///
+/// Access to stored tokens is provided  through the [`iterator::Tokens`]
+/// iterator.
 struct Tokenizer<'src> {
-    /// Arena that owns every [`TokenSpan`] produced for the file.
-    buffer: Vec<TokenSpan<'src>>,
+    /// Arena that owns every [`TokenPiece`] produced for the file.
+    buffer: Vec<TokenPiece<'src>>,
    /// Mapping from physical line number to the tokens that belong to it.
    lines: Vec<Line>,
    /// The current 0-based physical line number.
    line_number: usize,
-    /// Index in [`Tokenizer::buffer`] where the current line starts.
+    /// Index in [`Tokenizer::buffer`] where the current *line* starts.
    slice_start_index: usize,
    /// When a multi-line token is being scanned, stores the 0-based line
    /// on which it started; [`None`] otherwise.
@ -99,25 +109,43 @@ struct Tokenizer<'src> {

 impl<'src> TokenizedFile<'src> {
    /// Tokenize `source` and return a fresh [`TokenizedFile`].
-    pub fn from_source(source: &'src str) -> TokenizedFile<'src> {
-        let mut tokenizer = TokenizedFile::<'src>::builder();
+    ///
+    /// ## Examples
+    ///
+    /// ```rust
+    /// let source_text = "2 + 2 * 2".to_string();
+    /// let tokenized_file = TokenizedFile::from_str(&source_text);
+    /// ```
+    #[must_use]
+    pub fn from_str(source: &'src str) -> TokenizedFile<'src> {
+        let mut tokenizer = Self::builder();
        let mut lexer = Token::lexer(source);

-        // Logos > Ok() > token > token span <- plugged into tokenizer
        while let Some(token_result) = lexer.next() {
+            // Add `Token:Error` manually, since Logos won't do it for us.
            let token = token_result.unwrap_or_else(|_| {
                tokenizer.had_errors = true;
                Token::Error
            });
-            let token_span = build_span(token, lexer.slice());
-            tokenizer.process_token_span(token_span);
+            let token_piece = make_token_piece(token, lexer.slice());
+            tokenizer.process_token_piece(token_piece);
        }
        tokenizer.into_tokenized_file()
    }

    /// Returns [`true`] if any erroneous tokens were produced during building
    /// of this [`TokenizedFile`].
-    pub fn had_errors(&self) -> bool {
+    ///
+    /// ## Examples
+    ///
+    /// ```rust
+    /// let tokenized_file = TokenizedFile::from_str("function test() {}");
+    /// if tokenized_file.has_errors() {
+    ///     println!("Error while parsing file: {}", path.display());
+    /// }
+    /// ```
+    #[inline]
+    pub fn has_errors(&self) -> bool {
        self.had_errors
    }

@ -134,38 +162,112 @@ impl<'src> TokenizedFile<'src> {
    }
 }

+/// Type for indexing lines in a [`TokenizedFile`].
+type LineIdx = usize;
+
+/// Type for specific tokens inside each [`Line`].
+type TokenIdx = usize;
+
+/// Representation of a single physical line of the source file.
+///
+/// [`Range<TokenIndex>`] are used instead of slices to avoid creating
+/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+struct Line {
+    /// Token that began on an earlier line (`None` for standalone lines).
+    continued_from: Option<LineIdx>,
+    /// Contiguous tokens that started on this line (`start >= end` iff empty).
+    local_range: Range<TokenIdx>,
+}
+
+impl Line {
+    /// Creates a standalone line that owns a contiguous slice in
+    /// the [`TokenizedFile::buffer`] arena.
+    #[inline]
+    fn standalone(locals: Range<TokenIdx>) -> Line {
+        Line {
+            continued_from: None,
+            local_range: locals,
+        }
+    }
+
+    /// Creates a line that is part of a multi-line token started on
+    /// another line, referencing the 0-based index of its origin.
+    #[inline]
+    fn spanned(carried: LineIdx) -> Line {
+        Line {
+            continued_from: Some(carried),
+            local_range: 0..0,
+        }
+    }
+
+    /// Creates a line that is part of a multi-line token started on
+    /// another line and also contains additional tokens local to itself.
+    #[inline]
+    fn spanned_with_tokens(carried: LineIdx, locals: Range<TokenIdx>) -> Line {
+        Line {
+            continued_from: Some(carried),
+            local_range: locals,
+        }
+    }
+
+    /// Returns a range of tokens inside [`TokenizedFile::buffer`] that start
+    /// on this line.
+    ///
+    /// [`None`] means there is no such tokens. Otherwise range is guaranteed
+    /// to not be empty.
+    #[inline]
+    fn local_range(&self) -> Option<Range<TokenIdx>> {
+        if self.local_range.is_empty() {
+            None
+        } else {
+            Some(self.local_range.clone())
+        }
+    }
+
+    /// Returns amount of tokens of the line.
+    ///
+    /// Counts both tokens that started on this line and tokens that continued
+    /// from previous one.
+    #[inline]
+    fn len(&self) -> usize {
+        (if self.continued_from.is_some() { 1 } else { 0 })
+            + (self.local_range.end - self.local_range.start)
+    }
+}
+
 impl<'src> Tokenizer<'src> {
    /// Handles a token span and dispatches to the appropriate handler.
-    fn process_token_span(&mut self, token_span: TokenSpan<'src>) {
-        if token_can_span_lines(&token_span.token) {
-            self.process_multi_line_token(token_span);
+    fn process_token_piece(&mut self, token_piece: TokenPiece<'src>) {
+        if token_piece.token.can_span_lines() {
+            self.process_multi_line_token(token_piece);
        } else {
-            self.process_single_line_token(token_span);
+            self.process_single_line_token(token_piece);
        }
    }

    /// Handles tokens that never span multiple lines.
-    fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) {
-        if token_is_newline(&token_span.token) {
+    fn process_single_line_token(&mut self, token_piece: TokenPiece<'src>) {
+        if token_piece.token.is_newline() {
            self.line_number += 1;
-            self.buffer.push(token_span);
+            self.buffer.push(token_piece);
            self.commit_current_line();
        } else {
-            self.buffer.push(token_span);
+            self.buffer.push(token_piece);
        }
    }

    /// Handles tokens that may contain one or more newline characters.
-    fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) {
+    fn process_multi_line_token(&mut self, token_piece: TokenPiece<'src>) {
        let start_line = self.line_number;
-        let newline_count = count_newlines(token_span.lexeme);
+        let newline_count = count_line_breaks(token_piece.lexeme);

        // Did this token end in a newline?
        // This can happen if this is an `Error` token that ends the file.
        let ends_with_newline =
-            token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r');
+            token_piece.lexeme.ends_with('\n') || token_piece.lexeme.ends_with('\r');

-        self.buffer.push(token_span);
+        self.buffer.push(token_piece);
        // We only need to commit the line if this token actually ended the line
        if newline_count > 0 {
            self.commit_current_line();
@ -175,7 +277,7 @@ impl<'src> Tokenizer<'src> {
            // exactly `1` interior line)
            let insert_count = newline_count - 1;
            for _ in 0..insert_count {
-                self.lines.push(Line::Spanned(start_line));
+                self.lines.push(Line::spanned(start_line));
            }
            // This is called *after* `commit_current_line()` cleared previous
            // stored value
@ -196,13 +298,13 @@ impl<'src> Tokenizer<'src> {
            let slice = self.slice_start_index..slice_end;

            // If we were in the middle of a multi-line token, we
-            // *always* consume `spanned_from` here, ensuring that each call to
-            // `commit_current_line()` only applies it once.
+            // *always* consume `multi_line_start` here, ensuring that each call
+            // to `commit_current_line()` only applies it once.
            // This guarantees no "bleed" between adjacent multi-line tokens.
            if let Some(from) = self.multi_line_start.take() {
-                self.lines.push(Line::SpannedWithTokens(from, slice));
+                self.lines.push(Line::spanned_with_tokens(from, slice));
            } else {
-                self.lines.push(Line::Standalone(slice));
+                self.lines.push(Line::standalone(slice));
            }
            self.slice_start_index = slice_end;
        }
@ -213,10 +315,11 @@ impl<'src> Tokenizer<'src> {
    fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
        // Commit any trailing tokens
        self.commit_current_line();
-        // If we still have a `spanned_from` (i.e. a pure multi-line token with
-        // no local tokens on its last line), push a bare `Spanned` entry.
+        // If we still have a `multi_line_start`
+        // (i.e. a pure multi-line token with no local tokens on its last line),
+        // push a bare `Line::spanned` entry.
        if let Some(from) = self.multi_line_start.take() {
-            self.lines.push(Line::Spanned(from));
+            self.lines.push(Line::spanned(from));
        }

        // Optimize for size
@ -231,28 +334,17 @@ impl<'src> Tokenizer<'src> {
    }
 }

-fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> {
+fn make_token_piece<'src>(token: Token, text: &'src str) -> TokenPiece<'src> {
    let length_utf16 = text.encode_utf16().count();
-    TokenSpan {
+    TokenPiece {
        lexeme: text,
        token,
        length_utf16,
    }
 }

-fn token_is_newline(token: &Token) -> bool {
-    matches!(token, Token::NewLine)
-}
-
-fn token_can_span_lines(token: &Token) -> bool {
-    matches!(
-        token,
-        Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error
-    )
-}
-
 /// Counts the number of new lines in given text.
-fn count_newlines(text: &str) -> usize {
+fn count_line_breaks(text: &str) -> usize {
    let mut bytes_iterator = text.as_bytes().iter().peekable();
    let mut newline_count = 0;
    while let Some(&next_byte) = bytes_iterator.next() {
Author	SHA1	Message	Date
dkanus	e2d17f2e8a	Add iterator over tokens to TokenizedFile	2025-08-07 17:24:35 +07:00
dkanus	9ab65b0b02	Add missing tokens to lexer	2025-08-07 13:49:19 +07:00
dkanus	9ff20c7a60	Rename TokenSpan to TokenPiece and tidy code layout	2025-08-06 23:18:08 +07:00
dkanus	579c2a4d3d	Refactor `Line` Previous definition of `Line` type was obnoxious and too difficult to work with. This one should make iterator implementation much easier and has clearer structure on its own.	2025-08-06 23:17:55 +07:00