Add iterator over tokens to TokenizedFile

This commit is contained in:
dkanus 2025-08-07 17:24:35 +07:00
parent 9ab65b0b02
commit 933722bd42
3 changed files with 306 additions and 74 deletions

View File

@ -0,0 +1,191 @@
//! Sub-module that adds an iterator to [`TokenizedFile`] which yields tokens in
//! the order they appear in the source code.
//!
//! ## Examples
//!
//! ```rust
//! let iter = TokenizedFile::from_str("0 / 0").tokens().without_whitespace();
//! ```
//!
//! ## Terminology: continued tokens
//!
//! Some [`super::Token`]s (e.g. [`super::Token::CppText`] or
//! [`super::Token::BlockComment`] can span multiple lines and are recorded on
//! every line on which they appear (usually as the first, and sometimes
//! the only, token).
//! In this module these are referred to as "continued" or
//! "carried-over" tokens.
//! Since our iterator needs to return each token only once, we take special
//! care to skip such continued tokens during iteration.
use super::{TokenLocation, TokenPiece, TokenizedFile};
/// An immutable iterator over all tokens in a [`TokenizedFile`], preserving
/// their order of appearance in the original source file.
///
/// After exhaustion it keeps returning [`None`].
#[must_use]
#[derive(Clone, Debug)]
pub struct Tokens<'src> {
/// [`TokenLocation`] of the next token to be returned.
///
/// [`None`] means the iterator has been exhausted.
cursor: Option<TokenLocation>,
/// [`TokenizedFile`] whose tokens we're iterating over.
source_file: &'src TokenizedFile<'src>,
/// When `true`, whitespace tokens are skipped.
skip_whitespace: bool,
}
// Because we can only return [`None`] after we've returned it once.
impl<'src> std::iter::FusedIterator for Tokens<'src> {}
impl<'src> Tokens<'src> {
/// Makes the iterator skip all whitespace tokens.
#[must_use]
#[inline]
pub fn without_whitespace(mut self) -> Self {
self.skip_whitespace = true;
self
}
// Returns the position of the next new token, skipping carried-over pieces
// and blank lines.
fn advance_position(&self, mut position: TokenLocation) -> Option<TokenLocation> {
if let Some(current_line) = self.source_file.lines.get(position.line) {
// `Line::len()` also counts a possible token that continued from
// the previous line.
if position.column + 1 < current_line.len() {
position.column += 1;
return Some(position);
}
}
// Current line is exhausted: walk downward until we find the first line
// that **owns local tokens**, because we only want *new* token,
// not continued from previous lines (they were already iterated over).
position.line += 1;
while let Some(next_line) = self.source_file.lines.get(position.line) {
if next_line.local_range().is_some() {
// Start at the first *local* token,
// skipping any carried-over one
position.column = if next_line.continued_from.is_some() {
1
} else {
0
};
return Some(position);
}
position.line += 1; // keep skipping empty / pure-carried lines
}
// No more tokens.
None
}
// Creates a new iterator.
fn new(source_file: &'src TokenizedFile) -> Tokens<'src> {
let mut new_iterator = Tokens {
source_file,
cursor: Some(TokenLocation { line: 0, column: 0 }),
skip_whitespace: false,
};
// We need to land on the first existing token so [`Iterator::next`]
// can assume cursor is valid.
while let Some(token_position) = new_iterator.cursor {
if new_iterator.source_file.get(token_position).is_some() {
break;
}
new_iterator.cursor = new_iterator.advance_position(token_position);
}
new_iterator
}
}
impl<'src> Iterator for Tokens<'src> {
type Item = (TokenLocation, TokenPiece<'src>);
fn next(&mut self) -> Option<Self::Item> {
// We only ever loop to discard whitespaces when the flag is on
loop {
let current_cursor = self.cursor?;
let token_piece = *self.source_file.get(current_cursor)?;
self.cursor = self.advance_position(current_cursor);
// Optional whitespace-skip
if !self.skip_whitespace || !token_piece.token.is_whitespace() {
return Some((current_cursor, token_piece));
}
}
}
}
impl<'src> TokenizedFile<'src> {
// Returns the final local token in `line_number`
// (used to resolve column 0 of a continued line).
fn last_piece_in_line(&self, line_number: usize) -> Option<&TokenPiece> {
self.lines
.get(line_number)
.and_then(|line| line.local_range())
// `Line::local_range()` is guaranteed to return non-empty `Range`.
.and_then(|range| self.buffer.get(range.end - 1))
}
/// Returns [`TokenPiece`] at a given location if it exists.
///
/// If the line specified by [`TokenLocation`] starts with a token that
/// continues from the previous line - column `0` refers to that token.
///
/// Never panics, invalid position returns [`None`].
///
/// ## Examples
///
/// ```rust
/// use mycrate::{TokenizedFile, TokenLocation, Token};
/// let file = TokenizedFile::from_str("0 / 0");
/// assert_eq!(
/// file.get(TokenLocation { line: 0, column: 2 }).map(|p| p.token),
/// Some(Token::Divide),
/// );
/// ```
#[track_caller]
pub fn get(&self, position: TokenLocation) -> Option<&TokenPiece> {
let line = self.lines.get(position.line)?;
let column = position.column;
if column >= line.len() {
return None;
}
if let Some(spanned_line_number) = line.continued_from
&& column == 0
{
self.last_piece_in_line(spanned_line_number)
} else {
// If we have a token that continued from the previous line,
// then, relative to `self.buffer`, our `column` is actually 1-based
// and we need to shift it back to being 0-based.
let token_position =
line.local_range.start + column - if line.continued_from.is_some() { 1 } else { 0 };
self.buffer.get(token_position)
}
}
/// Returns an iterator over all contained tokens in the order they appear
/// in the original source file.
///
/// By default includes all tokens, including whitespace and comments.
///
/// Returns the same iterator as [`TokenizedFile::into_iter`]
#[must_use]
#[inline]
pub fn tokens(&'src self) -> Tokens<'src> {
Tokens::new(self)
}
}
impl<'src> IntoIterator for &'src TokenizedFile<'src> {
type Item = (TokenLocation, TokenPiece<'src>);
type IntoIter = Tokens<'src>;
#[inline]
fn into_iter(self) -> Self::IntoIter {
self.tokens()
}
}

View File

@ -364,6 +364,30 @@ pub enum Token {
Error, Error,
} }
impl Token {
/// Returns `true` if this token is a newline (`Token::NewLine`).
pub fn is_newline(&self) -> bool {
matches!(self, Token::NewLine)
}
/// Returns `true` if this token is trivia whitespace
/// (`Token::Whitespace` or `Token::NewLine`).
///
/// Note: comments are **not** considered whitespace.
pub fn is_whitespace(&self) -> bool {
matches!(&self, Token::Whitespace | Token::NewLine)
}
/// Returns `true` if this token may span multiple physical lines
/// (i.e. can contain newline characters).
pub fn can_span_lines(&self) -> bool {
matches!(
self,
Token::BlockComment | Token::Brace(BraceKind::CppBlock) | Token::Error
)
}
}
/// Consume a /* ... */ block comment with arbitrary nesting /// Consume a /* ... */ block comment with arbitrary nesting
/// (like UnrealScript allows). /// (like UnrealScript allows).
/// ///

View File

@ -12,30 +12,37 @@
//! precompute lengths of each token in that encoding, making interfacing //! precompute lengths of each token in that encoding, making interfacing
//! easier. //! easier.
//! //!
//! ## Iteration over tokens
//!
//! For simplicity we've moved out code for iterating over tokens of
//! [`TokenizedFile`] into a separate submodule [`iterator`].
//!
//! ## Opt-in debug helpers //! ## Opt-in debug helpers
//! //!
//! Extra diagnostics become available in **debug builds** or when the crate is //! Extra diagnostics become available in **debug builds** or when the crate is
//! compiled with `debug` feature enabled. They live in the [`DebugTools`] //! compiled with `debug` feature enabled. They live in the [`debug_tools`]
//! extension trait, implemented for [`TokenizedFile`]. //! extension trait, implemented for [`TokenizedFile`].
//! //!
//! ``` //! ```
//! // bring the trait into scope //! // bring the trait into scope
//! use lexer::DebugTools; //! use lexer::DebugTools;
//! //!
//! let file = TokenizedFile::from_source(src); //! let file = TokenizedFile::from_str(src);
//! file.debug_dump(); // pretty-print token layout //! file.debug_dump(); // pretty-print token layout
//! let text = file.to_source(); // reconstruct original text //! let text = file.to_source(); // reconstruct original text
//! ``` //! ```
mod debug_tools; mod debug_tools;
mod iterator;
mod lexing; mod lexing;
use std::{cmp::Ordering, ops::Range}; use std::ops::Range;
use logos::Logos; use logos::Logos;
#[cfg(any(debug_assertions, feature = "debug"))] #[cfg(any(debug_assertions, feature = "debug"))]
pub use debug_tools::DebugTools; pub use debug_tools::DebugTools;
pub use iterator::Tokens;
pub use lexing::Token; pub use lexing::Token;
/// Empirically chosen starting size for token buffer (used during tokenization) /// Empirically chosen starting size for token buffer (used during tokenization)
@ -45,44 +52,31 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
/// A slice tagged with its token kind plus two length counters. /// A slice tagged with its token kind plus two length counters.
/// ///
/// *No absolute coordinates* are stored - they are recomputed per line. /// *No absolute coordinates* are stored - they are recomputed per line.
#[derive(Debug, Clone, Copy)] #[derive(Debug, Hash, Clone, Copy, PartialEq, Eq)]
pub struct TokenPiece<'src> { pub struct TokenPiece<'src> {
pub lexeme: &'src str, /// Token, represented by this [`TokenPiece`].
pub token: Token, pub token: Token,
/// Underlying text that was lexed as the corresponding token.
pub lexeme: &'src str,
/// Length of the token in UTF-16 code units for the needs of easy seeking
/// using given LSP cursor coordinates (line + UTF-16 offset).
/// Precomputed for convenience.
pub length_utf16: usize, pub length_utf16: usize,
} }
/// Defines location of a token inside [`TokenizedFile`] in a way, convenient /// Defines location of a token inside [`TokenizedFile`] in a way, convenient
/// for communicating through LSP. /// for communicating through LSP.
#[derive(Eq, Clone, Copy)] #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct TokenLocation { pub struct TokenLocation {
line_number: usize, /// 0-based line number.
column: usize, pub line: usize,
/// 0-based index of a token in the line, possibly including the token that
/// has continued from the previous line.
pub column: usize,
} }
impl PartialEq for TokenLocation {
fn eq(&self, other: &TokenLocation) -> bool {
self.line_number == other.line_number && self.column == other.column
}
}
impl PartialOrd for TokenLocation {
fn partial_cmp(&self, other: &TokenLocation) -> Option<Ordering> {
if self.line_number == other.line_number {
self.column.partial_cmp(&other.column)
} else {
self.line_number.partial_cmp(&other.line_number)
}
}
}
/// Type for indexing lines in a [`TokenizedFile`].
type LineNumber = usize;
/// Type for specific tokens inside each [`Line`].
type TokenIndex = usize;
/// A tokenized, lossless representation of an UnrealScript source file. /// A tokenized, lossless representation of an UnrealScript source file.
#[derive(Debug)]
pub struct TokenizedFile<'src> { pub struct TokenizedFile<'src> {
/// Arena of every token span in this file. /// Arena of every token span in this file.
buffer: Vec<TokenPiece<'src>>, buffer: Vec<TokenPiece<'src>>,
@ -94,6 +88,9 @@ pub struct TokenizedFile<'src> {
} }
/// Mutable state that encapsulates data needed during the tokenization loop. /// Mutable state that encapsulates data needed during the tokenization loop.
///
/// Access to stored tokens is provided through the [`iterator::Tokens`]
/// iterator.
struct Tokenizer<'src> { struct Tokenizer<'src> {
/// Arena that owns every [`TokenPiece`] produced for the file. /// Arena that owns every [`TokenPiece`] produced for the file.
buffer: Vec<TokenPiece<'src>>, buffer: Vec<TokenPiece<'src>>,
@ -101,7 +98,7 @@ struct Tokenizer<'src> {
lines: Vec<Line>, lines: Vec<Line>,
/// The current 0-based physical line number. /// The current 0-based physical line number.
line_number: usize, line_number: usize,
/// Index in [`Tokenizer::buffer`] where the current line starts. /// Index in [`Tokenizer::buffer`] where the current *line* starts.
slice_start_index: usize, slice_start_index: usize,
/// When a multi-line token is being scanned, stores the 0-based line /// When a multi-line token is being scanned, stores the 0-based line
/// on which it started; [`None`] otherwise. /// on which it started; [`None`] otherwise.
@ -112,25 +109,43 @@ struct Tokenizer<'src> {
impl<'src> TokenizedFile<'src> { impl<'src> TokenizedFile<'src> {
/// Tokenize `source` and return a fresh [`TokenizedFile`]. /// Tokenize `source` and return a fresh [`TokenizedFile`].
pub fn from_source(source: &'src str) -> TokenizedFile<'src> { ///
let mut tokenizer = TokenizedFile::<'src>::builder(); /// ## Examples
///
/// ```rust
/// let source_text = "2 + 2 * 2".to_string();
/// let tokenized_file = TokenizedFile::from_str(&source_text);
/// ```
#[must_use]
pub fn from_str(source: &'src str) -> TokenizedFile<'src> {
let mut tokenizer = Self::builder();
let mut lexer = Token::lexer(source); let mut lexer = Token::lexer(source);
// Logos > Ok() > token > token span <- plugged into tokenizer
while let Some(token_result) = lexer.next() { while let Some(token_result) = lexer.next() {
// Add `Token:Error` manually, since Logos won't do it for us.
let token = token_result.unwrap_or_else(|_| { let token = token_result.unwrap_or_else(|_| {
tokenizer.had_errors = true; tokenizer.had_errors = true;
Token::Error Token::Error
}); });
let token_span = build_span(token, lexer.slice()); let token_piece = make_token_piece(token, lexer.slice());
tokenizer.process_token_span(token_span); tokenizer.process_token_piece(token_piece);
} }
tokenizer.into_tokenized_file() tokenizer.into_tokenized_file()
} }
/// Returns [`true`] if any erroneous tokens were produced during building /// Returns [`true`] if any erroneous tokens were produced during building
/// of this [`TokenizedFile`]. /// of this [`TokenizedFile`].
pub fn had_errors(&self) -> bool { ///
/// ## Examples
///
/// ```rust
/// let tokenized_file = TokenizedFile::from_str("function test() {}");
/// if tokenized_file.has_errors() {
/// println!("Error while parsing file: {}", path.display());
/// }
/// ```
#[inline]
pub fn has_errors(&self) -> bool {
self.had_errors self.had_errors
} }
@ -147,22 +162,29 @@ impl<'src> TokenizedFile<'src> {
} }
} }
/// Type for indexing lines in a [`TokenizedFile`].
type LineIdx = usize;
/// Type for specific tokens inside each [`Line`].
type TokenIdx = usize;
/// Representation of a single physical line of the source file. /// Representation of a single physical line of the source file.
/// ///
/// [`Range<TokenIndex>`] are used instead of slices to avoid creating /// [`Range<TokenIndex>`] are used instead of slices to avoid creating
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids. /// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
#[derive(Clone)] #[derive(Clone, Debug, Hash, PartialEq, Eq)]
struct Line { struct Line {
/// Token that began on an earlier line (`None` for standalone lines). /// Token that began on an earlier line (`None` for standalone lines).
continued_from: Option<LineNumber>, continued_from: Option<LineIdx>,
/// Contiguous tokens that started on this line (`start >= end` iff empty). /// Contiguous tokens that started on this line (`start >= end` iff empty).
local_range: Range<TokenIndex>, local_range: Range<TokenIdx>,
} }
impl Line { impl Line {
/// Creates a standalone line that owns a contiguous slice in /// Creates a standalone line that owns a contiguous slice in
/// the [`TokenizedFile::buffer`] arena. /// the [`TokenizedFile::buffer`] arena.
fn standalone(locals: Range<TokenIndex>) -> Line { #[inline]
fn standalone(locals: Range<TokenIdx>) -> Line {
Line { Line {
continued_from: None, continued_from: None,
local_range: locals, local_range: locals,
@ -171,7 +193,8 @@ impl Line {
/// Creates a line that is part of a multi-line token started on /// Creates a line that is part of a multi-line token started on
/// another line, referencing the 0-based index of its origin. /// another line, referencing the 0-based index of its origin.
fn spanned(carried: LineNumber) -> Line { #[inline]
fn spanned(carried: LineIdx) -> Line {
Line { Line {
continued_from: Some(carried), continued_from: Some(carried),
local_range: 0..0, local_range: 0..0,
@ -180,7 +203,8 @@ impl Line {
/// Creates a line that is part of a multi-line token started on /// Creates a line that is part of a multi-line token started on
/// another line and also contains additional tokens local to itself. /// another line and also contains additional tokens local to itself.
fn spanned_with_tokens(carried: LineNumber, locals: Range<TokenIndex>) -> Line { #[inline]
fn spanned_with_tokens(carried: LineIdx, locals: Range<TokenIdx>) -> Line {
Line { Line {
continued_from: Some(carried), continued_from: Some(carried),
local_range: locals, local_range: locals,
@ -192,7 +216,8 @@ impl Line {
/// ///
/// [`None`] means there is no such tokens. Otherwise range is guaranteed /// [`None`] means there is no such tokens. Otherwise range is guaranteed
/// to not be empty. /// to not be empty.
fn local_range(&self) -> Option<Range<TokenIndex>> { #[inline]
fn local_range(&self) -> Option<Range<TokenIdx>> {
if self.local_range.is_empty() { if self.local_range.is_empty() {
None None
} else { } else {
@ -204,43 +229,45 @@ impl Line {
/// ///
/// Counts both tokens that started on this line and tokens that continued /// Counts both tokens that started on this line and tokens that continued
/// from previous one. /// from previous one.
#[inline]
fn len(&self) -> usize { fn len(&self) -> usize {
(self.continued_from.is_some() as usize) + (self.local_range.end - self.local_range.start) (if self.continued_from.is_some() { 1 } else { 0 })
+ (self.local_range.end - self.local_range.start)
} }
} }
impl<'src> Tokenizer<'src> { impl<'src> Tokenizer<'src> {
/// Handles a token span and dispatches to the appropriate handler. /// Handles a token span and dispatches to the appropriate handler.
fn process_token_span(&mut self, token_span: TokenPiece<'src>) { fn process_token_piece(&mut self, token_piece: TokenPiece<'src>) {
if token_can_span_lines(&token_span.token) { if token_piece.token.can_span_lines() {
self.process_multi_line_token(token_span); self.process_multi_line_token(token_piece);
} else { } else {
self.process_single_line_token(token_span); self.process_single_line_token(token_piece);
} }
} }
/// Handles tokens that never span multiple lines. /// Handles tokens that never span multiple lines.
fn process_single_line_token(&mut self, token_span: TokenPiece<'src>) { fn process_single_line_token(&mut self, token_piece: TokenPiece<'src>) {
if token_is_newline(&token_span.token) { if token_piece.token.is_newline() {
self.line_number += 1; self.line_number += 1;
self.buffer.push(token_span); self.buffer.push(token_piece);
self.commit_current_line(); self.commit_current_line();
} else { } else {
self.buffer.push(token_span); self.buffer.push(token_piece);
} }
} }
/// Handles tokens that may contain one or more newline characters. /// Handles tokens that may contain one or more newline characters.
fn process_multi_line_token(&mut self, token_span: TokenPiece<'src>) { fn process_multi_line_token(&mut self, token_piece: TokenPiece<'src>) {
let start_line = self.line_number; let start_line = self.line_number;
let newline_count = count_newlines(token_span.lexeme); let newline_count = count_line_breaks(token_piece.lexeme);
// Did this token end in a newline? // Did this token end in a newline?
// This can happen if this is an `Error` token that ends the file. // This can happen if this is an `Error` token that ends the file.
let ends_with_newline = let ends_with_newline =
token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r'); token_piece.lexeme.ends_with('\n') || token_piece.lexeme.ends_with('\r');
self.buffer.push(token_span); self.buffer.push(token_piece);
// We only need to commit the line if this token actually ended the line // We only need to commit the line if this token actually ended the line
if newline_count > 0 { if newline_count > 0 {
self.commit_current_line(); self.commit_current_line();
@ -271,8 +298,8 @@ impl<'src> Tokenizer<'src> {
let slice = self.slice_start_index..slice_end; let slice = self.slice_start_index..slice_end;
// If we were in the middle of a multi-line token, we // If we were in the middle of a multi-line token, we
// *always* consume `spanned_from` here, ensuring that each call to // *always* consume `multi_line_start` here, ensuring that each call
// `commit_current_line()` only applies it once. // to `commit_current_line()` only applies it once.
// This guarantees no "bleed" between adjacent multi-line tokens. // This guarantees no "bleed" between adjacent multi-line tokens.
if let Some(from) = self.multi_line_start.take() { if let Some(from) = self.multi_line_start.take() {
self.lines.push(Line::spanned_with_tokens(from, slice)); self.lines.push(Line::spanned_with_tokens(from, slice));
@ -288,8 +315,9 @@ impl<'src> Tokenizer<'src> {
fn into_tokenized_file(mut self) -> TokenizedFile<'src> { fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
// Commit any trailing tokens // Commit any trailing tokens
self.commit_current_line(); self.commit_current_line();
// If we still have a `spanned_from` (i.e. a pure multi-line token with // If we still have a `multi_line_start`
// no local tokens on its last line), push a bare `Spanned` entry. // (i.e. a pure multi-line token with no local tokens on its last line),
// push a bare `Line::spanned` entry.
if let Some(from) = self.multi_line_start.take() { if let Some(from) = self.multi_line_start.take() {
self.lines.push(Line::spanned(from)); self.lines.push(Line::spanned(from));
} }
@ -306,7 +334,7 @@ impl<'src> Tokenizer<'src> {
} }
} }
fn build_span<'src>(token: Token, text: &'src str) -> TokenPiece<'src> { fn make_token_piece<'src>(token: Token, text: &'src str) -> TokenPiece<'src> {
let length_utf16 = text.encode_utf16().count(); let length_utf16 = text.encode_utf16().count();
TokenPiece { TokenPiece {
lexeme: text, lexeme: text,
@ -315,19 +343,8 @@ fn build_span<'src>(token: Token, text: &'src str) -> TokenPiece<'src> {
} }
} }
fn token_is_newline(token: &Token) -> bool {
matches!(token, Token::NewLine)
}
fn token_can_span_lines(token: &Token) -> bool {
matches!(
token,
Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error
)
}
/// Counts the number of new lines in given text. /// Counts the number of new lines in given text.
fn count_newlines(text: &str) -> usize { fn count_line_breaks(text: &str) -> usize {
let mut bytes_iterator = text.as_bytes().iter().peekable(); let mut bytes_iterator = text.as_bytes().iter().peekable();
let mut newline_count = 0; let mut newline_count = 0;
while let Some(&next_byte) = bytes_iterator.next() { while let Some(&next_byte) = bytes_iterator.next() {