rott/dev_tests/src/uc_lexer_verify.rs
2025-07-30 19:47:11 +07:00

123 lines
4.3 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::{collections::HashSet, fs, path::PathBuf};
use rottlib::lexer::{DebugTools, TokenizedFile};
/// Read `ignore.txt` (one path per line, `#` for comments) from root directory
/// and turn it into a canonicalized [`HashSet<PathBuf>`].
fn load_ignore_set(root: &std::path::Path) -> HashSet<PathBuf> {
let ignore_file = root.join("ignore.txt");
if !ignore_file.exists() {
return HashSet::new();
}
let content = match fs::read_to_string(&ignore_file) {
Ok(content) => content,
Err(error) => {
eprintln!("Could not read {}: {error}", ignore_file.display());
return HashSet::new();
}
};
content
.lines()
.map(str::trim)
.filter(|line| !line.is_empty() && !line.starts_with('#'))
.filter_map(|line| {
let next_path = PathBuf::from(line);
let absolute_path = if next_path.is_absolute() {
next_path
} else {
root.join(next_path)
};
fs::canonicalize(absolute_path).ok()
})
.collect()
}
/// CLI: `verify_uc <root_dir>` - find all `.uc` files in the provided directory
/// (except those listed in `ignore.txt` in the root) and test them all.
///
/// Reported execution time is the tokenization time, without considering time
/// it takes to read files from disk.
///
/// `ignore.txt` is for listing specific files, not directories.
fn main() {
let root_dir = std::env::args().nth(1).unwrap(); // it is fine to crash debug utility
let root = PathBuf::from(&root_dir);
if !root.exists() {
eprintln!("Root directory '{root_dir}' does not exist.");
std::process::exit(1);
}
// Load files
let ignored_paths = load_ignore_set(&root);
let mut uc_files: Vec<(PathBuf, String)> = Vec::new();
for entry in walkdir::WalkDir::new(&root)
.into_iter()
.filter_map(Result::ok) // for debug tool this is ok
.filter(|entry| {
let path = entry.path();
// Skip anything explicitly ignored
if let Ok(absolute_path) = fs::canonicalize(path) {
if ignored_paths.contains(&absolute_path) {
return false;
}
}
// Must be *.uc
path.is_file()
&& path
.extension()
.and_then(|extension| extension.to_str())
.is_some_and(|extension| extension.eq_ignore_ascii_case("uc"))
})
{
let path = entry.path();
match fs::read(path) {
Ok(raw_bytes) => {
// Autodetect encoding for old Unreal script sources
let (encoding_label, _, _) = chardet::detect(&raw_bytes);
let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes())
.unwrap_or(encoding_rs::UTF_8);
let (decoded_text, _, _) = encoding.decode(&raw_bytes);
uc_files.push((path.to_path_buf(), decoded_text.into_owned()));
}
Err(error) => {
eprintln!("Failed to read `{}`: {error}", path.display());
std::process::exit(1);
}
}
}
println!("Loaded {} .uc files into memory.", uc_files.len());
// Tokenize and measure performance
let start_time = std::time::Instant::now();
let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files
.iter()
.map(|(path, source_code)| {
let tokenized_file = TokenizedFile::from_source(source_code);
if tokenized_file.had_errors() {
println!("TK: {}", path.display());
}
(path.clone(), tokenized_file)
})
.collect();
let elapsed_time = start_time.elapsed();
println!(
"Tokenized {} files in {:.2?}",
tokenized_files.len(),
elapsed_time
);
// Roundtrip check
for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) {
let reconstructed = tokenized_file.reconstruct_source();
if original != &reconstructed {
eprintln!("Reconstruction mismatch in `{}`!", path.display());
std::process::exit(1);
}
}
println!("All .uc files matched successfully.");
}