rott/dev_tests/src/uc_lexer_verify.rs

use std::{collections::HashSet, fs, path::PathBuf};

use rottlib::lexer::{DebugTools, TokenizedFile};

/// Read `ignore.txt` (one path per line, `#` for comments) from root directory
/// and turn it into a canonicalized [`HashSet<PathBuf>`].
fn load_ignore_set(root: &std::path::Path) -> HashSet<PathBuf> {
    let ignore_file = root.join("ignore.txt");
    if !ignore_file.exists() {
        return HashSet::new();
    }

    let content = match fs::read_to_string(&ignore_file) {
        Ok(content) => content,
        Err(error) => {
            eprintln!("Could not read {}: {error}", ignore_file.display());
            return HashSet::new();
        }
    };

    content
        .lines()
        .map(str::trim)
        .filter(|line| !line.is_empty() && !line.starts_with('#'))
        .filter_map(|line| {
            let next_path = PathBuf::from(line);
            let absolute_path = if next_path.is_absolute() {
                next_path
            } else {
                root.join(next_path)
            };
            fs::canonicalize(absolute_path).ok()
        })
        .collect()
}

/// CLI: `verify_uc <root_dir>` - find all `.uc` files in the provided directory
/// (except those listed in `ignore.txt` in the root) and test them all.
///
/// Reported execution time is the tokenization time, without considering time
/// it takes to read files from disk.
///
/// `ignore.txt` is for listing specific files, not directories.
fn main() {
    let root_dir = std::env::args().nth(1).unwrap(); // it is fine to crash debug utility
    let root = PathBuf::from(&root_dir);

    if !root.exists() {
        eprintln!("Root directory '{root_dir}' does not exist.");
        std::process::exit(1);
    }

    // Load files
    let ignored_paths = load_ignore_set(&root);
    let mut uc_files: Vec<(PathBuf, String)> = Vec::new();
    for entry in walkdir::WalkDir::new(&root)
        .into_iter()
        .filter_map(Result::ok) // for debug tool this is ok
        .filter(|entry| {
            let path = entry.path();
            // Skip anything explicitly ignored
            if let Ok(absolute_path) = fs::canonicalize(path) {
                if ignored_paths.contains(&absolute_path) {
                    return false;
                }
            }
            // Must be *.uc
            path.is_file()
                && path
                    .extension()
                    .and_then(|extension| extension.to_str())
                    .is_some_and(|extension| extension.eq_ignore_ascii_case("uc"))
        })
    {
        let path = entry.path();
        match fs::read(path) {
            Ok(raw_bytes) => {
                // Auto‑detect encoding for old Unreal script sources
                let (encoding_label, _, _) = chardet::detect(&raw_bytes);
                let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes())
                    .unwrap_or(encoding_rs::UTF_8);
                let (decoded_text, _, _) = encoding.decode(&raw_bytes);
                uc_files.push((path.to_path_buf(), decoded_text.into_owned()));
            }
            Err(error) => {
                eprintln!("Failed to read `{}`: {error}", path.display());
                std::process::exit(1);
            }
        }
    }
    println!("Loaded {} .uc files into memory.", uc_files.len());

    // Tokenize and measure performance
    let start_time = std::time::Instant::now();
    let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files
        .iter()
        .map(|(path, source_code)| {
            let tokenized_file = TokenizedFile::from_source(source_code);
            if tokenized_file.had_errors() {
                println!("TK: {}", path.display());
            }
            (path.clone(), tokenized_file)
        })
        .collect();
    let elapsed_time = start_time.elapsed();
    println!(
        "Tokenized {} files in {:.2?}",
        tokenized_files.len(),
        elapsed_time
    );

    // Round‑trip check
    for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) {
        let reconstructed = tokenized_file.reconstruct_source();
        if original != &reconstructed {
            eprintln!("Reconstruction mismatch in `{}`!", path.display());
            std::process::exit(1);
        }
    }

    println!("All .uc files matched successfully.");
}