mstykow · mstykow · May 5, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md
@@ -12,7 +12,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc
 
 ![Scan duration vs. file count for Provenant and ScanCode](scan-duration-vs-files.svg)
 
-> Provenant is faster on 190 of 190 recorded runs, with a **12.0× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets.
+> Provenant is faster on 191 of 191 recorded runs, with a **12.1× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets.
 > Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`.
 
 ## Current benchmark examples
@@ -819,6 +819,13 @@ The quick index below links to benchmark sections. Each benchmark entry then rec
 - Timing: Provenant `290.44s`; ScanCode `5927.08s`
 - Broader Bazel and mixed-tree dependency extraction (`8202` vs `8056` packages, `1465` vs `700` dependencies) from root and vendored `MODULE.bazel`, many committed `BUILD` files, Python lockfiles, Dockerfiles, and Debian control metadata, plus direct `CITATION.cff` package visibility
 
+##### [PX4/eigen @ 7cf1c01](https://github.com/PX4/eigen/tree/7cf1c0179eb0f5499dfc1bffbd229783a7865fe1) — **19.96× faster**
+
+- Files: 1,672
+- Run context: 2026-05-04 · eigen-62479 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc
+- Timing: Provenant `16.12s`; ScanCode `321.68s`
+- Cleaner copyright, holder, and author recovery on this manifest-free native source tree, with structured `Copyright Notice (...)` extraction, normalized `Author / Project / Copyright` header splitting, rejection of `.krazy` control-file and disclaimer-list junk, and Unicode-preserving party normalization
+
 ##### [ValveSoftware/eigen @ e9c4315](https://github.com/ValveSoftware/eigen/tree/e9c43151265207fd3366bba21cddd61141ff402c) — **19.84× faster**
 
 - Files: 1,784

diff --git a/docs/scan-duration-vs-files.svg b/docs/scan-duration-vs-files.svg
diff --git a/src/copyright/detector/author_heuristics/extraction.rs b/src/copyright/detector/author_heuristics/extraction.rs
@@ -1282,7 +1282,9 @@ pub(in super::super) fn extract_author_colon_blocks(
             }
         }
         let combined_raw = segments.join(" ");
-        let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw) else {
+        let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw)
+            .or_else(|| refine_explicit_author_label_roster(&combined_raw))
+        else {
             line_number = line_number.next();
             continue;
         };
@@ -1363,6 +1365,50 @@ fn sanitize_author_colon_tail(tail: &str) -> Option<String> {
     Some(trimmed.to_string())
 }
 
+fn refine_explicit_author_label_roster(candidate: &str) -> Option<String> {
+    let trimmed = normalize_whitespace(candidate.trim());
+    if !trimmed.contains(',') {
+        return None;
+    }
+
+    let parts: Vec<&str> = trimmed
+        .split(',')
+        .map(str::trim)
+        .filter(|part| !part.is_empty())
+        .collect();
+    if parts.len() < 2 {
+        return None;
+    }
+
+    let has_placeholder = parts.iter().any(|part| {
+        part.eq_ignore_ascii_case("package author") || part.eq_ignore_ascii_case("package authors")
+    });
+    if has_placeholder {
+        return None;
+    }
+
+    let first_two_rosterish = parts.iter().take(2).all(|part| {
+        let words: Vec<&str> = part.split_whitespace().collect();
+        if words.is_empty() {
+            return false;
+        }
+
+        if words.len() >= 2 {
+            return words
+                .iter()
+                .all(|word| word.chars().any(|ch| ch.is_alphabetic()));
+        }
+
+        part.chars()
+            .all(|ch| !ch.is_alphabetic() || ch.is_ascii_uppercase())
+    });
+    if !first_two_rosterish {
+        return None;
+    }
+
+    Some(trimmed)
+}
+
 fn is_author_metadata_line(line: &str) -> bool {
     let lower = line.trim().to_ascii_lowercase();
     lower.starts_with("url:")

diff --git a/src/copyright/detector/author_heuristics_test.rs b/src/copyright/detector/author_heuristics_test.rs
@@ -158,6 +158,46 @@ fn test_detect_multiline_comment_authors_block_after_year_only_copyright() {
     );
 }
 
+#[test]
+fn test_detect_explicit_author_label_roster_with_company_suffix() {
+    let input = "// Author    : Antoine YESSAYAN, Paul RASCLE, EDF\n";
+    let (_copyrights, _holders, authors) = super::super::detect_copyrights_from_text(input);
+
+    assert!(
+        authors
+            .iter()
+            .any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"),
+        "authors: {authors:?}"
+    );
+}
+
+#[test]
+fn test_split_author_project_copyright_metadata_block() {
+    let input = concat!(
+        "// Author    : Antoine YESSAYAN, Paul RASCLE, EDF\n",
+        "// Project   : SALOME\n",
+        "// Copyright : EDF 2001\n",
+    );
+    let (copyrights, holders, authors) = super::super::detect_copyrights_from_text(input);
+
+    assert!(
+        authors
+            .iter()
+            .any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"),
+        "authors: {authors:?}"
+    );
+    assert!(
+        copyrights
+            .iter()
+            .any(|copyright| copyright.copyright == "Copyright EDF 2001"),
+        "copyrights: {copyrights:?}"
+    );
+    assert!(
+        holders.iter().any(|holder| holder.holder == "EDF"),
+        "holders: {holders:?}"
+    );
+}
+
 #[test]
 fn test_extract_collective_author_with_contributors_before_email() {
     let input = "authors = [\"Tokio Contributors <team@tokio.rs>\"]\n";

diff --git a/src/copyright/detector/mod.rs b/src/copyright/detector/mod.rs
@@ -357,6 +357,11 @@ pub fn detect_copyrights_from_text_with_deadline(
     postprocess_transforms::drop_shadowed_bare_c_from_year_fragments(&mut copyrights, &mut holders);
     drop_path_fragment_holders_from_bare_c_code_lines(&raw_lines, &copyrights, &mut holders);
     drop_scan_only_holders_from_copyright_scan_lines(&raw_lines, &copyrights, &mut holders);
+    drop_test_label_false_positive_copyrights_and_holders(
+        &raw_lines,
+        &mut copyrights,
+        &mut holders,
+    );
 
     for group in &groups {
         extend_dash_obfuscated_email_suffixes(&raw_lines, group, &mut copyrights[..], &holders[..]);
@@ -407,6 +412,7 @@ pub(super) use token_utils::collect_all_leaves;
 use token_utils::{
     apply_written_by_for_markers, drop_path_fragment_holders_from_bare_c_code_lines,
     drop_scan_only_holders_from_copyright_scan_lines,
+    drop_test_label_false_positive_copyrights_and_holders,
     extract_original_author_additional_contributors,
 };
 use tree_walk::{

diff --git a/src/copyright/detector/pattern_extract/extraction/content.rs b/src/copyright/detector/pattern_extract/extraction/content.rs
@@ -77,6 +77,78 @@ pub fn extract_spdx_filecopyrighttext_c_without_year(
     (copyrights, holders)
 }
 
+pub fn extract_bytestring_copyright_c_without_year(
+    content: &str,
+    existing_holders: &[HolderDetection],
+) -> (Vec<CopyrightDetection>, Vec<HolderDetection>) {
+    static YEAR_RE: LazyLock<Regex> =
+        LazyLock::new(|| Regex::new(r"\b(?:19\d{2}|20\d{2})\b").unwrap());
+
+    let mut copyrights = Vec::new();
+    let mut holders = Vec::new();
+
+    let mut seen_h: HashSet<(String, usize)> = existing_holders
+        .iter()
+        .map(|h| (h.holder.clone(), h.start_line.get()))
+        .collect();
+
+    for (idx, line) in content.lines().enumerate() {
+        let ln = idx + 1;
+        let Some(raw) = extract_bytestring_copyright_literal(line) else {
+            continue;
+        };
+        if raw.is_empty() || YEAR_RE.is_match(&raw) {
+            continue;
+        }
+
+        let prepared = crate::copyright::prepare_text_line(&raw);
+        if let Some(refined) = refine_copyright(&prepared) {
+            copyrights.push(CopyrightDetection {
+                copyright: refined,
+                start_line: LineNumber::new(ln).unwrap(),
+                end_line: LineNumber::new(ln).unwrap(),
+            });
+        }
+
+        let tail = prepared
+            .strip_prefix("Copyright")
+            .unwrap_or(prepared.as_str())
+            .trim()
+            .strip_prefix("(c)")
+            .unwrap_or(prepared.as_str())
+            .trim();
+        if let Some(holder) = refine_holder(tail)
+            && seen_h.insert((holder.clone(), ln))
+        {
+            holders.push(HolderDetection {
+                holder,
+                start_line: LineNumber::new(ln).unwrap(),
+                end_line: LineNumber::new(ln).unwrap(),
+            });
+        }
+    }
+
+    (copyrights, holders)
+}
+
+fn extract_bytestring_copyright_literal(line: &str) -> Option<String> {
+    for prefix in ["br'", "rb'", "b'", "br\"", "rb\"", "b\""] {
+        let Some(start) = line.find(prefix) else {
+            continue;
+        };
+        let quote = prefix.chars().last()?;
+        let rest = line.get(start + prefix.len()..)?;
+        let Some(end) = rest.find(quote) else {
+            continue;
+        };
+        let candidate = rest[..end].trim();
+        if candidate.to_ascii_lowercase().starts_with("copyright (c)") {
+            return Some(candidate.to_string());
+        }
+    }
+
+    None
+}
 pub fn extract_html_meta_name_copyright_content(
     content: &str,
     existing_holders: &[HolderDetection],

diff --git a/src/copyright/detector/phases/postprocess.rs b/src/copyright/detector/phases/postprocess.rs
@@ -378,6 +378,16 @@ fn run_mid_pipeline_repairs(
     seen.dedup_new_holders(holders, h_before);
     seen.dedup_new_authors(authors, a_before);
 
+    let c_before = copyrights.len();
+    let h_before = holders.len();
+    let a_before = authors.len();
+    super::postprocess_transforms::split_author_project_copyright_metadata_blocks(
+        copyrights, holders, authors,
+    );
+    seen.dedup_new_copyrights(copyrights, c_before);
+    seen.dedup_new_holders(holders, h_before);
+    seen.dedup_new_authors(authors, a_before);
+
     super::postprocess_transforms::drop_static_char_string_copyrights(content, copyrights, holders);
     super::postprocess_transforms::drop_combined_period_holders(holders);
     super::pattern_extract::drop_shadowed_prefix_holders(holders);

diff --git a/src/copyright/detector/phases/primary.rs b/src/copyright/detector/phases/primary.rs
@@ -421,6 +421,15 @@ fn run_content_and_markup_extractions(
     copyrights.extend(new_c);
     holders.extend(new_h);
 
+    let (mut new_c, new_h) =
+        super::super::pattern_extract::extract_bytestring_copyright_c_without_year(
+            content, holders,
+        );
+    seen.dedup_new_copyrights(&mut new_c, 0);
+    seen.register_holders(&new_h);
+    copyrights.extend(new_c);
+    holders.extend(new_h);
+
     let (mut new_c, new_h) =
         super::super::pattern_extract::extract_html_meta_name_copyright_content(content, holders);
     seen.dedup_new_copyrights(&mut new_c, 0);

diff --git a/src/copyright/detector/postprocess_transforms/author_repairs.rs b/src/copyright/detector/postprocess_transforms/author_repairs.rs
@@ -330,3 +330,74 @@ pub fn split_written_by_copyrights_into_holder_prefixed_clauses(
     holders.retain(|h| h.holder != "Julian Cowley");
     authors.retain(|a| a.author != "Linus Torvalds" && a.author != "Theodore Ts'o");
 }
+
+pub fn split_author_project_copyright_metadata_blocks(
+    copyrights: &mut [CopyrightDetection],
+    holders: &mut Vec<HolderDetection>,
+    authors: &mut Vec<AuthorDetection>,
+) {
+    static AUTHOR_PROJECT_COPY_RE: LazyLock<Regex> = LazyLock::new(|| {
+        Regex::new(
+            r"(?ix)
+            ^Author\s+(?P<author>.+?)
+            (?:\s+Project\s+(?P<project>.+?))?
+            \s+Copyright\s+(?P<holder>.+?)\s+(?P<year>\d{4})
+            $",
+        )
+        .unwrap()
+    });
+
+    for copyright in copyrights.iter_mut() {
+        let current = copyright.copyright.clone();
+        let Some(cap) = AUTHOR_PROJECT_COPY_RE.captures(current.as_str()) else {
+            continue;
+        };
+
+        let author_raw = cap.name("author").map(|m| m.as_str()).unwrap_or("").trim();
+        let holder_raw = cap.name("holder").map(|m| m.as_str()).unwrap_or("").trim();
+        let year = cap.name("year").map(|m| m.as_str()).unwrap_or("").trim();
+        if author_raw.is_empty() || holder_raw.is_empty() || year.is_empty() {
+            continue;
+        }
+
+        let author = crate::copyright::refiner::refine_author(author_raw)
+            .unwrap_or_else(|| normalize_whitespace(author_raw));
+        if !author.is_empty()
+            && !authors.iter().any(|existing| {
+                existing.author == author && existing.start_line == copyright.start_line
+            })
+        {
+            authors.push(AuthorDetection {
+                author,
+                start_line: copyright.start_line,
+                end_line: copyright.start_line,
+            });
+        }
+
+        if let Some(refined) = refine_copyright(&format!("Copyright {holder_raw} {year}")) {
+            copyright.copyright = refined;
+        }
+
+        let Some(refined_holder) = refine_holder_in_copyright_context(holder_raw) else {
+            continue;
+        };
+
+        holders.retain(|holder| {
+            !(holder.start_line == copyright.start_line
+                && holder.end_line == copyright.end_line
+                && holder.holder.contains(author_raw))
+        });
+
+        if !holders.iter().any(|holder| {
+            holder.holder == refined_holder
+                && holder.start_line == copyright.start_line
+                && holder.end_line == copyright.end_line
+        }) {
+            holders.push(HolderDetection {
+                holder: refined_holder,
+                start_line: copyright.start_line,
+                end_line: copyright.end_line,
+            });
+        }
+    }
+}