diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md
index 452c11401..ab3bd93a2 100644
--- a/docs/BENCHMARKS.md
+++ b/docs/BENCHMARKS.md
@@ -12,7 +12,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc

-> Provenant is faster on 190 of 190 recorded runs, with a **12.0× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets.
+> Provenant is faster on 191 of 191 recorded runs, with a **12.1× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets.
> Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`.
## Current benchmark examples
@@ -819,6 +819,13 @@ The quick index below links to benchmark sections. Each benchmark entry then rec
- Timing: Provenant `290.44s`; ScanCode `5927.08s`
- Broader Bazel and mixed-tree dependency extraction (`8202` vs `8056` packages, `1465` vs `700` dependencies) from root and vendored `MODULE.bazel`, many committed `BUILD` files, Python lockfiles, Dockerfiles, and Debian control metadata, plus direct `CITATION.cff` package visibility
+##### [PX4/eigen @ 7cf1c01](https://github.com/PX4/eigen/tree/7cf1c0179eb0f5499dfc1bffbd229783a7865fe1) — **19.96× faster**
+
+- Files: 1,672
+- Run context: 2026-05-04 · eigen-62479 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc
+- Timing: Provenant `16.12s`; ScanCode `321.68s`
+- Cleaner copyright, holder, and author recovery on this manifest-free native source tree, with structured `Copyright Notice (...)` extraction, normalized `Author / Project / Copyright` header splitting, rejection of `.krazy` control-file and disclaimer-list junk, and Unicode-preserving party normalization
+
##### [ValveSoftware/eigen @ e9c4315](https://github.com/ValveSoftware/eigen/tree/e9c43151265207fd3366bba21cddd61141ff402c) — **19.84× faster**
- Files: 1,784
diff --git a/docs/scan-duration-vs-files.svg b/docs/scan-duration-vs-files.svg
index fda55a43f..a66d00bf1 100644
--- a/docs/scan-duration-vs-files.svg
+++ b/docs/scan-duration-vs-files.svg
@@ -371,6 +371,9 @@ ScanCode: 272.67s
aosp-mirror/platform_build @ 045a3d6
Files: 1515
ScanCode: 240.24s
+ PX4/eigen @ 7cf1c01
+Files: 1672
+ScanCode: 321.68s
guillemj/dpkg @ 0061122
Files: 1766
ScanCode: 563.43s
@@ -943,6 +946,9 @@ Provenant: 24.48s
aosp-mirror/platform_build @ 045a3d6
Files: 1515
Provenant: 25.23s
+ PX4/eigen @ 7cf1c01
+Files: 1672
+Provenant: 16.12s
guillemj/dpkg @ 0061122
Files: 1766
Provenant: 27.87s
diff --git a/src/copyright/detector/author_heuristics/extraction.rs b/src/copyright/detector/author_heuristics/extraction.rs
index 756222a34..18308f4b3 100644
--- a/src/copyright/detector/author_heuristics/extraction.rs
+++ b/src/copyright/detector/author_heuristics/extraction.rs
@@ -1282,7 +1282,9 @@ pub(in super::super) fn extract_author_colon_blocks(
}
}
let combined_raw = segments.join(" ");
- let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw) else {
+ let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw)
+ .or_else(|| refine_explicit_author_label_roster(&combined_raw))
+ else {
line_number = line_number.next();
continue;
};
@@ -1363,6 +1365,50 @@ fn sanitize_author_colon_tail(tail: &str) -> Option {
Some(trimmed.to_string())
}
+fn refine_explicit_author_label_roster(candidate: &str) -> Option {
+ let trimmed = normalize_whitespace(candidate.trim());
+ if !trimmed.contains(',') {
+ return None;
+ }
+
+ let parts: Vec<&str> = trimmed
+ .split(',')
+ .map(str::trim)
+ .filter(|part| !part.is_empty())
+ .collect();
+ if parts.len() < 2 {
+ return None;
+ }
+
+ let has_placeholder = parts.iter().any(|part| {
+ part.eq_ignore_ascii_case("package author") || part.eq_ignore_ascii_case("package authors")
+ });
+ if has_placeholder {
+ return None;
+ }
+
+ let first_two_rosterish = parts.iter().take(2).all(|part| {
+ let words: Vec<&str> = part.split_whitespace().collect();
+ if words.is_empty() {
+ return false;
+ }
+
+ if words.len() >= 2 {
+ return words
+ .iter()
+ .all(|word| word.chars().any(|ch| ch.is_alphabetic()));
+ }
+
+ part.chars()
+ .all(|ch| !ch.is_alphabetic() || ch.is_ascii_uppercase())
+ });
+ if !first_two_rosterish {
+ return None;
+ }
+
+ Some(trimmed)
+}
+
fn is_author_metadata_line(line: &str) -> bool {
let lower = line.trim().to_ascii_lowercase();
lower.starts_with("url:")
diff --git a/src/copyright/detector/author_heuristics_test.rs b/src/copyright/detector/author_heuristics_test.rs
index a9a510d48..807b1e803 100644
--- a/src/copyright/detector/author_heuristics_test.rs
+++ b/src/copyright/detector/author_heuristics_test.rs
@@ -158,6 +158,46 @@ fn test_detect_multiline_comment_authors_block_after_year_only_copyright() {
);
}
+#[test]
+fn test_detect_explicit_author_label_roster_with_company_suffix() {
+ let input = "// Author : Antoine YESSAYAN, Paul RASCLE, EDF\n";
+ let (_copyrights, _holders, authors) = super::super::detect_copyrights_from_text(input);
+
+ assert!(
+ authors
+ .iter()
+ .any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"),
+ "authors: {authors:?}"
+ );
+}
+
+#[test]
+fn test_split_author_project_copyright_metadata_block() {
+ let input = concat!(
+ "// Author : Antoine YESSAYAN, Paul RASCLE, EDF\n",
+ "// Project : SALOME\n",
+ "// Copyright : EDF 2001\n",
+ );
+ let (copyrights, holders, authors) = super::super::detect_copyrights_from_text(input);
+
+ assert!(
+ authors
+ .iter()
+ .any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"),
+ "authors: {authors:?}"
+ );
+ assert!(
+ copyrights
+ .iter()
+ .any(|copyright| copyright.copyright == "Copyright EDF 2001"),
+ "copyrights: {copyrights:?}"
+ );
+ assert!(
+ holders.iter().any(|holder| holder.holder == "EDF"),
+ "holders: {holders:?}"
+ );
+}
+
#[test]
fn test_extract_collective_author_with_contributors_before_email() {
let input = "authors = [\"Tokio Contributors \"]\n";
diff --git a/src/copyright/detector/mod.rs b/src/copyright/detector/mod.rs
index d2bf1f9f0..a11048da7 100644
--- a/src/copyright/detector/mod.rs
+++ b/src/copyright/detector/mod.rs
@@ -357,6 +357,11 @@ pub fn detect_copyrights_from_text_with_deadline(
postprocess_transforms::drop_shadowed_bare_c_from_year_fragments(&mut copyrights, &mut holders);
drop_path_fragment_holders_from_bare_c_code_lines(&raw_lines, ©rights, &mut holders);
drop_scan_only_holders_from_copyright_scan_lines(&raw_lines, ©rights, &mut holders);
+ drop_test_label_false_positive_copyrights_and_holders(
+ &raw_lines,
+ &mut copyrights,
+ &mut holders,
+ );
for group in &groups {
extend_dash_obfuscated_email_suffixes(&raw_lines, group, &mut copyrights[..], &holders[..]);
@@ -407,6 +412,7 @@ pub(super) use token_utils::collect_all_leaves;
use token_utils::{
apply_written_by_for_markers, drop_path_fragment_holders_from_bare_c_code_lines,
drop_scan_only_holders_from_copyright_scan_lines,
+ drop_test_label_false_positive_copyrights_and_holders,
extract_original_author_additional_contributors,
};
use tree_walk::{
diff --git a/src/copyright/detector/pattern_extract/extraction/content.rs b/src/copyright/detector/pattern_extract/extraction/content.rs
index a056afca5..0621a2cef 100644
--- a/src/copyright/detector/pattern_extract/extraction/content.rs
+++ b/src/copyright/detector/pattern_extract/extraction/content.rs
@@ -77,6 +77,78 @@ pub fn extract_spdx_filecopyrighttext_c_without_year(
(copyrights, holders)
}
+pub fn extract_bytestring_copyright_c_without_year(
+ content: &str,
+ existing_holders: &[HolderDetection],
+) -> (Vec, Vec) {
+ static YEAR_RE: LazyLock =
+ LazyLock::new(|| Regex::new(r"\b(?:19\d{2}|20\d{2})\b").unwrap());
+
+ let mut copyrights = Vec::new();
+ let mut holders = Vec::new();
+
+ let mut seen_h: HashSet<(String, usize)> = existing_holders
+ .iter()
+ .map(|h| (h.holder.clone(), h.start_line.get()))
+ .collect();
+
+ for (idx, line) in content.lines().enumerate() {
+ let ln = idx + 1;
+ let Some(raw) = extract_bytestring_copyright_literal(line) else {
+ continue;
+ };
+ if raw.is_empty() || YEAR_RE.is_match(&raw) {
+ continue;
+ }
+
+ let prepared = crate::copyright::prepare_text_line(&raw);
+ if let Some(refined) = refine_copyright(&prepared) {
+ copyrights.push(CopyrightDetection {
+ copyright: refined,
+ start_line: LineNumber::new(ln).unwrap(),
+ end_line: LineNumber::new(ln).unwrap(),
+ });
+ }
+
+ let tail = prepared
+ .strip_prefix("Copyright")
+ .unwrap_or(prepared.as_str())
+ .trim()
+ .strip_prefix("(c)")
+ .unwrap_or(prepared.as_str())
+ .trim();
+ if let Some(holder) = refine_holder(tail)
+ && seen_h.insert((holder.clone(), ln))
+ {
+ holders.push(HolderDetection {
+ holder,
+ start_line: LineNumber::new(ln).unwrap(),
+ end_line: LineNumber::new(ln).unwrap(),
+ });
+ }
+ }
+
+ (copyrights, holders)
+}
+
+fn extract_bytestring_copyright_literal(line: &str) -> Option {
+ for prefix in ["br'", "rb'", "b'", "br\"", "rb\"", "b\""] {
+ let Some(start) = line.find(prefix) else {
+ continue;
+ };
+ let quote = prefix.chars().last()?;
+ let rest = line.get(start + prefix.len()..)?;
+ let Some(end) = rest.find(quote) else {
+ continue;
+ };
+ let candidate = rest[..end].trim();
+ if candidate.to_ascii_lowercase().starts_with("copyright (c)") {
+ return Some(candidate.to_string());
+ }
+ }
+
+ None
+}
pub fn extract_html_meta_name_copyright_content(
content: &str,
existing_holders: &[HolderDetection],
diff --git a/src/copyright/detector/phases/postprocess.rs b/src/copyright/detector/phases/postprocess.rs
index a07785d5b..c8810b4e6 100644
--- a/src/copyright/detector/phases/postprocess.rs
+++ b/src/copyright/detector/phases/postprocess.rs
@@ -378,6 +378,16 @@ fn run_mid_pipeline_repairs(
seen.dedup_new_holders(holders, h_before);
seen.dedup_new_authors(authors, a_before);
+ let c_before = copyrights.len();
+ let h_before = holders.len();
+ let a_before = authors.len();
+ super::postprocess_transforms::split_author_project_copyright_metadata_blocks(
+ copyrights, holders, authors,
+ );
+ seen.dedup_new_copyrights(copyrights, c_before);
+ seen.dedup_new_holders(holders, h_before);
+ seen.dedup_new_authors(authors, a_before);
+
super::postprocess_transforms::drop_static_char_string_copyrights(content, copyrights, holders);
super::postprocess_transforms::drop_combined_period_holders(holders);
super::pattern_extract::drop_shadowed_prefix_holders(holders);
diff --git a/src/copyright/detector/phases/primary.rs b/src/copyright/detector/phases/primary.rs
index 09e189120..9df41f7d7 100644
--- a/src/copyright/detector/phases/primary.rs
+++ b/src/copyright/detector/phases/primary.rs
@@ -421,6 +421,15 @@ fn run_content_and_markup_extractions(
copyrights.extend(new_c);
holders.extend(new_h);
+ let (mut new_c, new_h) =
+ super::super::pattern_extract::extract_bytestring_copyright_c_without_year(
+ content, holders,
+ );
+ seen.dedup_new_copyrights(&mut new_c, 0);
+ seen.register_holders(&new_h);
+ copyrights.extend(new_c);
+ holders.extend(new_h);
+
let (mut new_c, new_h) =
super::super::pattern_extract::extract_html_meta_name_copyright_content(content, holders);
seen.dedup_new_copyrights(&mut new_c, 0);
diff --git a/src/copyright/detector/postprocess_transforms/author_repairs.rs b/src/copyright/detector/postprocess_transforms/author_repairs.rs
index 8e6f0967f..520597ffa 100644
--- a/src/copyright/detector/postprocess_transforms/author_repairs.rs
+++ b/src/copyright/detector/postprocess_transforms/author_repairs.rs
@@ -330,3 +330,74 @@ pub fn split_written_by_copyrights_into_holder_prefixed_clauses(
holders.retain(|h| h.holder != "Julian Cowley");
authors.retain(|a| a.author != "Linus Torvalds" && a.author != "Theodore Ts'o");
}
+
+pub fn split_author_project_copyright_metadata_blocks(
+ copyrights: &mut [CopyrightDetection],
+ holders: &mut Vec,
+ authors: &mut Vec,
+) {
+ static AUTHOR_PROJECT_COPY_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?ix)
+ ^Author\s+(?P.+?)
+ (?:\s+Project\s+(?P.+?))?
+ \s+Copyright\s+(?P.+?)\s+(?P\d{4})
+ $",
+ )
+ .unwrap()
+ });
+
+ for copyright in copyrights.iter_mut() {
+ let current = copyright.copyright.clone();
+ let Some(cap) = AUTHOR_PROJECT_COPY_RE.captures(current.as_str()) else {
+ continue;
+ };
+
+ let author_raw = cap.name("author").map(|m| m.as_str()).unwrap_or("").trim();
+ let holder_raw = cap.name("holder").map(|m| m.as_str()).unwrap_or("").trim();
+ let year = cap.name("year").map(|m| m.as_str()).unwrap_or("").trim();
+ if author_raw.is_empty() || holder_raw.is_empty() || year.is_empty() {
+ continue;
+ }
+
+ let author = crate::copyright::refiner::refine_author(author_raw)
+ .unwrap_or_else(|| normalize_whitespace(author_raw));
+ if !author.is_empty()
+ && !authors.iter().any(|existing| {
+ existing.author == author && existing.start_line == copyright.start_line
+ })
+ {
+ authors.push(AuthorDetection {
+ author,
+ start_line: copyright.start_line,
+ end_line: copyright.start_line,
+ });
+ }
+
+ if let Some(refined) = refine_copyright(&format!("Copyright {holder_raw} {year}")) {
+ copyright.copyright = refined;
+ }
+
+ let Some(refined_holder) = refine_holder_in_copyright_context(holder_raw) else {
+ continue;
+ };
+
+ holders.retain(|holder| {
+ !(holder.start_line == copyright.start_line
+ && holder.end_line == copyright.end_line
+ && holder.holder.contains(author_raw))
+ });
+
+ if !holders.iter().any(|holder| {
+ holder.holder == refined_holder
+ && holder.start_line == copyright.start_line
+ && holder.end_line == copyright.end_line
+ }) {
+ holders.push(HolderDetection {
+ holder: refined_holder,
+ start_line: copyright.start_line,
+ end_line: copyright.end_line,
+ });
+ }
+ }
+}
diff --git a/src/copyright/detector/tests.rs b/src/copyright/detector/tests.rs
index c97d07177..141cdb70b 100644
--- a/src/copyright/detector/tests.rs
+++ b/src/copyright/detector/tests.rs
@@ -180,6 +180,25 @@ fn test_added_copyright_year_for_line_is_extracted() {
);
}
+#[test]
+fn test_structured_copyright_notice_with_year_is_extracted() {
+ let input = "Minpack Copyright Notice (1999) University of Chicago. All rights reserved\n";
+
+ let (copyrights, holders, _authors) = detect_copyrights_from_text(input);
+ assert!(
+ copyrights
+ .iter()
+ .any(|c| c.copyright == "Copyright Notice (1999) University of Chicago"),
+ "copyrights: {:?}",
+ copyrights.iter().map(|c| &c.copyright).collect::>()
+ );
+ assert!(
+ holders.iter().any(|h| h.holder == "University of Chicago"),
+ "holders: {:?}",
+ holders.iter().map(|h| &h.holder).collect::>()
+ );
+}
+
#[test]
fn test_author_prefix_dedup_keeps_short_email_list() {
let input = "Author(s): gthomas, sorin@netappi.com\nContributors: gthomas, sorin@netappi.com, andrew.lunn@ascom.ch\n";
@@ -1208,6 +1227,134 @@ fn test_detect_storyboard_text_attribute_copyright_holder() {
);
}
+#[test]
+fn test_detect_flutter_application_legalese_assignment_strips_wrapper() {
+ let input = "applicationLegalese: '© 2014 The Flutter Authors',";
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(
+ c.iter()
+ .any(|cr| cr.copyright == "(c) 2014 The Flutter Authors"),
+ "copyrights: {:?}",
+ c.iter().map(|cr| &cr.copyright).collect::>()
+ );
+ assert!(
+ h.iter().any(|ho| ho.holder == "The Flutter Authors"),
+ "holders: {:?}",
+ h.iter().map(|ho| &ho.holder).collect::>()
+ );
+}
+
+#[test]
+fn test_detect_flutter_product_copyright_assignment_strips_wrapper() {
+ let input = "PRODUCT_COPYRIGHT = Copyright © 2014 The Flutter Authors. All rights reserved.";
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(
+ c.iter()
+ .any(|cr| cr.copyright == "Copyright (c) 2014 The Flutter Authors"),
+ "copyrights: {:?}",
+ c.iter().map(|cr| &cr.copyright).collect::>()
+ );
+ assert!(
+ h.iter().any(|ho| ho.holder == "The Flutter Authors"),
+ "holders: {:?}",
+ h.iter().map(|ho| &ho.holder).collect::>()
+ );
+}
+
+#[test]
+fn test_detect_flutter_windows_legalcopyright_template_dropped() {
+ let input = r#"VALUE "LegalCopyright", "Copyright (C) {{year}} {{organization}}. All rights reserved." "\0""#;
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(c.is_empty(), "copyrights: {c:?}");
+ assert!(h.is_empty(), "holders: {h:?}");
+}
+
+#[test]
+fn test_detect_flutter_material_icon_doc_false_positive_dropped() {
+ let input = r#"copyright — material icon named "copyright" (sharp)."#;
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(c.is_empty(), "copyrights: {c:?}");
+ assert!(h.is_empty(), "holders: {h:?}");
+}
+
+#[test]
+fn test_detect_flutter_verify_entry_false_positive_dropped() {
+ let input = "verifyEntry(mapping, 'KeyC', [r'c', r'C', r'©', r'¢'], 'c');";
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(c.is_empty(), "copyrights: {c:?}");
+ assert!(h.is_empty(), "holders: {h:?}");
+}
+
+#[test]
+fn test_detect_python_bytestring_copyright_without_year() {
+ let input = "not line.startswith(b'Copyright (C) Microsoft Corporation') and line):";
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(
+ c.iter()
+ .any(|cr| cr.copyright == "Copyright (c) Microsoft Corporation"),
+ "copyrights: {:?}",
+ c.iter().map(|cr| &cr.copyright).collect::>()
+ );
+ assert!(
+ h.iter().any(|ho| ho.holder == "Microsoft Corporation"),
+ "holders: {:?}",
+ h.iter().map(|ho| &ho.holder).collect::>()
+ );
+}
+
+#[test]
+fn test_detect_table_header_labels_with_c_sign_are_dropped() {
+ let input = concat!(
+ " ---------------------- (A) Column Header\n",
+ " | (C) | 1 | 2 | (D) Row Header\n",
+ );
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(c.is_empty(), "copyrights: {c:?}");
+ assert!(h.is_empty(), "holders: {h:?}");
+}
+
+#[test]
+fn test_detect_minpack_example_prose_false_positive_dropped() {
+ let input = "// Tests using the examples provided by (c)minpack\n";
+ let (c, h, _a) = detect_copyrights_from_text(input);
+ assert!(c.is_empty(), "copyrights: {c:?}");
+ assert!(h.is_empty(), "holders: {h:?}");
+}
+
+#[test]
+fn test_detect_flutter_about_dialog_snippet_keeps_clean_values() {
+ let input = concat!(
+ "// Copyright 2014 The Flutter Authors. All rights reserved.\n",
+ "showAboutDialog(\n",
+ " context: context,\n",
+ " applicationLegalese: '© 2014 The Flutter Authors',\n",
+ ");\n",
+ );
+ let (c, h, _a) = detect_copyrights_from_text(input);
+
+ let copyrights: Vec<&str> = c.iter().map(|cr| cr.copyright.as_str()).collect();
+ let holders: Vec<&str> = h.iter().map(|ho| ho.holder.as_str()).collect();
+
+ assert!(
+ copyrights.contains(&"Copyright 2014 The Flutter Authors"),
+ "copyrights: {copyrights:?}"
+ );
+ assert!(
+ copyrights.contains(&"(c) 2014 The Flutter Authors"),
+ "copyrights: {copyrights:?}"
+ );
+ assert!(
+ copyrights
+ .iter()
+ .all(|cr| !cr.contains("applicationLegalese") && !cr.contains("All rights reserved")),
+ "copyrights: {copyrights:?}"
+ );
+ assert!(
+ holders.iter().all(|ho| *ho == "The Flutter Authors"),
+ "holders: {holders:?}"
+ );
+}
+
#[test]
fn test_detect_iso_date_holder_regression() {
let input = "Copyright (c) 2006-07-24 John Boolage";
diff --git a/src/copyright/detector/token_utils/filters.rs b/src/copyright/detector/token_utils/filters.rs
index a1bedb6ee..194f656a3 100644
--- a/src/copyright/detector/token_utils/filters.rs
+++ b/src/copyright/detector/token_utils/filters.rs
@@ -367,6 +367,45 @@ pub fn drop_path_fragment_holders_from_bare_c_code_lines(
});
}
+pub fn drop_test_label_false_positive_copyrights_and_holders(
+ raw_lines: &[&str],
+ copyrights: &mut Vec,
+ holders: &mut Vec,
+) {
+ if raw_lines.is_empty() || (copyrights.is_empty() && holders.is_empty()) {
+ return;
+ }
+
+ static ROW_HEADER_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?ix)
+ \([A-Z]\)\s+(?:row|column)\s+header
+ | \|\s*\([A-Z]\)\s*\|\s*\d+\s*\|\s*\d+\s*\|\s*\([A-Z]\)\s+row\s+header
+ ",
+ )
+ .unwrap()
+ });
+ static TEST_EXAMPLES_COPY_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)tests\s+using\s+the\s+examples\s+provided\s+by\s+\(c\)").unwrap()
+ });
+
+ let bad_lines: HashSet = raw_lines
+ .iter()
+ .enumerate()
+ .filter_map(|(idx, line)| {
+ (ROW_HEADER_RE.is_match(line) || TEST_EXAMPLES_COPY_RE.is_match(line))
+ .then_some(idx + 1)
+ })
+ .collect();
+
+ if bad_lines.is_empty() {
+ return;
+ }
+
+ copyrights.retain(|c| !(c.start_line == c.end_line && bad_lines.contains(&c.start_line.get())));
+ holders.retain(|h| !(h.start_line == h.end_line && bad_lines.contains(&h.start_line.get())));
+}
+
/// Tags whose filtering should cause adjacent commas to be considered orphaned.
/// Only year-related tags: commas between years (e.g. "2006, 2007") become
/// orphaned when the years are removed. Email/URL commas are intentionally
diff --git a/src/copyright/mod.rs b/src/copyright/mod.rs
index 9b29a0a9a..bc45ac8d3 100644
--- a/src/copyright/mod.rs
+++ b/src/copyright/mod.rs
@@ -28,6 +28,7 @@ mod refiner;
mod types;
pub use credits::{detect_credits_authors, is_credits_file};
+pub(crate) use prepare::prepare_text_line;
pub(crate) use refiner::refine_author;
pub use refiner::refine_copyright;
pub use types::{AuthorDetection, CopyrightDetection, HolderDetection};
diff --git a/src/copyright/refiner/author.rs b/src/copyright/refiner/author.rs
index 96477ac25..f90780ab6 100644
--- a/src/copyright/refiner/author.rs
+++ b/src/copyright/refiner/author.rs
@@ -27,12 +27,14 @@ pub fn refine_author(s: &str) -> Option {
a = strip_trailing_comma_year(&a);
a = strip_trailing_comma_month_year(&a);
a = strip_trailing_comma_email_matching_name(&a);
+ a = truncate_trailing_from_clause_after_angle_contact(&a);
a = truncate_trailing_clause_after_contact(&a);
a = strip_trailing_comma_and(&a);
a = truncate_bug_reports_clause(&a);
a = truncate_caller_specificaly_clause(&a);
a = truncate_json_metadata_tail(&a);
a = truncate_distribution_metadata_tail(&a);
+ a = truncate_generated_month_year_clause(&a);
a = truncate_better_known_as_clause(&a);
a = normalize_slash_spacing(&a);
a = normalize_slash_author_pairs(&a);
@@ -644,6 +646,31 @@ fn truncate_distribution_metadata_tail(s: &str) -> String {
prefix.to_string()
}
+fn truncate_generated_month_year_clause(s: &str) -> String {
+ static GENERATED_MONTH_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?ix)
+ ^(?P.+?)
+ \s+Generated\s+
+ (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)
+ (?:\s*,?\s*(?:19\d{2}|20\d{2}))?
+ \s*$",
+ )
+ .unwrap()
+ });
+
+ let trimmed = s.trim();
+ let Some(cap) = GENERATED_MONTH_RE.captures(trimmed) else {
+ return s.to_string();
+ };
+ let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim();
+ if prefix.is_empty() {
+ return s.to_string();
+ }
+
+ prefix.to_string()
+}
+
fn looks_like_generated_resource_identifier(s: &str) -> bool {
let trimmed = s.trim();
if trimmed.is_empty() || trimmed.contains(' ') {
@@ -1052,7 +1079,7 @@ fn truncate_trailing_clause_after_contact(s: &str) -> String {
let tail_lower = tail.to_ascii_lowercase();
let prose_like_tail = [
"the ", "a ", "an ", "i ", "since ", "this ", "these ", "those ", "is ", "was ", "visit ",
- "for ",
+ "for ", "from ",
]
.iter()
.any(|prefix_text| tail_lower.starts_with(prefix_text));
@@ -1064,6 +1091,22 @@ fn truncate_trailing_clause_after_contact(s: &str) -> String {
s.to_string()
}
+fn truncate_trailing_from_clause_after_angle_contact(s: &str) -> String {
+ static FROM_CLAUSE_RE: LazyLock =
+ LazyLock::new(|| Regex::new(r"(?i)^(?P.+?<[^>]*@[^>]*>)\s+from\b.*$").unwrap());
+
+ let trimmed = s.trim();
+ let Some(cap) = FROM_CLAUSE_RE.captures(trimmed) else {
+ return s.to_string();
+ };
+ let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim();
+ if prefix.is_empty() {
+ return s.to_string();
+ }
+
+ prefix.to_string()
+}
+
fn strip_trailing_status_works(s: &str) -> String {
static STATUS_WORKS_RE: LazyLock =
LazyLock::new(|| Regex::new(r"(?i)^(?P.+\bStatus)\s+works\s*$").unwrap());
diff --git a/src/copyright/refiner/authors_junk_patterns.rs b/src/copyright/refiner/authors_junk_patterns.rs
index d0463aa57..2bbde57ce 100644
--- a/src/copyright/refiner/authors_junk_patterns.rs
+++ b/src/copyright/refiner/authors_junk_patterns.rs
@@ -91,6 +91,7 @@ pub(super) static AUTHORS_JUNK_PATTERNS: LazyLock> = LazyLock::new(||
r"(?i)^gives unlimited$",
r"(?i)^word assigns past and future changes\b",
r"(?i)^maintainers\s*<[^>]+>\s+from\s+https?://\S+$",
+ r"(?i)^maintainers\s*<[^>]+>$",
r"(?i)^versions,\s+and$",
r"(?i)^versions$",
r"(?i)^makes$",
@@ -104,6 +105,8 @@ pub(super) static AUTHORS_JUNK_PATTERNS: LazyLock> = LazyLock::new(||
r"(?i)^the task'?s numa mempolicy\b.*$",
r"(?i)^the authors laboriously took the trouble\b.*$",
r"(?i)^laboriously took the trouble\b.*$",
+ r"(?i)^the code itself\b.*$",
+ r"(?i)^.+\bis a software package provided by\b.*$",
r"(?i)^support\s+for\b.*$",
r"(?i)^addresses\s+\.+.*$",
];
diff --git a/src/copyright/refiner/copyrights_junk_patterns.rs b/src/copyright/refiner/copyrights_junk_patterns.rs
index 8dfe13e48..bd81194e4 100644
--- a/src/copyright/refiner/copyrights_junk_patterns.rs
+++ b/src/copyright/refiner/copyrights_junk_patterns.rs
@@ -9,6 +9,8 @@ use regex::Regex;
pub(super) static COPYRIGHTS_JUNK_PATTERNS: LazyLock> = LazyLock::new(|| {
let patterns = [
r"(?i)^copyright \(c\)$",
+ r"(?i)^copyright exclude$",
+ r"(?i)^copyright doctrines of fair use, fair dealing, or other equivalents\.?$",
r"(?i)^\(c\) by$",
r"(?i)\(c\) [a-zA-Z][a-z] \(c\)",
r"(?i)^copyright holder or simply",
diff --git a/src/copyright/refiner/holders_junk_patterns.rs b/src/copyright/refiner/holders_junk_patterns.rs
index ac59837a0..3b2b8ce09 100644
--- a/src/copyright/refiner/holders_junk_patterns.rs
+++ b/src/copyright/refiner/holders_junk_patterns.rs
@@ -9,6 +9,9 @@ use regex::Regex;
pub(super) static HOLDERS_JUNK_PATTERNS: LazyLock> = LazyLock::new(|| {
let patterns = [
r"(?i)^licenses?,\s+and/or\b",
+ r"(?i)^exclude$",
+ r"(?i)^with the$",
+ r"(?i)^.+,\s+.+\band\s+their\s+employees$",
r"(?i)^holders?,\s*authors\b",
r"(?i)^notice,\s+and\b",
r"(?i)^notice,\s+in\b",
diff --git a/src/copyright/refiner/mod.rs b/src/copyright/refiner/mod.rs
index ade6d4b09..628775633 100644
--- a/src/copyright/refiner/mod.rs
+++ b/src/copyright/refiner/mod.rs
@@ -14,6 +14,7 @@ use std::sync::LazyLock;
use regex::Regex;
use super::candidates::strip_balanced_edge_parens;
+use super::prepare::prepare_text_line;
mod authors_junk_patterns;
mod copyrights_junk_patterns;
mod holders_junk_patterns;
@@ -441,6 +442,10 @@ static HOLDERS_JUNK: LazyLock> = LazyLock::new(|| {
/// Return true if `s` matches any known junk copyright pattern.
pub fn is_junk_copyright(s: &str) -> bool {
+ if looks_like_structured_copyright_notice_with_year(s) {
+ return false;
+ }
+
COPYRIGHTS_JUNK_PATTERNS.iter().any(|re| re.is_match(s))
|| is_junk_copyright_scan_phrase(s)
|| is_junk_copyright_code_fragment(s)
@@ -448,6 +453,14 @@ pub fn is_junk_copyright(s: &str) -> bool {
|| is_junk_c_sign_path_fragment(s)
}
+fn looks_like_structured_copyright_notice_with_year(s: &str) -> bool {
+ static STRUCTURED_NOTICE_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)^copyright\s+notice\s*\(\s*(?:19\d{2}|20\d{2})\s*\)\s+.+$").unwrap()
+ });
+
+ STRUCTURED_NOTICE_RE.is_match(s.trim())
+}
+
fn has_copyright_year(s: &str) -> bool {
static COPYRIGHT_YEAR_RE: LazyLock = LazyLock::new(|| {
Regex::new(r"(?i)\b(?:19\d{2}|20\d{2})(?:\s*[-–/]\s*(?:19\d{2}|20\d{2}|\d{2}))?\b").unwrap()
@@ -507,8 +520,10 @@ fn is_junk_copyright_code_fragment(s: &str) -> bool {
|| lower == "copyright void"
|| trimmed.contains("??")
|| contains_member_access_code_token(trimmed)
+ || contains_code_string_literal_fragment(trimmed)
|| contains_unicode_escape_token_run(trimmed)
|| contains_html_entity_decoder_artifact(trimmed)
+ || contains_markup_tag_fragment(trimmed)
|| contains_xml_markup_declaration_token(trimmed)
|| contains_regex_or_template_marker(trimmed)
|| has_windows_versioninfo_markers
@@ -568,10 +583,14 @@ fn is_junk_holder_code_fragment(s: &str) -> bool {
|| contains_embedded_file_reference_prose(trimmed)
|| lower.contains("icondata")
|| lower.contains("authors.append")
+ || lower == "void"
+ || looks_like_parenthesized_ui_descriptor(trimmed)
|| contains_member_access_code_token(trimmed)
|| contains_code_call_fragment(trimmed)
+ || contains_code_string_literal_fragment(trimmed)
|| contains_unicode_escape_token_run(trimmed)
|| contains_html_entity_decoder_artifact(trimmed)
+ || contains_markup_tag_fragment(trimmed)
|| contains_xml_markup_declaration_token(trimmed)
|| contains_regex_or_template_marker(trimmed)
|| has_windows_versioninfo_markers
@@ -655,6 +674,7 @@ fn contains_html_entity_decoder_artifact(s: &str) -> bool {
let lower = s.to_ascii_lowercase();
lower.contains("u00a0")
|| lower.contains("hellip")
+ || lower.contains("x2014")
|| lower.contains("x2f")
|| lower.contains("reg 174")
|| lower.contains("copy 169")
@@ -680,6 +700,29 @@ fn contains_generated_resource_token(s: &str) -> bool {
ASSET_RE.is_match(trimmed)
}
+fn contains_markup_tag_fragment(s: &str) -> bool {
+ static MARKUP_TAG_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)?[a-z][^>]*>|<[!?][^>]*>").expect("valid markup tag fragment regex")
+ });
+
+ let trimmed = s.trim();
+ let lower = trimmed.to_ascii_lowercase();
+ if trimmed.contains('@')
+ || lower.contains("www.")
+ || lower.contains(".com")
+ || lower.contains(".org")
+ || lower.contains(".net")
+ || lower.contains(".edu")
+ || lower.contains(".gov")
+ || lower.contains(".io")
+ || lower.contains(".dev")
+ {
+ return false;
+ }
+
+ MARKUP_TAG_RE.is_match(trimmed) || trimmed.contains("")
+}
+
fn contains_member_access_code_token(s: &str) -> bool {
let trimmed = s.trim();
let lower = trimmed.to_ascii_lowercase();
@@ -704,6 +747,25 @@ fn contains_member_access_code_token(s: &str) -> bool {
MEMBER_ACCESS_RE.is_match(trimmed)
}
+fn contains_code_string_literal_fragment(s: &str) -> bool {
+ let trimmed = s.trim();
+ let lower = trimmed.to_ascii_lowercase();
+
+ lower.contains("r'")
+ || lower.contains("r\"")
+ || lower.contains("\"\\0\"")
+ || lower.contains("'\\0'")
+}
+
+fn looks_like_parenthesized_ui_descriptor(s: &str) -> bool {
+ static UI_DESCRIPTOR_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)^\((?:sharp|round|rounded|outline|outlined|filled)\)$")
+ .expect("valid UI descriptor regex")
+ });
+
+ UI_DESCRIPTOR_RE.is_match(s.trim())
+}
+
fn is_post_refine_copyright_code_fragment(s: &str) -> bool {
let trimmed = s.trim();
let lower = trimmed.to_ascii_lowercase();
@@ -711,7 +773,9 @@ fn is_post_refine_copyright_code_fragment(s: &str) -> bool {
contains_windows_versioninfo_token(trimmed)
|| contains_member_access_code_token(trimmed)
|| contains_code_call_fragment(trimmed)
+ || contains_code_string_literal_fragment(trimmed)
|| contains_unicode_escape_token_run(trimmed)
+ || contains_markup_tag_fragment(trimmed)
|| lower.contains("public void")
|| lower.contains("get set")
|| lower.contains("assert.equal")
@@ -777,6 +841,7 @@ fn is_explicit_generic_field_label_token(s: &str) -> bool {
"description",
"direction",
"options",
+ "organization",
"owner_name",
"params",
"placeholder",
@@ -784,6 +849,7 @@ fn is_explicit_generic_field_label_token(s: &str) -> bool {
"ref",
"reviewers",
"schema",
+ "sharp",
"source",
"text",
"timeago",
@@ -791,6 +857,11 @@ fn is_explicit_generic_field_label_token(s: &str) -> bool {
"tooltip",
"unique",
"username",
+ "round",
+ "rounded",
+ "outline",
+ "outlined",
+ "filled",
])
});
@@ -1015,7 +1086,11 @@ fn is_obvious_prose_fragment(s: &str) -> bool {
if lower.starts_with("not by ") {
return false;
}
- if lower.contains("code sample for") {
+ if lower.contains("code sample for")
+ || lower.contains("tests using the examples provided by")
+ || lower.ends_with("row header")
+ || lower.ends_with("column header")
+ {
return true;
}
@@ -1080,6 +1155,7 @@ pub fn refine_copyright(s: &str) -> Option {
return None;
}
let mut c = original.clone();
+ c = strip_known_copyright_wrappers(&c);
c = strip_trailing_quote_before_email(&c);
c = normalize_b_dot_angle_emails(&c);
c = strip_nickname_quotes(&c);
@@ -1091,6 +1167,7 @@ pub fn refine_copyright(s: &str) -> Option {
c = strip_trailing_paren_email_after_c_by(&c);
c = strip_trailing_for_clause_after_email(&c);
c = strip_trailing_at_affiliation(&c);
+ c = strip_trailing_single_letter_obfuscated_email_phrase(&c);
c = strip_trailing_obfuscated_email_after_dash(&c);
c = strip_url_token_between_years_and_holder(&c);
c = strip_obfuscated_angle_emails(&c);
@@ -1120,14 +1197,17 @@ pub fn refine_copyright(s: &str) -> Option {
c = strip_trailing_paren_identifier(&c);
c = strip_trailing_company_name_placeholder(&c);
c = strip_trailing_company_co_ltd(&c);
+ c = strip_trailing_heavily_based_clause(&c);
c = strip_trailing_obfuscated_email_in_angle_brackets_after_copyright(&c);
c = strip_trailing_linux_ag_location_in_copyright(&c);
+ c = strip_trailing_locale_timestamp_before_terminal_year_in_copyright(&c);
c = strip_trailing_by_person_clause_after_company(&c);
c = strip_trailing_division_of_company_suffix(&c);
c = strip_trailing_paren_at_without_domain(&c);
c = strip_trailing_inc_after_today_year_placeholder(&c);
c = truncate_trailing_boilerplate(&c);
c = strip_trailing_everyone_is_permitted_to_copy_clause(&c);
+ c = strip_trailing_all_rights_reserved_clause(&c);
c = strip_trailing_author_label(&c);
c = strip_trailing_credit_file_reference_clause(&c);
c = strip_trailing_isc_after_inc(&c);
@@ -1203,7 +1283,7 @@ pub fn refine_copyright(s: &str) -> Option {
let result_upper = result.to_ascii_uppercase();
if result_upper.contains("COPYRIGHT")
&& result_upper.contains("YEAR")
- && result_upper.contains("YOUR NAME")
+ && (result_upper.contains("YOUR NAME") || result_upper.contains("ORGANIZATION"))
{
return None;
}
@@ -1211,6 +1291,7 @@ pub fn refine_copyright(s: &str) -> Option {
return None;
}
if is_post_refine_copyright_code_fragment(&result)
+ || is_explicit_junk_copyright_phrase(&result)
|| is_junk_copyright_of_header(&result)
|| is_junk_copyrighted_works_header(&result)
|| is_junk_copyrighted_software_phrase(&result)
@@ -1224,6 +1305,116 @@ pub fn refine_copyright(s: &str) -> Option {
}
}
+fn is_explicit_junk_copyright_phrase(s: &str) -> bool {
+ matches!(
+ s.trim().to_ascii_lowercase().as_str(),
+ "copyright exclude"
+ | "copyright doctrines of fair use, fair dealing, or other equivalents"
+ | "copyright doctrines of fair use, fair dealing, or other equivalents."
+ )
+}
+
+fn strip_known_copyright_wrappers(s: &str) -> String {
+ static VALUE_LEGALCOPYRIGHT_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r#"(?ix)
+ ^VALUE\s+"LegalCopyright"\s*,\s*"(?P[^"]+)"
+ (?:\s+"\\0")?\s*$
+ "#,
+ )
+ .expect("valid LegalCopyright wrapper regex")
+ });
+ static ASSIGNMENT_COPYRIGHT_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r#"(?ix)
+ ^(?:PRODUCT_COPYRIGHT|INFOPLIST_KEY_NSHumanReadableCopyright)
+ \s*=\s*(?P.+?)\s*;?\s*$
+ "#,
+ )
+ .expect("valid assignment copyright wrapper regex")
+ });
+ static APPLICATION_LEGALESE_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r#"(?ix)^applicationLegalese\s*:\s*(?P.+?)\s*,?\s*$"#)
+ .expect("valid applicationLegalese wrapper regex")
+ });
+ static MARKUP_TEXT_COPYRIGHT_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r#"(?ix)
+ \btext\s*=\s*(?:"(?P[^"]+)"|'(?P[^']+)')
+ "#,
+ )
+ .expect("valid markup text copyright wrapper regex")
+ });
+
+ let trimmed = s.trim();
+ if let Some(captures) = VALUE_LEGALCOPYRIGHT_RE.captures(trimmed) {
+ let value = captures
+ .name("value")
+ .map(|m| m.as_str())
+ .unwrap_or("")
+ .trim();
+ if !value.is_empty() {
+ return prepare_text_line(value).trim().to_string();
+ }
+ }
+
+ for regex in [&*ASSIGNMENT_COPYRIGHT_RE, &*APPLICATION_LEGALESE_RE] {
+ if let Some(captures) = regex.captures(trimmed) {
+ let value = captures
+ .name("value")
+ .map(|m| m.as_str())
+ .unwrap_or("")
+ .trim()
+ .trim_matches(&['\'', '"'][..]);
+ if value.starts_with("Copyright") || value.starts_with('©') {
+ return prepare_text_line(value).trim().to_string();
+ }
+ }
+ }
+
+ if let Some(captures) = MARKUP_TEXT_COPYRIGHT_RE.captures(trimmed) {
+ let value = captures
+ .name("dq")
+ .or_else(|| captures.name("sq"))
+ .map(|m| m.as_str())
+ .unwrap_or("")
+ .trim();
+ if value.starts_with("Copyright") || value.starts_with('©') {
+ return prepare_text_line(value).trim().to_string();
+ }
+ }
+
+ s.to_string()
+}
+
+fn strip_trailing_all_rights_reserved_clause(s: &str) -> String {
+ static ALL_RIGHTS_RESERVED_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?ix)^(?P.+?)\.?\s+all\s+rights\s+reserved\.?$")
+ .expect("valid all rights reserved regex")
+ });
+
+ let trimmed = s.trim();
+ let Some(captures) = ALL_RIGHTS_RESERVED_RE.captures(trimmed) else {
+ return s.to_string();
+ };
+
+ let prefix = captures
+ .name("prefix")
+ .map(|m| m.as_str())
+ .unwrap_or("")
+ .trim();
+ let lower = prefix.to_ascii_lowercase();
+ if prefix.is_empty()
+ || !(lower.starts_with("copyright") || lower.starts_with("(c)") || lower.starts_with('©'))
+ {
+ return s.to_string();
+ }
+
+ prefix
+ .trim_end_matches(&[' ', '.', ',', ';', ':'][..])
+ .to_string()
+}
+
fn strip_trailing_obfuscated_email_after_dash(s: &str) -> String {
static TRAILING_DASH_OBF_EMAIL_RE: LazyLock = LazyLock::new(|| {
Regex::new(
@@ -1263,6 +1454,76 @@ fn strip_trailing_obfuscated_email_after_dash(s: &str) -> String {
prefix.to_string()
}
+fn strip_trailing_single_letter_obfuscated_email_phrase(s: &str) -> String {
+ static SINGLE_LETTER_OBF_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?ix)
+ ^(?P.+?)
+ \s+
+ (?P[a-z0-9][a-z0-9._-]{0,63})
+ \s+a\s+
+ (?P[a-z0-9][a-z0-9._-]{0,63})
+ \s+
+ (?Pcom|org|net|edu|gov|mil|io|co|us|uk|de|fr|jp|cn|in|info|biz|me|tv|ca|au)
+ \s*$",
+ )
+ .unwrap()
+ });
+
+ let trimmed = s.trim();
+ let Some(cap) = SINGLE_LETTER_OBF_RE.captures(trimmed) else {
+ return s.to_string();
+ };
+
+ let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim();
+ let user = cap.name("user").map(|m| m.as_str()).unwrap_or("").trim();
+ let domain = cap.name("domain").map(|m| m.as_str()).unwrap_or("").trim();
+ if prefix.is_empty() || user.is_empty() || domain.is_empty() {
+ return s.to_string();
+ }
+
+ let prefix_tokens: HashSet = prefix
+ .split_whitespace()
+ .map(|token| {
+ token
+ .trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
+ .to_ascii_lowercase()
+ })
+ .filter(|token| token.len() >= 2)
+ .collect();
+
+ if prefix_tokens.contains(&user.to_ascii_lowercase())
+ && prefix_tokens.contains(&domain.to_ascii_lowercase())
+ {
+ return prefix.to_string();
+ }
+
+ s.to_string()
+}
+
+fn strip_trailing_heavily_based_clause(s: &str) -> String {
+ static HEAVILY_BASED_RE: LazyLock =
+ LazyLock::new(|| Regex::new(r"(?i)^(?P.+?)\s+Heavily(?:\s+based\b.*)?$").unwrap());
+
+ let trimmed = s.trim();
+ let Some(cap) = HEAVILY_BASED_RE.captures(trimmed) else {
+ return s.to_string();
+ };
+
+ let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim();
+ if prefix.is_empty() {
+ return s.to_string();
+ }
+
+ let lower = prefix.to_ascii_lowercase();
+ if lower.starts_with("copyright") || lower.starts_with("(c)") || prefix_has_holder_words(prefix)
+ {
+ return prefix.to_string();
+ }
+
+ s.to_string()
+}
+
fn strip_trailing_credit_file_reference_clause(s: &str) -> String {
let trimmed = s.trim();
let lower = trimmed.to_ascii_lowercase();
@@ -1432,6 +1693,40 @@ fn strip_trailing_linux_ag_location_in_copyright(s: &str) -> String {
s.to_string()
}
+fn strip_trailing_locale_timestamp_before_terminal_year_in_copyright(s: &str) -> String {
+ static LOCALE_TIMESTAMP_COPY_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?ix)
+ ^(?P.+?),\s*
+ [a-z]{3}\s+[a-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[A-Z]{2,5}
+ (?:\s+(?P\d{4}))?\s*$
+ ",
+ )
+ .unwrap()
+ });
+
+ let trimmed = s.trim();
+ let Some(cap) = LOCALE_TIMESTAMP_COPY_RE.captures(trimmed) else {
+ return s.to_string();
+ };
+ let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim();
+ if prefix.is_empty() {
+ return s.to_string();
+ }
+ let lower = prefix.to_ascii_lowercase();
+ if !(lower.starts_with("copyright") || lower.starts_with("(c)") || lower.starts_with('©')) {
+ return s.to_string();
+ }
+ if let Some(year) = cap
+ .name("year")
+ .map(|m| m.as_str())
+ .filter(|y| !y.is_empty())
+ {
+ return format!("{} {}", prefix.trim_end_matches(&[',', ' '][..]), year);
+ }
+ prefix.trim_end_matches(&[',', ' '][..]).to_string()
+}
+
fn strip_trailing_quote_before_email(s: &str) -> String {
static TRAILING_QUOTE_BEFORE_EMAIL_RE: LazyLock = LazyLock::new(|| {
Regex::new(
@@ -1824,6 +2119,30 @@ fn strip_trailing_component_descriptor_from_holder(s: &str) -> String {
s.to_string()
}
+fn strip_trailing_holder_prose_clause(s: &str) -> String {
+ let trimmed = s.trim();
+ let lower = trimmed.to_ascii_lowercase();
+ for marker in [
+ " and it is hereby released to the",
+ " it is hereby released to the",
+ ", are derived from ",
+ " are derived from ",
+ " and is licensed under ",
+ " and labeled as such",
+ ] {
+ if let Some(idx) = lower.find(marker) {
+ let prefix = trimmed[..idx]
+ .trim_end_matches(&[',', ';', ':', ' '][..])
+ .trim();
+ if !prefix.is_empty() && prefix_has_holder_words(prefix) {
+ return prefix.to_string();
+ }
+ }
+ }
+
+ s.to_string()
+}
+
fn strip_trailing_or_suffix(s: &str) -> String {
static TRAILING_OR_RE: LazyLock =
LazyLock::new(|| Regex::new(r"(?i)^(?Pcopyright\b.+?)\s+or\s*$").unwrap());
@@ -2319,12 +2638,14 @@ fn refine_holder_impl(s: &str, in_copyright_context: bool) -> Option {
h = strip_trailing_email_token(&h);
h = strip_trailing_obfuscated_email_phrase_in_holder(&h);
}
+ h = strip_trailing_single_letter_obfuscated_email_phrase(&h);
h = strip_parenthesized_emails(&h);
h = strip_trailing_parenthesized_url_or_domain(&h);
h = strip_contributor_parens_after_org(&h);
h = normalize_comma_spacing(&h);
h = normalize_angle_bracket_comma_spacing(&h);
h = strip_trailing_linux_ag_location(&h);
+ h = strip_trailing_locale_timestamp_in_holder(&h);
h = strip_trailing_but_suffix(&h);
if had_paren_email {
h = remove_comma_between_person_and_company_suffix(&h);
@@ -2349,6 +2670,7 @@ fn refine_holder_impl(s: &str, in_copyright_context: bool) -> Option {
h = strip_trailing_paren_identifier(&h);
h = strip_trailing_company_name_placeholder(&h);
h = strip_trailing_confidentiality_qualifier(&h);
+ h = strip_trailing_heavily_based_clause(&h);
if in_copyright_context {
h = strip_trailing_short_surname_paren_list_in_holder(&h);
@@ -2366,6 +2688,7 @@ fn refine_holder_impl(s: &str, in_copyright_context: bool) -> Option {
h = remove_some_extra_words_and_punct(&h);
h = strip_trailing_incomplete_as_represented_by(&h);
+ h = strip_trailing_holder_prose_clause(&h);
h = h.trim_matches(&['/', ' ', '~'][..]).to_string();
h = refine_names(&h, prefixes);
h = strip_trailing_company_co_ltd(&h);
@@ -2510,6 +2833,29 @@ fn strip_trailing_linux_ag_location(s: &str) -> String {
s.to_string()
}
+fn strip_trailing_locale_timestamp_in_holder(s: &str) -> String {
+ static LOCALE_TIMESTAMP_HOLDER_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?ix)
+ ^(?P.+?),\s*
+ [a-z]{3}\s+[a-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[A-Z]{2,5}
+ (?:\s+\d{4})?\s*$
+ ",
+ )
+ .unwrap()
+ });
+
+ let trimmed = s.trim();
+ let Some(cap) = LOCALE_TIMESTAMP_HOLDER_RE.captures(trimmed) else {
+ return s.to_string();
+ };
+ let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim();
+ if prefix.is_empty() || !prefix_has_holder_words(prefix) {
+ return s.to_string();
+ }
+ prefix.trim_end_matches(&[',', ' '][..]).to_string()
+}
+
fn remove_comma_between_person_and_company_suffix(s: &str) -> String {
static COMMA_CORP_RE: LazyLock = LazyLock::new(|| {
Regex::new(r"^(?P[\p{Lu}][^,]{2,64}(?:\s+[\p{Lu}][^,]{2,64})+)\s*,\s*(?P[^,]{2,64}\b(?:Corp\.?|Corporation|Inc\.?|Ltd\.?))\s*$")
diff --git a/src/copyright/refiner/tests.rs b/src/copyright/refiner/tests.rs
index 0a6a6dd13..ea5c30f3b 100644
--- a/src/copyright/refiner/tests.rs
+++ b/src/copyright/refiner/tests.rs
@@ -1548,6 +1548,55 @@ fn test_refine_copyright_drops_versioninfo_and_dtd_junk() {
);
}
+#[test]
+fn test_refine_copyright_strips_flutter_wrapper_context() {
+ assert_eq!(
+ refine_copyright("applicationLegalese: '© 2014 The Flutter Authors',"),
+ Some("(c) 2014 The Flutter Authors".to_string())
+ );
+ assert_eq!(
+ refine_copyright(
+ "PRODUCT_COPYRIGHT = Copyright © 2014 The Flutter Authors. All rights reserved."
+ ),
+ Some("Copyright (c) 2014 The Flutter Authors".to_string())
+ );
+ assert_eq!(
+ refine_copyright(
+ r#""#
+ ),
+ Some("(c) 2018 The Flutter Authors".to_string())
+ );
+ assert_eq!(
+ refine_copyright(
+ r#"VALUE "LegalCopyright", "Copyright (C) {{year}} {{organization}}. All rights reserved." "\0""#
+ ),
+ None
+ );
+}
+
+#[test]
+fn test_refine_copyright_drops_flutter_generated_code_fragments() {
+ assert_eq!(
+ refine_copyright(
+ r#"copyright — material icon named "copyright" (sharp)."#
+ ),
+ None
+ );
+ assert_eq!(
+ refine_copyright("verifyEntry(mapping, 'KeyC', [r'c', r'C', r'©', r'¢'], 'c');"),
+ None
+ );
+ assert_eq!(refine_copyright("r'u3 u©g˝g' r'v2˚kk' r'w2ÂzÅz'"), None);
+}
+
+#[test]
+fn test_refine_copyright_strips_all_rights_reserved_clause() {
+ assert_eq!(
+ refine_copyright("Copyright 2024 Apple Inc. All rights reserved."),
+ Some("Copyright 2024 Apple Inc.".to_string())
+ );
+}
+
#[test]
fn test_refine_holder_drops_versioninfo_and_dtd_junk() {
assert_eq!(
@@ -1559,6 +1608,14 @@ fn test_refine_holder_drops_versioninfo_and_dtd_junk() {
assert_eq!(refine_holder("HeaderType.Content u00AD u00AE"), None);
}
+#[test]
+fn test_refine_holder_drops_flutter_generated_code_fragments() {
+ assert_eq!(refine_holder("x2014 material icon named"), None);
+ assert_eq!(refine_holder("r'¢"), None);
+ assert_eq!(refine_holder("void"), None);
+ assert_eq!(refine_holder("organization"), None);
+}
+
#[test]
fn test_refine_author_drops_template_token_runs_and_numeric_fragments() {
assert_eq!(refine_author("AUTH CONTRIBUTORS AUTHS+ + 2660"), None);
@@ -2055,3 +2112,112 @@ fn test_refine_holder_drops_css_selector_noise() {
assert_eq!(refine_holder("Legal Notice"), None);
assert_eq!(refine_holder("color 666666"), None);
}
+
+#[test]
+fn test_refine_author_strips_generated_month_year_and_from_lib_tail() {
+ assert_eq!(
+ refine_author("Intel Corporation Generated November"),
+ Some("Intel Corporation".to_string())
+ );
+ assert_eq!(
+ refine_author("L. Plagne from boost lib"),
+ Some("L. Plagne ".to_string())
+ );
+}
+
+#[test]
+fn test_refine_author_drops_code_itself_and_lapack_package_prose() {
+ assert_eq!(
+ refine_author(
+ "the code itself Stefan I. Larimore and Timothy A. Davis (davis@cise.ufl.edu), University of Florida. The algorithm was in collaboration with John Gilbert, Xerox PARC, and Esmond Ng, Oak Ridge National Laboratory"
+ ),
+ None
+ );
+ assert_eq!(
+ refine_author(
+ "LAPACK is a software package provided by Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd"
+ ),
+ None
+ );
+}
+
+#[test]
+fn test_refine_holder_drops_exclude_disclaimer_and_trailing_heavily() {
+ assert_eq!(refine_holder("EXCLUDE"), None);
+ assert_eq!(refine_holder("with the"), None);
+ assert_eq!(
+ refine_holder(
+ "THE UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND THEIR EMPLOYEES"
+ ),
+ None
+ );
+ assert_eq!(
+ refine_holder("Konstantinos Margaritis Heavily"),
+ Some("Konstantinos Margaritis".to_string())
+ );
+}
+
+#[test]
+fn test_refine_holder_and_copyright_strip_single_letter_obfuscated_email_tail() {
+ assert_eq!(
+ refine_holder("Mark Borgerding mark a borgerding net"),
+ Some("Mark Borgerding".to_string())
+ );
+ assert_eq!(
+ refine_copyright("Copyright (c) 2009 Mark Borgerding mark a borgerding net"),
+ Some("Copyright (c) 2009 Mark Borgerding".to_string())
+ );
+}
+
+#[test]
+fn test_refine_copyright_drops_exclude_and_mpl_fair_use_noise() {
+ assert_eq!(refine_copyright("copyright EXCLUDE"), None);
+ assert_eq!(
+ refine_copyright("copyright doctrines of fair use, fair dealing, or other equivalents"),
+ None
+ );
+}
+
+#[test]
+fn test_refine_copyright_strips_trailing_heavily_based_clause() {
+ assert_eq!(
+ refine_copyright("Copyright (c) 2010 Konstantinos Margaritis Heavily"),
+ Some("Copyright (c) 2010 Konstantinos Margaritis ".to_string())
+ );
+}
+
+#[test]
+fn test_refine_copyright_keeps_structured_copyright_notice_with_year() {
+ assert_eq!(
+ refine_copyright("Copyright Notice (1999) University of Chicago"),
+ Some("Copyright Notice (1999) University of Chicago".to_string())
+ );
+}
+
+#[test]
+fn test_refine_copyright_strips_locale_timestamp_before_year() {
+ assert_eq!(
+ refine_copyright("Copyright (C) EDF R&D, lun sep 30 14:23:19 CEST 2002"),
+ Some("Copyright (C) EDF R&D 2002".to_string())
+ );
+}
+
+#[test]
+fn test_refine_holder_strips_locale_timestamp_suffix() {
+ assert_eq!(
+ refine_holder("EDF R&D, lun sep 30 14:23:19 CEST"),
+ Some("EDF R&D".to_string())
+ );
+}
+
+#[test]
+fn test_refine_holder_strips_trailing_prose_clauses() {
+ assert_eq!(
+ refine_holder("Alexander Peslyak and it is hereby released to the"),
+ Some("Alexander Peslyak".to_string())
+ );
+ assert_eq!(
+ refine_holder("Andreas Dilger, are derived from libpng-0.88"),
+ Some("Andreas Dilger".to_string())
+ );
+}
diff --git a/src/scanner/process/copyright.rs b/src/scanner/process/copyright.rs
index db2d3f057..816a9fe13 100644
--- a/src/scanner/process/copyright.rs
+++ b/src/scanner/process/copyright.rs
@@ -5,7 +5,10 @@ use super::binary_text::{
extract_named_author_from_binary_line, has_binary_name_like_shape, has_excessive_at_noise,
has_sufficient_alphabetic_content, is_binary_string_author_candidate, is_company_like_suffix,
};
-use crate::copyright::{self, AuthorDetection, CopyrightDetection, HolderDetection, refine_author};
+use crate::copyright::{
+ self, AuthorDetection, CopyrightDetection, HolderDetection, prepare_text_line, refine_author,
+ refine_copyright,
+};
use crate::models::{Author, Copyright, FileInfoBuilder, Holder, LineNumber};
use regex::Regex;
use std::collections::HashSet;
@@ -119,11 +122,75 @@ fn render_raw_copyright_from_text(
if rendered.is_empty() {
fallback.to_string()
+ } else if let Some(projected) = project_wrapped_copyright_value(&rendered, fallback) {
+ projected
+ } else if let Some(projected) = project_suspicious_native_copyright_value(&rendered) {
+ projected
} else {
project_native_copyright_value(&rendered, fallback)
}
}
+fn project_wrapped_copyright_value(rendered: &str, _fallback: &str) -> Option {
+ static VALUE_LEGALCOPYRIGHT_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r#"(?ix)
+ ^VALUE\s+"LegalCopyright"\s*,\s*"(?P[^"]+)"
+ (?:\s+"\\0")?\s*$
+ "#,
+ )
+ .expect("valid LegalCopyright wrapper regex")
+ });
+ static ASSIGNMENT_COPYRIGHT_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r#"(?ix)
+ ^(?:PRODUCT_COPYRIGHT|INFOPLIST_KEY_NSHumanReadableCopyright)
+ \s*=\s*(?P.+?)\s*;?\s*$
+ "#,
+ )
+ .expect("valid assignment copyright wrapper regex")
+ });
+ static APPLICATION_LEGALESE_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r#"(?ix)^applicationLegalese\s*:\s*(?P.+?)\s*,?\s*$"#)
+ .expect("valid applicationLegalese wrapper regex")
+ });
+ static MARKUP_TEXT_COPYRIGHT_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r#"(?ix)
+ \btext\s*=\s*(?:"(?P[^"]+)"|'(?P[^']+)')
+ "#,
+ )
+ .expect("valid markup text copyright wrapper regex")
+ });
+
+ let extracted = if let Some(captures) = VALUE_LEGALCOPYRIGHT_RE.captures(rendered) {
+ captures
+ .name("value")
+ .map(|m| m.as_str().trim().to_string())
+ } else if let Some(captures) = ASSIGNMENT_COPYRIGHT_RE.captures(rendered) {
+ captures
+ .name("value")
+ .map(|m| m.as_str().trim().trim_matches(&['\'', '"'][..]).to_string())
+ } else if let Some(captures) = APPLICATION_LEGALESE_RE.captures(rendered) {
+ captures
+ .name("value")
+ .map(|m| m.as_str().trim().trim_matches(&['\'', '"'][..]).to_string())
+ } else if let Some(captures) = MARKUP_TEXT_COPYRIGHT_RE.captures(rendered) {
+ captures
+ .name("dq")
+ .or_else(|| captures.name("sq"))
+ .map(|m| m.as_str().trim().to_string())
+ } else {
+ None
+ }?;
+
+ let projected = prepare_text_line(&extracted)
+ .split_whitespace()
+ .collect::>()
+ .join(" ");
+ Some(projected)
+}
+
fn project_native_copyright_value(rendered: &str, fallback: &str) -> String {
let rendered = rendered.trim();
let fallback = fallback.trim();
@@ -134,6 +201,9 @@ fn project_native_copyright_value(rendered: &str, fallback: &str) -> String {
let rendered_lower = rendered.to_ascii_lowercase();
let fallback_lower = fallback.to_ascii_lowercase();
let Some(start) = rendered_lower.find(&fallback_lower) else {
+ if refine_copyright(rendered).as_deref() == Some(fallback) {
+ return preserve_native_suffix_for_semantic_match(rendered, fallback);
+ }
return rendered.to_string();
};
let end = start + fallback.len();
@@ -155,6 +225,40 @@ fn project_native_copyright_value(rendered: &str, fallback: &str) -> String {
}
}
+fn preserve_native_suffix_for_semantic_match(rendered: &str, fallback: &str) -> String {
+ let lower = rendered.to_ascii_lowercase();
+ if lower.contains("all rights reserved") {
+ let sep = if fallback.ends_with(['.', ',', ';', ':']) {
+ ""
+ } else {
+ "."
+ };
+ return format!("{fallback}{sep} All rights reserved.");
+ }
+
+ fallback.to_string()
+}
+
+fn project_suspicious_native_copyright_value(rendered: &str) -> Option {
+ let lower = rendered.to_ascii_lowercase();
+ let looks_suspicious = lower.ends_with(" or")
+ || lower.contains(" and is licensed under ")
+ || lower.contains(" are derived from ")
+ || lower.contains(" it is hereby released to the")
+ || lower.contains(", et al")
+ || lower.contains(" cest ")
+ || lower.contains(" ce?st ")
+ || (lower.contains(':') && lower.contains(" edt "))
+ || (lower.contains(':') && lower.contains(" cest"));
+ if !looks_suspicious {
+ return None;
+ }
+
+ let prepared = prepare_text_line(rendered);
+ let refined = refine_copyright(&prepared)?;
+ (refined != rendered.trim()).then_some(refined)
+}
+
fn normalize_native_suffix(suffix: &str) -> Option {
if suffix.is_empty() {
return Some(String::new());
diff --git a/src/scanner/process/copyright_test.rs b/src/scanner/process/copyright_test.rs
index da5b182e4..a79f336cf 100644
--- a/src/scanner/process/copyright_test.rs
+++ b/src/scanner/process/copyright_test.rs
@@ -186,6 +186,202 @@ fn test_extract_copyright_information_xml_comment_projection_preserves_native_sy
);
}
+#[test]
+fn test_extract_copyright_information_strips_flutter_wrapper_assignments() {
+ let text = "PRODUCT_COPYRIGHT = Copyright © 2014 The Flutter Authors. All rights reserved.\n";
+ let mut builder = FileInfoBuilder::default();
+
+ extract_copyright_information(
+ &mut builder,
+ Path::new("AppInfo.xcconfig"),
+ text,
+ 120.0,
+ false,
+ );
+
+ let file = builder
+ .name("AppInfo.xcconfig".to_string())
+ .base_name("AppInfo".to_string())
+ .extension(".xcconfig".to_string())
+ .path("AppInfo.xcconfig".to_string())
+ .file_type(FileType::File)
+ .size(text.len() as u64)
+ .build()
+ .expect("builder should produce file info");
+
+ assert_eq!(
+ file.copyrights.len(),
+ 1,
+ "copyrights: {:?}",
+ file.copyrights
+ );
+ assert_eq!(
+ file.copyrights[0].copyright,
+ "Copyright (c) 2014 The Flutter Authors. All rights reserved."
+ );
+ assert_eq!(
+ file.copyrights[0].normalized_copyright.as_deref(),
+ Some("Copyright (c) 2014 The Flutter Authors")
+ );
+}
+
+#[test]
+fn test_extract_copyright_information_strips_flutter_application_legalese_wrapper() {
+ let text = "applicationLegalese: '© 2014 The Flutter Authors',\n";
+ let mut builder = FileInfoBuilder::default();
+
+ extract_copyright_information(&mut builder, Path::new("about.dart"), text, 120.0, false);
+
+ let file = builder
+ .name("about.dart".to_string())
+ .base_name("about".to_string())
+ .extension(".dart".to_string())
+ .path("about.dart".to_string())
+ .file_type(FileType::File)
+ .size(text.len() as u64)
+ .build()
+ .expect("builder should produce file info");
+
+ assert_eq!(
+ file.copyrights.len(),
+ 1,
+ "copyrights: {:?}",
+ file.copyrights
+ );
+ assert_eq!(file.copyrights[0].copyright, "(c) 2014 The Flutter Authors");
+ assert_eq!(
+ file.copyrights[0].normalized_copyright.as_deref(),
+ Some("(c) 2014 The Flutter Authors")
+ );
+}
+
+#[test]
+fn test_extract_copyright_information_strips_flutter_storyboard_text_wrapper() {
+ let text = r#"\n"#;
+ let mut builder = FileInfoBuilder::default();
+
+ extract_copyright_information(
+ &mut builder,
+ Path::new("LaunchScreen.storyboard"),
+ text,
+ 120.0,
+ false,
+ );
+
+ let file = builder
+ .name("LaunchScreen.storyboard".to_string())
+ .base_name("LaunchScreen".to_string())
+ .extension(".storyboard".to_string())
+ .path("LaunchScreen.storyboard".to_string())
+ .file_type(FileType::File)
+ .size(text.len() as u64)
+ .build()
+ .expect("builder should produce file info");
+
+ assert_eq!(
+ file.copyrights.len(),
+ 1,
+ "copyrights: {:?}",
+ file.copyrights
+ );
+ assert_eq!(
+ file.copyrights[0].copyright,
+ "(c) 2018 The Flutter Authors. All rights reserved."
+ );
+ assert_eq!(
+ file.copyrights[0].normalized_copyright.as_deref(),
+ Some("(c) 2018 The Flutter Authors")
+ );
+}
+
+#[test]
+fn test_extract_copyright_information_drops_flutter_generated_doc_false_positive() {
+ let text = r#"copyright — material icon named "copyright" (sharp).\n"#;
+ let mut builder = FileInfoBuilder::default();
+
+ extract_copyright_information(&mut builder, Path::new("icons.dart"), text, 120.0, false);
+
+ let file = builder
+ .name("icons.dart".to_string())
+ .base_name("icons".to_string())
+ .extension(".dart".to_string())
+ .path("icons.dart".to_string())
+ .file_type(FileType::File)
+ .size(text.len() as u64)
+ .build()
+ .expect("builder should produce file info");
+
+ assert!(
+ file.copyrights.is_empty(),
+ "copyrights: {:?}",
+ file.copyrights
+ );
+ assert!(file.holders.is_empty(), "holders: {:?}", file.holders);
+}
+
+#[test]
+fn test_extract_copyright_information_strips_trailing_or_notice_bleed() {
+ let text = "Copyright © 1993,2004 Sun Microsystems or\n";
+ let mut builder = FileInfoBuilder::default();
+
+ extract_copyright_information(&mut builder, Path::new("NOTICE"), text, 120.0, false);
+
+ let file = builder
+ .name("NOTICE".to_string())
+ .base_name("NOTICE".to_string())
+ .extension("".to_string())
+ .path("NOTICE".to_string())
+ .file_type(FileType::File)
+ .size(text.len() as u64)
+ .build()
+ .expect("builder should produce file info");
+
+ assert_eq!(
+ file.copyrights.len(),
+ 1,
+ "copyrights: {:?}",
+ file.copyrights
+ );
+ assert_eq!(
+ file.copyrights[0].copyright,
+ "Copyright (c) 1993,2004 Sun Microsystems"
+ );
+}
+
+#[test]
+fn test_extract_copyright_information_strips_locale_timestamp_from_raw_projection() {
+ let text = "// Copyright (C) EDF R&D, lun sep 30 14:23:19 CEST 2002\n";
+ let mut builder = FileInfoBuilder::default();
+
+ extract_copyright_information(
+ &mut builder,
+ Path::new("action_aat_product.hh"),
+ text,
+ 120.0,
+ false,
+ );
+
+ let file = builder
+ .name("action_aat_product.hh".to_string())
+ .base_name("action_aat_product".to_string())
+ .extension(".hh".to_string())
+ .path("action_aat_product.hh".to_string())
+ .file_type(FileType::File)
+ .size(text.len() as u64)
+ .build()
+ .expect("builder should produce file info");
+
+ assert_eq!(
+ file.copyrights.len(),
+ 1,
+ "copyrights: {:?}",
+ file.copyrights
+ );
+ assert_eq!(file.copyrights[0].copyright, "Copyright (c) EDF R&D 2002");
+ assert_eq!(file.holders.len(), 1, "holders: {:?}", file.holders);
+ assert_eq!(file.holders[0].holder, "EDF R&D");
+}
+
#[test]
fn test_binary_string_copyright_candidate_keeps_real_notice() {
let notice = "Copyright nexB and others (c) 2012";
diff --git a/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml b/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml
index 9840f3c2a..a2276f8f2 100644
--- a/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml
+++ b/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml
@@ -10,4 +10,3 @@ holders:
holders_summary:
- value: University of Chicago
count: 1
-authors: []
diff --git a/xtask/src/bin/compare_outputs.rs b/xtask/src/bin/compare_outputs.rs
index 90a15970e..00081b913 100644
--- a/xtask/src/bin/compare_outputs.rs
+++ b/xtask/src/bin/compare_outputs.rs
@@ -2060,6 +2060,35 @@ fn normalize_compare_copyright_value(value: &str) -> String {
out
}
+fn normalize_compare_url_value(value: &str) -> String {
+ let normalized = normalize_text(value);
+ if normalized.is_empty() {
+ return normalized;
+ }
+
+ let Ok(parsed) = url::Url::parse(&normalized) else {
+ return normalized;
+ };
+
+ if parsed.cannot_be_a_base() {
+ return normalized;
+ }
+
+ let prefix_end = normalized
+ .find('?')
+ .or_else(|| normalized.find('#'))
+ .unwrap_or(normalized.len());
+ let prefix = &normalized[..prefix_end];
+ let suffix = &normalized[prefix_end..];
+
+ let trimmed_prefix = prefix.trim_end_matches('/');
+ if trimmed_prefix.is_empty() {
+ normalized
+ } else {
+ format!("{trimmed_prefix}{suffix}")
+ }
+}
+
fn strip_compare_all_rights_reserved(value: &str) -> String {
let lower = value.to_ascii_lowercase();
let marker = "all rights reserved";
@@ -2107,7 +2136,10 @@ fn metric_values(entry: &Value, metric: &str) -> Vec {
.get("email")
.and_then(Value::as_str)
.map(str::to_string),
- "urls" => item.get("url").and_then(Value::as_str).map(str::to_string),
+ "urls" => item
+ .get("url")
+ .and_then(Value::as_str)
+ .map(normalize_compare_url_value),
"scan_errors" => scan_error_identity(item).map(str::to_string),
_ => None,
}?;
@@ -4943,6 +4975,24 @@ mod tests {
);
}
+ #[test]
+ fn metric_values_ignore_trailing_slash_only_url_differences() {
+ let entry = json!({
+ "urls": [
+ {"url": "http://mozilla.org/MPL/2.0/"},
+ {"url": "https://example.com/foo/?a=1"}
+ ]
+ });
+
+ assert_eq!(
+ metric_values(&entry, "urls"),
+ vec![
+ "http://mozilla.org/MPL/2.0".to_string(),
+ "https://example.com/foo?a=1".to_string()
+ ]
+ );
+ }
+
#[test]
fn top_level_counts_deduplicate_license_detection_reference_match_identities() {
let value = json!({