From 491e8ad2ca5b36bd39d71fb0d96c1232d8d7d4b5 Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Mon, 4 May 2026 23:30:59 +0200 Subject: [PATCH 1/5] fix(copyright): clean metadata header and notice detections Signed-off-by: Maxim Stykow --- docs/BENCHMARKS.md | 9 +- docs/scan-duration-vs-files.svg | 6 ++ .../detector/author_heuristics/extraction.rs | 48 +++++++++- .../detector/author_heuristics_test.rs | 40 +++++++++ src/copyright/detector/phases/postprocess.rs | 10 +++ .../postprocess_transforms/author_repairs.rs | 71 +++++++++++++++ src/copyright/detector/tests.rs | 19 ++++ src/copyright/refiner/author.rs | 45 +++++++++- .../refiner/authors_junk_patterns.rs | 2 + .../refiner/copyrights_junk_patterns.rs | 2 + .../refiner/holders_junk_patterns.rs | 3 + src/copyright/refiner/mod.rs | 87 +++++++++++++++++++ src/copyright/refiner/tests.rs | 81 +++++++++++++++++ .../misco3/correct-copyright-minpack.txt.yml | 1 - 14 files changed, 420 insertions(+), 4 deletions(-) diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md index 452c11401..ab3bd93a2 100644 --- a/docs/BENCHMARKS.md +++ b/docs/BENCHMARKS.md @@ -12,7 +12,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc ![Scan duration vs. file count for Provenant and ScanCode](scan-duration-vs-files.svg) -> Provenant is faster on 190 of 190 recorded runs, with a **12.0× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets. +> Provenant is faster on 191 of 191 recorded runs, with a **12.1× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets. > Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`. ## Current benchmark examples @@ -819,6 +819,13 @@ The quick index below links to benchmark sections. Each benchmark entry then rec - Timing: Provenant `290.44s`; ScanCode `5927.08s` - Broader Bazel and mixed-tree dependency extraction (`8202` vs `8056` packages, `1465` vs `700` dependencies) from root and vendored `MODULE.bazel`, many committed `BUILD` files, Python lockfiles, Dockerfiles, and Debian control metadata, plus direct `CITATION.cff` package visibility +##### [PX4/eigen @ 7cf1c01](https://github.com/PX4/eigen/tree/7cf1c0179eb0f5499dfc1bffbd229783a7865fe1) — **19.96× faster** + +- Files: 1,672 +- Run context: 2026-05-04 · eigen-62479 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc +- Timing: Provenant `16.12s`; ScanCode `321.68s` +- Cleaner copyright, holder, and author recovery on this manifest-free native source tree, with structured `Copyright Notice (...)` extraction, normalized `Author / Project / Copyright` header splitting, rejection of `.krazy` control-file and disclaimer-list junk, and Unicode-preserving party normalization + ##### [ValveSoftware/eigen @ e9c4315](https://github.com/ValveSoftware/eigen/tree/e9c43151265207fd3366bba21cddd61141ff402c) — **19.84× faster** - Files: 1,784 diff --git a/docs/scan-duration-vs-files.svg b/docs/scan-duration-vs-files.svg index fda55a43f..a66d00bf1 100644 --- a/docs/scan-duration-vs-files.svg +++ b/docs/scan-duration-vs-files.svg @@ -371,6 +371,9 @@ ScanCode: 272.67s aosp-mirror/platform_build @ 045a3d6 Files: 1515 ScanCode: 240.24s + PX4/eigen @ 7cf1c01 +Files: 1672 +ScanCode: 321.68s guillemj/dpkg @ 0061122 Files: 1766 ScanCode: 563.43s @@ -943,6 +946,9 @@ Provenant: 24.48s aosp-mirror/platform_build @ 045a3d6 Files: 1515 Provenant: 25.23s + PX4/eigen @ 7cf1c01 +Files: 1672 +Provenant: 16.12s guillemj/dpkg @ 0061122 Files: 1766 Provenant: 27.87s diff --git a/src/copyright/detector/author_heuristics/extraction.rs b/src/copyright/detector/author_heuristics/extraction.rs index 756222a34..18308f4b3 100644 --- a/src/copyright/detector/author_heuristics/extraction.rs +++ b/src/copyright/detector/author_heuristics/extraction.rs @@ -1282,7 +1282,9 @@ pub(in super::super) fn extract_author_colon_blocks( } } let combined_raw = segments.join(" "); - let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw) else { + let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw) + .or_else(|| refine_explicit_author_label_roster(&combined_raw)) + else { line_number = line_number.next(); continue; }; @@ -1363,6 +1365,50 @@ fn sanitize_author_colon_tail(tail: &str) -> Option { Some(trimmed.to_string()) } +fn refine_explicit_author_label_roster(candidate: &str) -> Option { + let trimmed = normalize_whitespace(candidate.trim()); + if !trimmed.contains(',') { + return None; + } + + let parts: Vec<&str> = trimmed + .split(',') + .map(str::trim) + .filter(|part| !part.is_empty()) + .collect(); + if parts.len() < 2 { + return None; + } + + let has_placeholder = parts.iter().any(|part| { + part.eq_ignore_ascii_case("package author") || part.eq_ignore_ascii_case("package authors") + }); + if has_placeholder { + return None; + } + + let first_two_rosterish = parts.iter().take(2).all(|part| { + let words: Vec<&str> = part.split_whitespace().collect(); + if words.is_empty() { + return false; + } + + if words.len() >= 2 { + return words + .iter() + .all(|word| word.chars().any(|ch| ch.is_alphabetic())); + } + + part.chars() + .all(|ch| !ch.is_alphabetic() || ch.is_ascii_uppercase()) + }); + if !first_two_rosterish { + return None; + } + + Some(trimmed) +} + fn is_author_metadata_line(line: &str) -> bool { let lower = line.trim().to_ascii_lowercase(); lower.starts_with("url:") diff --git a/src/copyright/detector/author_heuristics_test.rs b/src/copyright/detector/author_heuristics_test.rs index a9a510d48..807b1e803 100644 --- a/src/copyright/detector/author_heuristics_test.rs +++ b/src/copyright/detector/author_heuristics_test.rs @@ -158,6 +158,46 @@ fn test_detect_multiline_comment_authors_block_after_year_only_copyright() { ); } +#[test] +fn test_detect_explicit_author_label_roster_with_company_suffix() { + let input = "// Author : Antoine YESSAYAN, Paul RASCLE, EDF\n"; + let (_copyrights, _holders, authors) = super::super::detect_copyrights_from_text(input); + + assert!( + authors + .iter() + .any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"), + "authors: {authors:?}" + ); +} + +#[test] +fn test_split_author_project_copyright_metadata_block() { + let input = concat!( + "// Author : Antoine YESSAYAN, Paul RASCLE, EDF\n", + "// Project : SALOME\n", + "// Copyright : EDF 2001\n", + ); + let (copyrights, holders, authors) = super::super::detect_copyrights_from_text(input); + + assert!( + authors + .iter() + .any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"), + "authors: {authors:?}" + ); + assert!( + copyrights + .iter() + .any(|copyright| copyright.copyright == "Copyright EDF 2001"), + "copyrights: {copyrights:?}" + ); + assert!( + holders.iter().any(|holder| holder.holder == "EDF"), + "holders: {holders:?}" + ); +} + #[test] fn test_extract_collective_author_with_contributors_before_email() { let input = "authors = [\"Tokio Contributors \"]\n"; diff --git a/src/copyright/detector/phases/postprocess.rs b/src/copyright/detector/phases/postprocess.rs index a07785d5b..c8810b4e6 100644 --- a/src/copyright/detector/phases/postprocess.rs +++ b/src/copyright/detector/phases/postprocess.rs @@ -378,6 +378,16 @@ fn run_mid_pipeline_repairs( seen.dedup_new_holders(holders, h_before); seen.dedup_new_authors(authors, a_before); + let c_before = copyrights.len(); + let h_before = holders.len(); + let a_before = authors.len(); + super::postprocess_transforms::split_author_project_copyright_metadata_blocks( + copyrights, holders, authors, + ); + seen.dedup_new_copyrights(copyrights, c_before); + seen.dedup_new_holders(holders, h_before); + seen.dedup_new_authors(authors, a_before); + super::postprocess_transforms::drop_static_char_string_copyrights(content, copyrights, holders); super::postprocess_transforms::drop_combined_period_holders(holders); super::pattern_extract::drop_shadowed_prefix_holders(holders); diff --git a/src/copyright/detector/postprocess_transforms/author_repairs.rs b/src/copyright/detector/postprocess_transforms/author_repairs.rs index 8e6f0967f..520597ffa 100644 --- a/src/copyright/detector/postprocess_transforms/author_repairs.rs +++ b/src/copyright/detector/postprocess_transforms/author_repairs.rs @@ -330,3 +330,74 @@ pub fn split_written_by_copyrights_into_holder_prefixed_clauses( holders.retain(|h| h.holder != "Julian Cowley"); authors.retain(|a| a.author != "Linus Torvalds" && a.author != "Theodore Ts'o"); } + +pub fn split_author_project_copyright_metadata_blocks( + copyrights: &mut [CopyrightDetection], + holders: &mut Vec, + authors: &mut Vec, +) { + static AUTHOR_PROJECT_COPY_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?ix) + ^Author\s+(?P.+?) + (?:\s+Project\s+(?P.+?))? + \s+Copyright\s+(?P.+?)\s+(?P\d{4}) + $", + ) + .unwrap() + }); + + for copyright in copyrights.iter_mut() { + let current = copyright.copyright.clone(); + let Some(cap) = AUTHOR_PROJECT_COPY_RE.captures(current.as_str()) else { + continue; + }; + + let author_raw = cap.name("author").map(|m| m.as_str()).unwrap_or("").trim(); + let holder_raw = cap.name("holder").map(|m| m.as_str()).unwrap_or("").trim(); + let year = cap.name("year").map(|m| m.as_str()).unwrap_or("").trim(); + if author_raw.is_empty() || holder_raw.is_empty() || year.is_empty() { + continue; + } + + let author = crate::copyright::refiner::refine_author(author_raw) + .unwrap_or_else(|| normalize_whitespace(author_raw)); + if !author.is_empty() + && !authors.iter().any(|existing| { + existing.author == author && existing.start_line == copyright.start_line + }) + { + authors.push(AuthorDetection { + author, + start_line: copyright.start_line, + end_line: copyright.start_line, + }); + } + + if let Some(refined) = refine_copyright(&format!("Copyright {holder_raw} {year}")) { + copyright.copyright = refined; + } + + let Some(refined_holder) = refine_holder_in_copyright_context(holder_raw) else { + continue; + }; + + holders.retain(|holder| { + !(holder.start_line == copyright.start_line + && holder.end_line == copyright.end_line + && holder.holder.contains(author_raw)) + }); + + if !holders.iter().any(|holder| { + holder.holder == refined_holder + && holder.start_line == copyright.start_line + && holder.end_line == copyright.end_line + }) { + holders.push(HolderDetection { + holder: refined_holder, + start_line: copyright.start_line, + end_line: copyright.end_line, + }); + } + } +} diff --git a/src/copyright/detector/tests.rs b/src/copyright/detector/tests.rs index c97d07177..328809935 100644 --- a/src/copyright/detector/tests.rs +++ b/src/copyright/detector/tests.rs @@ -180,6 +180,25 @@ fn test_added_copyright_year_for_line_is_extracted() { ); } +#[test] +fn test_structured_copyright_notice_with_year_is_extracted() { + let input = "Minpack Copyright Notice (1999) University of Chicago. All rights reserved\n"; + + let (copyrights, holders, _authors) = detect_copyrights_from_text(input); + assert!( + copyrights + .iter() + .any(|c| c.copyright == "Copyright Notice (1999) University of Chicago"), + "copyrights: {:?}", + copyrights.iter().map(|c| &c.copyright).collect::>() + ); + assert!( + holders.iter().any(|h| h.holder == "University of Chicago"), + "holders: {:?}", + holders.iter().map(|h| &h.holder).collect::>() + ); +} + #[test] fn test_author_prefix_dedup_keeps_short_email_list() { let input = "Author(s): gthomas, sorin@netappi.com\nContributors: gthomas, sorin@netappi.com, andrew.lunn@ascom.ch\n"; diff --git a/src/copyright/refiner/author.rs b/src/copyright/refiner/author.rs index 96477ac25..f90780ab6 100644 --- a/src/copyright/refiner/author.rs +++ b/src/copyright/refiner/author.rs @@ -27,12 +27,14 @@ pub fn refine_author(s: &str) -> Option { a = strip_trailing_comma_year(&a); a = strip_trailing_comma_month_year(&a); a = strip_trailing_comma_email_matching_name(&a); + a = truncate_trailing_from_clause_after_angle_contact(&a); a = truncate_trailing_clause_after_contact(&a); a = strip_trailing_comma_and(&a); a = truncate_bug_reports_clause(&a); a = truncate_caller_specificaly_clause(&a); a = truncate_json_metadata_tail(&a); a = truncate_distribution_metadata_tail(&a); + a = truncate_generated_month_year_clause(&a); a = truncate_better_known_as_clause(&a); a = normalize_slash_spacing(&a); a = normalize_slash_author_pairs(&a); @@ -644,6 +646,31 @@ fn truncate_distribution_metadata_tail(s: &str) -> String { prefix.to_string() } +fn truncate_generated_month_year_clause(s: &str) -> String { + static GENERATED_MONTH_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?ix) + ^(?P.+?) + \s+Generated\s+ + (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) + (?:\s*,?\s*(?:19\d{2}|20\d{2}))? + \s*$", + ) + .unwrap() + }); + + let trimmed = s.trim(); + let Some(cap) = GENERATED_MONTH_RE.captures(trimmed) else { + return s.to_string(); + }; + let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim(); + if prefix.is_empty() { + return s.to_string(); + } + + prefix.to_string() +} + fn looks_like_generated_resource_identifier(s: &str) -> bool { let trimmed = s.trim(); if trimmed.is_empty() || trimmed.contains(' ') { @@ -1052,7 +1079,7 @@ fn truncate_trailing_clause_after_contact(s: &str) -> String { let tail_lower = tail.to_ascii_lowercase(); let prose_like_tail = [ "the ", "a ", "an ", "i ", "since ", "this ", "these ", "those ", "is ", "was ", "visit ", - "for ", + "for ", "from ", ] .iter() .any(|prefix_text| tail_lower.starts_with(prefix_text)); @@ -1064,6 +1091,22 @@ fn truncate_trailing_clause_after_contact(s: &str) -> String { s.to_string() } +fn truncate_trailing_from_clause_after_angle_contact(s: &str) -> String { + static FROM_CLAUSE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)^(?P.+?<[^>]*@[^>]*>)\s+from\b.*$").unwrap()); + + let trimmed = s.trim(); + let Some(cap) = FROM_CLAUSE_RE.captures(trimmed) else { + return s.to_string(); + }; + let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim(); + if prefix.is_empty() { + return s.to_string(); + } + + prefix.to_string() +} + fn strip_trailing_status_works(s: &str) -> String { static STATUS_WORKS_RE: LazyLock = LazyLock::new(|| Regex::new(r"(?i)^(?P.+\bStatus)\s+works\s*$").unwrap()); diff --git a/src/copyright/refiner/authors_junk_patterns.rs b/src/copyright/refiner/authors_junk_patterns.rs index d0463aa57..8a203211a 100644 --- a/src/copyright/refiner/authors_junk_patterns.rs +++ b/src/copyright/refiner/authors_junk_patterns.rs @@ -104,6 +104,8 @@ pub(super) static AUTHORS_JUNK_PATTERNS: LazyLock> = LazyLock::new(|| r"(?i)^the task'?s numa mempolicy\b.*$", r"(?i)^the authors laboriously took the trouble\b.*$", r"(?i)^laboriously took the trouble\b.*$", + r"(?i)^the code itself\b.*$", + r"(?i)^.+\bis a software package provided by\b.*$", r"(?i)^support\s+for\b.*$", r"(?i)^addresses\s+\.+.*$", ]; diff --git a/src/copyright/refiner/copyrights_junk_patterns.rs b/src/copyright/refiner/copyrights_junk_patterns.rs index 8dfe13e48..bd81194e4 100644 --- a/src/copyright/refiner/copyrights_junk_patterns.rs +++ b/src/copyright/refiner/copyrights_junk_patterns.rs @@ -9,6 +9,8 @@ use regex::Regex; pub(super) static COPYRIGHTS_JUNK_PATTERNS: LazyLock> = LazyLock::new(|| { let patterns = [ r"(?i)^copyright \(c\)$", + r"(?i)^copyright exclude$", + r"(?i)^copyright doctrines of fair use, fair dealing, or other equivalents\.?$", r"(?i)^\(c\) by$", r"(?i)\(c\) [a-zA-Z][a-z] \(c\)", r"(?i)^copyright holder or simply", diff --git a/src/copyright/refiner/holders_junk_patterns.rs b/src/copyright/refiner/holders_junk_patterns.rs index ac59837a0..3b2b8ce09 100644 --- a/src/copyright/refiner/holders_junk_patterns.rs +++ b/src/copyright/refiner/holders_junk_patterns.rs @@ -9,6 +9,9 @@ use regex::Regex; pub(super) static HOLDERS_JUNK_PATTERNS: LazyLock> = LazyLock::new(|| { let patterns = [ r"(?i)^licenses?,\s+and/or\b", + r"(?i)^exclude$", + r"(?i)^with the$", + r"(?i)^.+,\s+.+\band\s+their\s+employees$", r"(?i)^holders?,\s*authors\b", r"(?i)^notice,\s+and\b", r"(?i)^notice,\s+in\b", diff --git a/src/copyright/refiner/mod.rs b/src/copyright/refiner/mod.rs index ade6d4b09..edc8ee849 100644 --- a/src/copyright/refiner/mod.rs +++ b/src/copyright/refiner/mod.rs @@ -441,6 +441,10 @@ static HOLDERS_JUNK: LazyLock> = LazyLock::new(|| { /// Return true if `s` matches any known junk copyright pattern. pub fn is_junk_copyright(s: &str) -> bool { + if looks_like_structured_copyright_notice_with_year(s) { + return false; + } + COPYRIGHTS_JUNK_PATTERNS.iter().any(|re| re.is_match(s)) || is_junk_copyright_scan_phrase(s) || is_junk_copyright_code_fragment(s) @@ -448,6 +452,14 @@ pub fn is_junk_copyright(s: &str) -> bool { || is_junk_c_sign_path_fragment(s) } +fn looks_like_structured_copyright_notice_with_year(s: &str) -> bool { + static STRUCTURED_NOTICE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)^copyright\s+notice\s*\(\s*(?:19\d{2}|20\d{2})\s*\)\s+.+$").unwrap() + }); + + STRUCTURED_NOTICE_RE.is_match(s.trim()) +} + fn has_copyright_year(s: &str) -> bool { static COPYRIGHT_YEAR_RE: LazyLock = LazyLock::new(|| { Regex::new(r"(?i)\b(?:19\d{2}|20\d{2})(?:\s*[-–/]\s*(?:19\d{2}|20\d{2}|\d{2}))?\b").unwrap() @@ -1091,6 +1103,7 @@ pub fn refine_copyright(s: &str) -> Option { c = strip_trailing_paren_email_after_c_by(&c); c = strip_trailing_for_clause_after_email(&c); c = strip_trailing_at_affiliation(&c); + c = strip_trailing_single_letter_obfuscated_email_phrase(&c); c = strip_trailing_obfuscated_email_after_dash(&c); c = strip_url_token_between_years_and_holder(&c); c = strip_obfuscated_angle_emails(&c); @@ -1120,6 +1133,7 @@ pub fn refine_copyright(s: &str) -> Option { c = strip_trailing_paren_identifier(&c); c = strip_trailing_company_name_placeholder(&c); c = strip_trailing_company_co_ltd(&c); + c = strip_trailing_heavily_based_clause(&c); c = strip_trailing_obfuscated_email_in_angle_brackets_after_copyright(&c); c = strip_trailing_linux_ag_location_in_copyright(&c); c = strip_trailing_by_person_clause_after_company(&c); @@ -1211,6 +1225,7 @@ pub fn refine_copyright(s: &str) -> Option { return None; } if is_post_refine_copyright_code_fragment(&result) + || is_junk_copyright(&result) || is_junk_copyright_of_header(&result) || is_junk_copyrighted_works_header(&result) || is_junk_copyrighted_software_phrase(&result) @@ -1263,6 +1278,76 @@ fn strip_trailing_obfuscated_email_after_dash(s: &str) -> String { prefix.to_string() } +fn strip_trailing_single_letter_obfuscated_email_phrase(s: &str) -> String { + static SINGLE_LETTER_OBF_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?ix) + ^(?P.+?) + \s+ + (?P[a-z0-9][a-z0-9._-]{0,63}) + \s+a\s+ + (?P[a-z0-9][a-z0-9._-]{0,63}) + \s+ + (?Pcom|org|net|edu|gov|mil|io|co|us|uk|de|fr|jp|cn|in|info|biz|me|tv|ca|au) + \s*$", + ) + .unwrap() + }); + + let trimmed = s.trim(); + let Some(cap) = SINGLE_LETTER_OBF_RE.captures(trimmed) else { + return s.to_string(); + }; + + let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim(); + let user = cap.name("user").map(|m| m.as_str()).unwrap_or("").trim(); + let domain = cap.name("domain").map(|m| m.as_str()).unwrap_or("").trim(); + if prefix.is_empty() || user.is_empty() || domain.is_empty() { + return s.to_string(); + } + + let prefix_tokens: HashSet = prefix + .split_whitespace() + .map(|token| { + token + .trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_') + .to_ascii_lowercase() + }) + .filter(|token| token.len() >= 2) + .collect(); + + if prefix_tokens.contains(&user.to_ascii_lowercase()) + && prefix_tokens.contains(&domain.to_ascii_lowercase()) + { + return prefix.to_string(); + } + + s.to_string() +} + +fn strip_trailing_heavily_based_clause(s: &str) -> String { + static HEAVILY_BASED_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)^(?P.+?)\s+Heavily(?:\s+based\b.*)?$").unwrap()); + + let trimmed = s.trim(); + let Some(cap) = HEAVILY_BASED_RE.captures(trimmed) else { + return s.to_string(); + }; + + let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim(); + if prefix.is_empty() { + return s.to_string(); + } + + let lower = prefix.to_ascii_lowercase(); + if lower.starts_with("copyright") || lower.starts_with("(c)") || prefix_has_holder_words(prefix) + { + return prefix.to_string(); + } + + s.to_string() +} + fn strip_trailing_credit_file_reference_clause(s: &str) -> String { let trimmed = s.trim(); let lower = trimmed.to_ascii_lowercase(); @@ -2319,6 +2404,7 @@ fn refine_holder_impl(s: &str, in_copyright_context: bool) -> Option { h = strip_trailing_email_token(&h); h = strip_trailing_obfuscated_email_phrase_in_holder(&h); } + h = strip_trailing_single_letter_obfuscated_email_phrase(&h); h = strip_parenthesized_emails(&h); h = strip_trailing_parenthesized_url_or_domain(&h); h = strip_contributor_parens_after_org(&h); @@ -2349,6 +2435,7 @@ fn refine_holder_impl(s: &str, in_copyright_context: bool) -> Option { h = strip_trailing_paren_identifier(&h); h = strip_trailing_company_name_placeholder(&h); h = strip_trailing_confidentiality_qualifier(&h); + h = strip_trailing_heavily_based_clause(&h); if in_copyright_context { h = strip_trailing_short_surname_paren_list_in_holder(&h); diff --git a/src/copyright/refiner/tests.rs b/src/copyright/refiner/tests.rs index 0a6a6dd13..da8ded855 100644 --- a/src/copyright/refiner/tests.rs +++ b/src/copyright/refiner/tests.rs @@ -2055,3 +2055,84 @@ fn test_refine_holder_drops_css_selector_noise() { assert_eq!(refine_holder("Legal Notice"), None); assert_eq!(refine_holder("color 666666"), None); } + +#[test] +fn test_refine_author_strips_generated_month_year_and_from_lib_tail() { + assert_eq!( + refine_author("Intel Corporation Generated November"), + Some("Intel Corporation".to_string()) + ); + assert_eq!( + refine_author("L. Plagne from boost lib"), + Some("L. Plagne ".to_string()) + ); +} + +#[test] +fn test_refine_author_drops_code_itself_and_lapack_package_prose() { + assert_eq!( + refine_author( + "the code itself Stefan I. Larimore and Timothy A. Davis (davis@cise.ufl.edu), University of Florida. The algorithm was in collaboration with John Gilbert, Xerox PARC, and Esmond Ng, Oak Ridge National Laboratory" + ), + None + ); + assert_eq!( + refine_author( + "LAPACK is a software package provided by Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd" + ), + None + ); +} + +#[test] +fn test_refine_holder_drops_exclude_disclaimer_and_trailing_heavily() { + assert_eq!(refine_holder("EXCLUDE"), None); + assert_eq!(refine_holder("with the"), None); + assert_eq!( + refine_holder( + "THE UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND THEIR EMPLOYEES" + ), + None + ); + assert_eq!( + refine_holder("Konstantinos Margaritis Heavily"), + Some("Konstantinos Margaritis".to_string()) + ); +} + +#[test] +fn test_refine_holder_and_copyright_strip_single_letter_obfuscated_email_tail() { + assert_eq!( + refine_holder("Mark Borgerding mark a borgerding net"), + Some("Mark Borgerding".to_string()) + ); + assert_eq!( + refine_copyright("Copyright (c) 2009 Mark Borgerding mark a borgerding net"), + Some("Copyright (c) 2009 Mark Borgerding".to_string()) + ); +} + +#[test] +fn test_refine_copyright_drops_exclude_and_mpl_fair_use_noise() { + assert_eq!(refine_copyright("copyright EXCLUDE"), None); + assert_eq!( + refine_copyright("copyright doctrines of fair use, fair dealing, or other equivalents"), + None + ); +} + +#[test] +fn test_refine_copyright_strips_trailing_heavily_based_clause() { + assert_eq!( + refine_copyright("Copyright (c) 2010 Konstantinos Margaritis Heavily"), + Some("Copyright (c) 2010 Konstantinos Margaritis ".to_string()) + ); +} + +#[test] +fn test_refine_copyright_keeps_structured_copyright_notice_with_year() { + assert_eq!( + refine_copyright("Copyright Notice (1999) University of Chicago"), + Some("Copyright Notice (1999) University of Chicago".to_string()) + ); +} diff --git a/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml b/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml index 9840f3c2a..a2276f8f2 100644 --- a/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml +++ b/testdata/copyright-golden/copyrights/misco3/correct-copyright-minpack.txt.yml @@ -10,4 +10,3 @@ holders: holders_summary: - value: University of Chicago count: 1 -authors: [] From 7a500dbe397c5cf2dc11b8c6eb15d353a119b63c Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Tue, 5 May 2026 00:02:50 +0200 Subject: [PATCH 2/5] fix(xtask): ignore slash-only URL compare deltas Signed-off-by: Maxim Stykow --- xtask/src/bin/compare_outputs.rs | 52 +++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/xtask/src/bin/compare_outputs.rs b/xtask/src/bin/compare_outputs.rs index 90a15970e..00081b913 100644 --- a/xtask/src/bin/compare_outputs.rs +++ b/xtask/src/bin/compare_outputs.rs @@ -2060,6 +2060,35 @@ fn normalize_compare_copyright_value(value: &str) -> String { out } +fn normalize_compare_url_value(value: &str) -> String { + let normalized = normalize_text(value); + if normalized.is_empty() { + return normalized; + } + + let Ok(parsed) = url::Url::parse(&normalized) else { + return normalized; + }; + + if parsed.cannot_be_a_base() { + return normalized; + } + + let prefix_end = normalized + .find('?') + .or_else(|| normalized.find('#')) + .unwrap_or(normalized.len()); + let prefix = &normalized[..prefix_end]; + let suffix = &normalized[prefix_end..]; + + let trimmed_prefix = prefix.trim_end_matches('/'); + if trimmed_prefix.is_empty() { + normalized + } else { + format!("{trimmed_prefix}{suffix}") + } +} + fn strip_compare_all_rights_reserved(value: &str) -> String { let lower = value.to_ascii_lowercase(); let marker = "all rights reserved"; @@ -2107,7 +2136,10 @@ fn metric_values(entry: &Value, metric: &str) -> Vec { .get("email") .and_then(Value::as_str) .map(str::to_string), - "urls" => item.get("url").and_then(Value::as_str).map(str::to_string), + "urls" => item + .get("url") + .and_then(Value::as_str) + .map(normalize_compare_url_value), "scan_errors" => scan_error_identity(item).map(str::to_string), _ => None, }?; @@ -4943,6 +4975,24 @@ mod tests { ); } + #[test] + fn metric_values_ignore_trailing_slash_only_url_differences() { + let entry = json!({ + "urls": [ + {"url": "http://mozilla.org/MPL/2.0/"}, + {"url": "https://example.com/foo/?a=1"} + ] + }); + + assert_eq!( + metric_values(&entry, "urls"), + vec![ + "http://mozilla.org/MPL/2.0".to_string(), + "https://example.com/foo?a=1".to_string() + ] + ); + } + #[test] fn top_level_counts_deduplicate_license_detection_reference_match_identities() { let value = json!({ From 6c6aa4ebedd6657928af2c356eee42d7dd99513e Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Tue, 5 May 2026 00:40:16 +0200 Subject: [PATCH 3/5] fix(copyright): restore refiner compatibility after rebase Signed-off-by: Maxim Stykow --- src/copyright/refiner/authors_junk_patterns.rs | 1 + src/copyright/refiner/mod.rs | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/copyright/refiner/authors_junk_patterns.rs b/src/copyright/refiner/authors_junk_patterns.rs index 8a203211a..2bbde57ce 100644 --- a/src/copyright/refiner/authors_junk_patterns.rs +++ b/src/copyright/refiner/authors_junk_patterns.rs @@ -91,6 +91,7 @@ pub(super) static AUTHORS_JUNK_PATTERNS: LazyLock> = LazyLock::new(|| r"(?i)^gives unlimited$", r"(?i)^word assigns past and future changes\b", r"(?i)^maintainers\s*<[^>]+>\s+from\s+https?://\S+$", + r"(?i)^maintainers\s*<[^>]+>$", r"(?i)^versions,\s+and$", r"(?i)^versions$", r"(?i)^makes$", diff --git a/src/copyright/refiner/mod.rs b/src/copyright/refiner/mod.rs index edc8ee849..453bb97f6 100644 --- a/src/copyright/refiner/mod.rs +++ b/src/copyright/refiner/mod.rs @@ -1225,7 +1225,7 @@ pub fn refine_copyright(s: &str) -> Option { return None; } if is_post_refine_copyright_code_fragment(&result) - || is_junk_copyright(&result) + || is_explicit_junk_copyright_phrase(&result) || is_junk_copyright_of_header(&result) || is_junk_copyrighted_works_header(&result) || is_junk_copyrighted_software_phrase(&result) @@ -1239,6 +1239,15 @@ pub fn refine_copyright(s: &str) -> Option { } } +fn is_explicit_junk_copyright_phrase(s: &str) -> bool { + matches!( + s.trim().to_ascii_lowercase().as_str(), + "copyright exclude" + | "copyright doctrines of fair use, fair dealing, or other equivalents" + | "copyright doctrines of fair use, fair dealing, or other equivalents." + ) +} + fn strip_trailing_obfuscated_email_after_dash(s: &str) -> String { static TRAILING_DASH_OBF_EMAIL_RE: LazyLock = LazyLock::new(|| { Regex::new( From 95e48c79c94bf1147e0d40f82b48b4dd15076efe Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Tue, 5 May 2026 12:48:03 +0200 Subject: [PATCH 4/5] fix(copyright): tighten wrapper and notice extraction Signed-off-by: Maxim Stykow --- src/copyright/detector/mod.rs | 6 + .../pattern_extract/extraction/content.rs | 72 +++++ src/copyright/detector/phases/primary.rs | 9 + src/copyright/detector/tests.rs | 128 +++++++++ src/copyright/detector/token_utils/filters.rs | 39 +++ src/copyright/mod.rs | 1 + src/copyright/refiner/mod.rs | 254 +++++++++++++++++- src/copyright/refiner/tests.rs | 85 ++++++ src/scanner/process/copyright.rs | 106 +++++++- src/scanner/process/copyright_test.rs | 196 ++++++++++++++ 10 files changed, 893 insertions(+), 3 deletions(-) diff --git a/src/copyright/detector/mod.rs b/src/copyright/detector/mod.rs index d2bf1f9f0..a11048da7 100644 --- a/src/copyright/detector/mod.rs +++ b/src/copyright/detector/mod.rs @@ -357,6 +357,11 @@ pub fn detect_copyrights_from_text_with_deadline( postprocess_transforms::drop_shadowed_bare_c_from_year_fragments(&mut copyrights, &mut holders); drop_path_fragment_holders_from_bare_c_code_lines(&raw_lines, ©rights, &mut holders); drop_scan_only_holders_from_copyright_scan_lines(&raw_lines, ©rights, &mut holders); + drop_test_label_false_positive_copyrights_and_holders( + &raw_lines, + &mut copyrights, + &mut holders, + ); for group in &groups { extend_dash_obfuscated_email_suffixes(&raw_lines, group, &mut copyrights[..], &holders[..]); @@ -407,6 +412,7 @@ pub(super) use token_utils::collect_all_leaves; use token_utils::{ apply_written_by_for_markers, drop_path_fragment_holders_from_bare_c_code_lines, drop_scan_only_holders_from_copyright_scan_lines, + drop_test_label_false_positive_copyrights_and_holders, extract_original_author_additional_contributors, }; use tree_walk::{ diff --git a/src/copyright/detector/pattern_extract/extraction/content.rs b/src/copyright/detector/pattern_extract/extraction/content.rs index a056afca5..0621a2cef 100644 --- a/src/copyright/detector/pattern_extract/extraction/content.rs +++ b/src/copyright/detector/pattern_extract/extraction/content.rs @@ -77,6 +77,78 @@ pub fn extract_spdx_filecopyrighttext_c_without_year( (copyrights, holders) } +pub fn extract_bytestring_copyright_c_without_year( + content: &str, + existing_holders: &[HolderDetection], +) -> (Vec, Vec) { + static YEAR_RE: LazyLock = + LazyLock::new(|| Regex::new(r"\b(?:19\d{2}|20\d{2})\b").unwrap()); + + let mut copyrights = Vec::new(); + let mut holders = Vec::new(); + + let mut seen_h: HashSet<(String, usize)> = existing_holders + .iter() + .map(|h| (h.holder.clone(), h.start_line.get())) + .collect(); + + for (idx, line) in content.lines().enumerate() { + let ln = idx + 1; + let Some(raw) = extract_bytestring_copyright_literal(line) else { + continue; + }; + if raw.is_empty() || YEAR_RE.is_match(&raw) { + continue; + } + + let prepared = crate::copyright::prepare_text_line(&raw); + if let Some(refined) = refine_copyright(&prepared) { + copyrights.push(CopyrightDetection { + copyright: refined, + start_line: LineNumber::new(ln).unwrap(), + end_line: LineNumber::new(ln).unwrap(), + }); + } + + let tail = prepared + .strip_prefix("Copyright") + .unwrap_or(prepared.as_str()) + .trim() + .strip_prefix("(c)") + .unwrap_or(prepared.as_str()) + .trim(); + if let Some(holder) = refine_holder(tail) + && seen_h.insert((holder.clone(), ln)) + { + holders.push(HolderDetection { + holder, + start_line: LineNumber::new(ln).unwrap(), + end_line: LineNumber::new(ln).unwrap(), + }); + } + } + + (copyrights, holders) +} + +fn extract_bytestring_copyright_literal(line: &str) -> Option { + for prefix in ["br'", "rb'", "b'", "br\"", "rb\"", "b\""] { + let Some(start) = line.find(prefix) else { + continue; + }; + let quote = prefix.chars().last()?; + let rest = line.get(start + prefix.len()..)?; + let Some(end) = rest.find(quote) else { + continue; + }; + let candidate = rest[..end].trim(); + if candidate.to_ascii_lowercase().starts_with("copyright (c)") { + return Some(candidate.to_string()); + } + } + + None +} pub fn extract_html_meta_name_copyright_content( content: &str, existing_holders: &[HolderDetection], diff --git a/src/copyright/detector/phases/primary.rs b/src/copyright/detector/phases/primary.rs index 09e189120..9df41f7d7 100644 --- a/src/copyright/detector/phases/primary.rs +++ b/src/copyright/detector/phases/primary.rs @@ -421,6 +421,15 @@ fn run_content_and_markup_extractions( copyrights.extend(new_c); holders.extend(new_h); + let (mut new_c, new_h) = + super::super::pattern_extract::extract_bytestring_copyright_c_without_year( + content, holders, + ); + seen.dedup_new_copyrights(&mut new_c, 0); + seen.register_holders(&new_h); + copyrights.extend(new_c); + holders.extend(new_h); + let (mut new_c, new_h) = super::super::pattern_extract::extract_html_meta_name_copyright_content(content, holders); seen.dedup_new_copyrights(&mut new_c, 0); diff --git a/src/copyright/detector/tests.rs b/src/copyright/detector/tests.rs index 328809935..141cdb70b 100644 --- a/src/copyright/detector/tests.rs +++ b/src/copyright/detector/tests.rs @@ -1227,6 +1227,134 @@ fn test_detect_storyboard_text_attribute_copyright_holder() { ); } +#[test] +fn test_detect_flutter_application_legalese_assignment_strips_wrapper() { + let input = "applicationLegalese: '© 2014 The Flutter Authors',"; + let (c, h, _a) = detect_copyrights_from_text(input); + assert!( + c.iter() + .any(|cr| cr.copyright == "(c) 2014 The Flutter Authors"), + "copyrights: {:?}", + c.iter().map(|cr| &cr.copyright).collect::>() + ); + assert!( + h.iter().any(|ho| ho.holder == "The Flutter Authors"), + "holders: {:?}", + h.iter().map(|ho| &ho.holder).collect::>() + ); +} + +#[test] +fn test_detect_flutter_product_copyright_assignment_strips_wrapper() { + let input = "PRODUCT_COPYRIGHT = Copyright © 2014 The Flutter Authors. All rights reserved."; + let (c, h, _a) = detect_copyrights_from_text(input); + assert!( + c.iter() + .any(|cr| cr.copyright == "Copyright (c) 2014 The Flutter Authors"), + "copyrights: {:?}", + c.iter().map(|cr| &cr.copyright).collect::>() + ); + assert!( + h.iter().any(|ho| ho.holder == "The Flutter Authors"), + "holders: {:?}", + h.iter().map(|ho| &ho.holder).collect::>() + ); +} + +#[test] +fn test_detect_flutter_windows_legalcopyright_template_dropped() { + let input = r#"VALUE "LegalCopyright", "Copyright (C) {{year}} {{organization}}. All rights reserved." "\0""#; + let (c, h, _a) = detect_copyrights_from_text(input); + assert!(c.is_empty(), "copyrights: {c:?}"); + assert!(h.is_empty(), "holders: {h:?}"); +} + +#[test] +fn test_detect_flutter_material_icon_doc_false_positive_dropped() { + let input = r#"copyright — material icon named "copyright" (sharp)."#; + let (c, h, _a) = detect_copyrights_from_text(input); + assert!(c.is_empty(), "copyrights: {c:?}"); + assert!(h.is_empty(), "holders: {h:?}"); +} + +#[test] +fn test_detect_flutter_verify_entry_false_positive_dropped() { + let input = "verifyEntry(mapping, 'KeyC', [r'c', r'C', r'©', r'¢'], 'c');"; + let (c, h, _a) = detect_copyrights_from_text(input); + assert!(c.is_empty(), "copyrights: {c:?}"); + assert!(h.is_empty(), "holders: {h:?}"); +} + +#[test] +fn test_detect_python_bytestring_copyright_without_year() { + let input = "not line.startswith(b'Copyright (C) Microsoft Corporation') and line):"; + let (c, h, _a) = detect_copyrights_from_text(input); + assert!( + c.iter() + .any(|cr| cr.copyright == "Copyright (c) Microsoft Corporation"), + "copyrights: {:?}", + c.iter().map(|cr| &cr.copyright).collect::>() + ); + assert!( + h.iter().any(|ho| ho.holder == "Microsoft Corporation"), + "holders: {:?}", + h.iter().map(|ho| &ho.holder).collect::>() + ); +} + +#[test] +fn test_detect_table_header_labels_with_c_sign_are_dropped() { + let input = concat!( + " ---------------------- (A) Column Header\n", + " | (C) | 1 | 2 | (D) Row Header\n", + ); + let (c, h, _a) = detect_copyrights_from_text(input); + assert!(c.is_empty(), "copyrights: {c:?}"); + assert!(h.is_empty(), "holders: {h:?}"); +} + +#[test] +fn test_detect_minpack_example_prose_false_positive_dropped() { + let input = "// Tests using the examples provided by (c)minpack\n"; + let (c, h, _a) = detect_copyrights_from_text(input); + assert!(c.is_empty(), "copyrights: {c:?}"); + assert!(h.is_empty(), "holders: {h:?}"); +} + +#[test] +fn test_detect_flutter_about_dialog_snippet_keeps_clean_values() { + let input = concat!( + "// Copyright 2014 The Flutter Authors. All rights reserved.\n", + "showAboutDialog(\n", + " context: context,\n", + " applicationLegalese: '© 2014 The Flutter Authors',\n", + ");\n", + ); + let (c, h, _a) = detect_copyrights_from_text(input); + + let copyrights: Vec<&str> = c.iter().map(|cr| cr.copyright.as_str()).collect(); + let holders: Vec<&str> = h.iter().map(|ho| ho.holder.as_str()).collect(); + + assert!( + copyrights.contains(&"Copyright 2014 The Flutter Authors"), + "copyrights: {copyrights:?}" + ); + assert!( + copyrights.contains(&"(c) 2014 The Flutter Authors"), + "copyrights: {copyrights:?}" + ); + assert!( + copyrights + .iter() + .all(|cr| !cr.contains("applicationLegalese") && !cr.contains("All rights reserved")), + "copyrights: {copyrights:?}" + ); + assert!( + holders.iter().all(|ho| *ho == "The Flutter Authors"), + "holders: {holders:?}" + ); +} + #[test] fn test_detect_iso_date_holder_regression() { let input = "Copyright (c) 2006-07-24 John Boolage"; diff --git a/src/copyright/detector/token_utils/filters.rs b/src/copyright/detector/token_utils/filters.rs index a1bedb6ee..194f656a3 100644 --- a/src/copyright/detector/token_utils/filters.rs +++ b/src/copyright/detector/token_utils/filters.rs @@ -367,6 +367,45 @@ pub fn drop_path_fragment_holders_from_bare_c_code_lines( }); } +pub fn drop_test_label_false_positive_copyrights_and_holders( + raw_lines: &[&str], + copyrights: &mut Vec, + holders: &mut Vec, +) { + if raw_lines.is_empty() || (copyrights.is_empty() && holders.is_empty()) { + return; + } + + static ROW_HEADER_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?ix) + \([A-Z]\)\s+(?:row|column)\s+header + | \|\s*\([A-Z]\)\s*\|\s*\d+\s*\|\s*\d+\s*\|\s*\([A-Z]\)\s+row\s+header + ", + ) + .unwrap() + }); + static TEST_EXAMPLES_COPY_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)tests\s+using\s+the\s+examples\s+provided\s+by\s+\(c\)").unwrap() + }); + + let bad_lines: HashSet = raw_lines + .iter() + .enumerate() + .filter_map(|(idx, line)| { + (ROW_HEADER_RE.is_match(line) || TEST_EXAMPLES_COPY_RE.is_match(line)) + .then_some(idx + 1) + }) + .collect(); + + if bad_lines.is_empty() { + return; + } + + copyrights.retain(|c| !(c.start_line == c.end_line && bad_lines.contains(&c.start_line.get()))); + holders.retain(|h| !(h.start_line == h.end_line && bad_lines.contains(&h.start_line.get()))); +} + /// Tags whose filtering should cause adjacent commas to be considered orphaned. /// Only year-related tags: commas between years (e.g. "2006, 2007") become /// orphaned when the years are removed. Email/URL commas are intentionally diff --git a/src/copyright/mod.rs b/src/copyright/mod.rs index 9b29a0a9a..bc45ac8d3 100644 --- a/src/copyright/mod.rs +++ b/src/copyright/mod.rs @@ -28,6 +28,7 @@ mod refiner; mod types; pub use credits::{detect_credits_authors, is_credits_file}; +pub(crate) use prepare::prepare_text_line; pub(crate) use refiner::refine_author; pub use refiner::refine_copyright; pub use types::{AuthorDetection, CopyrightDetection, HolderDetection}; diff --git a/src/copyright/refiner/mod.rs b/src/copyright/refiner/mod.rs index 453bb97f6..628775633 100644 --- a/src/copyright/refiner/mod.rs +++ b/src/copyright/refiner/mod.rs @@ -14,6 +14,7 @@ use std::sync::LazyLock; use regex::Regex; use super::candidates::strip_balanced_edge_parens; +use super::prepare::prepare_text_line; mod authors_junk_patterns; mod copyrights_junk_patterns; mod holders_junk_patterns; @@ -519,8 +520,10 @@ fn is_junk_copyright_code_fragment(s: &str) -> bool { || lower == "copyright void" || trimmed.contains("??") || contains_member_access_code_token(trimmed) + || contains_code_string_literal_fragment(trimmed) || contains_unicode_escape_token_run(trimmed) || contains_html_entity_decoder_artifact(trimmed) + || contains_markup_tag_fragment(trimmed) || contains_xml_markup_declaration_token(trimmed) || contains_regex_or_template_marker(trimmed) || has_windows_versioninfo_markers @@ -580,10 +583,14 @@ fn is_junk_holder_code_fragment(s: &str) -> bool { || contains_embedded_file_reference_prose(trimmed) || lower.contains("icondata") || lower.contains("authors.append") + || lower == "void" + || looks_like_parenthesized_ui_descriptor(trimmed) || contains_member_access_code_token(trimmed) || contains_code_call_fragment(trimmed) + || contains_code_string_literal_fragment(trimmed) || contains_unicode_escape_token_run(trimmed) || contains_html_entity_decoder_artifact(trimmed) + || contains_markup_tag_fragment(trimmed) || contains_xml_markup_declaration_token(trimmed) || contains_regex_or_template_marker(trimmed) || has_windows_versioninfo_markers @@ -667,6 +674,7 @@ fn contains_html_entity_decoder_artifact(s: &str) -> bool { let lower = s.to_ascii_lowercase(); lower.contains("u00a0") || lower.contains("hellip") + || lower.contains("x2014") || lower.contains("x2f") || lower.contains("reg 174") || lower.contains("copy 169") @@ -692,6 +700,29 @@ fn contains_generated_resource_token(s: &str) -> bool { ASSET_RE.is_match(trimmed) } +fn contains_markup_tag_fragment(s: &str) -> bool { + static MARKUP_TAG_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)]*>|<[!?][^>]*>").expect("valid markup tag fragment regex") + }); + + let trimmed = s.trim(); + let lower = trimmed.to_ascii_lowercase(); + if trimmed.contains('@') + || lower.contains("www.") + || lower.contains(".com") + || lower.contains(".org") + || lower.contains(".net") + || lower.contains(".edu") + || lower.contains(".gov") + || lower.contains(".io") + || lower.contains(".dev") + { + return false; + } + + MARKUP_TAG_RE.is_match(trimmed) || trimmed.contains("&#") +} + fn contains_member_access_code_token(s: &str) -> bool { let trimmed = s.trim(); let lower = trimmed.to_ascii_lowercase(); @@ -716,6 +747,25 @@ fn contains_member_access_code_token(s: &str) -> bool { MEMBER_ACCESS_RE.is_match(trimmed) } +fn contains_code_string_literal_fragment(s: &str) -> bool { + let trimmed = s.trim(); + let lower = trimmed.to_ascii_lowercase(); + + lower.contains("r'") + || lower.contains("r\"") + || lower.contains("\"\\0\"") + || lower.contains("'\\0'") +} + +fn looks_like_parenthesized_ui_descriptor(s: &str) -> bool { + static UI_DESCRIPTOR_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)^\((?:sharp|round|rounded|outline|outlined|filled)\)$") + .expect("valid UI descriptor regex") + }); + + UI_DESCRIPTOR_RE.is_match(s.trim()) +} + fn is_post_refine_copyright_code_fragment(s: &str) -> bool { let trimmed = s.trim(); let lower = trimmed.to_ascii_lowercase(); @@ -723,7 +773,9 @@ fn is_post_refine_copyright_code_fragment(s: &str) -> bool { contains_windows_versioninfo_token(trimmed) || contains_member_access_code_token(trimmed) || contains_code_call_fragment(trimmed) + || contains_code_string_literal_fragment(trimmed) || contains_unicode_escape_token_run(trimmed) + || contains_markup_tag_fragment(trimmed) || lower.contains("public void") || lower.contains("get set") || lower.contains("assert.equal") @@ -789,6 +841,7 @@ fn is_explicit_generic_field_label_token(s: &str) -> bool { "description", "direction", "options", + "organization", "owner_name", "params", "placeholder", @@ -796,6 +849,7 @@ fn is_explicit_generic_field_label_token(s: &str) -> bool { "ref", "reviewers", "schema", + "sharp", "source", "text", "timeago", @@ -803,6 +857,11 @@ fn is_explicit_generic_field_label_token(s: &str) -> bool { "tooltip", "unique", "username", + "round", + "rounded", + "outline", + "outlined", + "filled", ]) }); @@ -1027,7 +1086,11 @@ fn is_obvious_prose_fragment(s: &str) -> bool { if lower.starts_with("not by ") { return false; } - if lower.contains("code sample for") { + if lower.contains("code sample for") + || lower.contains("tests using the examples provided by") + || lower.ends_with("row header") + || lower.ends_with("column header") + { return true; } @@ -1092,6 +1155,7 @@ pub fn refine_copyright(s: &str) -> Option { return None; } let mut c = original.clone(); + c = strip_known_copyright_wrappers(&c); c = strip_trailing_quote_before_email(&c); c = normalize_b_dot_angle_emails(&c); c = strip_nickname_quotes(&c); @@ -1136,12 +1200,14 @@ pub fn refine_copyright(s: &str) -> Option { c = strip_trailing_heavily_based_clause(&c); c = strip_trailing_obfuscated_email_in_angle_brackets_after_copyright(&c); c = strip_trailing_linux_ag_location_in_copyright(&c); + c = strip_trailing_locale_timestamp_before_terminal_year_in_copyright(&c); c = strip_trailing_by_person_clause_after_company(&c); c = strip_trailing_division_of_company_suffix(&c); c = strip_trailing_paren_at_without_domain(&c); c = strip_trailing_inc_after_today_year_placeholder(&c); c = truncate_trailing_boilerplate(&c); c = strip_trailing_everyone_is_permitted_to_copy_clause(&c); + c = strip_trailing_all_rights_reserved_clause(&c); c = strip_trailing_author_label(&c); c = strip_trailing_credit_file_reference_clause(&c); c = strip_trailing_isc_after_inc(&c); @@ -1217,7 +1283,7 @@ pub fn refine_copyright(s: &str) -> Option { let result_upper = result.to_ascii_uppercase(); if result_upper.contains("COPYRIGHT") && result_upper.contains("YEAR") - && result_upper.contains("YOUR NAME") + && (result_upper.contains("YOUR NAME") || result_upper.contains("ORGANIZATION")) { return None; } @@ -1248,6 +1314,107 @@ fn is_explicit_junk_copyright_phrase(s: &str) -> bool { ) } +fn strip_known_copyright_wrappers(s: &str) -> String { + static VALUE_LEGALCOPYRIGHT_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r#"(?ix) + ^VALUE\s+"LegalCopyright"\s*,\s*"(?P[^"]+)" + (?:\s+"\\0")?\s*$ + "#, + ) + .expect("valid LegalCopyright wrapper regex") + }); + static ASSIGNMENT_COPYRIGHT_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r#"(?ix) + ^(?:PRODUCT_COPYRIGHT|INFOPLIST_KEY_NSHumanReadableCopyright) + \s*=\s*(?P.+?)\s*;?\s*$ + "#, + ) + .expect("valid assignment copyright wrapper regex") + }); + static APPLICATION_LEGALESE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"(?ix)^applicationLegalese\s*:\s*(?P.+?)\s*,?\s*$"#) + .expect("valid applicationLegalese wrapper regex") + }); + static MARKUP_TEXT_COPYRIGHT_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r#"(?ix) + \btext\s*=\s*(?:"(?P[^"]+)"|'(?P[^']+)') + "#, + ) + .expect("valid markup text copyright wrapper regex") + }); + + let trimmed = s.trim(); + if let Some(captures) = VALUE_LEGALCOPYRIGHT_RE.captures(trimmed) { + let value = captures + .name("value") + .map(|m| m.as_str()) + .unwrap_or("") + .trim(); + if !value.is_empty() { + return prepare_text_line(value).trim().to_string(); + } + } + + for regex in [&*ASSIGNMENT_COPYRIGHT_RE, &*APPLICATION_LEGALESE_RE] { + if let Some(captures) = regex.captures(trimmed) { + let value = captures + .name("value") + .map(|m| m.as_str()) + .unwrap_or("") + .trim() + .trim_matches(&['\'', '"'][..]); + if value.starts_with("Copyright") || value.starts_with('©') { + return prepare_text_line(value).trim().to_string(); + } + } + } + + if let Some(captures) = MARKUP_TEXT_COPYRIGHT_RE.captures(trimmed) { + let value = captures + .name("dq") + .or_else(|| captures.name("sq")) + .map(|m| m.as_str()) + .unwrap_or("") + .trim(); + if value.starts_with("Copyright") || value.starts_with('©') { + return prepare_text_line(value).trim().to_string(); + } + } + + s.to_string() +} + +fn strip_trailing_all_rights_reserved_clause(s: &str) -> String { + static ALL_RIGHTS_RESERVED_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?ix)^(?P.+?)\.?\s+all\s+rights\s+reserved\.?$") + .expect("valid all rights reserved regex") + }); + + let trimmed = s.trim(); + let Some(captures) = ALL_RIGHTS_RESERVED_RE.captures(trimmed) else { + return s.to_string(); + }; + + let prefix = captures + .name("prefix") + .map(|m| m.as_str()) + .unwrap_or("") + .trim(); + let lower = prefix.to_ascii_lowercase(); + if prefix.is_empty() + || !(lower.starts_with("copyright") || lower.starts_with("(c)") || lower.starts_with('©')) + { + return s.to_string(); + } + + prefix + .trim_end_matches(&[' ', '.', ',', ';', ':'][..]) + .to_string() +} + fn strip_trailing_obfuscated_email_after_dash(s: &str) -> String { static TRAILING_DASH_OBF_EMAIL_RE: LazyLock = LazyLock::new(|| { Regex::new( @@ -1526,6 +1693,40 @@ fn strip_trailing_linux_ag_location_in_copyright(s: &str) -> String { s.to_string() } +fn strip_trailing_locale_timestamp_before_terminal_year_in_copyright(s: &str) -> String { + static LOCALE_TIMESTAMP_COPY_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?ix) + ^(?P.+?),\s* + [a-z]{3}\s+[a-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[A-Z]{2,5} + (?:\s+(?P\d{4}))?\s*$ + ", + ) + .unwrap() + }); + + let trimmed = s.trim(); + let Some(cap) = LOCALE_TIMESTAMP_COPY_RE.captures(trimmed) else { + return s.to_string(); + }; + let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim(); + if prefix.is_empty() { + return s.to_string(); + } + let lower = prefix.to_ascii_lowercase(); + if !(lower.starts_with("copyright") || lower.starts_with("(c)") || lower.starts_with('©')) { + return s.to_string(); + } + if let Some(year) = cap + .name("year") + .map(|m| m.as_str()) + .filter(|y| !y.is_empty()) + { + return format!("{} {}", prefix.trim_end_matches(&[',', ' '][..]), year); + } + prefix.trim_end_matches(&[',', ' '][..]).to_string() +} + fn strip_trailing_quote_before_email(s: &str) -> String { static TRAILING_QUOTE_BEFORE_EMAIL_RE: LazyLock = LazyLock::new(|| { Regex::new( @@ -1918,6 +2119,30 @@ fn strip_trailing_component_descriptor_from_holder(s: &str) -> String { s.to_string() } +fn strip_trailing_holder_prose_clause(s: &str) -> String { + let trimmed = s.trim(); + let lower = trimmed.to_ascii_lowercase(); + for marker in [ + " and it is hereby released to the", + " it is hereby released to the", + ", are derived from ", + " are derived from ", + " and is licensed under ", + " and labeled as such", + ] { + if let Some(idx) = lower.find(marker) { + let prefix = trimmed[..idx] + .trim_end_matches(&[',', ';', ':', ' '][..]) + .trim(); + if !prefix.is_empty() && prefix_has_holder_words(prefix) { + return prefix.to_string(); + } + } + } + + s.to_string() +} + fn strip_trailing_or_suffix(s: &str) -> String { static TRAILING_OR_RE: LazyLock = LazyLock::new(|| Regex::new(r"(?i)^(?Pcopyright\b.+?)\s+or\s*$").unwrap()); @@ -2420,6 +2645,7 @@ fn refine_holder_impl(s: &str, in_copyright_context: bool) -> Option { h = normalize_comma_spacing(&h); h = normalize_angle_bracket_comma_spacing(&h); h = strip_trailing_linux_ag_location(&h); + h = strip_trailing_locale_timestamp_in_holder(&h); h = strip_trailing_but_suffix(&h); if had_paren_email { h = remove_comma_between_person_and_company_suffix(&h); @@ -2462,6 +2688,7 @@ fn refine_holder_impl(s: &str, in_copyright_context: bool) -> Option { h = remove_some_extra_words_and_punct(&h); h = strip_trailing_incomplete_as_represented_by(&h); + h = strip_trailing_holder_prose_clause(&h); h = h.trim_matches(&['/', ' ', '~'][..]).to_string(); h = refine_names(&h, prefixes); h = strip_trailing_company_co_ltd(&h); @@ -2606,6 +2833,29 @@ fn strip_trailing_linux_ag_location(s: &str) -> String { s.to_string() } +fn strip_trailing_locale_timestamp_in_holder(s: &str) -> String { + static LOCALE_TIMESTAMP_HOLDER_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?ix) + ^(?P.+?),\s* + [a-z]{3}\s+[a-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[A-Z]{2,5} + (?:\s+\d{4})?\s*$ + ", + ) + .unwrap() + }); + + let trimmed = s.trim(); + let Some(cap) = LOCALE_TIMESTAMP_HOLDER_RE.captures(trimmed) else { + return s.to_string(); + }; + let prefix = cap.name("prefix").map(|m| m.as_str()).unwrap_or("").trim(); + if prefix.is_empty() || !prefix_has_holder_words(prefix) { + return s.to_string(); + } + prefix.trim_end_matches(&[',', ' '][..]).to_string() +} + fn remove_comma_between_person_and_company_suffix(s: &str) -> String { static COMMA_CORP_RE: LazyLock = LazyLock::new(|| { Regex::new(r"^(?P[\p{Lu}][^,]{2,64}(?:\s+[\p{Lu}][^,]{2,64})+)\s*,\s*(?P[^,]{2,64}\b(?:Corp\.?|Corporation|Inc\.?|Ltd\.?))\s*$") diff --git a/src/copyright/refiner/tests.rs b/src/copyright/refiner/tests.rs index da8ded855..ea5c30f3b 100644 --- a/src/copyright/refiner/tests.rs +++ b/src/copyright/refiner/tests.rs @@ -1548,6 +1548,55 @@ fn test_refine_copyright_drops_versioninfo_and_dtd_junk() { ); } +#[test] +fn test_refine_copyright_strips_flutter_wrapper_context() { + assert_eq!( + refine_copyright("applicationLegalese: '© 2014 The Flutter Authors',"), + Some("(c) 2014 The Flutter Authors".to_string()) + ); + assert_eq!( + refine_copyright( + "PRODUCT_COPYRIGHT = Copyright © 2014 The Flutter Authors. All rights reserved." + ), + Some("Copyright (c) 2014 The Flutter Authors".to_string()) + ); + assert_eq!( + refine_copyright( + r#"