From bc621d706ca95d790abfa83a850f4d4c0d84d5c8 Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Wed, 29 Apr 2026 12:02:31 +0200 Subject: [PATCH 1/2] fix(copyright): recognize malformed year ranges in token utils Signed-off-by: Maxim Stykow --- .../detector/token_utils/builders.rs | 10 ++++---- src/copyright/detector/token_utils/filters.rs | 23 +++++++++++++++++-- src/copyright/detector/token_utils_test.rs | 23 +++++++++++++++++++ 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/src/copyright/detector/token_utils/builders.rs b/src/copyright/detector/token_utils/builders.rs index 42ed0d377..a9383b963 100644 --- a/src/copyright/detector/token_utils/builders.rs +++ b/src/copyright/detector/token_utils/builders.rs @@ -13,8 +13,8 @@ use crate::copyright::types::{ use crate::models::LineNumber; use super::{ - collect_filtered_leaves, collect_holder_filtered_leaves, normalized_tokens_to_string, - strip_all_rights_reserved, + collect_filtered_leaves, collect_holder_filtered_leaves, is_year_like_token, + normalized_tokens_to_string, strip_all_rights_reserved, }; pub fn extract_original_author_additional_contributors( @@ -96,7 +96,7 @@ pub fn build_holder_from_node( let filtered = strip_all_rights_reserved(leaves); let allow_single_word_contributors = collect_all_leaves(node) .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| is_year_like_token(t)); build_holder_from_tokens(&filtered, allow_single_word_contributors) } @@ -133,9 +133,7 @@ pub fn build_holder_from_copyright_node( }); } - let allow_single_word_contributors = all_leaves - .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + let allow_single_word_contributors = all_leaves.iter().any(|t| is_year_like_token(t)); build_holder_from_tokens(&filtered, allow_single_word_contributors) } diff --git a/src/copyright/detector/token_utils/filters.rs b/src/copyright/detector/token_utils/filters.rs index 7dbb733b0..9c877d5e0 100644 --- a/src/copyright/detector/token_utils/filters.rs +++ b/src/copyright/detector/token_utils/filters.rs @@ -374,6 +374,25 @@ pub fn drop_path_fragment_holders_from_bare_c_code_lines( /// (e.g. "Name , Name2"). pub const YEAR_LIKE_POS_TAGS: &[PosTag] = &[PosTag::Yr, PosTag::YrPlus, PosTag::BareYr]; +static MALFORMED_COPYRIGHT_YEAR_RANGE_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?ix)^ + (?:19\d{2}|20\d{2}|19\d{3}|20\d{3}) + (?: + [\.,/\-~] + (?:19\d{2}|20\d{2}|19\d{3}|20\d{3}|\d{2}|present|current_year|today\.year) + )+ + $", + ) + .unwrap() +}); + +pub fn is_year_like_token(token: &Token) -> bool { + YEAR_LIKE_POS_TAGS.contains(&token.tag) + || (token.tag == PosTag::Cd + && MALFORMED_COPYRIGHT_YEAR_RANGE_RE.is_match(token.value.as_str())) +} + /// Year-related tree labels whose filtering orphans adjacent commas. pub const YEAR_LIKE_LABELS: &[TreeLabel] = &[TreeLabel::YrRange, TreeLabel::YrAnd]; @@ -427,7 +446,7 @@ fn collect_holder_filtered_leaves_inner<'a>( match node { ParseNode::Leaf(token) => { if ignored_pos_tags.contains(&token.tag) { - if YEAR_LIKE_POS_TAGS.contains(&token.tag) { + if is_year_like_token(token) { state.last_was_year_filtered = true; } if matches!(token.tag, PosTag::Email | PosTag::Url | PosTag::Url2) { @@ -513,7 +532,7 @@ pub fn filter_holder_tokens_with_state<'a>( for (i, &token) in tokens.iter().enumerate() { if non_holder_tags.contains(&token.tag) { - if YEAR_LIKE_POS_TAGS.contains(&token.tag) { + if is_year_like_token(token) { last_was_year_filtered = true; } if matches!(token.tag, PosTag::Email | PosTag::Url | PosTag::Url2) { diff --git a/src/copyright/detector/token_utils_test.rs b/src/copyright/detector/token_utils_test.rs index 4ba654d5b..1287292d2 100644 --- a/src/copyright/detector/token_utils_test.rs +++ b/src/copyright/detector/token_utils_test.rs @@ -104,3 +104,26 @@ fn test_collect_filtered_leaves_filters_tree_labels() { assert_eq!(leaves[0].value, "Copyright"); assert_eq!(leaves[1].value, "Acme"); } + +#[test] +fn test_is_year_like_token_accepts_malformed_year_typos_in_ranges() { + let malformed_first = Token { + value: "20010-2011".to_string(), + tag: PosTag::Cd, + start_line: LineNumber::ONE, + }; + let malformed_second = Token { + value: "2010-20224".to_string(), + tag: PosTag::Cd, + start_line: LineNumber::ONE, + }; + let unrelated_number = Token { + value: "12345-67890".to_string(), + tag: PosTag::Cd, + start_line: LineNumber::ONE, + }; + + assert!(is_year_like_token(&malformed_first)); + assert!(is_year_like_token(&malformed_second)); + assert!(!is_year_like_token(&unrelated_number)); +} From 0393cdbf07cd3997a2f70e95ab811cdf60f98aa5 Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Wed, 29 Apr 2026 12:03:52 +0200 Subject: [PATCH 2/2] fix(copyright): recover malformed first-year range detections Signed-off-by: Maxim Stykow --- src/copyright/detector/mod.rs | 3 +- src/copyright/detector/tests.rs | 19 ++++++++ src/copyright/detector/tree_walk/copyright.rs | 44 +++++++++---------- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/copyright/detector/mod.rs b/src/copyright/detector/mod.rs index 1e5ab7995..e1e51f9ac 100644 --- a/src/copyright/detector/mod.rs +++ b/src/copyright/detector/mod.rs @@ -98,8 +98,7 @@ fn analyze_tree(nodes: &[ParseNode]) -> TreeAnalysis { token.tag, PosTag::Auths | PosTag::AuthDot | PosTag::Contributors | PosTag::Commit ); - analysis.has_year_token |= - matches!(token.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr); + analysis.has_year_token |= token_utils::is_year_like_token(token); let line = token.start_line.get(); match analysis.single_line { diff --git a/src/copyright/detector/tests.rs b/src/copyright/detector/tests.rs index f3cc6a314..a1cfe497b 100644 --- a/src/copyright/detector/tests.rs +++ b/src/copyright/detector/tests.rs @@ -873,6 +873,25 @@ fn test_detect_copyright_with_short_holder_and_trailing_punct_email() { assert_eq!(h[0].holder, "bgme"); } +#[test] +fn test_detect_copyright_with_malformed_first_year_range() { + let input = "Copyright (C) 20010-2011 Hauke Heibel "; + let (copyrights, holders, _authors) = detect_copyrights_from_text(input); + + assert!( + copyrights.iter().any(|c| { + c.copyright == "Copyright (c) 20010-2011 Hauke Heibel " + }), + "copyrights: {:?}", + copyrights.iter().map(|c| &c.copyright).collect::>() + ); + assert!( + holders.iter().any(|h| h.holder == "Hauke Heibel"), + "holders: {:?}", + holders.iter().map(|h| &h.holder).collect::>() + ); +} + #[test] fn test_detect_copyright_compact_c_parens_with_lowercase_holder_and_email() { let input = "Copyright(c) 2014 dead_horse "; diff --git a/src/copyright/detector/tree_walk/copyright.rs b/src/copyright/detector/tree_walk/copyright.rs index 351d5d0a6..939df4150 100644 --- a/src/copyright/detector/tree_walk/copyright.rs +++ b/src/copyright/detector/tree_walk/copyright.rs @@ -171,7 +171,7 @@ pub fn extract_from_tree_nodes( let allow_single_word_contributors = detector::token_utils::collect_all_leaves(node) .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); let prefix_token = get_orphaned_copy_prefix(tree, i); let not_prefix = get_orphaned_not_prefix(tree, i, node, allow_not_copyrighted_prefix); let (mut trailing_tokens, mut skip) = collect_trailing_orphan_tokens(node, tree, i + 1); @@ -429,9 +429,9 @@ pub fn extract_from_tree_nodes( if !has_holder && i + 1 < tree.len() { let copyright_ends_with_year = { let leaves = detector::token_utils::collect_all_leaves(node); - leaves.last().is_some_and(|t| { - matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr) - }) + leaves + .last() + .is_some_and(|t| detector::token_utils::is_year_like_token(t)) }; let next_node = &tree[i + 1]; let next_line_ok = { @@ -724,7 +724,7 @@ pub fn extract_from_tree_nodes( if t.tag == PosTag::Cc && t.value == "," { continue; } - if detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag) { + if detector::token_utils::is_year_like_token(t) { found = true; } break; @@ -762,9 +762,7 @@ pub fn extract_from_tree_nodes( holder_tokens_mini.extend(&node_holder_mini); let node_ends_with_year_mini = detector::token_utils::collect_all_leaves(node) .last() - .is_some_and(|t| { - detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag) - }); + .is_some_and(|t| detector::token_utils::is_year_like_token(t)); holder_tokens_mini.extend( detector::token_utils::filter_holder_tokens_with_state( &trailing_tokens, @@ -856,7 +854,7 @@ pub fn extract_from_tree_nodes( cr_tokens.extend(&name_leaves); let allow_single_word_contributors = cr_tokens .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); if let Some(det) = detector::token_utils::build_copyright_from_tokens(&cr_tokens) { copyrights.push(det); } @@ -1120,7 +1118,7 @@ fn get_trailing_year_range<'a>( } let node_has_year = detector::token_utils::collect_all_leaves(copyright_node) .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); if node_has_year { return None; } @@ -1287,7 +1285,7 @@ fn is_orphan_copy_name_match(node: &ParseNode) -> bool { let leaves = detector::token_utils::collect_all_leaves(node); leaves .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)) + .any(|t| detector::token_utils::is_year_like_token(t)) } _ => false, } @@ -1369,7 +1367,7 @@ pub fn should_start_absorbing( let same_line = last_line.is_some_and(|l| l == token.start_line); let node_has_year = detector::token_utils::collect_all_leaves(copyright_node) .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); let has_holder_like_tokens = detector::token_utils::collect_all_leaves(copyright_node) .iter() .any(|t| { @@ -1409,7 +1407,7 @@ pub fn should_start_absorbing( if same_line && has_author_keyword { let node_has_year = detector::token_utils::collect_all_leaves(copyright_node) .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); if node_has_year { return true; } @@ -1591,7 +1589,7 @@ pub fn should_start_absorbing( && last_line.is_some_and(|l| leaves.first().is_some_and(|t| t.start_line == l)); let node_has_year = detector::token_utils::collect_all_leaves(copyright_node) .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); let last_tag = detector::token_utils::collect_all_leaves(copyright_node) .last() .map(|t| t.tag); @@ -1624,7 +1622,7 @@ pub fn should_start_absorbing( if last_leaf_ends_with_comma(copyright_node) { let node_has_year = detector::token_utils::collect_all_leaves(copyright_node) .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); if node_has_year { let is_name_like_first = match first { ParseNode::Leaf(token) => matches!( @@ -1911,7 +1909,7 @@ fn collect_following_copyright_clause_tokens( let skip = j - start; let has_year = tokens .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); if !has_year { return (Vec::new(), 0); @@ -1950,7 +1948,7 @@ fn is_year_only_copyright_clause_node(node: &ParseNode) -> bool { let leaves = detector::token_utils::collect_all_leaves(node); let has_year = leaves .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); if !has_year { return false; } @@ -2065,7 +2063,7 @@ fn merge_year_only_copyright_clause_with_preceding_copyrighted_by( let holder_tokens = detector::token_utils::strip_all_rights_reserved(holder_tokens); let allow_single_word_contributors = holder_tokens .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); let holder_det = detector::token_utils::build_holder_from_tokens( &holder_tokens, allow_single_word_contributors, @@ -2202,7 +2200,7 @@ pub fn extract_bare_copyrights( let name_leaves = detector::token_utils::strip_all_rights_reserved(name_leaves); let allow_single_word_contributors = name_leaves .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); cr_tokens.extend(&name_leaves); let mut extra_skip = 0usize; @@ -2395,7 +2393,7 @@ pub fn extract_from_spans( && copy_start == copy_idx && all_leaves[copy_idx..i] .iter() - .any(|t| detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag)) + .any(|t| detector::token_utils::is_year_like_token(t)) && !all_leaves[copy_idx..i].iter().any(|t| { matches!( t.tag, @@ -2441,7 +2439,7 @@ pub fn extract_from_spans( if span.len() > 1 { let allow_single_word_contributors = span .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); let filtered = detector::token_utils::strip_all_rights_reserved_slice(span); if let Some(det) = detector::token_utils::build_copyright_from_tokens(&filtered) { copyrights.push(det); @@ -2638,7 +2636,7 @@ pub fn extract_copyrights_from_spans( && copy_start == copy_idx && all_leaves[copy_idx..i] .iter() - .any(|t| detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag)) + .any(|t| detector::token_utils::is_year_like_token(t)) && !all_leaves[copy_idx..i].iter().any(|t| { matches!( t.tag, @@ -2684,7 +2682,7 @@ pub fn extract_copyrights_from_spans( if span.len() > 1 { let allow_single_word_contributors = span .iter() - .any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)); + .any(|t| detector::token_utils::is_year_like_token(t)); let filtered = detector::token_utils::strip_all_rights_reserved_slice(span); if let Some(det) = detector::token_utils::build_copyright_from_tokens(&filtered) {