Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/copyright/detector/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ fn analyze_tree(nodes: &[ParseNode]) -> TreeAnalysis {
token.tag,
PosTag::Auths | PosTag::AuthDot | PosTag::Contributors | PosTag::Commit
);
analysis.has_year_token |=
matches!(token.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr);
analysis.has_year_token |= token_utils::is_year_like_token(token);

let line = token.start_line.get();
match analysis.single_line {
Expand Down
19 changes: 19 additions & 0 deletions src/copyright/detector/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,25 @@ fn test_detect_copyright_with_short_holder_and_trailing_punct_email() {
assert_eq!(h[0].holder, "bgme");
}

#[test]
fn test_detect_copyright_with_malformed_first_year_range() {
let input = "Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>";
let (copyrights, holders, _authors) = detect_copyrights_from_text(input);

assert!(
copyrights.iter().any(|c| {
c.copyright == "Copyright (c) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>"
}),
"copyrights: {:?}",
copyrights.iter().map(|c| &c.copyright).collect::<Vec<_>>()
);
assert!(
holders.iter().any(|h| h.holder == "Hauke Heibel"),
"holders: {:?}",
holders.iter().map(|h| &h.holder).collect::<Vec<_>>()
);
}

#[test]
fn test_detect_copyright_compact_c_parens_with_lowercase_holder_and_email() {
let input = "Copyright(c) 2014 dead_horse <dead_horse@qq.com>";
Expand Down
10 changes: 4 additions & 6 deletions src/copyright/detector/token_utils/builders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ use crate::copyright::types::{
use crate::models::LineNumber;

use super::{
collect_filtered_leaves, collect_holder_filtered_leaves, normalized_tokens_to_string,
strip_all_rights_reserved,
collect_filtered_leaves, collect_holder_filtered_leaves, is_year_like_token,
normalized_tokens_to_string, strip_all_rights_reserved,
};

pub fn extract_original_author_additional_contributors(
Expand Down Expand Up @@ -96,7 +96,7 @@ pub fn build_holder_from_node(
let filtered = strip_all_rights_reserved(leaves);
let allow_single_word_contributors = collect_all_leaves(node)
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| is_year_like_token(t));
build_holder_from_tokens(&filtered, allow_single_word_contributors)
}

Expand Down Expand Up @@ -133,9 +133,7 @@ pub fn build_holder_from_copyright_node(
});
}

let allow_single_word_contributors = all_leaves
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
let allow_single_word_contributors = all_leaves.iter().any(|t| is_year_like_token(t));

build_holder_from_tokens(&filtered, allow_single_word_contributors)
}
Expand Down
23 changes: 21 additions & 2 deletions src/copyright/detector/token_utils/filters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,25 @@ pub fn drop_path_fragment_holders_from_bare_c_code_lines(
/// (e.g. "Name <email>, Name2").
pub const YEAR_LIKE_POS_TAGS: &[PosTag] = &[PosTag::Yr, PosTag::YrPlus, PosTag::BareYr];

static MALFORMED_COPYRIGHT_YEAR_RANGE_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?ix)^
(?:19\d{2}|20\d{2}|19\d{3}|20\d{3})
(?:
[\.,/\-~]
(?:19\d{2}|20\d{2}|19\d{3}|20\d{3}|\d{2}|present|current_year|today\.year)
)+
$",
)
.unwrap()
});

pub fn is_year_like_token(token: &Token) -> bool {
YEAR_LIKE_POS_TAGS.contains(&token.tag)
|| (token.tag == PosTag::Cd
&& MALFORMED_COPYRIGHT_YEAR_RANGE_RE.is_match(token.value.as_str()))
}

/// Year-related tree labels whose filtering orphans adjacent commas.
pub const YEAR_LIKE_LABELS: &[TreeLabel] = &[TreeLabel::YrRange, TreeLabel::YrAnd];

Expand Down Expand Up @@ -427,7 +446,7 @@ fn collect_holder_filtered_leaves_inner<'a>(
match node {
ParseNode::Leaf(token) => {
if ignored_pos_tags.contains(&token.tag) {
if YEAR_LIKE_POS_TAGS.contains(&token.tag) {
if is_year_like_token(token) {
state.last_was_year_filtered = true;
}
if matches!(token.tag, PosTag::Email | PosTag::Url | PosTag::Url2) {
Expand Down Expand Up @@ -513,7 +532,7 @@ pub fn filter_holder_tokens_with_state<'a>(

for (i, &token) in tokens.iter().enumerate() {
if non_holder_tags.contains(&token.tag) {
if YEAR_LIKE_POS_TAGS.contains(&token.tag) {
if is_year_like_token(token) {
last_was_year_filtered = true;
}
if matches!(token.tag, PosTag::Email | PosTag::Url | PosTag::Url2) {
Expand Down
23 changes: 23 additions & 0 deletions src/copyright/detector/token_utils_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,26 @@ fn test_collect_filtered_leaves_filters_tree_labels() {
assert_eq!(leaves[0].value, "Copyright");
assert_eq!(leaves[1].value, "Acme");
}

#[test]
fn test_is_year_like_token_accepts_malformed_year_typos_in_ranges() {
let malformed_first = Token {
value: "20010-2011".to_string(),
tag: PosTag::Cd,
start_line: LineNumber::ONE,
};
let malformed_second = Token {
value: "2010-20224".to_string(),
tag: PosTag::Cd,
start_line: LineNumber::ONE,
};
let unrelated_number = Token {
value: "12345-67890".to_string(),
tag: PosTag::Cd,
start_line: LineNumber::ONE,
};

assert!(is_year_like_token(&malformed_first));
assert!(is_year_like_token(&malformed_second));
assert!(!is_year_like_token(&unrelated_number));
}
44 changes: 21 additions & 23 deletions src/copyright/detector/tree_walk/copyright.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ pub fn extract_from_tree_nodes(

let allow_single_word_contributors = detector::token_utils::collect_all_leaves(node)
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
let prefix_token = get_orphaned_copy_prefix(tree, i);
let not_prefix = get_orphaned_not_prefix(tree, i, node, allow_not_copyrighted_prefix);
let (mut trailing_tokens, mut skip) = collect_trailing_orphan_tokens(node, tree, i + 1);
Expand Down Expand Up @@ -429,9 +429,9 @@ pub fn extract_from_tree_nodes(
if !has_holder && i + 1 < tree.len() {
let copyright_ends_with_year = {
let leaves = detector::token_utils::collect_all_leaves(node);
leaves.last().is_some_and(|t| {
matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr)
})
leaves
.last()
.is_some_and(|t| detector::token_utils::is_year_like_token(t))
};
let next_node = &tree[i + 1];
let next_line_ok = {
Expand Down Expand Up @@ -724,7 +724,7 @@ pub fn extract_from_tree_nodes(
if t.tag == PosTag::Cc && t.value == "," {
continue;
}
if detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag) {
if detector::token_utils::is_year_like_token(t) {
found = true;
}
break;
Expand Down Expand Up @@ -762,9 +762,7 @@ pub fn extract_from_tree_nodes(
holder_tokens_mini.extend(&node_holder_mini);
let node_ends_with_year_mini = detector::token_utils::collect_all_leaves(node)
.last()
.is_some_and(|t| {
detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag)
});
.is_some_and(|t| detector::token_utils::is_year_like_token(t));
holder_tokens_mini.extend(
detector::token_utils::filter_holder_tokens_with_state(
&trailing_tokens,
Expand Down Expand Up @@ -856,7 +854,7 @@ pub fn extract_from_tree_nodes(
cr_tokens.extend(&name_leaves);
let allow_single_word_contributors = cr_tokens
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
if let Some(det) = detector::token_utils::build_copyright_from_tokens(&cr_tokens) {
copyrights.push(det);
}
Expand Down Expand Up @@ -1120,7 +1118,7 @@ fn get_trailing_year_range<'a>(
}
let node_has_year = detector::token_utils::collect_all_leaves(copyright_node)
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
if node_has_year {
return None;
}
Expand Down Expand Up @@ -1287,7 +1285,7 @@ fn is_orphan_copy_name_match(node: &ParseNode) -> bool {
let leaves = detector::token_utils::collect_all_leaves(node);
leaves
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr))
.any(|t| detector::token_utils::is_year_like_token(t))
}
_ => false,
}
Expand Down Expand Up @@ -1369,7 +1367,7 @@ pub fn should_start_absorbing(
let same_line = last_line.is_some_and(|l| l == token.start_line);
let node_has_year = detector::token_utils::collect_all_leaves(copyright_node)
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
let has_holder_like_tokens = detector::token_utils::collect_all_leaves(copyright_node)
.iter()
.any(|t| {
Expand Down Expand Up @@ -1409,7 +1407,7 @@ pub fn should_start_absorbing(
if same_line && has_author_keyword {
let node_has_year = detector::token_utils::collect_all_leaves(copyright_node)
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
if node_has_year {
return true;
}
Expand Down Expand Up @@ -1591,7 +1589,7 @@ pub fn should_start_absorbing(
&& last_line.is_some_and(|l| leaves.first().is_some_and(|t| t.start_line == l));
let node_has_year = detector::token_utils::collect_all_leaves(copyright_node)
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
let last_tag = detector::token_utils::collect_all_leaves(copyright_node)
.last()
.map(|t| t.tag);
Expand Down Expand Up @@ -1624,7 +1622,7 @@ pub fn should_start_absorbing(
if last_leaf_ends_with_comma(copyright_node) {
let node_has_year = detector::token_utils::collect_all_leaves(copyright_node)
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
if node_has_year {
let is_name_like_first = match first {
ParseNode::Leaf(token) => matches!(
Expand Down Expand Up @@ -1911,7 +1909,7 @@ fn collect_following_copyright_clause_tokens(
let skip = j - start;
let has_year = tokens
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));

if !has_year {
return (Vec::new(), 0);
Expand Down Expand Up @@ -1950,7 +1948,7 @@ fn is_year_only_copyright_clause_node(node: &ParseNode) -> bool {
let leaves = detector::token_utils::collect_all_leaves(node);
let has_year = leaves
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
if !has_year {
return false;
}
Expand Down Expand Up @@ -2065,7 +2063,7 @@ fn merge_year_only_copyright_clause_with_preceding_copyrighted_by(
let holder_tokens = detector::token_utils::strip_all_rights_reserved(holder_tokens);
let allow_single_word_contributors = holder_tokens
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
let holder_det = detector::token_utils::build_holder_from_tokens(
&holder_tokens,
allow_single_word_contributors,
Expand Down Expand Up @@ -2202,7 +2200,7 @@ pub fn extract_bare_copyrights(
let name_leaves = detector::token_utils::strip_all_rights_reserved(name_leaves);
let allow_single_word_contributors = name_leaves
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
cr_tokens.extend(&name_leaves);

let mut extra_skip = 0usize;
Expand Down Expand Up @@ -2395,7 +2393,7 @@ pub fn extract_from_spans(
&& copy_start == copy_idx
&& all_leaves[copy_idx..i]
.iter()
.any(|t| detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag))
.any(|t| detector::token_utils::is_year_like_token(t))
&& !all_leaves[copy_idx..i].iter().any(|t| {
matches!(
t.tag,
Expand Down Expand Up @@ -2441,7 +2439,7 @@ pub fn extract_from_spans(
if span.len() > 1 {
let allow_single_word_contributors = span
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));
let filtered = detector::token_utils::strip_all_rights_reserved_slice(span);
if let Some(det) = detector::token_utils::build_copyright_from_tokens(&filtered) {
copyrights.push(det);
Expand Down Expand Up @@ -2638,7 +2636,7 @@ pub fn extract_copyrights_from_spans(
&& copy_start == copy_idx
&& all_leaves[copy_idx..i]
.iter()
.any(|t| detector::token_utils::YEAR_LIKE_POS_TAGS.contains(&t.tag))
.any(|t| detector::token_utils::is_year_like_token(t))
&& !all_leaves[copy_idx..i].iter().any(|t| {
matches!(
t.tag,
Expand Down Expand Up @@ -2684,7 +2682,7 @@ pub fn extract_copyrights_from_spans(
if span.len() > 1 {
let allow_single_word_contributors = span
.iter()
.any(|t| matches!(t.tag, PosTag::Yr | PosTag::YrPlus | PosTag::BareYr));
.any(|t| detector::token_utils::is_year_like_token(t));

let filtered = detector::token_utils::strip_all_rights_reserved_slice(span);
if let Some(det) = detector::token_utils::build_copyright_from_tokens(&filtered) {
Expand Down
Loading