diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md index 043c3fa8e..e09d90ecd 100644 --- a/docs/BENCHMARKS.md +++ b/docs/BENCHMARKS.md @@ -12,7 +12,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc ![Scan duration vs. file count for Provenant and ScanCode](benchmarks/scan-duration-vs-files.svg) -> Provenant is faster on 180 of 180 recorded runs, with a **11.7× median speedup** and **11.0× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **18.6×** on 10k+ file targets. +> Provenant is faster on 181 of 181 recorded runs, with a **11.7× median speedup** and **11.1× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.1×** on 10k+ file targets. > Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`. ## Current benchmark examples @@ -914,6 +914,13 @@ The quick index below links to benchmark sections. Each benchmark entry then rec - Timing: Provenant `38.15s`; ScanCode `379.55s` - Broader .NET/NuGet package and dependency extraction (`105` vs `3` packages, `145` vs `33` dependencies) from many `*.csproj` files plus `Directory.Packages.props` and `Directory.Build.props` across samples, tooling, and test projects, with zero scan errors where ScanCode trips on `TwitterColorEmoji-SVGinOT.ttf` +##### [dotnet/runtime @ d1163e5](https://github.com/dotnet/runtime/tree/d1163e5a8f3f3aaa374993e8b5805911689aba28) — **31.49× faster** + +- Files: 57,611 +- Run context: 2026-04-29 · runtime-99690 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc +- Timing: Provenant `299.53s`; ScanCode `9432.77s` +- Broader .NET/NuGet and sibling npm package visibility (`2249` vs `5` packages, `983` vs `503` dependencies) across many `*.csproj` files, `Directory.Packages.props`, `Directory.Build.props`, and committed `package-lock.json` inputs, with zero scan-file timeouts where ScanCode aborts on `EncryptedXmlSample4.xml` + ##### [microsoft/onnxruntime @ 97e0a00](https://github.com/microsoft/onnxruntime/tree/97e0a001d43f8783db4507c9b2ac3731dc95a1ed) — **23.89× faster** - Files: 9,802 diff --git a/docs/benchmarks/scan-duration-vs-files.svg b/docs/benchmarks/scan-duration-vs-files.svg index 7213eb5d0..db4c87f8e 100644 --- a/docs/benchmarks/scan-duration-vs-files.svg +++ b/docs/benchmarks/scan-duration-vs-files.svg @@ -593,6 +593,9 @@ ScanCode: 4641.96s mongodb/mongo @ d6877a3 Files: 52443 ScanCode: 4363.53s + dotnet/runtime @ d1163e5 +Files: 57611 +ScanCode: 9432.77s rust-lang/rust @ dab8d9d Files: 58818 ScanCode: 1879.48s @@ -1135,6 +1138,9 @@ Provenant: 312.87s mongodb/mongo @ d6877a3 Files: 52443 Provenant: 313.61s + dotnet/runtime @ d1163e5 +Files: 57611 +Provenant: 299.53s rust-lang/rust @ dab8d9d Files: 58818 Provenant: 61.49s diff --git a/src/assembly/assembly_test.rs b/src/assembly/assembly_test.rs index 7f3ce4dec..befa70f3b 100644 --- a/src/assembly/assembly_test.rs +++ b/src/assembly/assembly_test.rs @@ -1738,7 +1738,7 @@ mod tests { } #[test] - fn test_assemble_npm_package_json_skips_lockfile_with_missing_identity() { + fn test_assemble_npm_package_json_merges_lockfile_with_missing_version_when_name_matches() { let mut files = vec![ create_test_file_info( "project/package.json", @@ -1775,10 +1775,78 @@ mod tests { assert_eq!(result.packages[0].version, Some("1.0.0".to_string())); assert_eq!( result.packages[0].datafile_paths, - vec!["project/package.json".to_string()] + vec![ + "project/package-lock.json".to_string(), + "project/package.json".to_string() + ] ); - assert!(result.dependencies.is_empty()); - assert!(files[1].for_packages.is_empty()); + assert_eq!(result.dependencies.len(), 1); + assert_eq!( + result.dependencies[0].purl.as_deref(), + Some("pkg:npm/left-pad@1.3.0") + ); + assert_eq!( + result.dependencies[0].datafile_path, + "project/package-lock.json" + ); + assert_eq!(files[0].for_packages.len(), 1); + assert_eq!(files[1].for_packages.len(), 1); + } + + #[test] + fn test_assemble_npm_package_json_and_lockfile_merge_when_both_omit_version() { + let mut manifest = create_test_file_info( + "project/package.json", + DatasourceId::NpmPackageJson, + None, + Some("my-app"), + None, + vec![], + ); + manifest.package_data[0].package_type = Some(PackageType::Npm); + + let mut lockfile = create_test_file_info( + "project/package-lock.json", + DatasourceId::NpmPackageLockJson, + None, + Some("my-app"), + None, + vec![Dependency { + purl: Some("pkg:npm/left-pad@1.3.0".to_string()), + extracted_requirement: Some("1.3.0".to_string()), + scope: Some("dependencies".to_string()), + is_runtime: Some(true), + is_optional: Some(false), + is_pinned: Some(true), + is_direct: Some(false), + resolved_package: None, + extra_data: None, + }], + ); + lockfile.package_data[0].package_type = Some(PackageType::Npm); + + let mut files = vec![manifest, lockfile]; + + let result = assemble(&mut files); + + assert_eq!(result.packages.len(), 1); + assert_eq!(result.packages[0].name, Some("my-app".to_string())); + assert_eq!(result.packages[0].version, None); + assert_eq!( + result.packages[0].datafile_paths, + vec![ + "project/package-lock.json".to_string(), + "project/package.json".to_string() + ] + ); + assert_eq!(result.dependencies.len(), 1); + assert_eq!( + result.dependencies[0].purl.as_deref(), + Some("pkg:npm/left-pad@1.3.0") + ); + assert!(result.dependencies[0].for_package_uid.is_some()); + assert_eq!(files[0].for_packages.len(), 1); + assert_eq!(files[1].for_packages.len(), 1); } #[test] diff --git a/src/assembly/sibling_merge.rs b/src/assembly/sibling_merge.rs index 9ae439a87..adb416c9b 100644 --- a/src/assembly/sibling_merge.rs +++ b/src/assembly/sibling_merge.rs @@ -461,20 +461,24 @@ fn should_skip_bun_lock_merge(package: &Package, pkg_data: &PackageData) -> bool } fn npm_package_identity_matches(package: &Package, pkg_data: &PackageData) -> bool { - let Some(package_name) = normalized_identity_value(package.name.as_deref()) else { - return false; - }; - let Some(package_version) = normalized_identity_value(package.version.as_deref()) else { - return false; - }; - let Some(candidate_name) = normalized_identity_value(pkg_data.name.as_deref()) else { + if let (Some(package_name), Some(candidate_name)) = ( + normalized_identity_value(package.name.as_deref()), + normalized_identity_value(pkg_data.name.as_deref()), + ) && package_name != candidate_name + { return false; - }; - let Some(candidate_version) = normalized_identity_value(pkg_data.version.as_deref()) else { + } + + if let (Some(package_version), Some(candidate_version)) = ( + normalized_identity_value(package.version.as_deref()), + normalized_identity_value(pkg_data.version.as_deref()), + ) && package_version != candidate_version + { return false; - }; + } - package_name == candidate_name && package_version == candidate_version + normalized_identity_value(package.name.as_deref()).is_some() + && normalized_identity_value(pkg_data.name.as_deref()).is_some() } fn normalized_identity_value(value: Option<&str>) -> Option<&str> { diff --git a/src/copyright/detector/author_heuristics/cleanup.rs b/src/copyright/detector/author_heuristics/cleanup.rs index 400d03b5f..ceb32ecd6 100644 --- a/src/copyright/detector/author_heuristics/cleanup.rs +++ b/src/copyright/detector/author_heuristics/cleanup.rs @@ -391,6 +391,86 @@ pub(in super::super) fn drop_json_code_example_authors( }); } +pub(in super::super) fn drop_markup_element_value_authors( + raw_lines: &[&str], + authors: &mut Vec, +) { + if raw_lines.is_empty() || authors.is_empty() { + return; + } + + authors.retain(|author| { + let Some(window) = + surrounding_author_window(raw_lines, author.start_line.get(), author.end_line.get()) + else { + return true; + }; + !window_contains_markup_element_author_value(&window, &author.author) + }); +} + +fn window_contains_markup_element_author_value(window: &str, author: &str) -> bool { + let normalized = normalize_whitespace(window); + if !(normalized.contains('<') && normalized.contains('>')) { + return false; + } + + let lower = normalized.to_ascii_lowercase(); + if lower.contains("copyright") || lower.contains("written by") || lower.contains("created by") { + return false; + } + + let has_author_element = (lower.contains("") + || lower.contains("") + || lower.contains("")) + && !lower.contains("author="); + if has_author_element { + return true; + } + + if has_author_element + && (lower.contains("") + || lower.contains("") + || lower.contains("") + || lower.contains("")) + { + return true; + } + + if lower.contains("xml:lang") && author.trim().starts_with("XmlLang ") { + return true; + } + + let trimmed_author = author.trim(); + if trimmed_author.is_empty() { + return false; + } + + let escaped = regex::escape(trimmed_author); + let exact_tag_re = Regex::new(&format!( + r"(?is)<(?:name|title|id|email|uri|updated|first-name|last-name|first\.name|last\.name|firstname|surname)\b[^>]*>\s*{}\s*") + || lower.contains("") + || lower.contains("") + || lower.contains("")) +} + fn surrounding_author_window( raw_lines: &[&str], start_line: usize, diff --git a/src/copyright/detector/phases/postprocess.rs b/src/copyright/detector/phases/postprocess.rs index 3d8ae94e9..b71bd37ef 100644 --- a/src/copyright/detector/phases/postprocess.rs +++ b/src/copyright/detector/phases/postprocess.rs @@ -277,6 +277,7 @@ fn run_author_extraction_and_repairs( authors.extend(new_a); super::author_heuristics::drop_json_code_example_authors(raw_lines, authors); + super::author_heuristics::drop_markup_element_value_authors(raw_lines, authors); seen.rebuild_authors_from(authors); let mut new_a = super::author_heuristics::extract_name_contributed_authors(prepared_cache); @@ -286,6 +287,8 @@ fn run_author_extraction_and_repairs( let mut new_a = super::author_heuristics::extract_comment_author_label_authors(raw_lines); seen.dedup_new_authors(&mut new_a, 0); authors.extend(new_a); + super::author_heuristics::drop_markup_element_value_authors(raw_lines, authors); + seen.rebuild_authors_from(authors); } #[allow(clippy::too_many_arguments)] @@ -597,6 +600,9 @@ fn run_final_variant_and_cleanup_repairs( super::postprocess_transforms::drop_json_description_metadata_copyrights_and_holders( raw_lines, copyrights, holders, ); + super::postprocess_transforms::drop_markup_declaration_and_versioninfo_copyrights_and_holders( + raw_lines, copyrights, holders, + ); super::postprocess_transforms::drop_copyright_like_holders(holders); } diff --git a/src/copyright/detector/postprocess_transforms/metadata_repairs.rs b/src/copyright/detector/postprocess_transforms/metadata_repairs.rs index 7d194edb3..db5895bcb 100644 --- a/src/copyright/detector/postprocess_transforms/metadata_repairs.rs +++ b/src/copyright/detector/postprocess_transforms/metadata_repairs.rs @@ -75,6 +75,79 @@ pub fn drop_json_description_metadata_copyrights_and_holders( }); } +pub fn drop_markup_declaration_and_versioninfo_copyrights_and_holders( + raw_lines: &[&str], + copyrights: &mut Vec, + holders: &mut Vec, +) { + if raw_lines.is_empty() { + return; + } + + let mut retained_spans: HashSet<(usize, usize)> = HashSet::new(); + copyrights.retain(|copyright| { + let keep = !span_has_markup_declaration_or_versioninfo( + raw_lines, + copyright.start_line.get(), + copyright.end_line.get(), + ); + if keep { + retained_spans.insert((copyright.start_line.get(), copyright.end_line.get())); + } + keep + }); + + holders.retain(|holder| { + if retained_spans.contains(&(holder.start_line.get(), holder.end_line.get())) { + return true; + } + + !span_has_markup_declaration_or_versioninfo( + raw_lines, + holder.start_line.get(), + holder.end_line.get(), + ) + }); +} + +fn span_has_markup_declaration_or_versioninfo( + raw_lines: &[&str], + start_line: usize, + end_line: usize, +) -> bool { + if start_line == 0 + || end_line == 0 + || start_line > raw_lines.len() + || end_line > raw_lines.len() + { + return false; + } + + let joined = raw_lines[start_line - 1..end_line].join(" "); + let lower = joined.to_ascii_lowercase(); + let has_year = + Regex::new(r"(?i)\b(?:19\d{2}|20\d{2})(?:\s*[-–/]\s*(?:19\d{2}|20\d{2}|\d{2}))?\b") + .expect("valid year regex") + .is_match(&joined); + + ((lower.contains("Some test

Copyright © 2003-2014

"; + let (c, _h, _a) = detect_copyrights_from_text(input); + assert!( + c.iter().any(|cr| cr.copyright == "Copyright (c) 2003-2014"), + "Expected Copyright (c) year range extracted, got: {:?}", + c.iter().map(|cr| &cr.copyright).collect::>() + ); +} + #[test] fn test_extract_hex_a9_entity_year_range_only_as_bare_c() { let input = "expectedXml = \"

Copyright © 2003-2014

\","; diff --git a/src/copyright/detector/tests_author_pipeline.rs b/src/copyright/detector/tests_author_pipeline.rs index 27398654b..7fa029e02 100644 --- a/src/copyright/detector/tests_author_pipeline.rs +++ b/src/copyright/detector/tests_author_pipeline.rs @@ -435,6 +435,56 @@ fn test_json_sponsor_description_does_not_create_authors() { assert!(authors.is_empty(), "authors: {authors:?}"); } +#[test] +fn test_xml_author_elements_do_not_create_file_authors() { + let input = concat!( + "\n", + " Joe Bob\n", + " Mary Bob\n", + "\n", + ); + let (_copyrights, _holders, authors) = detect_copyrights_from_text(input); + + assert!(authors.is_empty(), "authors: {authors:?}"); +} + +#[test] +fn test_atom_feed_identifier_fields_do_not_create_authors() { + let input = concat!( + "\n", + " tag:contoso.com,2000\n", + " 2006-04-25T12:12:12Z\n", + " jerry@Contoso.com\n", + "\n", + ); + let (_copyrights, _holders, authors) = detect_copyrights_from_text(input); + + assert!(authors.is_empty(), "authors: {authors:?}"); +} + +#[test] +fn test_xml_nested_name_tags_inside_author_do_not_create_file_authors() { + let input = concat!( + "\n", + " \n", + " Joe\n", + " Bob\n", + " \n", + "\n", + ); + let (_copyrights, _holders, authors) = detect_copyrights_from_text(input); + + assert!(authors.is_empty(), "authors: {authors:?}"); +} + +#[test] +fn test_xml_lang_attribute_value_does_not_create_author() { + let input = r#""#; + let (_copyrights, _holders, authors) = detect_copyrights_from_text(input); + + assert!(authors.is_empty(), "authors: {authors:?}"); +} + #[test] fn test_written_by_sentence_trims_following_description_clause() { let input = "JUnit is a regression testing framework written by Erich Gamma and Kent Beck. It is used by the developer who implements unit tests in Java."; diff --git a/src/copyright/detector/tests_false_positives.rs b/src/copyright/detector/tests_false_positives.rs index 0e48e667f..41f984722 100644 --- a/src/copyright/detector/tests_false_positives.rs +++ b/src/copyright/detector/tests_false_positives.rs @@ -220,6 +220,46 @@ fn test_detect_filters_code_like_c_marker_lines() { assert!(authors.is_empty(), "authors: {authors:?}"); } +#[test] +fn test_windows_versioninfo_line_is_not_detected_as_copyright_or_holder() { + let text = "Copyright (c) 2050 VALUE OriginalFilename NativeConsoleApp.exe"; + let (copyrights, holders, authors) = detect_copyrights_from_text(text); + + assert!(copyrights.is_empty(), "copyrights: {copyrights:?}"); + assert!(holders.is_empty(), "holders: {holders:?}"); + assert!(authors.is_empty(), "authors: {authors:?}"); +} + +#[test] +fn test_dtd_declaration_line_is_not_detected_as_copyright_or_holder() { + let text = "copyright aaaa"; + let (copyrights, holders, authors) = detect_copyrights_from_text(text); + + assert!(copyrights.is_empty(), "copyrights: {copyrights:?}"); + assert!(holders.is_empty(), "holders: {holders:?}"); + assert!(authors.is_empty(), "authors: {authors:?}"); +} + +#[test] +fn test_copyright_property_access_line_is_not_detected_as_copyright_or_holder() { + let text = "Copyright clone.Copyright.Text"; + let (copyrights, holders, authors) = detect_copyrights_from_text(text); + + assert!(copyrights.is_empty(), "copyrights: {copyrights:?}"); + assert!(holders.is_empty(), "holders: {holders:?}"); + assert!(authors.is_empty(), "authors: {authors:?}"); +} + +#[test] +fn test_unicode_escape_c_marker_line_is_not_detected_as_copyright_or_holder() { + let text = "(c) HeaderType.Content u00AD u00AE"; + let (copyrights, holders, authors) = detect_copyrights_from_text(text); + + assert!(copyrights.is_empty(), "copyrights: {copyrights:?}"); + assert!(holders.is_empty(), "holders: {holders:?}"); + assert!(authors.is_empty(), "authors: {authors:?}"); +} + #[test] fn test_detect_copyright_does_not_absorb_unexpected_as_represented() { let text = "Copyright 1993 United States Government as represented by the\nDirector, National Security Agency."; diff --git a/src/copyright/refiner/author.rs b/src/copyright/refiner/author.rs index b7fad3a2b..f00907c33 100644 --- a/src/copyright/refiner/author.rs +++ b/src/copyright/refiner/author.rs @@ -100,6 +100,14 @@ fn looks_like_prose_fragment_author(s: &str) -> bool { return true; } + if looks_like_markup_data_identifier(trimmed) { + return true; + } + + if looks_like_markup_attribute_label_value(trimmed) { + return true; + } + if contains_standalone_at_prefixed_token(trimmed) { return true; } @@ -222,7 +230,48 @@ fn looks_like_prose_fragment_author(s: &str) -> bool { fn contains_windows_versioninfo_fragment(s: &str) -> bool { let trimmed = s.trim(); trimmed.starts_with("VALUE ") - && (trimmed.contains("FileDescription") || trimmed.contains("FileVersion")) + && (trimmed.contains("FileDescription") + || trimmed.contains("FileVersion") + || trimmed.contains("OriginalFilename") + || trimmed.contains("ProductVersion") + || trimmed.contains("LegalTrademarks")) +} + +fn looks_like_markup_data_identifier(s: &str) -> bool { + static DOI_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)^doi:[^\s]+$").expect("valid doi regex")); + static TAG_URI_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)^tag:[^,\s]+,\d{4}(?::[^\s]+)?$").expect("valid tag uri regex") + }); + static RELATIVE_ID_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)^(?:id|urn|uuid)/[\p{L}0-9._~:/?#\[\]@!$&'()*+,;=-]+$") + .expect("valid relative id regex") + }); + static NAME_WITH_TIMESTAMP_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?i)^[\p{Lu}][\p{L}'._-]+(?:\s+[\p{Lu}][\p{L}'._-]+){0,3}\s+\d{4}-\d{2}-\d{2}t\d{2}:\d{2}:\d{2}z$", + ) + .expect("valid name timestamp regex") + }); + static DUPLICATED_AUTHOR_WORD_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)^(?:author|name){2,}$").expect("valid duplicated author word regex") + }); + + let trimmed = s.trim(); + DOI_RE.is_match(trimmed) + || TAG_URI_RE.is_match(trimmed) + || RELATIVE_ID_RE.is_match(trimmed) + || NAME_WITH_TIMESTAMP_RE.is_match(trimmed) + || DUPLICATED_AUTHOR_WORD_RE.is_match(trimmed) +} + +fn looks_like_markup_attribute_label_value(s: &str) -> bool { + static MARKUP_ATTRIBUTE_LABEL_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)^(?:xmllang|xml:lang|xmlns(?::[a-z0-9_.-]+)?)\s+\S+$") + .expect("valid markup attribute label regex") + }); + + MARKUP_ATTRIBUTE_LABEL_RE.is_match(s.trim()) } fn looks_like_file_reference_note_author(s: &str) -> bool { diff --git a/src/copyright/refiner/mod.rs b/src/copyright/refiner/mod.rs index 88472499f..cfbd5b763 100644 --- a/src/copyright/refiner/mod.rs +++ b/src/copyright/refiner/mod.rs @@ -473,6 +473,7 @@ fn is_junk_c_sign_path_fragment(s: &str) -> bool { fn is_junk_copyright_code_fragment(s: &str) -> bool { let trimmed = s.trim(); let lower = trimmed.to_ascii_lowercase(); + let has_windows_versioninfo_markers = contains_windows_versioninfo_token(trimmed); let has_code_markers = lower.contains("string?") || lower.contains("bool") || lower.contains("final ") @@ -489,16 +490,30 @@ fn is_junk_copyright_code_fragment(s: &str) -> bool { || lower.contains("$template") || lower.contains("icondata") || lower.contains("static const") + || lower.contains("public void") + || lower.contains("get set") + || lower.contains("assert.equal") || lower.contains("classifiers") || lower.contains("authors.append") || lower == "copyright void" || trimmed.contains("??") + || contains_member_access_code_token(trimmed) + || contains_unicode_escape_token_run(trimmed) + || contains_xml_markup_declaration_token(trimmed) || contains_regex_or_template_marker(trimmed) + || has_windows_versioninfo_markers || contains_generated_resource_token(trimmed) || contains_malformed_spaced_year(trimmed); let has_prose_markers = is_obvious_prose_fragment(trimmed); + if has_windows_versioninfo_markers { + return true; + } + if !lower.starts_with("copyright") { + if lower.starts_with("(c)") && (has_code_markers || has_prose_markers) { + return !has_copyright_year(trimmed); + } return (lower.starts_with("not copyrighted") && !has_copyright_year(trimmed)) || (lower.contains("copyright") && (has_code_markers || has_prose_markers)); } @@ -517,6 +532,7 @@ pub(crate) fn is_junk_holder(s: &str) -> bool { fn is_junk_holder_code_fragment(s: &str) -> bool { let trimmed = s.trim(); let lower = trimmed.to_ascii_lowercase(); + let has_windows_versioninfo_markers = contains_windows_versioninfo_token(trimmed); let has_code_markers = lower.contains("string?") || lower.contains("bool") || lower.contains("final ") @@ -529,13 +545,21 @@ fn is_junk_holder_code_fragment(s: &str) -> bool { || lower.contains("$template") || lower.contains("::") || lower.contains("static const") + || lower.contains("public void") + || lower.contains("get set") + || lower.contains("assert.equal") || lower.contains("icondata") || lower.contains("authors.append") + || contains_member_access_code_token(trimmed) + || contains_unicode_escape_token_run(trimmed) + || contains_xml_markup_declaration_token(trimmed) || contains_regex_or_template_marker(trimmed) + || has_windows_versioninfo_markers || contains_generated_resource_token(trimmed); let has_prose_markers = is_obvious_prose_fragment(trimmed); - (has_code_markers || has_prose_markers) && !has_copyright_year(trimmed) + has_windows_versioninfo_markers + || ((has_code_markers || has_prose_markers) && !has_copyright_year(trimmed)) } fn is_junk_holder_symbol_garbage(s: &str) -> bool { @@ -621,6 +645,64 @@ fn contains_generated_resource_token(s: &str) -> bool { ASSET_RE.is_match(trimmed) } +fn contains_member_access_code_token(s: &str) -> bool { + let trimmed = s.trim(); + let lower = trimmed.to_ascii_lowercase(); + if lower.contains("http://") || lower.contains("https://") || lower.contains("www.") { + return false; + } + + static MEMBER_ACCESS_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"\b(?:[a-z_][A-Za-z0-9_]{1,}\.){1,4}[A-Z][A-Za-z0-9_]{1,}(?:\.[A-Z][A-Za-z0-9_]{1,})*\b").unwrap() + }); + + MEMBER_ACCESS_RE.is_match(trimmed) +} + +fn is_post_refine_copyright_code_fragment(s: &str) -> bool { + let trimmed = s.trim(); + let lower = trimmed.to_ascii_lowercase(); + + contains_windows_versioninfo_token(trimmed) + || contains_member_access_code_token(trimmed) + || contains_unicode_escape_token_run(trimmed) + || lower.contains("public void") + || lower.contains("get set") + || lower.contains("assert.equal") +} + +fn contains_unicode_escape_token_run(s: &str) -> bool { + static UNICODE_ESCAPE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\bu[0-9a-f]{4}\b").unwrap()); + + UNICODE_ESCAPE_RE.is_match(s.trim()) +} + +fn contains_windows_versioninfo_token(s: &str) -> bool { + static VERSIONINFO_KEY_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?i)\b(?:VALUE\s+)?(?:OriginalFilename|FileDescription|FileVersion|ProductVersion|LegalTrademarks|ProductName|InternalName|CompanyName)\b", + ) + .unwrap() + }); + static VERSIONINFO_FILE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\b[\p{L}0-9_.-]+\.(?:exe|dll|mui|ocx|sys)\b").unwrap()); + + let trimmed = s.trim(); + VERSIONINFO_KEY_RE.is_match(trimmed) + && (trimmed.contains("VALUE ") + || VERSIONINFO_FILE_RE.is_match(trimmed) + || trimmed.to_ascii_lowercase().contains("legaltrademarks")) +} + +fn contains_xml_markup_declaration_token(s: &str) -> bool { + let lower = s.to_ascii_lowercase(); + lower.contains(" bool { static SPACED_YEAR_RE: LazyLock = LazyLock::new(|| Regex::new(r"\b(?:19|20)\s+\d{2}\b|\b\d{3}\s+\d{1,2}\b").unwrap()); @@ -702,6 +784,11 @@ pub fn refine_copyright(s: &str) -> Option { return None; } let original = normalize_whitespace(s); + if contains_windows_versioninfo_token(&original) + || (contains_xml_markup_declaration_token(&original) && !has_copyright_year(&original)) + { + return None; + } let mut c = original.clone(); c = strip_trailing_quote_before_email(&c); c = normalize_b_dot_angle_emails(&c); @@ -808,7 +895,8 @@ pub fn refine_copyright(s: &str) -> Option { { return None; } - if is_junk_copyright_of_header(&result) + if is_post_refine_copyright_code_fragment(&result) + || is_junk_copyright_of_header(&result) || is_junk_copyrighted_works_header(&result) || is_junk_copyrighted_software_phrase(&result) { diff --git a/src/copyright/refiner/tests.rs b/src/copyright/refiner/tests.rs index ff416eff3..922a6258e 100644 --- a/src/copyright/refiner/tests.rs +++ b/src/copyright/refiner/tests.rs @@ -1373,6 +1373,46 @@ fn test_refine_author_drops_generated_resource_identifiers() { assert_eq!(refine_author("icon-app-20x20@2x.png.img.tmpl"), None); } +#[test] +fn test_refine_author_drops_markup_feed_identifiers() { + assert_eq!(refine_author("doi:10.1038/nature05582"), None); + assert_eq!(refine_author("tag:contoso.com,2000"), None); + assert_eq!(refine_author("id/1234"), None); + assert_eq!(refine_author("James 2006-04-25T12:12:12Z"), None); + assert_eq!(refine_author("authorauthor"), None); + assert_eq!(refine_author("XmlLang en-usabcd"), None); +} + +#[test] +fn test_refine_copyright_drops_versioninfo_and_dtd_junk() { + assert_eq!( + refine_copyright("Copyright (c) 2050 VALUE OriginalFilename NativeConsoleApp.exe"), + None + ); + assert_eq!( + refine_copyright("copyright aaaa"), + None + ); + assert_eq!(refine_copyright("Copyright get set"), None); + assert_eq!(refine_copyright("copyright public void"), None); + assert_eq!(refine_copyright("Copyright clone.Copyright.Text"), None); + assert_eq!( + refine_copyright("Copyright HeaderType.Content u00AD u00AE"), + None + ); +} + +#[test] +fn test_refine_holder_drops_versioninfo_and_dtd_junk() { + assert_eq!( + refine_holder("VALUE OriginalFilename NativeConsoleApp.exe"), + None + ); + assert_eq!(refine_holder("PCDATA"), None); + assert_eq!(refine_holder("clone.Copyright.Text"), None); + assert_eq!(refine_holder("HeaderType.Content u00AD u00AE"), None); +} + #[test] fn test_refine_author_drops_template_token_runs_and_numeric_fragments() { assert_eq!(refine_author("AUTH CONTRIBUTORS AUTHS+ + 2660"), None); diff --git a/src/parsers/autotools.rs b/src/parsers/autotools.rs index 6dd25e84e..815a1407f 100644 --- a/src/parsers/autotools.rs +++ b/src/parsers/autotools.rs @@ -35,6 +35,7 @@ const AUTOCONF_CONFIGURE_MARKERS: &[&str] = &[ "generated by gnu autoconf", "generated automatically using autoconf", "please tell bug-autoconf@gnu.org", + "configure script for ", ]; fn looks_like_autoconf_generated_configure(path: &Path) -> bool { diff --git a/src/parsers/autotools_golden_test.rs b/src/parsers/autotools_golden_test.rs index 9515939cd..fdbd09736 100644 --- a/src/parsers/autotools_golden_test.rs +++ b/src/parsers/autotools_golden_test.rs @@ -32,4 +32,12 @@ mod golden_tests { "testdata/autotools/another-project/configure.ac.expected.json", ); } + + #[test] + fn test_golden_autotools_zlib_style_configure() { + run_golden( + "testdata/autotools/zlib-ng/configure", + "testdata/autotools/zlib-ng/configure.expected.json", + ); + } } diff --git a/src/parsers/autotools_test.rs b/src/parsers/autotools_test.rs index ffc1cc728..31cace453 100644 --- a/src/parsers/autotools_test.rs +++ b/src/parsers/autotools_test.rs @@ -21,6 +21,11 @@ fn test_is_match() { "testdata/autotools/another-project/configure.ac" ))); + // Should match zlib-style generated configure header + assert!(AutotoolsConfigureParser::is_match(&PathBuf::from( + "testdata/autotools/zlib-ng/configure" + ))); + // Should NOT match a custom non-Autoconf configure script assert!(!AutotoolsConfigureParser::is_match(&PathBuf::from( "testdata/autotools/non-autoconf-configure/configure" @@ -86,3 +91,13 @@ fn test_nested_path() { Some("pkg:autotools/my-awesome-project") ); } + +#[test] +fn test_zlib_style_configure_header() { + let path = PathBuf::from("testdata/autotools/zlib-ng/configure"); + let package_data = AutotoolsConfigureParser::extract_first_package(&path); + + assert_eq!(package_data.package_type, Some(PackageType::Autotools)); + assert_eq!(package_data.name, Some("zlib-ng".to_string())); + assert_eq!(package_data.purl.as_deref(), Some("pkg:autotools/zlib-ng")); +} diff --git a/testdata/autotools/zlib-ng/configure b/testdata/autotools/zlib-ng/configure new file mode 100644 index 000000000..99f558ed5 --- /dev/null +++ b/testdata/autotools/zlib-ng/configure @@ -0,0 +1,4 @@ +#!/bin/sh +# configure script for zlib. +# +# Normally configure builds both a static and a shared library. diff --git a/testdata/autotools/zlib-ng/configure.expected.json b/testdata/autotools/zlib-ng/configure.expected.json new file mode 100644 index 000000000..eec0ac8db --- /dev/null +++ b/testdata/autotools/zlib-ng/configure.expected.json @@ -0,0 +1,9 @@ +[ + { + "type": "autotools", + "name": "zlib-ng", + "purl": "pkg:autotools/zlib-ng", + "parties": [], + "datasource_id": "autotools_configure" + } +]