diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md
index 043c3fa8e..e09d90ecd 100644
--- a/docs/BENCHMARKS.md
+++ b/docs/BENCHMARKS.md
@@ -12,7 +12,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc

-> Provenant is faster on 180 of 180 recorded runs, with a **11.7× median speedup** and **11.0× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **18.6×** on 10k+ file targets.
+> Provenant is faster on 181 of 181 recorded runs, with a **11.7× median speedup** and **11.1× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.1×** on 10k+ file targets.
> Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`.
## Current benchmark examples
@@ -914,6 +914,13 @@ The quick index below links to benchmark sections. Each benchmark entry then rec
- Timing: Provenant `38.15s`; ScanCode `379.55s`
- Broader .NET/NuGet package and dependency extraction (`105` vs `3` packages, `145` vs `33` dependencies) from many `*.csproj` files plus `Directory.Packages.props` and `Directory.Build.props` across samples, tooling, and test projects, with zero scan errors where ScanCode trips on `TwitterColorEmoji-SVGinOT.ttf`
+##### [dotnet/runtime @ d1163e5](https://github.com/dotnet/runtime/tree/d1163e5a8f3f3aaa374993e8b5805911689aba28) — **31.49× faster**
+
+- Files: 57,611
+- Run context: 2026-04-29 · runtime-99690 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc
+- Timing: Provenant `299.53s`; ScanCode `9432.77s`
+- Broader .NET/NuGet and sibling npm package visibility (`2249` vs `5` packages, `983` vs `503` dependencies) across many `*.csproj` files, `Directory.Packages.props`, `Directory.Build.props`, and committed `package-lock.json` inputs, with zero scan-file timeouts where ScanCode aborts on `EncryptedXmlSample4.xml`
+
##### [microsoft/onnxruntime @ 97e0a00](https://github.com/microsoft/onnxruntime/tree/97e0a001d43f8783db4507c9b2ac3731dc95a1ed) — **23.89× faster**
- Files: 9,802
diff --git a/docs/benchmarks/scan-duration-vs-files.svg b/docs/benchmarks/scan-duration-vs-files.svg
index 7213eb5d0..db4c87f8e 100644
--- a/docs/benchmarks/scan-duration-vs-files.svg
+++ b/docs/benchmarks/scan-duration-vs-files.svg
@@ -593,6 +593,9 @@ ScanCode: 4641.96s
mongodb/mongo @ d6877a3
Files: 52443
ScanCode: 4363.53s
+ dotnet/runtime @ d1163e5
+Files: 57611
+ScanCode: 9432.77s
rust-lang/rust @ dab8d9d
Files: 58818
ScanCode: 1879.48s
@@ -1135,6 +1138,9 @@ Provenant: 312.87s
mongodb/mongo @ d6877a3
Files: 52443
Provenant: 313.61s
+ dotnet/runtime @ d1163e5
+Files: 57611
+Provenant: 299.53s
rust-lang/rust @ dab8d9d
Files: 58818
Provenant: 61.49s
diff --git a/src/assembly/assembly_test.rs b/src/assembly/assembly_test.rs
index 7f3ce4dec..befa70f3b 100644
--- a/src/assembly/assembly_test.rs
+++ b/src/assembly/assembly_test.rs
@@ -1738,7 +1738,7 @@ mod tests {
}
#[test]
- fn test_assemble_npm_package_json_skips_lockfile_with_missing_identity() {
+ fn test_assemble_npm_package_json_merges_lockfile_with_missing_version_when_name_matches() {
let mut files = vec![
create_test_file_info(
"project/package.json",
@@ -1775,10 +1775,78 @@ mod tests {
assert_eq!(result.packages[0].version, Some("1.0.0".to_string()));
assert_eq!(
result.packages[0].datafile_paths,
- vec!["project/package.json".to_string()]
+ vec![
+ "project/package-lock.json".to_string(),
+ "project/package.json".to_string()
+ ]
);
- assert!(result.dependencies.is_empty());
- assert!(files[1].for_packages.is_empty());
+ assert_eq!(result.dependencies.len(), 1);
+ assert_eq!(
+ result.dependencies[0].purl.as_deref(),
+ Some("pkg:npm/left-pad@1.3.0")
+ );
+ assert_eq!(
+ result.dependencies[0].datafile_path,
+ "project/package-lock.json"
+ );
+ assert_eq!(files[0].for_packages.len(), 1);
+ assert_eq!(files[1].for_packages.len(), 1);
+ }
+
+ #[test]
+ fn test_assemble_npm_package_json_and_lockfile_merge_when_both_omit_version() {
+ let mut manifest = create_test_file_info(
+ "project/package.json",
+ DatasourceId::NpmPackageJson,
+ None,
+ Some("my-app"),
+ None,
+ vec![],
+ );
+ manifest.package_data[0].package_type = Some(PackageType::Npm);
+
+ let mut lockfile = create_test_file_info(
+ "project/package-lock.json",
+ DatasourceId::NpmPackageLockJson,
+ None,
+ Some("my-app"),
+ None,
+ vec![Dependency {
+ purl: Some("pkg:npm/left-pad@1.3.0".to_string()),
+ extracted_requirement: Some("1.3.0".to_string()),
+ scope: Some("dependencies".to_string()),
+ is_runtime: Some(true),
+ is_optional: Some(false),
+ is_pinned: Some(true),
+ is_direct: Some(false),
+ resolved_package: None,
+ extra_data: None,
+ }],
+ );
+ lockfile.package_data[0].package_type = Some(PackageType::Npm);
+
+ let mut files = vec![manifest, lockfile];
+
+ let result = assemble(&mut files);
+
+ assert_eq!(result.packages.len(), 1);
+ assert_eq!(result.packages[0].name, Some("my-app".to_string()));
+ assert_eq!(result.packages[0].version, None);
+ assert_eq!(
+ result.packages[0].datafile_paths,
+ vec![
+ "project/package-lock.json".to_string(),
+ "project/package.json".to_string()
+ ]
+ );
+ assert_eq!(result.dependencies.len(), 1);
+ assert_eq!(
+ result.dependencies[0].purl.as_deref(),
+ Some("pkg:npm/left-pad@1.3.0")
+ );
+ assert!(result.dependencies[0].for_package_uid.is_some());
+ assert_eq!(files[0].for_packages.len(), 1);
+ assert_eq!(files[1].for_packages.len(), 1);
}
#[test]
diff --git a/src/assembly/sibling_merge.rs b/src/assembly/sibling_merge.rs
index 9ae439a87..adb416c9b 100644
--- a/src/assembly/sibling_merge.rs
+++ b/src/assembly/sibling_merge.rs
@@ -461,20 +461,24 @@ fn should_skip_bun_lock_merge(package: &Package, pkg_data: &PackageData) -> bool
}
fn npm_package_identity_matches(package: &Package, pkg_data: &PackageData) -> bool {
- let Some(package_name) = normalized_identity_value(package.name.as_deref()) else {
- return false;
- };
- let Some(package_version) = normalized_identity_value(package.version.as_deref()) else {
- return false;
- };
- let Some(candidate_name) = normalized_identity_value(pkg_data.name.as_deref()) else {
+ if let (Some(package_name), Some(candidate_name)) = (
+ normalized_identity_value(package.name.as_deref()),
+ normalized_identity_value(pkg_data.name.as_deref()),
+ ) && package_name != candidate_name
+ {
return false;
- };
- let Some(candidate_version) = normalized_identity_value(pkg_data.version.as_deref()) else {
+ }
+
+ if let (Some(package_version), Some(candidate_version)) = (
+ normalized_identity_value(package.version.as_deref()),
+ normalized_identity_value(pkg_data.version.as_deref()),
+ ) && package_version != candidate_version
+ {
return false;
- };
+ }
- package_name == candidate_name && package_version == candidate_version
+ normalized_identity_value(package.name.as_deref()).is_some()
+ && normalized_identity_value(pkg_data.name.as_deref()).is_some()
}
fn normalized_identity_value(value: Option<&str>) -> Option<&str> {
diff --git a/src/copyright/detector/author_heuristics/cleanup.rs b/src/copyright/detector/author_heuristics/cleanup.rs
index 400d03b5f..ceb32ecd6 100644
--- a/src/copyright/detector/author_heuristics/cleanup.rs
+++ b/src/copyright/detector/author_heuristics/cleanup.rs
@@ -391,6 +391,86 @@ pub(in super::super) fn drop_json_code_example_authors(
});
}
+pub(in super::super) fn drop_markup_element_value_authors(
+ raw_lines: &[&str],
+ authors: &mut Vec,
+) {
+ if raw_lines.is_empty() || authors.is_empty() {
+ return;
+ }
+
+ authors.retain(|author| {
+ let Some(window) =
+ surrounding_author_window(raw_lines, author.start_line.get(), author.end_line.get())
+ else {
+ return true;
+ };
+ !window_contains_markup_element_author_value(&window, &author.author)
+ });
+}
+
+fn window_contains_markup_element_author_value(window: &str, author: &str) -> bool {
+ let normalized = normalize_whitespace(window);
+ if !(normalized.contains('<') && normalized.contains('>')) {
+ return false;
+ }
+
+ let lower = normalized.to_ascii_lowercase();
+ if lower.contains("copyright") || lower.contains("written by") || lower.contains("created by") {
+ return false;
+ }
+
+ let has_author_element = (lower.contains("")
+ || lower.contains("")
+ || lower.contains(""))
+ && !lower.contains("author=");
+ if has_author_element {
+ return true;
+ }
+
+ if has_author_element
+ && (lower.contains("")
+ || lower.contains("")
+ || lower.contains("")
+ || lower.contains(""))
+ {
+ return true;
+ }
+
+ if lower.contains("xml:lang") && author.trim().starts_with("XmlLang ") {
+ return true;
+ }
+
+ let trimmed_author = author.trim();
+ if trimmed_author.is_empty() {
+ return false;
+ }
+
+ let escaped = regex::escape(trimmed_author);
+ let exact_tag_re = Regex::new(&format!(
+ r"(?is)<(?:name|title|id|email|uri|updated|first-name|last-name|first\.name|last\.name|firstname|surname)\b[^>]*>\s*{}\s*",
+ escaped
+ ))
+ .expect("valid exact markup data tag regex");
+ if exact_tag_re.is_match(&normalized) {
+ return true;
+ }
+
+ let looks_like_identifier = trimmed_author.contains('@')
+ || trimmed_author.contains("http://")
+ || trimmed_author.contains("https://")
+ || trimmed_author.to_ascii_lowercase().starts_with("doi:")
+ || trimmed_author.to_ascii_lowercase().starts_with("tag:")
+ || trimmed_author.contains('T') && trimmed_author.ends_with('Z');
+
+ looks_like_identifier
+ && (lower.contains("")
+ || lower.contains("")
+ || lower.contains("")
+ || lower.contains(""))
+}
+
fn surrounding_author_window(
raw_lines: &[&str],
start_line: usize,
diff --git a/src/copyright/detector/phases/postprocess.rs b/src/copyright/detector/phases/postprocess.rs
index 3d8ae94e9..b71bd37ef 100644
--- a/src/copyright/detector/phases/postprocess.rs
+++ b/src/copyright/detector/phases/postprocess.rs
@@ -277,6 +277,7 @@ fn run_author_extraction_and_repairs(
authors.extend(new_a);
super::author_heuristics::drop_json_code_example_authors(raw_lines, authors);
+ super::author_heuristics::drop_markup_element_value_authors(raw_lines, authors);
seen.rebuild_authors_from(authors);
let mut new_a = super::author_heuristics::extract_name_contributed_authors(prepared_cache);
@@ -286,6 +287,8 @@ fn run_author_extraction_and_repairs(
let mut new_a = super::author_heuristics::extract_comment_author_label_authors(raw_lines);
seen.dedup_new_authors(&mut new_a, 0);
authors.extend(new_a);
+ super::author_heuristics::drop_markup_element_value_authors(raw_lines, authors);
+ seen.rebuild_authors_from(authors);
}
#[allow(clippy::too_many_arguments)]
@@ -597,6 +600,9 @@ fn run_final_variant_and_cleanup_repairs(
super::postprocess_transforms::drop_json_description_metadata_copyrights_and_holders(
raw_lines, copyrights, holders,
);
+ super::postprocess_transforms::drop_markup_declaration_and_versioninfo_copyrights_and_holders(
+ raw_lines, copyrights, holders,
+ );
super::postprocess_transforms::drop_copyright_like_holders(holders);
}
diff --git a/src/copyright/detector/postprocess_transforms/metadata_repairs.rs b/src/copyright/detector/postprocess_transforms/metadata_repairs.rs
index 7d194edb3..db5895bcb 100644
--- a/src/copyright/detector/postprocess_transforms/metadata_repairs.rs
+++ b/src/copyright/detector/postprocess_transforms/metadata_repairs.rs
@@ -75,6 +75,79 @@ pub fn drop_json_description_metadata_copyrights_and_holders(
});
}
+pub fn drop_markup_declaration_and_versioninfo_copyrights_and_holders(
+ raw_lines: &[&str],
+ copyrights: &mut Vec,
+ holders: &mut Vec,
+) {
+ if raw_lines.is_empty() {
+ return;
+ }
+
+ let mut retained_spans: HashSet<(usize, usize)> = HashSet::new();
+ copyrights.retain(|copyright| {
+ let keep = !span_has_markup_declaration_or_versioninfo(
+ raw_lines,
+ copyright.start_line.get(),
+ copyright.end_line.get(),
+ );
+ if keep {
+ retained_spans.insert((copyright.start_line.get(), copyright.end_line.get()));
+ }
+ keep
+ });
+
+ holders.retain(|holder| {
+ if retained_spans.contains(&(holder.start_line.get(), holder.end_line.get())) {
+ return true;
+ }
+
+ !span_has_markup_declaration_or_versioninfo(
+ raw_lines,
+ holder.start_line.get(),
+ holder.end_line.get(),
+ )
+ });
+}
+
+fn span_has_markup_declaration_or_versioninfo(
+ raw_lines: &[&str],
+ start_line: usize,
+ end_line: usize,
+) -> bool {
+ if start_line == 0
+ || end_line == 0
+ || start_line > raw_lines.len()
+ || end_line > raw_lines.len()
+ {
+ return false;
+ }
+
+ let joined = raw_lines[start_line - 1..end_line].join(" ");
+ let lower = joined.to_ascii_lowercase();
+ let has_year =
+ Regex::new(r"(?i)\b(?:19\d{2}|20\d{2})(?:\s*[-–/]\s*(?:19\d{2}|20\d{2}|\d{2}))?\b")
+ .expect("valid year regex")
+ .is_match(&joined);
+
+ ((lower.contains("Some test";
+ let (c, _h, _a) = detect_copyrights_from_text(input);
+ assert!(
+ c.iter().any(|cr| cr.copyright == "Copyright (c) 2003-2014"),
+ "Expected Copyright (c) year range extracted, got: {:?}",
+ c.iter().map(|cr| &cr.copyright).collect::>()
+ );
+}
+
#[test]
fn test_extract_hex_a9_entity_year_range_only_as_bare_c() {
let input = "expectedXml = \"Copyright © 2003-2014
\",";
diff --git a/src/copyright/detector/tests_author_pipeline.rs b/src/copyright/detector/tests_author_pipeline.rs
index 27398654b..7fa029e02 100644
--- a/src/copyright/detector/tests_author_pipeline.rs
+++ b/src/copyright/detector/tests_author_pipeline.rs
@@ -435,6 +435,56 @@ fn test_json_sponsor_description_does_not_create_authors() {
assert!(authors.is_empty(), "authors: {authors:?}");
}
+#[test]
+fn test_xml_author_elements_do_not_create_file_authors() {
+ let input = concat!(
+ "\n",
+ " Joe Bob\n",
+ " Mary Bob\n",
+ "\n",
+ );
+ let (_copyrights, _holders, authors) = detect_copyrights_from_text(input);
+
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
+#[test]
+fn test_atom_feed_identifier_fields_do_not_create_authors() {
+ let input = concat!(
+ "\n",
+ " tag:contoso.com,2000\n",
+ " 2006-04-25T12:12:12Z\n",
+ " jerry@Contoso.com\n",
+ "\n",
+ );
+ let (_copyrights, _holders, authors) = detect_copyrights_from_text(input);
+
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
+#[test]
+fn test_xml_nested_name_tags_inside_author_do_not_create_file_authors() {
+ let input = concat!(
+ "\n",
+ " \n",
+ " Joe\n",
+ " Bob\n",
+ " \n",
+ "\n",
+ );
+ let (_copyrights, _holders, authors) = detect_copyrights_from_text(input);
+
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
+#[test]
+fn test_xml_lang_attribute_value_does_not_create_author() {
+ let input = r#""#;
+ let (_copyrights, _holders, authors) = detect_copyrights_from_text(input);
+
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
#[test]
fn test_written_by_sentence_trims_following_description_clause() {
let input = "JUnit is a regression testing framework written by Erich Gamma and Kent Beck. It is used by the developer who implements unit tests in Java.";
diff --git a/src/copyright/detector/tests_false_positives.rs b/src/copyright/detector/tests_false_positives.rs
index 0e48e667f..41f984722 100644
--- a/src/copyright/detector/tests_false_positives.rs
+++ b/src/copyright/detector/tests_false_positives.rs
@@ -220,6 +220,46 @@ fn test_detect_filters_code_like_c_marker_lines() {
assert!(authors.is_empty(), "authors: {authors:?}");
}
+#[test]
+fn test_windows_versioninfo_line_is_not_detected_as_copyright_or_holder() {
+ let text = "Copyright (c) 2050 VALUE OriginalFilename NativeConsoleApp.exe";
+ let (copyrights, holders, authors) = detect_copyrights_from_text(text);
+
+ assert!(copyrights.is_empty(), "copyrights: {copyrights:?}");
+ assert!(holders.is_empty(), "holders: {holders:?}");
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
+#[test]
+fn test_dtd_declaration_line_is_not_detected_as_copyright_or_holder() {
+ let text = "copyright aaaa";
+ let (copyrights, holders, authors) = detect_copyrights_from_text(text);
+
+ assert!(copyrights.is_empty(), "copyrights: {copyrights:?}");
+ assert!(holders.is_empty(), "holders: {holders:?}");
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
+#[test]
+fn test_copyright_property_access_line_is_not_detected_as_copyright_or_holder() {
+ let text = "Copyright clone.Copyright.Text";
+ let (copyrights, holders, authors) = detect_copyrights_from_text(text);
+
+ assert!(copyrights.is_empty(), "copyrights: {copyrights:?}");
+ assert!(holders.is_empty(), "holders: {holders:?}");
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
+#[test]
+fn test_unicode_escape_c_marker_line_is_not_detected_as_copyright_or_holder() {
+ let text = "(c) HeaderType.Content u00AD u00AE";
+ let (copyrights, holders, authors) = detect_copyrights_from_text(text);
+
+ assert!(copyrights.is_empty(), "copyrights: {copyrights:?}");
+ assert!(holders.is_empty(), "holders: {holders:?}");
+ assert!(authors.is_empty(), "authors: {authors:?}");
+}
+
#[test]
fn test_detect_copyright_does_not_absorb_unexpected_as_represented() {
let text = "Copyright 1993 United States Government as represented by the\nDirector, National Security Agency.";
diff --git a/src/copyright/refiner/author.rs b/src/copyright/refiner/author.rs
index b7fad3a2b..f00907c33 100644
--- a/src/copyright/refiner/author.rs
+++ b/src/copyright/refiner/author.rs
@@ -100,6 +100,14 @@ fn looks_like_prose_fragment_author(s: &str) -> bool {
return true;
}
+ if looks_like_markup_data_identifier(trimmed) {
+ return true;
+ }
+
+ if looks_like_markup_attribute_label_value(trimmed) {
+ return true;
+ }
+
if contains_standalone_at_prefixed_token(trimmed) {
return true;
}
@@ -222,7 +230,48 @@ fn looks_like_prose_fragment_author(s: &str) -> bool {
fn contains_windows_versioninfo_fragment(s: &str) -> bool {
let trimmed = s.trim();
trimmed.starts_with("VALUE ")
- && (trimmed.contains("FileDescription") || trimmed.contains("FileVersion"))
+ && (trimmed.contains("FileDescription")
+ || trimmed.contains("FileVersion")
+ || trimmed.contains("OriginalFilename")
+ || trimmed.contains("ProductVersion")
+ || trimmed.contains("LegalTrademarks"))
+}
+
+fn looks_like_markup_data_identifier(s: &str) -> bool {
+ static DOI_RE: LazyLock =
+ LazyLock::new(|| Regex::new(r"(?i)^doi:[^\s]+$").expect("valid doi regex"));
+ static TAG_URI_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)^tag:[^,\s]+,\d{4}(?::[^\s]+)?$").expect("valid tag uri regex")
+ });
+ static RELATIVE_ID_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)^(?:id|urn|uuid)/[\p{L}0-9._~:/?#\[\]@!$&'()*+,;=-]+$")
+ .expect("valid relative id regex")
+ });
+ static NAME_WITH_TIMESTAMP_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?i)^[\p{Lu}][\p{L}'._-]+(?:\s+[\p{Lu}][\p{L}'._-]+){0,3}\s+\d{4}-\d{2}-\d{2}t\d{2}:\d{2}:\d{2}z$",
+ )
+ .expect("valid name timestamp regex")
+ });
+ static DUPLICATED_AUTHOR_WORD_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)^(?:author|name){2,}$").expect("valid duplicated author word regex")
+ });
+
+ let trimmed = s.trim();
+ DOI_RE.is_match(trimmed)
+ || TAG_URI_RE.is_match(trimmed)
+ || RELATIVE_ID_RE.is_match(trimmed)
+ || NAME_WITH_TIMESTAMP_RE.is_match(trimmed)
+ || DUPLICATED_AUTHOR_WORD_RE.is_match(trimmed)
+}
+
+fn looks_like_markup_attribute_label_value(s: &str) -> bool {
+ static MARKUP_ATTRIBUTE_LABEL_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"(?i)^(?:xmllang|xml:lang|xmlns(?::[a-z0-9_.-]+)?)\s+\S+$")
+ .expect("valid markup attribute label regex")
+ });
+
+ MARKUP_ATTRIBUTE_LABEL_RE.is_match(s.trim())
}
fn looks_like_file_reference_note_author(s: &str) -> bool {
diff --git a/src/copyright/refiner/mod.rs b/src/copyright/refiner/mod.rs
index 88472499f..cfbd5b763 100644
--- a/src/copyright/refiner/mod.rs
+++ b/src/copyright/refiner/mod.rs
@@ -473,6 +473,7 @@ fn is_junk_c_sign_path_fragment(s: &str) -> bool {
fn is_junk_copyright_code_fragment(s: &str) -> bool {
let trimmed = s.trim();
let lower = trimmed.to_ascii_lowercase();
+ let has_windows_versioninfo_markers = contains_windows_versioninfo_token(trimmed);
let has_code_markers = lower.contains("string?")
|| lower.contains("bool")
|| lower.contains("final ")
@@ -489,16 +490,30 @@ fn is_junk_copyright_code_fragment(s: &str) -> bool {
|| lower.contains("$template")
|| lower.contains("icondata")
|| lower.contains("static const")
+ || lower.contains("public void")
+ || lower.contains("get set")
+ || lower.contains("assert.equal")
|| lower.contains("classifiers")
|| lower.contains("authors.append")
|| lower == "copyright void"
|| trimmed.contains("??")
+ || contains_member_access_code_token(trimmed)
+ || contains_unicode_escape_token_run(trimmed)
+ || contains_xml_markup_declaration_token(trimmed)
|| contains_regex_or_template_marker(trimmed)
+ || has_windows_versioninfo_markers
|| contains_generated_resource_token(trimmed)
|| contains_malformed_spaced_year(trimmed);
let has_prose_markers = is_obvious_prose_fragment(trimmed);
+ if has_windows_versioninfo_markers {
+ return true;
+ }
+
if !lower.starts_with("copyright") {
+ if lower.starts_with("(c)") && (has_code_markers || has_prose_markers) {
+ return !has_copyright_year(trimmed);
+ }
return (lower.starts_with("not copyrighted") && !has_copyright_year(trimmed))
|| (lower.contains("copyright") && (has_code_markers || has_prose_markers));
}
@@ -517,6 +532,7 @@ pub(crate) fn is_junk_holder(s: &str) -> bool {
fn is_junk_holder_code_fragment(s: &str) -> bool {
let trimmed = s.trim();
let lower = trimmed.to_ascii_lowercase();
+ let has_windows_versioninfo_markers = contains_windows_versioninfo_token(trimmed);
let has_code_markers = lower.contains("string?")
|| lower.contains("bool")
|| lower.contains("final ")
@@ -529,13 +545,21 @@ fn is_junk_holder_code_fragment(s: &str) -> bool {
|| lower.contains("$template")
|| lower.contains("::")
|| lower.contains("static const")
+ || lower.contains("public void")
+ || lower.contains("get set")
+ || lower.contains("assert.equal")
|| lower.contains("icondata")
|| lower.contains("authors.append")
+ || contains_member_access_code_token(trimmed)
+ || contains_unicode_escape_token_run(trimmed)
+ || contains_xml_markup_declaration_token(trimmed)
|| contains_regex_or_template_marker(trimmed)
+ || has_windows_versioninfo_markers
|| contains_generated_resource_token(trimmed);
let has_prose_markers = is_obvious_prose_fragment(trimmed);
- (has_code_markers || has_prose_markers) && !has_copyright_year(trimmed)
+ has_windows_versioninfo_markers
+ || ((has_code_markers || has_prose_markers) && !has_copyright_year(trimmed))
}
fn is_junk_holder_symbol_garbage(s: &str) -> bool {
@@ -621,6 +645,64 @@ fn contains_generated_resource_token(s: &str) -> bool {
ASSET_RE.is_match(trimmed)
}
+fn contains_member_access_code_token(s: &str) -> bool {
+ let trimmed = s.trim();
+ let lower = trimmed.to_ascii_lowercase();
+ if lower.contains("http://") || lower.contains("https://") || lower.contains("www.") {
+ return false;
+ }
+
+ static MEMBER_ACCESS_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(r"\b(?:[a-z_][A-Za-z0-9_]{1,}\.){1,4}[A-Z][A-Za-z0-9_]{1,}(?:\.[A-Z][A-Za-z0-9_]{1,})*\b").unwrap()
+ });
+
+ MEMBER_ACCESS_RE.is_match(trimmed)
+}
+
+fn is_post_refine_copyright_code_fragment(s: &str) -> bool {
+ let trimmed = s.trim();
+ let lower = trimmed.to_ascii_lowercase();
+
+ contains_windows_versioninfo_token(trimmed)
+ || contains_member_access_code_token(trimmed)
+ || contains_unicode_escape_token_run(trimmed)
+ || lower.contains("public void")
+ || lower.contains("get set")
+ || lower.contains("assert.equal")
+}
+
+fn contains_unicode_escape_token_run(s: &str) -> bool {
+ static UNICODE_ESCAPE_RE: LazyLock =
+ LazyLock::new(|| Regex::new(r"(?i)\bu[0-9a-f]{4}\b").unwrap());
+
+ UNICODE_ESCAPE_RE.is_match(s.trim())
+}
+
+fn contains_windows_versioninfo_token(s: &str) -> bool {
+ static VERSIONINFO_KEY_RE: LazyLock = LazyLock::new(|| {
+ Regex::new(
+ r"(?i)\b(?:VALUE\s+)?(?:OriginalFilename|FileDescription|FileVersion|ProductVersion|LegalTrademarks|ProductName|InternalName|CompanyName)\b",
+ )
+ .unwrap()
+ });
+ static VERSIONINFO_FILE_RE: LazyLock =
+ LazyLock::new(|| Regex::new(r"(?i)\b[\p{L}0-9_.-]+\.(?:exe|dll|mui|ocx|sys)\b").unwrap());
+
+ let trimmed = s.trim();
+ VERSIONINFO_KEY_RE.is_match(trimmed)
+ && (trimmed.contains("VALUE ")
+ || VERSIONINFO_FILE_RE.is_match(trimmed)
+ || trimmed.to_ascii_lowercase().contains("legaltrademarks"))
+}
+
+fn contains_xml_markup_declaration_token(s: &str) -> bool {
+ let lower = s.to_ascii_lowercase();
+ lower.contains(" bool {
static SPACED_YEAR_RE: LazyLock =
LazyLock::new(|| Regex::new(r"\b(?:19|20)\s+\d{2}\b|\b\d{3}\s+\d{1,2}\b").unwrap());
@@ -702,6 +784,11 @@ pub fn refine_copyright(s: &str) -> Option {
return None;
}
let original = normalize_whitespace(s);
+ if contains_windows_versioninfo_token(&original)
+ || (contains_xml_markup_declaration_token(&original) && !has_copyright_year(&original))
+ {
+ return None;
+ }
let mut c = original.clone();
c = strip_trailing_quote_before_email(&c);
c = normalize_b_dot_angle_emails(&c);
@@ -808,7 +895,8 @@ pub fn refine_copyright(s: &str) -> Option {
{
return None;
}
- if is_junk_copyright_of_header(&result)
+ if is_post_refine_copyright_code_fragment(&result)
+ || is_junk_copyright_of_header(&result)
|| is_junk_copyrighted_works_header(&result)
|| is_junk_copyrighted_software_phrase(&result)
{
diff --git a/src/copyright/refiner/tests.rs b/src/copyright/refiner/tests.rs
index ff416eff3..922a6258e 100644
--- a/src/copyright/refiner/tests.rs
+++ b/src/copyright/refiner/tests.rs
@@ -1373,6 +1373,46 @@ fn test_refine_author_drops_generated_resource_identifiers() {
assert_eq!(refine_author("icon-app-20x20@2x.png.img.tmpl"), None);
}
+#[test]
+fn test_refine_author_drops_markup_feed_identifiers() {
+ assert_eq!(refine_author("doi:10.1038/nature05582"), None);
+ assert_eq!(refine_author("tag:contoso.com,2000"), None);
+ assert_eq!(refine_author("id/1234"), None);
+ assert_eq!(refine_author("James 2006-04-25T12:12:12Z"), None);
+ assert_eq!(refine_author("authorauthor"), None);
+ assert_eq!(refine_author("XmlLang en-usabcd"), None);
+}
+
+#[test]
+fn test_refine_copyright_drops_versioninfo_and_dtd_junk() {
+ assert_eq!(
+ refine_copyright("Copyright (c) 2050 VALUE OriginalFilename NativeConsoleApp.exe"),
+ None
+ );
+ assert_eq!(
+ refine_copyright("copyright aaaa"),
+ None
+ );
+ assert_eq!(refine_copyright("Copyright get set"), None);
+ assert_eq!(refine_copyright("copyright public void"), None);
+ assert_eq!(refine_copyright("Copyright clone.Copyright.Text"), None);
+ assert_eq!(
+ refine_copyright("Copyright HeaderType.Content u00AD u00AE"),
+ None
+ );
+}
+
+#[test]
+fn test_refine_holder_drops_versioninfo_and_dtd_junk() {
+ assert_eq!(
+ refine_holder("VALUE OriginalFilename NativeConsoleApp.exe"),
+ None
+ );
+ assert_eq!(refine_holder("PCDATA"), None);
+ assert_eq!(refine_holder("clone.Copyright.Text"), None);
+ assert_eq!(refine_holder("HeaderType.Content u00AD u00AE"), None);
+}
+
#[test]
fn test_refine_author_drops_template_token_runs_and_numeric_fragments() {
assert_eq!(refine_author("AUTH CONTRIBUTORS AUTHS+ + 2660"), None);
diff --git a/src/parsers/autotools.rs b/src/parsers/autotools.rs
index 6dd25e84e..815a1407f 100644
--- a/src/parsers/autotools.rs
+++ b/src/parsers/autotools.rs
@@ -35,6 +35,7 @@ const AUTOCONF_CONFIGURE_MARKERS: &[&str] = &[
"generated by gnu autoconf",
"generated automatically using autoconf",
"please tell bug-autoconf@gnu.org",
+ "configure script for ",
];
fn looks_like_autoconf_generated_configure(path: &Path) -> bool {
diff --git a/src/parsers/autotools_golden_test.rs b/src/parsers/autotools_golden_test.rs
index 9515939cd..fdbd09736 100644
--- a/src/parsers/autotools_golden_test.rs
+++ b/src/parsers/autotools_golden_test.rs
@@ -32,4 +32,12 @@ mod golden_tests {
"testdata/autotools/another-project/configure.ac.expected.json",
);
}
+
+ #[test]
+ fn test_golden_autotools_zlib_style_configure() {
+ run_golden(
+ "testdata/autotools/zlib-ng/configure",
+ "testdata/autotools/zlib-ng/configure.expected.json",
+ );
+ }
}
diff --git a/src/parsers/autotools_test.rs b/src/parsers/autotools_test.rs
index ffc1cc728..31cace453 100644
--- a/src/parsers/autotools_test.rs
+++ b/src/parsers/autotools_test.rs
@@ -21,6 +21,11 @@ fn test_is_match() {
"testdata/autotools/another-project/configure.ac"
)));
+ // Should match zlib-style generated configure header
+ assert!(AutotoolsConfigureParser::is_match(&PathBuf::from(
+ "testdata/autotools/zlib-ng/configure"
+ )));
+
// Should NOT match a custom non-Autoconf configure script
assert!(!AutotoolsConfigureParser::is_match(&PathBuf::from(
"testdata/autotools/non-autoconf-configure/configure"
@@ -86,3 +91,13 @@ fn test_nested_path() {
Some("pkg:autotools/my-awesome-project")
);
}
+
+#[test]
+fn test_zlib_style_configure_header() {
+ let path = PathBuf::from("testdata/autotools/zlib-ng/configure");
+ let package_data = AutotoolsConfigureParser::extract_first_package(&path);
+
+ assert_eq!(package_data.package_type, Some(PackageType::Autotools));
+ assert_eq!(package_data.name, Some("zlib-ng".to_string()));
+ assert_eq!(package_data.purl.as_deref(), Some("pkg:autotools/zlib-ng"));
+}
diff --git a/testdata/autotools/zlib-ng/configure b/testdata/autotools/zlib-ng/configure
new file mode 100644
index 000000000..99f558ed5
--- /dev/null
+++ b/testdata/autotools/zlib-ng/configure
@@ -0,0 +1,4 @@
+#!/bin/sh
+# configure script for zlib.
+#
+# Normally configure builds both a static and a shared library.
diff --git a/testdata/autotools/zlib-ng/configure.expected.json b/testdata/autotools/zlib-ng/configure.expected.json
new file mode 100644
index 000000000..eec0ac8db
--- /dev/null
+++ b/testdata/autotools/zlib-ng/configure.expected.json
@@ -0,0 +1,9 @@
+[
+ {
+ "type": "autotools",
+ "name": "zlib-ng",
+ "purl": "pkg:autotools/zlib-ng",
+ "parties": [],
+ "datasource_id": "autotools_configure"
+ }
+]