Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion docs/BENCHMARKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc

![Scan duration vs. file count for Provenant and ScanCode](benchmarks/scan-duration-vs-files.svg)

> Provenant is faster on 180 of 180 recorded runs, with a **11.7× median speedup** and **11.0× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **18.6×** on 10k+ file targets.
> Provenant is faster on 181 of 181 recorded runs, with a **11.7× median speedup** and **11.1× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.1×** on 10k+ file targets.
> Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`.

## Current benchmark examples
Expand Down Expand Up @@ -914,6 +914,13 @@ The quick index below links to benchmark sections. Each benchmark entry then rec
- Timing: Provenant `38.15s`; ScanCode `379.55s`
- Broader .NET/NuGet package and dependency extraction (`105` vs `3` packages, `145` vs `33` dependencies) from many `*.csproj` files plus `Directory.Packages.props` and `Directory.Build.props` across samples, tooling, and test projects, with zero scan errors where ScanCode trips on `TwitterColorEmoji-SVGinOT.ttf`

##### [dotnet/runtime @ d1163e5](https://github.com/dotnet/runtime/tree/d1163e5a8f3f3aaa374993e8b5805911689aba28) — **31.49× faster**

- Files: 57,611
- Run context: 2026-04-29 · runtime-99690 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc
- Timing: Provenant `299.53s`; ScanCode `9432.77s`
- Broader .NET/NuGet and sibling npm package visibility (`2249` vs `5` packages, `983` vs `503` dependencies) across many `*.csproj` files, `Directory.Packages.props`, `Directory.Build.props`, and committed `package-lock.json` inputs, with zero scan-file timeouts where ScanCode aborts on `EncryptedXmlSample4.xml`

##### [microsoft/onnxruntime @ 97e0a00](https://github.com/microsoft/onnxruntime/tree/97e0a001d43f8783db4507c9b2ac3731dc95a1ed) — **23.89× faster**

- Files: 9,802
Expand Down
6 changes: 6 additions & 0 deletions docs/benchmarks/scan-duration-vs-files.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
76 changes: 72 additions & 4 deletions src/assembly/assembly_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1738,7 +1738,7 @@ mod tests {
}

#[test]
fn test_assemble_npm_package_json_skips_lockfile_with_missing_identity() {
fn test_assemble_npm_package_json_merges_lockfile_with_missing_version_when_name_matches() {
let mut files = vec![
create_test_file_info(
"project/package.json",
Expand Down Expand Up @@ -1775,10 +1775,78 @@ mod tests {
assert_eq!(result.packages[0].version, Some("1.0.0".to_string()));
assert_eq!(
result.packages[0].datafile_paths,
vec!["project/package.json".to_string()]
vec![
"project/package-lock.json".to_string(),
"project/package.json".to_string()
]
);
assert!(result.dependencies.is_empty());
assert!(files[1].for_packages.is_empty());
assert_eq!(result.dependencies.len(), 1);
assert_eq!(
result.dependencies[0].purl.as_deref(),
Some("pkg:npm/left-pad@1.3.0")
);
assert_eq!(
result.dependencies[0].datafile_path,
"project/package-lock.json"
);
assert_eq!(files[0].for_packages.len(), 1);
assert_eq!(files[1].for_packages.len(), 1);
}

#[test]
fn test_assemble_npm_package_json_and_lockfile_merge_when_both_omit_version() {
let mut manifest = create_test_file_info(
"project/package.json",
DatasourceId::NpmPackageJson,
None,
Some("my-app"),
None,
vec![],
);
manifest.package_data[0].package_type = Some(PackageType::Npm);

let mut lockfile = create_test_file_info(
"project/package-lock.json",
DatasourceId::NpmPackageLockJson,
None,
Some("my-app"),
None,
vec![Dependency {
purl: Some("pkg:npm/left-pad@1.3.0".to_string()),
extracted_requirement: Some("1.3.0".to_string()),
scope: Some("dependencies".to_string()),
is_runtime: Some(true),
is_optional: Some(false),
is_pinned: Some(true),
is_direct: Some(false),
resolved_package: None,
extra_data: None,
}],
);
lockfile.package_data[0].package_type = Some(PackageType::Npm);

let mut files = vec![manifest, lockfile];

let result = assemble(&mut files);

assert_eq!(result.packages.len(), 1);
assert_eq!(result.packages[0].name, Some("my-app".to_string()));
assert_eq!(result.packages[0].version, None);
assert_eq!(
result.packages[0].datafile_paths,
vec![
"project/package-lock.json".to_string(),
"project/package.json".to_string()
]
);
assert_eq!(result.dependencies.len(), 1);
assert_eq!(
result.dependencies[0].purl.as_deref(),
Some("pkg:npm/left-pad@1.3.0")
);
assert!(result.dependencies[0].for_package_uid.is_some());
assert_eq!(files[0].for_packages.len(), 1);
assert_eq!(files[1].for_packages.len(), 1);
}

#[test]
Expand Down
26 changes: 15 additions & 11 deletions src/assembly/sibling_merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -461,20 +461,24 @@ fn should_skip_bun_lock_merge(package: &Package, pkg_data: &PackageData) -> bool
}

fn npm_package_identity_matches(package: &Package, pkg_data: &PackageData) -> bool {
let Some(package_name) = normalized_identity_value(package.name.as_deref()) else {
return false;
};
let Some(package_version) = normalized_identity_value(package.version.as_deref()) else {
return false;
};
let Some(candidate_name) = normalized_identity_value(pkg_data.name.as_deref()) else {
if let (Some(package_name), Some(candidate_name)) = (
normalized_identity_value(package.name.as_deref()),
normalized_identity_value(pkg_data.name.as_deref()),
) && package_name != candidate_name
{
return false;
};
let Some(candidate_version) = normalized_identity_value(pkg_data.version.as_deref()) else {
}

if let (Some(package_version), Some(candidate_version)) = (
normalized_identity_value(package.version.as_deref()),
normalized_identity_value(pkg_data.version.as_deref()),
) && package_version != candidate_version
{
return false;
};
}

package_name == candidate_name && package_version == candidate_version
normalized_identity_value(package.name.as_deref()).is_some()
&& normalized_identity_value(pkg_data.name.as_deref()).is_some()
}

fn normalized_identity_value(value: Option<&str>) -> Option<&str> {
Expand Down
80 changes: 80 additions & 0 deletions src/copyright/detector/author_heuristics/cleanup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,86 @@ pub(in super::super) fn drop_json_code_example_authors(
});
}

pub(in super::super) fn drop_markup_element_value_authors(
raw_lines: &[&str],
authors: &mut Vec<AuthorDetection>,
) {
if raw_lines.is_empty() || authors.is_empty() {
return;
}

authors.retain(|author| {
let Some(window) =
surrounding_author_window(raw_lines, author.start_line.get(), author.end_line.get())
else {
return true;
};
!window_contains_markup_element_author_value(&window, &author.author)
});
}

fn window_contains_markup_element_author_value(window: &str, author: &str) -> bool {
let normalized = normalize_whitespace(window);
if !(normalized.contains('<') && normalized.contains('>')) {
return false;
}

let lower = normalized.to_ascii_lowercase();
if lower.contains("copyright") || lower.contains("written by") || lower.contains("created by") {
return false;
}

let has_author_element = (lower.contains("<author>")
|| lower.contains("</author>")
|| lower.contains("<author ")
|| lower.contains(":author>"))
&& !lower.contains("author=");
if has_author_element {
return true;
}

if has_author_element
&& (lower.contains("<first-name>")
|| lower.contains("<last-name>")
|| lower.contains("<first.name>")
|| lower.contains("<last.name>"))
{
return true;
}

if lower.contains("xml:lang") && author.trim().starts_with("XmlLang ") {
return true;
}

let trimmed_author = author.trim();
if trimmed_author.is_empty() {
return false;
}

let escaped = regex::escape(trimmed_author);
let exact_tag_re = Regex::new(&format!(
r"(?is)<(?:name|title|id|email|uri|updated|first-name|last-name|first\.name|last\.name|firstname|surname)\b[^>]*>\s*{}\s*</",
escaped
))
.expect("valid exact markup data tag regex");
if exact_tag_re.is_match(&normalized) {
return true;
}

let looks_like_identifier = trimmed_author.contains('@')
|| trimmed_author.contains("http://")
|| trimmed_author.contains("https://")
|| trimmed_author.to_ascii_lowercase().starts_with("doi:")
|| trimmed_author.to_ascii_lowercase().starts_with("tag:")
|| trimmed_author.contains('T') && trimmed_author.ends_with('Z');

looks_like_identifier
&& (lower.contains("<id>")
|| lower.contains("<email>")
|| lower.contains("<uri>")
|| lower.contains("<updated>"))
}

fn surrounding_author_window(
raw_lines: &[&str],
start_line: usize,
Expand Down
6 changes: 6 additions & 0 deletions src/copyright/detector/phases/postprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ fn run_author_extraction_and_repairs(
authors.extend(new_a);

super::author_heuristics::drop_json_code_example_authors(raw_lines, authors);
super::author_heuristics::drop_markup_element_value_authors(raw_lines, authors);
seen.rebuild_authors_from(authors);

let mut new_a = super::author_heuristics::extract_name_contributed_authors(prepared_cache);
Expand All @@ -286,6 +287,8 @@ fn run_author_extraction_and_repairs(
let mut new_a = super::author_heuristics::extract_comment_author_label_authors(raw_lines);
seen.dedup_new_authors(&mut new_a, 0);
authors.extend(new_a);
super::author_heuristics::drop_markup_element_value_authors(raw_lines, authors);
seen.rebuild_authors_from(authors);
}

#[allow(clippy::too_many_arguments)]
Expand Down Expand Up @@ -597,6 +600,9 @@ fn run_final_variant_and_cleanup_repairs(
super::postprocess_transforms::drop_json_description_metadata_copyrights_and_holders(
raw_lines, copyrights, holders,
);
super::postprocess_transforms::drop_markup_declaration_and_versioninfo_copyrights_and_holders(
raw_lines, copyrights, holders,
);
super::postprocess_transforms::drop_copyright_like_holders(holders);
}

Expand Down
73 changes: 73 additions & 0 deletions src/copyright/detector/postprocess_transforms/metadata_repairs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,79 @@ pub fn drop_json_description_metadata_copyrights_and_holders(
});
}

pub fn drop_markup_declaration_and_versioninfo_copyrights_and_holders(
raw_lines: &[&str],
copyrights: &mut Vec<CopyrightDetection>,
holders: &mut Vec<HolderDetection>,
) {
if raw_lines.is_empty() {
return;
}

let mut retained_spans: HashSet<(usize, usize)> = HashSet::new();
copyrights.retain(|copyright| {
let keep = !span_has_markup_declaration_or_versioninfo(
raw_lines,
copyright.start_line.get(),
copyright.end_line.get(),
);
if keep {
retained_spans.insert((copyright.start_line.get(), copyright.end_line.get()));
}
keep
});

holders.retain(|holder| {
if retained_spans.contains(&(holder.start_line.get(), holder.end_line.get())) {
return true;
}

!span_has_markup_declaration_or_versioninfo(
raw_lines,
holder.start_line.get(),
holder.end_line.get(),
)
});
}

fn span_has_markup_declaration_or_versioninfo(
raw_lines: &[&str],
start_line: usize,
end_line: usize,
) -> bool {
if start_line == 0
|| end_line == 0
|| start_line > raw_lines.len()
|| end_line > raw_lines.len()
{
return false;
}

let joined = raw_lines[start_line - 1..end_line].join(" ");
let lower = joined.to_ascii_lowercase();
let has_year =
Regex::new(r"(?i)\b(?:19\d{2}|20\d{2})(?:\s*[-–/]\s*(?:19\d{2}|20\d{2}|\d{2}))?\b")
.expect("valid year regex")
.is_match(&joined);

((lower.contains("<!element")
|| lower.contains("<!attlist")
|| lower.contains("<!doctype")
|| lower.contains("pcdata"))
&& !has_year)
|| ((lower.contains("originalfilename")
|| lower.contains("filedescription")
|| lower.contains("fileversion")
|| lower.contains("productversion")
|| lower.contains("legaltrademarks"))
&& (lower.contains("value ")
|| lower.contains(".exe")
|| lower.contains(".dll")
|| lower.contains(".mui")
|| lower.contains(".ocx")
|| lower.contains(".sys")))
}

pub fn json_window_for_span(
raw_lines: &[&str],
start_line: usize,
Expand Down
11 changes: 11 additions & 0 deletions src/copyright/detector/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,17 @@ fn test_extract_copy_entity_year_range_only() {
);
}

#[test]
fn test_extract_copy_entity_year_range_from_html_fixture_line() {
let input = "<!doctype html><html><head><title>Some test</title></head><body><footer><p>Copyright &copy; 2003-2014</p></footer></body></html>";
let (c, _h, _a) = detect_copyrights_from_text(input);
assert!(
c.iter().any(|cr| cr.copyright == "Copyright (c) 2003-2014"),
"Expected Copyright (c) year range extracted, got: {:?}",
c.iter().map(|cr| &cr.copyright).collect::<Vec<_>>()
);
}

#[test]
fn test_extract_hex_a9_entity_year_range_only_as_bare_c() {
let input = "expectedXml = \"<p>Copyright &#xA9; 2003-2014</p>\",";
Expand Down
Loading
Loading