Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion docs/BENCHMARKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc

![Scan duration vs. file count for Provenant and ScanCode](scan-duration-vs-files.svg)

> Provenant is faster on 190 of 190 recorded runs, with a **12.0× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets.
> Provenant is faster on 191 of 191 recorded runs, with a **12.1× median speedup** and **11.2× geometric-mean speedup** overall; the median gap grows from **7.1×** on sub-100-file targets to **19.7×** on 10k+ file targets.
> Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`.

## Current benchmark examples
Expand Down Expand Up @@ -819,6 +819,13 @@ The quick index below links to benchmark sections. Each benchmark entry then rec
- Timing: Provenant `290.44s`; ScanCode `5927.08s`
- Broader Bazel and mixed-tree dependency extraction (`8202` vs `8056` packages, `1465` vs `700` dependencies) from root and vendored `MODULE.bazel`, many committed `BUILD` files, Python lockfiles, Dockerfiles, and Debian control metadata, plus direct `CITATION.cff` package visibility

##### [PX4/eigen @ 7cf1c01](https://github.com/PX4/eigen/tree/7cf1c0179eb0f5499dfc1bffbd229783a7865fe1) — **19.96× faster**

- Files: 1,672
- Run context: 2026-05-04 · eigen-62479 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc
- Timing: Provenant `16.12s`; ScanCode `321.68s`
- Cleaner copyright, holder, and author recovery on this manifest-free native source tree, with structured `Copyright Notice (...)` extraction, normalized `Author / Project / Copyright` header splitting, rejection of `.krazy` control-file and disclaimer-list junk, and Unicode-preserving party normalization

##### [ValveSoftware/eigen @ e9c4315](https://github.com/ValveSoftware/eigen/tree/e9c43151265207fd3366bba21cddd61141ff402c) — **19.84× faster**

- Files: 1,784
Expand Down
6 changes: 6 additions & 0 deletions docs/scan-duration-vs-files.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
48 changes: 47 additions & 1 deletion src/copyright/detector/author_heuristics/extraction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1282,7 +1282,9 @@ pub(in super::super) fn extract_author_colon_blocks(
}
}
let combined_raw = segments.join(" ");
let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw) else {
let Some(combined) = refine_author_with_optional_handle_suffix(&combined_raw)
.or_else(|| refine_explicit_author_label_roster(&combined_raw))
else {
line_number = line_number.next();
continue;
};
Expand Down Expand Up @@ -1363,6 +1365,50 @@ fn sanitize_author_colon_tail(tail: &str) -> Option<String> {
Some(trimmed.to_string())
}

fn refine_explicit_author_label_roster(candidate: &str) -> Option<String> {
let trimmed = normalize_whitespace(candidate.trim());
if !trimmed.contains(',') {
return None;
}

let parts: Vec<&str> = trimmed
.split(',')
.map(str::trim)
.filter(|part| !part.is_empty())
.collect();
if parts.len() < 2 {
return None;
}

let has_placeholder = parts.iter().any(|part| {
part.eq_ignore_ascii_case("package author") || part.eq_ignore_ascii_case("package authors")
});
if has_placeholder {
return None;
}

let first_two_rosterish = parts.iter().take(2).all(|part| {
let words: Vec<&str> = part.split_whitespace().collect();
if words.is_empty() {
return false;
}

if words.len() >= 2 {
return words
.iter()
.all(|word| word.chars().any(|ch| ch.is_alphabetic()));
}

part.chars()
.all(|ch| !ch.is_alphabetic() || ch.is_ascii_uppercase())
});
if !first_two_rosterish {
return None;
}

Some(trimmed)
}

fn is_author_metadata_line(line: &str) -> bool {
let lower = line.trim().to_ascii_lowercase();
lower.starts_with("url:")
Expand Down
40 changes: 40 additions & 0 deletions src/copyright/detector/author_heuristics_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,46 @@ fn test_detect_multiline_comment_authors_block_after_year_only_copyright() {
);
}

#[test]
fn test_detect_explicit_author_label_roster_with_company_suffix() {
let input = "// Author : Antoine YESSAYAN, Paul RASCLE, EDF\n";
let (_copyrights, _holders, authors) = super::super::detect_copyrights_from_text(input);

assert!(
authors
.iter()
.any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"),
"authors: {authors:?}"
);
}

#[test]
fn test_split_author_project_copyright_metadata_block() {
let input = concat!(
"// Author : Antoine YESSAYAN, Paul RASCLE, EDF\n",
"// Project : SALOME\n",
"// Copyright : EDF 2001\n",
);
let (copyrights, holders, authors) = super::super::detect_copyrights_from_text(input);

assert!(
authors
.iter()
.any(|author| author.author == "Antoine YESSAYAN, Paul RASCLE, EDF"),
"authors: {authors:?}"
);
assert!(
copyrights
.iter()
.any(|copyright| copyright.copyright == "Copyright EDF 2001"),
"copyrights: {copyrights:?}"
);
assert!(
holders.iter().any(|holder| holder.holder == "EDF"),
"holders: {holders:?}"
);
}

#[test]
fn test_extract_collective_author_with_contributors_before_email() {
let input = "authors = [\"Tokio Contributors <team@tokio.rs>\"]\n";
Expand Down
6 changes: 6 additions & 0 deletions src/copyright/detector/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,11 @@ pub fn detect_copyrights_from_text_with_deadline(
postprocess_transforms::drop_shadowed_bare_c_from_year_fragments(&mut copyrights, &mut holders);
drop_path_fragment_holders_from_bare_c_code_lines(&raw_lines, &copyrights, &mut holders);
drop_scan_only_holders_from_copyright_scan_lines(&raw_lines, &copyrights, &mut holders);
drop_test_label_false_positive_copyrights_and_holders(
&raw_lines,
&mut copyrights,
&mut holders,
);

for group in &groups {
extend_dash_obfuscated_email_suffixes(&raw_lines, group, &mut copyrights[..], &holders[..]);
Expand Down Expand Up @@ -407,6 +412,7 @@ pub(super) use token_utils::collect_all_leaves;
use token_utils::{
apply_written_by_for_markers, drop_path_fragment_holders_from_bare_c_code_lines,
drop_scan_only_holders_from_copyright_scan_lines,
drop_test_label_false_positive_copyrights_and_holders,
extract_original_author_additional_contributors,
};
use tree_walk::{
Expand Down
72 changes: 72 additions & 0 deletions src/copyright/detector/pattern_extract/extraction/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,78 @@ pub fn extract_spdx_filecopyrighttext_c_without_year(
(copyrights, holders)
}

pub fn extract_bytestring_copyright_c_without_year(
content: &str,
existing_holders: &[HolderDetection],
) -> (Vec<CopyrightDetection>, Vec<HolderDetection>) {
static YEAR_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b(?:19\d{2}|20\d{2})\b").unwrap());

let mut copyrights = Vec::new();
let mut holders = Vec::new();

let mut seen_h: HashSet<(String, usize)> = existing_holders
.iter()
.map(|h| (h.holder.clone(), h.start_line.get()))
.collect();

for (idx, line) in content.lines().enumerate() {
let ln = idx + 1;
let Some(raw) = extract_bytestring_copyright_literal(line) else {
continue;
};
if raw.is_empty() || YEAR_RE.is_match(&raw) {
continue;
}

let prepared = crate::copyright::prepare_text_line(&raw);
if let Some(refined) = refine_copyright(&prepared) {
copyrights.push(CopyrightDetection {
copyright: refined,
start_line: LineNumber::new(ln).unwrap(),
end_line: LineNumber::new(ln).unwrap(),
});
}

let tail = prepared
.strip_prefix("Copyright")
.unwrap_or(prepared.as_str())
.trim()
.strip_prefix("(c)")
.unwrap_or(prepared.as_str())
.trim();
if let Some(holder) = refine_holder(tail)
&& seen_h.insert((holder.clone(), ln))
{
holders.push(HolderDetection {
holder,
start_line: LineNumber::new(ln).unwrap(),
end_line: LineNumber::new(ln).unwrap(),
});
}
}

(copyrights, holders)
}

fn extract_bytestring_copyright_literal(line: &str) -> Option<String> {
for prefix in ["br'", "rb'", "b'", "br\"", "rb\"", "b\""] {
let Some(start) = line.find(prefix) else {
continue;
};
let quote = prefix.chars().last()?;
let rest = line.get(start + prefix.len()..)?;
let Some(end) = rest.find(quote) else {
continue;
};
let candidate = rest[..end].trim();
if candidate.to_ascii_lowercase().starts_with("copyright (c)") {
return Some(candidate.to_string());
}
}

None
}
pub fn extract_html_meta_name_copyright_content(
content: &str,
existing_holders: &[HolderDetection],
Expand Down
10 changes: 10 additions & 0 deletions src/copyright/detector/phases/postprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,16 @@ fn run_mid_pipeline_repairs(
seen.dedup_new_holders(holders, h_before);
seen.dedup_new_authors(authors, a_before);

let c_before = copyrights.len();
let h_before = holders.len();
let a_before = authors.len();
super::postprocess_transforms::split_author_project_copyright_metadata_blocks(
copyrights, holders, authors,
);
seen.dedup_new_copyrights(copyrights, c_before);
seen.dedup_new_holders(holders, h_before);
seen.dedup_new_authors(authors, a_before);

super::postprocess_transforms::drop_static_char_string_copyrights(content, copyrights, holders);
super::postprocess_transforms::drop_combined_period_holders(holders);
super::pattern_extract::drop_shadowed_prefix_holders(holders);
Expand Down
9 changes: 9 additions & 0 deletions src/copyright/detector/phases/primary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,15 @@ fn run_content_and_markup_extractions(
copyrights.extend(new_c);
holders.extend(new_h);

let (mut new_c, new_h) =
super::super::pattern_extract::extract_bytestring_copyright_c_without_year(
content, holders,
);
seen.dedup_new_copyrights(&mut new_c, 0);
seen.register_holders(&new_h);
copyrights.extend(new_c);
holders.extend(new_h);

let (mut new_c, new_h) =
super::super::pattern_extract::extract_html_meta_name_copyright_content(content, holders);
seen.dedup_new_copyrights(&mut new_c, 0);
Expand Down
71 changes: 71 additions & 0 deletions src/copyright/detector/postprocess_transforms/author_repairs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,74 @@ pub fn split_written_by_copyrights_into_holder_prefixed_clauses(
holders.retain(|h| h.holder != "Julian Cowley");
authors.retain(|a| a.author != "Linus Torvalds" && a.author != "Theodore Ts'o");
}

pub fn split_author_project_copyright_metadata_blocks(
copyrights: &mut [CopyrightDetection],
holders: &mut Vec<HolderDetection>,
authors: &mut Vec<AuthorDetection>,
) {
static AUTHOR_PROJECT_COPY_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?ix)
^Author\s+(?P<author>.+?)
(?:\s+Project\s+(?P<project>.+?))?
\s+Copyright\s+(?P<holder>.+?)\s+(?P<year>\d{4})
$",
)
.unwrap()
});

for copyright in copyrights.iter_mut() {
let current = copyright.copyright.clone();
let Some(cap) = AUTHOR_PROJECT_COPY_RE.captures(current.as_str()) else {
continue;
};

let author_raw = cap.name("author").map(|m| m.as_str()).unwrap_or("").trim();
let holder_raw = cap.name("holder").map(|m| m.as_str()).unwrap_or("").trim();
let year = cap.name("year").map(|m| m.as_str()).unwrap_or("").trim();
if author_raw.is_empty() || holder_raw.is_empty() || year.is_empty() {
continue;
}

let author = crate::copyright::refiner::refine_author(author_raw)
.unwrap_or_else(|| normalize_whitespace(author_raw));
if !author.is_empty()
&& !authors.iter().any(|existing| {
existing.author == author && existing.start_line == copyright.start_line
})
{
authors.push(AuthorDetection {
author,
start_line: copyright.start_line,
end_line: copyright.start_line,
});
}

if let Some(refined) = refine_copyright(&format!("Copyright {holder_raw} {year}")) {
copyright.copyright = refined;
}

let Some(refined_holder) = refine_holder_in_copyright_context(holder_raw) else {
continue;
};

holders.retain(|holder| {
!(holder.start_line == copyright.start_line
&& holder.end_line == copyright.end_line
&& holder.holder.contains(author_raw))
});

if !holders.iter().any(|holder| {
holder.holder == refined_holder
&& holder.start_line == copyright.start_line
&& holder.end_line == copyright.end_line
}) {
holders.push(HolderDetection {
holder: refined_holder,
start_line: copyright.start_line,
end_line: copyright.end_line,
});
}
}
}
Loading
Loading