diff --git a/src/license_detection/detection/analysis.rs b/src/license_detection/detection/analysis.rs index e0c7cc7bf..c24f167d0 100644 --- a/src/license_detection/detection/analysis.rs +++ b/src/license_detection/detection/analysis.rs @@ -10,8 +10,8 @@ use crate::license_detection::expression::{ }; use crate::license_detection::models::{LicenseMatch, MatcherKind}; use crate::utils::spdx::{ - ExpressionRelation, combine_license_expressions_preserving_structure, - combine_license_expressions_with_relation_preserving_structure, + ExpressionRelation, combine_license_expressions_preserving_structure_strict, + combine_license_expressions_with_relation_preserving_structure_strict, }; /// Coverage value below which detections are not perfect. @@ -463,8 +463,10 @@ pub fn determine_spdx_expression( let expressions = expressions .ok_or_else(|| "Missing SPDX expressions for one or more matches".to_string())?; - combine_license_expressions_preserving_structure(expressions.into_iter().map(str::to_string)) - .ok_or_else(|| "Failed to combine SPDX expressions".to_string()) + combine_license_expressions_preserving_structure_strict( + expressions.into_iter().map(str::to_string), + ) + .ok_or_else(|| "Failed to combine SPDX expressions".to_string()) } fn determine_alternative_notice_expression( @@ -528,11 +530,12 @@ fn determine_alternative_notice_spdx_expression( let alternative_expressions = alternative_expressions.ok_or_else(|| { "Missing SPDX expressions for one or more alternative-license matches".to_string() })?; - let alternative_expression = combine_license_expressions_with_relation_preserving_structure( - alternative_expressions, - ExpressionRelation::Or, - ) - .ok_or_else(|| "Failed to combine alternative SPDX expressions".to_string())?; + let alternative_expression = + combine_license_expressions_with_relation_preserving_structure_strict( + alternative_expressions, + ExpressionRelation::Or, + ) + .ok_or_else(|| "Failed to combine alternative SPDX expressions".to_string())?; let mut parts = vec![alternative_expression]; let supplemental_expressions: Option> = supplemental @@ -543,9 +546,12 @@ fn determine_alternative_notice_spdx_expression( "Missing SPDX expressions for one or more supplemental matches".to_string() })?); - combine_license_expressions_with_relation_preserving_structure(parts, ExpressionRelation::And) - .ok_or_else(|| "Failed to combine alternative SPDX expression parts".to_string()) - .map(Some) + combine_license_expressions_with_relation_preserving_structure_strict( + parts, + ExpressionRelation::And, + ) + .ok_or_else(|| "Failed to combine alternative SPDX expression parts".to_string()) + .map(Some) } fn has_alternative_license_notice(matches: &[LicenseMatch], source_text: Option<&str>) -> bool { diff --git a/src/license_detection/detection/grouping.rs b/src/license_detection/detection/grouping.rs index cda781601..6fd864b85 100644 --- a/src/license_detection/detection/grouping.rs +++ b/src/license_detection/detection/grouping.rs @@ -63,7 +63,83 @@ pub(super) fn group_matches_by_region_with_threshold( groups.push(DetectionGroup::new(current_group)); } - groups + merge_sandwiched_same_line_clue_groups(groups) +} + +fn merge_sandwiched_same_line_clue_groups(groups: Vec) -> Vec { + let mut merged = Vec::new(); + let mut index = 0; + + while index < groups.len() { + if index + 2 < groups.len() + && should_merge_sandwiched_same_line_clue( + &groups[index], + &groups[index + 1], + &groups[index + 2], + ) + { + let mut combined_matches = groups[index].matches.clone(); + combined_matches.extend(groups[index + 1].matches.clone()); + combined_matches.extend(groups[index + 2].matches.clone()); + merged.push(DetectionGroup::new(combined_matches)); + index += 3; + continue; + } + + merged.push(DetectionGroup::new(groups[index].matches.clone())); + index += 1; + } + + merged +} + +fn should_merge_sandwiched_same_line_clue( + left: &DetectionGroup, + middle: &DetectionGroup, + right: &DetectionGroup, +) -> bool { + let Some(clue_match) = middle.matches.first() else { + return false; + }; + + middle.matches.len() == 1 + && clue_match.is_license_clue() + && left + .matches + .iter() + .all(|match_item| !match_item.is_license_clue()) + && right + .matches + .iter() + .all(|match_item| !match_item.is_license_clue()) + && all_matches_are_single_line_exact(&left.matches) + && all_matches_are_single_line_exact(&middle.matches) + && all_matches_are_single_line_exact(&right.matches) + && group_line_range(left) == group_line_range(middle) + && group_line_range(middle) == group_line_range(right) +} + +fn all_matches_are_single_line_exact(matches: &[LicenseMatch]) -> bool { + !matches.is_empty() + && matches.iter().all(|match_item| { + match_item.start_line == match_item.end_line + && match_item.coverage() == 100.0 + && matches!( + match_item.matcher, + crate::license_detection::models::MatcherKind::Hash + | crate::license_detection::models::MatcherKind::SpdxId + | crate::license_detection::models::MatcherKind::Aho + ) + }) +} + +fn group_line_range( + group: &DetectionGroup, +) -> Option<(crate::models::LineNumber, crate::models::LineNumber)> { + Some(( + group.matches.first()?.start_line, + group.matches.last()?.end_line, + )) } /// Check if two matches should be in the same group based on line proximity. @@ -308,6 +384,48 @@ mod tests { assert_eq!(groups[1].matches, vec![clue]); } + #[test] + fn test_group_matches_merges_sandwiched_same_line_clue_between_exact_matches() { + let mut left = create_test_match(10, 10, "2-aho", "linux-note.RULE"); + left.rule_kind = crate::license_detection::models::RuleKind::Reference; + left.license_expression = "linux-syscall-exception-gpl".to_string(); + left.match_coverage = 100.0; + + let mut clue = create_test_match(10, 10, "2-aho", "gpl_bare_word_only.RULE"); + clue.rule_kind = crate::license_detection::models::RuleKind::Clue; + clue.license_expression = "gpl-1.0-plus".to_string(); + clue.match_coverage = 100.0; + + let mut right = create_test_match(10, 10, "2-aho", "llgpl_3.RULE"); + right.rule_kind = crate::license_detection::models::RuleKind::Reference; + right.license_expression = "llgpl".to_string(); + right.match_coverage = 100.0; + + let groups = group_matches_by_region(&[left.clone(), clue.clone(), right.clone()]); + + assert_eq!(groups.len(), 1); + assert_eq!(groups[0].matches, vec![left, clue, right]); + } + + #[test] + fn test_group_matches_does_not_merge_sandwiched_clue_across_different_lines() { + let mut left = create_test_match(10, 10, "2-aho", "linux-note.RULE"); + left.rule_kind = crate::license_detection::models::RuleKind::Reference; + left.match_coverage = 100.0; + + let mut clue = create_test_match(11, 11, "2-aho", "gpl_bare_word_only.RULE"); + clue.rule_kind = crate::license_detection::models::RuleKind::Clue; + clue.match_coverage = 100.0; + + let mut right = create_test_match(11, 11, "2-aho", "llgpl_3.RULE"); + right.rule_kind = crate::license_detection::models::RuleKind::Reference; + right.match_coverage = 100.0; + + let groups = group_matches_by_region(&[left, clue, right]); + + assert_eq!(groups.len(), 3); + } + #[test] fn test_sort_matches_by_line() { let mut match1 = create_test_match(10, 15, "1-hash", "mit.LICENSE"); diff --git a/src/license_detection/spdx_lid/mod.rs b/src/license_detection/spdx_lid/mod.rs index 9f9cfb47c..62cba51de 100644 --- a/src/license_detection/spdx_lid/mod.rs +++ b/src/license_detection/spdx_lid/mod.rs @@ -29,6 +29,14 @@ use crate::models::MatchScore; pub const MATCH_SPDX_ID: MatcherKind = MatcherKind::SpdxId; +const UNKNOWN_SPDX_LICENSE_REF: &str = "LicenseRef-scancode-unknown-spdx"; + +#[derive(Debug, Clone, PartialEq, Eq)] +struct ResolvedExpression { + scancode_expression: String, + spdx_expression: Option, +} + #[derive(Debug, Clone, PartialEq)] enum RecoveryToken { LicenseKey(String), @@ -323,8 +331,8 @@ pub fn spdx_lid_match(index: &LicenseIndex, query: &Query) -> Vec lowered.clone() }; - if let Some(license_expression) = - find_matching_rule_for_expression(index, &resolved_expression) + if let Some(resolved_match) = + resolve_matching_rule_for_expression(index, &resolved_expression) { let matched_length = end_token.saturating_sub(*start_token); let match_coverage = 100.0; @@ -347,7 +355,8 @@ pub fn spdx_lid_match(index: &LicenseIndex, query: &Query) -> Vec .unwrap_or(0); let rule_relevance = 100; - let rule_identifier = synthetic_spdx_rule_identifier(&license_expression, spdx_text); + let rule_identifier = + synthetic_spdx_rule_identifier(&resolved_match.scancode_expression, spdx_text); let rule_url = String::new(); let rule_length = matched_length; let referenced_filenames = None; @@ -356,8 +365,12 @@ pub fn spdx_lid_match(index: &LicenseIndex, query: &Query) -> Vec let qspan = PositionSpan::range(*start_token, *end_token); let license_match = LicenseMatch { - license_expression, - license_expression_spdx: Some(spdx_expression.clone()), + license_expression: resolved_match.scancode_expression, + license_expression_spdx: Some( + resolved_match + .spdx_expression + .unwrap_or_else(|| spdx_expression.clone()), + ), from_file: None, start_line, end_line, @@ -708,19 +721,106 @@ fn render_boolean_operand(expr: &LicenseExpression, parent_operator: BooleanOper } } -pub(crate) fn find_matching_rule_for_expression( +fn unknown_spdx_license_ref() -> String { + UNKNOWN_SPDX_LICENSE_REF.to_string() +} + +fn rule_spdx_expression(index: &LicenseIndex, rid: usize) -> Option { + let rule = index.rules_by_rid.get(rid)?; + + index + .rule_metadata_by_identifier + .get(&rule.identifier) + .and_then(|metadata| metadata.license_expression_spdx.clone()) + .or_else(|| { + if rule.license_expression == "unknown-spdx" { + Some(unknown_spdx_license_ref()) + } else { + None + } + }) +} + +fn scancode_key_to_spdx_expression(key: &str, index: &LicenseIndex) -> Option { + if key == "unknown-spdx" { + return Some(LicenseExpression::LicenseRef(unknown_spdx_license_ref())); + } + + let license = index.licenses_by_key.get(key)?; + match &license.spdx_license_key { + Some(spdx_key) if spdx_key.starts_with("LicenseRef-") => { + Some(LicenseExpression::LicenseRef(spdx_key.clone())) + } + Some(spdx_key) => Some(LicenseExpression::License(spdx_key.clone())), + None => Some(LicenseExpression::LicenseRef(format!( + "LicenseRef-scancode-{key}" + ))), + } +} + +fn convert_scancode_expression_to_spdx( + expr: &LicenseExpression, index: &LicenseIndex, - expression: &str, +) -> Option { + match expr { + LicenseExpression::License(key) => scancode_key_to_spdx_expression(key, index), + LicenseExpression::LicenseRef(key) => Some(LicenseExpression::LicenseRef(key.clone())), + LicenseExpression::And { left, right } => Some(LicenseExpression::And { + left: Box::new(convert_scancode_expression_to_spdx(left, index)?), + right: Box::new(convert_scancode_expression_to_spdx(right, index)?), + }), + LicenseExpression::Or { left, right } => Some(LicenseExpression::Or { + left: Box::new(convert_scancode_expression_to_spdx(left, index)?), + right: Box::new(convert_scancode_expression_to_spdx(right, index)?), + }), + LicenseExpression::With { left, right } => Some(LicenseExpression::With { + left: Box::new(convert_scancode_expression_to_spdx(left, index)?), + right: Box::new(convert_scancode_expression_to_spdx(right, index)?), + }), + } +} + +fn render_valid_spdx_expression(expr: &LicenseExpression, index: &LicenseIndex) -> Option { + let spdx_expr = convert_scancode_expression_to_spdx(expr, index)?; + Some(render_valid_scancode_expression(&spdx_expr)) +} + +fn render_recovered_spdx_expression( + expr: &LicenseExpression, + index: &LicenseIndex, + render_mode: RecoveryRenderMode, ) -> Option { + let spdx_expr = convert_scancode_expression_to_spdx(expr, index)?; + Some(render_recovered_scancode_expression( + &spdx_expr, + render_mode, + )) +} + +fn resolve_matching_rule_for_expression( + index: &LicenseIndex, + expression: &str, +) -> Option { if let Some(&rid) = index.rid_by_spdx_key.get(expression) { let rule = &index.rules_by_rid[rid]; - return Some(rule.license_expression.clone()); + return Some(ResolvedExpression { + scancode_expression: rule.license_expression.clone(), + spdx_expression: rule_spdx_expression(index, rid), + }); } for rule in &index.rules_by_rid { let normalized = normalize_spdx_key(&rule.license_expression); if normalized == expression { - return Some(rule.license_expression.clone()); + let spdx_expression = index + .rule_metadata_by_identifier + .get(&rule.identifier) + .and_then(|metadata| metadata.license_expression_spdx.clone()); + + return Some(ResolvedExpression { + scancode_expression: rule.license_expression.clone(), + spdx_expression, + }); } } @@ -729,7 +829,10 @@ pub(crate) fn find_matching_rule_for_expression( { let result = render_valid_scancode_expression(&converted); if !result.is_empty() { - return Some(result); + return Some(ResolvedExpression { + scancode_expression: result, + spdx_expression: render_valid_spdx_expression(&converted, index), + }); } } @@ -742,7 +845,10 @@ pub(crate) fn find_matching_rule_for_expression( { let result = render_valid_scancode_expression(&converted); if !result.is_empty() { - return Some(result); + return Some(ResolvedExpression { + scancode_expression: result, + spdx_expression: render_valid_spdx_expression(&converted, index), + }); } } } @@ -752,13 +858,27 @@ pub(crate) fn find_matching_rule_for_expression( let converted = convert_recovered_expression_to_scancode(&recovered, index); let result = render_recovered_scancode_expression(&converted, render_mode); if !result.is_empty() { - return Some(result); + return Some(ResolvedExpression { + scancode_expression: result, + spdx_expression: render_recovered_spdx_expression(&converted, index, render_mode), + }); } } - index - .unknown_spdx_rid - .map(|rid| index.rules_by_rid[rid].license_expression.clone()) + index.unknown_spdx_rid.map(|rid| ResolvedExpression { + scancode_expression: index.rules_by_rid[rid].license_expression.clone(), + spdx_expression: rule_spdx_expression(index, rid) + .or_else(|| Some(unknown_spdx_license_ref())), + }) +} + +#[cfg(test)] +pub(crate) fn find_matching_rule_for_expression( + index: &LicenseIndex, + expression: &str, +) -> Option { + resolve_matching_rule_for_expression(index, expression) + .map(|resolved| resolved.scancode_expression) } fn convert_spdx_expression_to_scancode( diff --git a/src/license_detection/spdx_lid/test.rs b/src/license_detection/spdx_lid/test.rs index 8c79a0588..e437b70dd 100644 --- a/src/license_detection/spdx_lid/test.rs +++ b/src/license_detection/spdx_lid/test.rs @@ -451,6 +451,31 @@ mod tests { assert_eq!(matches[0].score, MatchScore::MAX); } + #[test] + fn test_spdx_lid_match_canonicalizes_recovered_unknown_spdx_expression() { + let engine = crate::license_detection::LicenseDetectionEngine::from_embedded() + .expect("required test dependency available"); + + let index = engine.index(); + let text = concat!( + "// SPDX-License-Identifier: Apache-2.0\n", + "//! such as \"SPDX-License-Identifier: MIT\" or variations with different comment styles and casing." + ); + let query = Query::from_extracted_text(text, index, false).unwrap(); + let matches = spdx_lid_match(index, &query); + + assert_eq!(matches.len(), 2); + assert_eq!( + matches[0].license_expression_spdx.as_deref(), + Some("Apache-2.0") + ); + assert_eq!(matches[1].license_expression, "unknown-spdx"); + assert_eq!( + matches[1].license_expression_spdx.as_deref(), + Some("LicenseRef-scancode-unknown-spdx") + ); + } + #[test] fn test_split_license_expression_with_with() { let expr = "GPL-2.0 WITH Classpath-exception-2.0"; diff --git a/src/license_detection/tests.rs b/src/license_detection/tests.rs index 8f777a4a3..88a146f64 100644 --- a/src/license_detection/tests.rs +++ b/src/license_detection/tests.rs @@ -491,6 +491,70 @@ fn test_engine_surfaces_bare_gpl1_as_clue_not_detection() { ); } +#[test] +fn test_engine_groups_same_line_bare_gpl_clue_with_exact_neighbors() { + let engine = get_engine(); + + let text = concat!( + "matches!(\n", + " lowered.as_str(),\n", + " \"linux-syscall-note\" | \"gpl-cc-1.0\" | \"llgpr\" | \"llgpl\" | \"shl-2.0\" | \"shl-2.1\"\n", + ")" + ); + + let detections = engine + .detect_with_kind(text, false, false) + .expect("Detection should succeed"); + + let grouped_detection = detections.iter().find(|detection| { + detection + .license_expression + .as_ref() + .is_some_and(|expression| { + expression.contains("linux-syscall-exception-gpl") + && expression.contains("gpl-1.0-plus") + && expression.contains("llgpl") + }) + }); + + let grouped_detection = + grouped_detection.expect("sandwiched GPL clue should join exact neighbors"); + assert!( + grouped_detection + .matches + .iter() + .any(|match_item| match_item.rule_identifier == "gpl_bare_word_only.RULE") + ); + assert!( + !grouped_detection + .detection_log + .iter() + .any(|log| log == "license-clues") + ); + assert!( + !detections.iter().any(|detection| { + detection.license_expression.is_none() + && detection + .matches + .iter() + .any(|match_item| match_item.rule_identifier == "gpl_bare_word_only.RULE") + }), + "sandwiched GPL clue should not remain as standalone clue-only evidence: {:?}", + detections + .iter() + .map(|detection| ( + detection.license_expression.as_deref().unwrap_or("none"), + detection.detection_log.clone(), + detection + .matches + .iter() + .map(|match_item| match_item.rule_identifier.as_str()) + .collect::>() + )) + .collect::>() + ); +} + #[test] fn test_engine_does_not_detect_graphics_pipeline_library_as_gpl() { let engine = get_engine(); diff --git a/src/output_schema/file_info.rs b/src/output_schema/file_info.rs index 3442b3145..58a56d338 100644 --- a/src/output_schema/file_info.rs +++ b/src/output_schema/file_info.rs @@ -119,40 +119,51 @@ impl OutputFileInfo { pub(crate) fn detected_license_expression_spdx(&self) -> Option { { - let expressions: Vec = self + let expressions: Option> = self .license_detections .iter() - .filter(|detection| !detection.license_expression_spdx.is_empty()) - .map(|detection| detection.license_expression_spdx.clone()) + .map(|detection| { + (!detection.license_expression_spdx.is_empty()) + .then(|| detection.license_expression_spdx.clone()) + }) .collect(); - crate::utils::spdx::select_primary_license_expression(expressions.clone()).or_else( - || { - crate::utils::spdx::combine_license_expressions_preserving_structure( - expressions, - ) - }, - ) + expressions.and_then(|expressions| { + crate::utils::spdx::select_primary_license_expression_strict(expressions.clone()) + .or_else(|| { + crate::utils::spdx::combine_license_expressions_preserving_structure_strict( + expressions, + ) + }) + }) } .or_else(|| { - let expressions: Vec = self + let expressions: Option> = self .package_data .iter() .flat_map(|package_data| package_data.license_detections.iter()) - .filter(|detection| !detection.license_expression_spdx.is_empty()) - .map(|detection| detection.license_expression_spdx.clone()) + .map(|detection| { + (!detection.license_expression_spdx.is_empty()) + .then(|| detection.license_expression_spdx.clone()) + }) .collect(); - crate::utils::spdx::select_primary_license_expression(expressions.clone()).or_else( - || { - crate::utils::spdx::combine_license_expressions_preserving_structure( - expressions, - ) - }, - ) + expressions.and_then(|expressions| { + crate::utils::spdx::select_primary_license_expression_strict(expressions.clone()) + .or_else(|| { + crate::utils::spdx::combine_license_expressions_preserving_structure_strict( + expressions, + ) + }) + }) }) .or_else(|| { self.license_expression .clone() .filter(|expression| !expression.is_empty()) + .and_then(|expression| { + crate::utils::spdx::combine_license_expressions_preserving_structure_strict([ + expression, + ]) + }) }) } } @@ -460,3 +471,96 @@ impl TryFrom<&OutputFileInfo> for crate::models::FileInfo { }) } } + +#[cfg(test)] +mod tests { + use super::OutputFileInfo; + use crate::models::FileType; + use crate::output_schema::license_detection::OutputLicenseDetection; + + fn base_output_file_info() -> OutputFileInfo { + OutputFileInfo { + name: "mod.rs".to_string(), + base_name: "mod".to_string(), + extension: ".rs".to_string(), + path: "mod.rs".to_string(), + file_type: FileType::File, + mime_type: None, + file_type_label: None, + size: 0, + date: None, + sha1: None, + md5: None, + sha256: None, + sha1_git: None, + programming_language: None, + package_data: Vec::new(), + license_expression: None, + license_detections: Vec::new(), + license_clues: Vec::new(), + percentage_of_license_text: None, + copyrights: Vec::new(), + holders: Vec::new(), + authors: Vec::new(), + emails: Vec::new(), + urls: Vec::new(), + for_packages: Vec::new(), + scan_errors: Vec::new(), + license_policy: None, + is_generated: None, + is_binary: None, + is_text: None, + is_archive: None, + is_media: None, + is_source: None, + is_script: None, + files_count: None, + dirs_count: None, + size_count: None, + source_count: None, + is_legal: false, + is_manifest: false, + is_readme: false, + is_top_level: false, + is_key_file: false, + is_community: false, + facets: Vec::new(), + tallies: None, + } + } + + #[test] + fn detected_license_expression_spdx_does_not_recombine_partial_detection_spdx() { + let mut file_info = base_output_file_info(); + file_info.license_expression = Some("Apache-2.0 AND MIT".to_string()); + file_info.license_detections = vec![ + OutputLicenseDetection { + license_expression: "apache-2.0".to_string(), + license_expression_spdx: "Apache-2.0".to_string(), + matches: Vec::new(), + detection_log: Vec::new(), + identifier: None, + }, + OutputLicenseDetection { + license_expression: "mit".to_string(), + license_expression_spdx: String::new(), + matches: Vec::new(), + detection_log: Vec::new(), + identifier: None, + }, + ]; + + assert_eq!( + file_info.detected_license_expression_spdx().as_deref(), + Some("Apache-2.0 AND MIT") + ); + } + + #[test] + fn detected_license_expression_spdx_rejects_invalid_fallback_expression() { + let mut file_info = base_output_file_info(); + file_info.license_expression = Some("MIT\" or malformed".to_string()); + + assert_eq!(file_info.detected_license_expression_spdx(), None); + } +} diff --git a/src/scanner/process/license.rs b/src/scanner/process/license.rs index 3f5730e55..070f44450 100644 --- a/src/scanner/process/license.rs +++ b/src/scanner/process/license.rs @@ -145,18 +145,20 @@ pub(super) fn extract_license_information( collapse_repeated_sourcemap_license_detections(path, model_detections); if !model_detections.is_empty() { - let expressions: Vec = model_detections + let expressions: Option> = model_detections .iter() - .filter(|d| !d.license_expression_spdx.is_empty()) - .map(|d| d.license_expression_spdx.clone()) + .map(|detection| { + (!detection.license_expression_spdx.is_empty()) + .then(|| detection.license_expression_spdx.clone()) + }) .collect(); - if !expressions.is_empty() { - let combined = crate::utils::spdx::select_primary_license_expression( + if let Some(expressions) = expressions { + let combined = crate::utils::spdx::select_primary_license_expression_strict( expressions.clone(), ) .or_else(|| { - crate::utils::spdx::combine_license_expressions_preserving_structure( + crate::utils::spdx::combine_license_expressions_preserving_structure_strict( expressions, ) }); @@ -566,12 +568,11 @@ fn promote_reference_url_clue_detection( .iter() .map(|license_match| license_match.license_expression.clone()), )?; - let license_expression_spdx = - crate::utils::spdx::combine_license_expressions_preserving_structure( - promoted_matches - .iter() - .filter_map(|license_match| license_match.license_expression_spdx.clone()), - ) + let license_expression_spdx = promoted_matches + .iter() + .map(|license_match| license_match.license_expression_spdx.clone()) + .collect::>>() + .and_then(crate::utils::spdx::combine_license_expressions_preserving_structure_strict) .unwrap_or_default(); let matches = promoted_matches .into_iter() @@ -638,14 +639,12 @@ fn promote_legal_notice_low_quality_detections( else { continue; }; - let license_expression_spdx = - crate::utils::spdx::combine_license_expressions_preserving_structure( - detection - .matches - .iter() - .filter_map(|license_match| license_match.license_expression_spdx.clone()) - .collect::>(), - ); + let license_expression_spdx = detection + .matches + .iter() + .map(|license_match| license_match.license_expression_spdx.clone()) + .collect::>>() + .and_then(crate::utils::spdx::combine_license_expressions_preserving_structure_strict); detection.license_expression = Some(license_expression); detection.license_expression_spdx = license_expression_spdx; @@ -1119,10 +1118,10 @@ fn normalize_optional_spdx_expression(expression: Option<&str>) -> String { return String::new(); }; - crate::utils::spdx::combine_license_expressions_preserving_structure(std::iter::once( + crate::utils::spdx::combine_license_expressions_preserving_structure_strict(std::iter::once( expression.to_string(), )) - .unwrap_or_else(|| expression.to_string()) + .unwrap_or_default() } fn compute_percentage_of_license_text( diff --git a/src/scanner/process/license_test.rs b/src/scanner/process/license_test.rs index 253e4437e..c708f0c0e 100644 --- a/src/scanner/process/license_test.rs +++ b/src/scanner/process/license_test.rs @@ -4,7 +4,8 @@ use super::{ LicenseExtractionInput, MAX_OUTPUT_MATCHED_TEXT_BYTES, MAX_OUTPUT_MATCHED_TEXT_LINE_LENGTH, compute_percentage_of_license_text, convert_detection_to_model, extract_license_information, - promote_legal_notice_low_quality_detections, prune_redundant_readme_conjunctive_detections, + normalize_optional_spdx_expression, promote_legal_notice_low_quality_detections, + prune_redundant_readme_conjunctive_detections, }; use crate::license_detection::LicenseDetection as InternalLicenseDetection; use crate::license_detection::LicenseDetectionEngine; @@ -257,6 +258,29 @@ fn test_convert_detection_to_model_routes_expressionless_detection_to_license_cl assert_eq!(clues[0].matched_text_diagnostics, None); } +#[test] +fn test_convert_detection_to_model_drops_invalid_spdx_expression() { + let mut detection = make_detection(""); + detection.license_expression_spdx = Some("MIT\" or malformed".to_string()); + detection.matches[0].license_expression_spdx = Some("MIT\" or malformed".to_string()); + + let (converted, clues) = + convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None, None); + let converted = converted.expect("detection should convert"); + + assert_eq!(converted.license_expression_spdx, ""); + assert_eq!(converted.matches[0].license_expression_spdx, ""); + assert!(clues.is_empty()); +} + +#[test] +fn test_normalize_optional_spdx_expression_rejects_invalid_input() { + assert_eq!( + normalize_optional_spdx_expression(Some("MIT\" or malformed")), + "" + ); +} + #[test] fn test_convert_detection_to_model_promotes_exact_reference_url_clue() { let mut index = create_test_index(&[], 0); diff --git a/src/utils/spdx.rs b/src/utils/spdx.rs index e9bcb6fa1..94b0023e5 100644 --- a/src/utils/spdx.rs +++ b/src/utils/spdx.rs @@ -32,6 +32,16 @@ pub fn combine_license_expressions_preserving_structure( combine_license_expressions_with_relation_and_mode(expressions, ExpressionRelation::And, true) } +pub(crate) fn combine_license_expressions_preserving_structure_strict( + expressions: impl IntoIterator, +) -> Option { + combine_license_expressions_with_relation_and_mode_strict( + expressions, + ExpressionRelation::And, + true, + ) +} + pub fn select_primary_license_expression( expressions: impl IntoIterator, ) -> Option { @@ -74,11 +84,20 @@ pub fn select_primary_license_expression( .then(|| candidate.clone()) } -pub(crate) fn combine_license_expressions_with_relation_preserving_structure( +pub(crate) fn select_primary_license_expression_strict( + expressions: impl IntoIterator, +) -> Option { + let expressions: Vec = expressions.into_iter().collect(); + select_primary_license_expression(expressions).and_then(|expression| { + combine_license_expressions_preserving_structure_strict([expression]) + }) +} + +pub(crate) fn combine_license_expressions_with_relation_preserving_structure_strict( expressions: impl IntoIterator, relation: ExpressionRelation, ) -> Option { - combine_license_expressions_with_relation_and_mode(expressions, relation, true) + combine_license_expressions_with_relation_and_mode_strict(expressions, relation, true) } pub(crate) fn combine_license_expressions_with_relation( @@ -107,6 +126,24 @@ fn combine_license_expressions_with_relation_and_mode( .or_else(|| combine_license_expressions_fallback(&expressions, relation)) } +fn combine_license_expressions_with_relation_and_mode_strict( + expressions: impl IntoIterator, + relation: ExpressionRelation, + preserve_structure: bool, +) -> Option { + let expressions: Vec = expressions + .into_iter() + .map(|expression| expression.trim().to_string()) + .filter(|expression| !expression.is_empty()) + .collect(); + + if expressions.is_empty() { + return None; + } + + combine_parsed_expressions(&expressions, relation, preserve_structure) +} + fn combine_parsed_expressions( expressions: &[String], relation: ExpressionRelation, @@ -461,4 +498,22 @@ mod tests { assert_eq!(result, None); } + + #[test] + fn combine_license_expressions_preserving_structure_strict_rejects_invalid_expression() { + let result = combine_license_expressions_preserving_structure_strict(vec![ + "Apache-2.0".to_string(), + "MIT\" or malformed".to_string(), + ]); + + assert_eq!(result, None); + } + + #[test] + fn select_primary_license_expression_strict_rejects_invalid_primary_expression() { + let result = + select_primary_license_expression_strict(vec!["MIT\" or malformed".to_string()]); + + assert_eq!(result, None); + } }