diff --git a/hashprep/checks/drift.py b/hashprep/checks/drift.py index e50b8fe..4e5798b 100644 --- a/hashprep/checks/drift.py +++ b/hashprep/checks/drift.py @@ -4,6 +4,9 @@ from .core import Issue from ..config import DEFAULT_CONFIG +from ..utils.logging import get_logger + +_log = get_logger("checks.drift") _DRIFT = DEFAULT_CONFIG.drift CRITICAL_P_VALUE = _DRIFT.critical_p_value @@ -19,6 +22,9 @@ def check_drift( Check for distribution shift between two datasets. Uses Kolmogorov-Smirnov test for numeric columns and Chi-square for categorical. """ + if not isinstance(df_train, pd.DataFrame) or not isinstance(df_test, pd.DataFrame): + raise TypeError("Both df_train and df_test must be pandas DataFrames") + issues = [] issues.extend(_check_numeric_drift(df_train, df_test, threshold)) @@ -132,7 +138,7 @@ def _check_categorical_drift( quick_fix="Options:\n- Re-train model with recent data.\n- Investigate category distribution changes.\n- Consider rebalancing categories.", ) ) - except (ValueError, RuntimeWarning): - pass + except (ValueError, RuntimeWarning) as e: + _log.debug("Chi-square drift test failed for '%s': %s", col, e) return issues diff --git a/hashprep/checks/leakage.py b/hashprep/checks/leakage.py index 26d5901..ecbbc1b 100644 --- a/hashprep/checks/leakage.py +++ b/hashprep/checks/leakage.py @@ -3,8 +3,10 @@ from scipy.stats import chi2_contingency, f_oneway import numpy as np from ..config import DEFAULT_CONFIG +from ..utils.logging import get_logger _LEAK = DEFAULT_CONFIG.leakage +_log = get_logger("checks.leakage") def _check_data_leakage(analyzer): issues = [] @@ -91,7 +93,8 @@ def _check_target_leakage_patterns(analyzer): quick_fix=quick_fix, ) ) - except Exception: + except (ValueError, np.linalg.LinAlgError) as e: + _log.debug("Chi-square leakage test failed for '%s': %s", col, e) continue numeric_cols = analyzer.df.select_dtypes(include="number").drop( columns=[analyzer.target_col], errors="ignore" @@ -127,6 +130,7 @@ def _check_target_leakage_patterns(analyzer): quick_fix=quick_fix, ) ) - except Exception: + except (ValueError, RuntimeWarning) as e: + _log.debug("F-test leakage check failed for '%s': %s", col, e) continue return issues \ No newline at end of file diff --git a/hashprep/checks/missing_values.py b/hashprep/checks/missing_values.py index 36f7ab5..f407360 100644 --- a/hashprep/checks/missing_values.py +++ b/hashprep/checks/missing_values.py @@ -3,7 +3,11 @@ import pandas as pd from collections import defaultdict import numpy as np +from numpy.linalg import LinAlgError from ..config import DEFAULT_CONFIG +from ..utils.logging import get_logger + +_log = get_logger("checks.missing_values") _THRESHOLDS = DEFAULT_CONFIG.missing_values @@ -117,7 +121,8 @@ def cramers_v(table): cramers = cramers_v(table) if p_val < threshold and cramers > _THRESHOLDS.pattern_cramers_v_min: cat_patterns[col].append((other_col, p_val, cramers)) - except Exception: + except (ValueError, LinAlgError) as e: + _log.debug("Chi-square test failed for '%s' vs '%s': %s", col, other_col, e) continue for other_col in analyzer.df.select_dtypes( @@ -140,7 +145,8 @@ def cramers_v(table): if p_val < threshold and cohens_d > _THRESHOLDS.pattern_cohens_d_min: num_patterns[col].append((other_col, p_val, cohens_d)) - except Exception: + except (ValueError, RuntimeWarning) as e: + _log.debug("Mann-Whitney U test failed for '%s' vs '%s': %s", col, other_col, e) continue # Generate grouped issues diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py index 5062c6a..bdbbc59 100644 --- a/hashprep/core/analyzer.py +++ b/hashprep/core/analyzer.py @@ -69,6 +69,15 @@ def __init__( sampling_config: Optional[SamplingConfig] = None, auto_sample: bool = True, ): + if not isinstance(df, pd.DataFrame): + raise TypeError(f"Expected pandas DataFrame, got {type(df).__name__}") + if df.columns.duplicated().any(): + raise ValueError(f"DataFrame has duplicate column names: {list(df.columns[df.columns.duplicated()])}") + if target_col is not None and target_col not in df.columns: + raise ValueError(f"Target column '{target_col}' not found in DataFrame") + if comparison_df is not None and not isinstance(comparison_df, pd.DataFrame): + raise TypeError(f"comparison_df must be a pandas DataFrame, got {type(comparison_df).__name__}") + self.comparison_df = comparison_df self.target_col = target_col self.selected_checks = selected_checks diff --git a/hashprep/reports/markdown.py b/hashprep/reports/markdown.py index 22b41a0..2b48dea 100644 --- a/hashprep/reports/markdown.py +++ b/hashprep/reports/markdown.py @@ -3,6 +3,9 @@ from typing import Dict, List import pandas as pd +from ..utils.logging import get_logger + +_log = get_logger("reports.markdown") import hashprep @@ -205,7 +208,8 @@ def generate(self, summary, full=False, output_file=None): img_f.write(base64.b64decode(plot_data)) rel_path = os.path.join(f"{report_name}_images", img_filename) content += f"![{plot_name}]({rel_path})\n\n" - except Exception: + except (OSError, ValueError) as e: + _log.warning("Failed to save plot '%s': %s", plot_name, e) content += f"*(Error saving plot {plot_name})*\n\n" content += "---\n\n" @@ -224,8 +228,8 @@ def generate(self, summary, full=False, output_file=None): img_f.write(base64.b64decode(plot_data)) rel_path = os.path.join(f"{report_name}_images", img_filename) content += f"![{method} Correlation]({rel_path})\n\n" - except Exception: - pass + except (OSError, ValueError) as e: + _log.warning("Failed to save correlation plot '%s': %s", method, e) pairs = [] for c1, corrs in num_corr["pearson"].items(): diff --git a/hashprep/summaries/interactions.py b/hashprep/summaries/interactions.py index 39d2259..2ef9ae3 100644 --- a/hashprep/summaries/interactions.py +++ b/hashprep/summaries/interactions.py @@ -1,6 +1,9 @@ import pandas as pd from scipy.stats import chi2_contingency, f_oneway import numpy as np +from ..utils.logging import get_logger + +_log = get_logger("summaries.interactions") def summarize_interactions(df): @@ -45,7 +48,8 @@ def _compute_categorical_correlations(df): r, k = table.shape cramers_v = (phi2 / min(k - 1, r - 1)) ** 0.5 results[f"{c1}__{c2}"] = float(cramers_v) - except Exception: + except (ValueError, np.linalg.LinAlgError) as e: + _log.debug("Categorical correlation failed for '%s' vs '%s': %s", c1, c2, e) continue return results @@ -69,6 +73,7 @@ def _compute_mixed_correlations(df): "f_stat": float(f_stat), "p_value": float(p_val), } - except Exception as e: + except (ValueError, RuntimeWarning) as e: + _log.debug("Mixed correlation failed for '%s' vs '%s': %s", cat, num, e) mixed_corr[f"{cat}__{num}"] = {"error": str(e)} return mixed_corr diff --git a/hashprep/utils/logging.py b/hashprep/utils/logging.py new file mode 100644 index 0000000..7c0efb8 --- /dev/null +++ b/hashprep/utils/logging.py @@ -0,0 +1,28 @@ +"""Structured logging for HashPrep. + +Provides a package-level logger that callers can use via get_logger(). +By default, logs at WARNING level so end users don't see noise. +Library consumers can adjust via standard logging configuration. +""" + +import logging + +LOGGER_NAME = "hashprep" + + +def get_logger(module_name: str = "") -> logging.Logger: + """Get a logger scoped to a hashprep submodule. + + Args: + module_name: Dot-separated submodule path (e.g. "checks.correlations"). + If empty, returns the root hashprep logger. + """ + name = f"{LOGGER_NAME}.{module_name}" if module_name else LOGGER_NAME + return logging.getLogger(name) + + +# Configure root hashprep logger with NullHandler (library best practice). +# This prevents "No handlers could be found" warnings when hashprep is used +# as a library. End users or the CLI can attach their own handlers. +_root_logger = logging.getLogger(LOGGER_NAME) +_root_logger.addHandler(logging.NullHandler())