Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions hashprep/checks/drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

from .core import Issue
from ..config import DEFAULT_CONFIG
from ..utils.logging import get_logger

_log = get_logger("checks.drift")

_DRIFT = DEFAULT_CONFIG.drift
CRITICAL_P_VALUE = _DRIFT.critical_p_value
Expand All @@ -19,6 +22,9 @@ def check_drift(
Check for distribution shift between two datasets.
Uses Kolmogorov-Smirnov test for numeric columns and Chi-square for categorical.
"""
if not isinstance(df_train, pd.DataFrame) or not isinstance(df_test, pd.DataFrame):
raise TypeError("Both df_train and df_test must be pandas DataFrames")

issues = []

issues.extend(_check_numeric_drift(df_train, df_test, threshold))
Expand Down Expand Up @@ -132,7 +138,7 @@ def _check_categorical_drift(
quick_fix="Options:\n- Re-train model with recent data.\n- Investigate category distribution changes.\n- Consider rebalancing categories.",
)
)
except (ValueError, RuntimeWarning):
pass
except (ValueError, RuntimeWarning) as e:
_log.debug("Chi-square drift test failed for '%s': %s", col, e)

return issues
8 changes: 6 additions & 2 deletions hashprep/checks/leakage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from scipy.stats import chi2_contingency, f_oneway
import numpy as np
from ..config import DEFAULT_CONFIG
from ..utils.logging import get_logger

_LEAK = DEFAULT_CONFIG.leakage
_log = get_logger("checks.leakage")

def _check_data_leakage(analyzer):
issues = []
Expand Down Expand Up @@ -91,7 +93,8 @@ def _check_target_leakage_patterns(analyzer):
quick_fix=quick_fix,
)
)
except Exception:
except (ValueError, np.linalg.LinAlgError) as e:
_log.debug("Chi-square leakage test failed for '%s': %s", col, e)
continue
numeric_cols = analyzer.df.select_dtypes(include="number").drop(
columns=[analyzer.target_col], errors="ignore"
Expand Down Expand Up @@ -127,6 +130,7 @@ def _check_target_leakage_patterns(analyzer):
quick_fix=quick_fix,
)
)
except Exception:
except (ValueError, RuntimeWarning) as e:
_log.debug("F-test leakage check failed for '%s': %s", col, e)
continue
return issues
10 changes: 8 additions & 2 deletions hashprep/checks/missing_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
import pandas as pd
from collections import defaultdict
import numpy as np
from numpy.linalg import LinAlgError
from ..config import DEFAULT_CONFIG
from ..utils.logging import get_logger

_log = get_logger("checks.missing_values")

_THRESHOLDS = DEFAULT_CONFIG.missing_values

Expand Down Expand Up @@ -117,7 +121,8 @@ def cramers_v(table):
cramers = cramers_v(table)
if p_val < threshold and cramers > _THRESHOLDS.pattern_cramers_v_min:
cat_patterns[col].append((other_col, p_val, cramers))
except Exception:
except (ValueError, LinAlgError) as e:
_log.debug("Chi-square test failed for '%s' vs '%s': %s", col, other_col, e)
continue

for other_col in analyzer.df.select_dtypes(
Expand All @@ -140,7 +145,8 @@ def cramers_v(table):

if p_val < threshold and cohens_d > _THRESHOLDS.pattern_cohens_d_min:
num_patterns[col].append((other_col, p_val, cohens_d))
except Exception:
except (ValueError, RuntimeWarning) as e:
_log.debug("Mann-Whitney U test failed for '%s' vs '%s': %s", col, other_col, e)
continue

# Generate grouped issues
Expand Down
9 changes: 9 additions & 0 deletions hashprep/core/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ def __init__(
sampling_config: Optional[SamplingConfig] = None,
auto_sample: bool = True,
):
if not isinstance(df, pd.DataFrame):
raise TypeError(f"Expected pandas DataFrame, got {type(df).__name__}")
if df.columns.duplicated().any():
raise ValueError(f"DataFrame has duplicate column names: {list(df.columns[df.columns.duplicated()])}")
if target_col is not None and target_col not in df.columns:
raise ValueError(f"Target column '{target_col}' not found in DataFrame")
if comparison_df is not None and not isinstance(comparison_df, pd.DataFrame):
raise TypeError(f"comparison_df must be a pandas DataFrame, got {type(comparison_df).__name__}")

self.comparison_df = comparison_df
self.target_col = target_col
self.selected_checks = selected_checks
Expand Down
10 changes: 7 additions & 3 deletions hashprep/reports/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from typing import Dict, List

import pandas as pd
from ..utils.logging import get_logger

_log = get_logger("reports.markdown")

import hashprep

Expand Down Expand Up @@ -205,7 +208,8 @@ def generate(self, summary, full=False, output_file=None):
img_f.write(base64.b64decode(plot_data))
rel_path = os.path.join(f"{report_name}_images", img_filename)
content += f"![{plot_name}]({rel_path})\n\n"
except Exception:
except (OSError, ValueError) as e:
_log.warning("Failed to save plot '%s': %s", plot_name, e)
content += f"*(Error saving plot {plot_name})*\n\n"

content += "---\n\n"
Expand All @@ -224,8 +228,8 @@ def generate(self, summary, full=False, output_file=None):
img_f.write(base64.b64decode(plot_data))
rel_path = os.path.join(f"{report_name}_images", img_filename)
content += f"![{method} Correlation]({rel_path})\n\n"
except Exception:
pass
except (OSError, ValueError) as e:
_log.warning("Failed to save correlation plot '%s': %s", method, e)

pairs = []
for c1, corrs in num_corr["pearson"].items():
Expand Down
9 changes: 7 additions & 2 deletions hashprep/summaries/interactions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pandas as pd
from scipy.stats import chi2_contingency, f_oneway
import numpy as np
from ..utils.logging import get_logger

_log = get_logger("summaries.interactions")


def summarize_interactions(df):
Expand Down Expand Up @@ -45,7 +48,8 @@ def _compute_categorical_correlations(df):
r, k = table.shape
cramers_v = (phi2 / min(k - 1, r - 1)) ** 0.5
results[f"{c1}__{c2}"] = float(cramers_v)
except Exception:
except (ValueError, np.linalg.LinAlgError) as e:
_log.debug("Categorical correlation failed for '%s' vs '%s': %s", c1, c2, e)
continue
return results

Expand All @@ -69,6 +73,7 @@ def _compute_mixed_correlations(df):
"f_stat": float(f_stat),
"p_value": float(p_val),
}
except Exception as e:
except (ValueError, RuntimeWarning) as e:
_log.debug("Mixed correlation failed for '%s' vs '%s': %s", cat, num, e)
mixed_corr[f"{cat}__{num}"] = {"error": str(e)}
return mixed_corr
28 changes: 28 additions & 0 deletions hashprep/utils/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Structured logging for HashPrep.

Provides a package-level logger that callers can use via get_logger().
By default, logs at WARNING level so end users don't see noise.
Library consumers can adjust via standard logging configuration.
"""

import logging

LOGGER_NAME = "hashprep"


def get_logger(module_name: str = "") -> logging.Logger:
"""Get a logger scoped to a hashprep submodule.

Args:
module_name: Dot-separated submodule path (e.g. "checks.correlations").
If empty, returns the root hashprep logger.
"""
name = f"{LOGGER_NAME}.{module_name}" if module_name else LOGGER_NAME
return logging.getLogger(name)


# Configure root hashprep logger with NullHandler (library best practice).
# This prevents "No handlers could be found" warnings when hashprep is used
# as a library. End users or the CLI can attach their own handlers.
_root_logger = logging.getLogger(LOGGER_NAME)
_root_logger.addHandler(logging.NullHandler())