-
Notifications
You must be signed in to change notification settings - Fork 87
Helper function to sanitize tables #935
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1fa8f85
d5cd64f
8c3b792
910d79b
fd2ef34
e95cebb
530ba36
6ce065d
f25165a
9f22185
8221d5d
1ec773b
38bb554
ad1c573
a27a6fe
b994960
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,9 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Iterable | ||
|
|
||
| from anndata import AnnData | ||
|
|
||
| from spatialdata._core.spatialdata import SpatialData | ||
|
|
||
|
|
||
|
|
@@ -25,3 +29,138 @@ def _find_common_table_keys(sdatas: Iterable[SpatialData]) -> set[str]: | |
| common_keys.intersection_update(sdata.tables.keys()) | ||
|
|
||
| return common_keys | ||
|
|
||
|
|
||
| def sanitize_name(name: str, is_dataframe_column: bool = False) -> str: | ||
| """ | ||
| Sanitize a name to comply with SpatialData naming rules. | ||
|
|
||
| This function converts invalid names into valid ones by: | ||
| 1. Converting to string if not already | ||
| 2. Removing invalid characters | ||
| 3. Handling special cases like "__" prefix | ||
| 4. Ensuring the name is not empty | ||
| 5. Handling special cases for dataframe columns | ||
|
|
||
| See a discussion on the naming rules, and how to avoid naming collisions, here: | ||
| https://github.com/scverse/spatialdata/discussions/707 | ||
|
|
||
| Parameters | ||
| ---------- | ||
| name | ||
| The name to sanitize | ||
| is_dataframe_column | ||
| Whether this name is for a dataframe column (additional restrictions apply) | ||
|
|
||
| Returns | ||
| ------- | ||
| A sanitized version of the name that complies with SpatialData naming rules. If a | ||
| santized name cannoted be generated, it returns "unnamed". | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> sanitize_name("my@invalid#name") | ||
| 'my_invalid_name' | ||
| >>> sanitize_name("__private") | ||
| 'private' | ||
| >>> sanitize_name("_index", is_dataframe_column=True) | ||
| 'index' | ||
| """ | ||
| # Convert to string if not already | ||
| name = str(name) | ||
|
|
||
| # Handle empty string case | ||
| if not name: | ||
| return "unnamed" | ||
|
|
||
| # Handle special cases | ||
| if name in {".", ".."}: | ||
| return "unnamed" | ||
|
|
||
| sanitized = "".join(char if char.isalnum() or char in "_-." else "_" for char in name) | ||
|
|
||
| # remove double underscores if found as a prefix | ||
| while sanitized.startswith("__"): | ||
| sanitized = sanitized[1:] | ||
|
|
||
| if is_dataframe_column and sanitized == "_index": | ||
| return "index" | ||
|
|
||
| # Ensure we don't end up with an empty string after sanitization | ||
| return sanitized or "unnamed" | ||
|
|
||
|
|
||
| def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: | ||
| """ | ||
| Sanitize all keys in an AnnData table to comply with SpatialData naming rules. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should maybe have a doc on these naming rules unless we had one already. WDYT @LucaMarconato
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, at some point we will make a doc (when working on better documenting the file format). For the moment I added a link to #707. |
||
|
|
||
| This function sanitizes all keys in obs, var, obsm, obsp, varm, varp, uns, and layers | ||
| while maintaining case-insensitive uniqueness. It can either modify the table in-place | ||
| or return a new sanitized copy. | ||
|
|
||
| See a discussion on the naming rules here: | ||
| https://github.com/scverse/spatialdata/discussions/707 | ||
|
|
||
| Parameters | ||
| ---------- | ||
| data | ||
| The AnnData table to sanitize | ||
| inplace | ||
| Whether to modify the table in-place or return a new copy | ||
|
|
||
| Returns | ||
| ------- | ||
| If inplace is False, returns a new AnnData object with sanitized keys. | ||
| If inplace is True, returns None as the original object is modified. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> import anndata as ad | ||
| >>> adata = ad.AnnData(obs=pd.DataFrame({"@invalid#": [1, 2]})) | ||
| >>> # Create a new sanitized copy | ||
| >>> sanitized = sanitize_table(adata) | ||
| >>> print(sanitized.obs.columns) | ||
| Index(['invalid_'], dtype='object') | ||
| >>> # Or modify in-place | ||
| >>> sanitize_table(adata, inplace=True) | ||
| >>> print(adata.obs.columns) | ||
| Index(['invalid_'], dtype='object') | ||
| """ | ||
| import copy | ||
| from collections import defaultdict | ||
|
|
||
| # Create a deep copy if not modifying in-place | ||
| sanitized = data if inplace else copy.deepcopy(data) | ||
|
|
||
| # Track used names to maintain case-insensitive uniqueness | ||
| used_names_lower: dict[str, set[str]] = defaultdict(set) | ||
|
|
||
| def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> str: | ||
| base_name = sanitize_name(name, is_dataframe_column) | ||
| normalized_base = base_name.lower() | ||
|
|
||
| # If this exact name is already used, add a number | ||
| if normalized_base in used_names_lower[attr]: | ||
| counter = 1 | ||
| while f"{base_name}_{counter}".lower() in used_names_lower[attr]: | ||
| counter += 1 | ||
| base_name = f"{base_name}_{counter}" | ||
|
|
||
| used_names_lower[attr].add(base_name.lower()) | ||
| return base_name | ||
|
|
||
| # Handle obs and var (dataframe columns) | ||
| for attr in ("obs", "var"): | ||
| df = getattr(sanitized, attr) | ||
| new_columns = {old: get_unique_name(old, attr, is_dataframe_column=True) for old in df.columns} | ||
| df.rename(columns=new_columns, inplace=True) | ||
|
|
||
| # Handle other attributes | ||
| for attr in ("obsm", "obsp", "varm", "varp", "uns", "layers"): | ||
| d = getattr(sanitized, attr) | ||
| new_keys = {old: get_unique_name(old, attr) for old in d} | ||
| # Create new dictionary with sanitized keys | ||
| new_dict = {new_keys[old]: value for old, value in d.items()} | ||
| setattr(sanitized, attr, new_dict) | ||
|
|
||
| return None if inplace else sanitized | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,208 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import pytest | ||
| from anndata import AnnData | ||
|
|
||
| from spatialdata import SpatialData | ||
| from spatialdata._core._utils import sanitize_name, sanitize_table | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def invalid_table() -> AnnData: | ||
| """AnnData with invalid obs column names to test basic sanitization.""" | ||
| return AnnData( | ||
| obs=pd.DataFrame( | ||
| { | ||
| "@invalid#": [1, 2], | ||
| "valid_name": [3, 4], | ||
| "__private": [5, 6], | ||
| } | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def invalid_table_with_index() -> AnnData: | ||
| """AnnData with a name requiring whitespace→underscore and a dataframe index column.""" | ||
| return AnnData( | ||
| obs=pd.DataFrame( | ||
| { | ||
| "invalid name": [1, 2], | ||
| "_index": [3, 4], | ||
| } | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| # ----------------------------------------------------------------------------- | ||
| # sanitize_name tests | ||
| # ----------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "raw,expected", | ||
| [ | ||
| ("valid_name", "valid_name"), | ||
| ("valid-name", "valid-name"), | ||
| ("valid.name", "valid.name"), | ||
| ("invalid@name", "invalid_name"), | ||
| ("invalid#name", "invalid_name"), | ||
| ("invalid name", "invalid_name"), | ||
| ("", "unnamed"), | ||
| (".", "unnamed"), | ||
| ("..", "unnamed"), | ||
| ("__", "_"), | ||
| ("___", "_"), | ||
| ("____#@$@", "_"), | ||
| ("__private", "_private"), | ||
| ], | ||
| ) | ||
| def test_sanitize_name_strips_special_chars(raw, expected): | ||
| assert sanitize_name(raw) == expected | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "raw,is_df_col,expected", | ||
| [ | ||
| ("_index", True, "index"), | ||
| ("_index", False, "_index"), | ||
| ("valid@column", True, "valid_column"), | ||
| ("__private", True, "_private"), | ||
| ], | ||
| ) | ||
| def test_sanitize_name_dataframe_column(raw, is_df_col, expected): | ||
| assert sanitize_name(raw, is_dataframe_column=is_df_col) == expected | ||
|
|
||
|
|
||
| # ----------------------------------------------------------------------------- | ||
| # sanitize_table basic behaviors | ||
| # ----------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| def test_sanitize_table_basic_columns(invalid_table, invalid_table_with_index): | ||
| ad1 = sanitize_table(invalid_table, inplace=False) | ||
| assert isinstance(ad1, AnnData) | ||
| assert list(ad1.obs.columns) == ["_invalid_", "valid_name", "_private"] | ||
|
|
||
| ad2 = sanitize_table(invalid_table_with_index, inplace=False) | ||
| assert list(ad2.obs.columns) == ["invalid_name", "index"] | ||
|
|
||
| # original fixture remains unchanged | ||
| assert list(invalid_table.obs.columns) == ["@invalid#", "valid_name", "__private"] | ||
|
|
||
|
|
||
| def test_sanitize_table_inplace_copy(invalid_table): | ||
| ad = invalid_table.copy() | ||
| sanitize_table(ad) # inplace=True is now default | ||
| assert list(ad.obs.columns) == ["_invalid_", "valid_name", "_private"] | ||
|
|
||
|
|
||
| def test_sanitize_table_case_insensitive_collisions(): | ||
| obs = pd.DataFrame( | ||
| { | ||
| "Column1": [1, 2], | ||
| "column1": [3, 4], | ||
| "COLUMN1": [5, 6], | ||
| } | ||
| ) | ||
| ad = AnnData(obs=obs) | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| cols = list(sanitized.obs.columns) | ||
| assert sorted(cols) == sorted(["Column1", "column1_1", "COLUMN1_2"]) | ||
|
|
||
|
|
||
| def test_sanitize_table_whitespace_collision(): | ||
| """Ensure 'a b' → 'a_b' doesn't collide silently with existing 'a_b'.""" | ||
| obs = pd.DataFrame({"a b": [1], "a_b": [2]}) | ||
| ad = AnnData(obs=obs) | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| cols = list(sanitized.obs.columns) | ||
| assert "a_b" in cols | ||
| assert "a_b_1" in cols | ||
|
|
||
|
|
||
| # ----------------------------------------------------------------------------- | ||
| # sanitize_table attribute‐specific tests | ||
| # ----------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| def test_sanitize_table_obs_and_obs_columns(): | ||
| ad = AnnData(obs=pd.DataFrame({"@col": [1, 2]})) | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| assert list(sanitized.obs.columns) == ["_col"] | ||
|
|
||
|
|
||
| def test_sanitize_table_obsm_and_obsp(): | ||
| ad = AnnData(obs=pd.DataFrame({"@col": [1, 2]})) | ||
| ad.obsm["@col"] = np.array([[1, 2], [3, 4]]) | ||
| ad.obsp["bad name"] = np.array([[1, 2], [3, 4]]) | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| assert list(sanitized.obsm.keys()) == ["_col"] | ||
| assert list(sanitized.obsp.keys()) == ["bad_name"] | ||
|
|
||
|
|
||
| def test_sanitize_table_varm_and_varp(): | ||
| ad = AnnData(obs=pd.DataFrame({"x": [1, 2]}), var=pd.DataFrame(index=["v1", "v2"])) | ||
| ad.varm["__priv"] = np.array([[1, 2], [3, 4]]) | ||
| ad.varp["_index"] = np.array([[1, 2], [3, 4]]) | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| assert list(sanitized.varm.keys()) == ["_priv"] | ||
| assert list(sanitized.varp.keys()) == ["_index"] | ||
|
|
||
|
|
||
| def test_sanitize_table_uns_and_layers(): | ||
| ad = AnnData(obs=pd.DataFrame({"x": [1, 2]}), var=pd.DataFrame(index=["v1", "v2"])) | ||
| ad.uns["bad@key"] = "val" | ||
| ad.layers["bad#layer"] = np.array([[0, 1], [1, 0]]) | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| assert list(sanitized.uns.keys()) == ["bad_key"] | ||
| assert list(sanitized.layers.keys()) == ["bad_layer"] | ||
|
|
||
|
|
||
| def test_sanitize_table_empty_returns_empty(): | ||
| ad = AnnData() | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| assert isinstance(sanitized, AnnData) | ||
| assert sanitized.obs.empty | ||
| assert sanitized.var.empty | ||
|
|
||
|
|
||
| def test_sanitize_table_preserves_underlying_data(): | ||
| ad = AnnData(obs=pd.DataFrame({"@invalid#": [1, 2], "valid": [3, 4]})) | ||
| ad.obsm["@invalid#"] = np.array([[1, 2], [3, 4]]) | ||
| ad.uns["invalid@key"] = "value" | ||
| sanitized = sanitize_table(ad, inplace=False) | ||
| assert sanitized.obs["_invalid_"].tolist() == [1, 2] | ||
| assert sanitized.obs["valid"].tolist() == [3, 4] | ||
| assert np.array_equal(sanitized.obsm["_invalid_"], np.array([[1, 2], [3, 4]])) | ||
| assert sanitized.uns["invalid_key"] == "value" | ||
|
|
||
|
|
||
| # ----------------------------------------------------------------------------- | ||
| # SpatialData integration | ||
| # ----------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| def test_sanitize_table_in_spatialdata_sanitized_fixture(invalid_table, invalid_table_with_index): | ||
| table1 = invalid_table.copy() | ||
| table2 = invalid_table_with_index.copy() | ||
| sanitize_table(table1) | ||
| sanitize_table(table2) | ||
| sdata_sanitized_tables = SpatialData(tables={"table1": table1, "table2": table2}) | ||
|
|
||
| t1 = sdata_sanitized_tables.tables["table1"] | ||
| t2 = sdata_sanitized_tables.tables["table2"] | ||
| assert list(t1.obs.columns) == ["_invalid_", "valid_name", "_private"] | ||
| assert list(t2.obs.columns) == ["invalid_name", "index"] | ||
|
|
||
|
|
||
| def test_spatialdata_retains_other_elements(full_sdata): | ||
| # Add another sanitized table into an existing full_sdata | ||
| tbl = AnnData(obs=pd.DataFrame({"@foo#": [1, 2], "bar": [3, 4]})) | ||
| sanitize_table(tbl) | ||
| full_sdata.tables["new_table"] = tbl | ||
|
|
||
| # Verify columns and presence of other SpatialData attributes | ||
| assert list(full_sdata.tables["new_table"].obs.columns) == ["_foo_", "bar"] |
Uh oh!
There was an error while loading. Please reload this page.