python-bigquery-dataframes/third_party/bigframes_vendored/sklearn/decomposition/_pca.py at bb696159566a9160f401c89c3f612657db61e495 · googleapis/python-bigquery-dataframes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
""" Principal Component Analysis.
"""

# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Mathieu Blondel <mathieu@mblondel.org>
#         Denis A. Engemann <denis-alexander.engemann@inria.fr>
#         Michael Eickenberg <michael.eickenberg@inria.fr>
#         Giorgio Patrini <giorgio.patrini@anu.edu.au>
#
# License: BSD 3 clause

from abc import ABCMeta

from bigframes_vendored.sklearn.base import BaseEstimator

from bigframes import constants


class PCA(BaseEstimator, metaclass=ABCMeta):
    """Principal component analysis (PCA).

    **Examples:**

        >>> import bigframes.pandas as bpd
        >>> from bigframes.ml.decomposition import PCA
        >>> X = bpd.DataFrame({"feat0": [-1, -2, -3, 1, 2, 3], "feat1": [-1, -1, -2, 1, 1, 2]})
        >>> pca = PCA(n_components=2).fit(X)
        >>> pca.predict(X) # doctest:+SKIP
            principal_component_1  principal_component_2
        0              -0.755243               0.157628
        1               -1.05405              -0.141179
        2              -1.809292               0.016449
        3               0.755243              -0.157628
        4                1.05405               0.141179
        5               1.809292              -0.016449
        <BLANKLINE>
        [6 rows x 2 columns]
        >>> pca.explained_variance_ratio_ # doctest:+SKIP
            principal_component_id  explained_variance_ratio
        0                       1                   0.00901
        1                       0                   0.99099
        <BLANKLINE>
        [2 rows x 2 columns]

    Args:
        n_components (int, float or None, default None):
            Number of components to keep. If n_components is not set, all
            components are kept, n_components = min(n_samples, n_features).
            If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.
        svd_solver ("full", "randomized" or "auto", default "auto"):
            The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver.

    """

    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Args:
            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
                Series or DataFrame of shape (n_samples, n_features). Training vector,
                where `n_samples` is the number of samples and `n_features` is
                the number of features.

            y (default None):
                Ignored.

        Returns:
            PCA: Fitted estimator.
        """
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

    def score(self, X=None, y=None):
        """Calculate evaluation metrics of the model.

        .. note::

            Output matches that of the BigQuery ML.EVALUATE function.
            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models
            for the outputs relevant to this model type.

        Args:
            X (default None):
                Ignored.

            y (default None):
                Ignored.
        Returns:
            bigframes.dataframe.DataFrame: DataFrame that represents model metrics.
        """
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

    def predict(self, X):
        """Predict the closest cluster for each sample in X.

        Args:
            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
                Series or a DataFrame to predict.

        Returns:
            bigframes.dataframe.DataFrame: Predicted DataFrames."""
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

    def fit_predict(
        self,
        X,
        y=None,
    ):
        """Fit the model with X and apply the dimensionality reduction on X.

        Convenience method; equivalent to calling fit(X) followed by predict(X).

        Args:
            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
                DataFrame of shape (n_samples, n_features). Training data.
            y (default None):
                Not used, present here for API consistency by convention.

        Returns:
            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted labels.
        """
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

    @property
    def components_(self):
        """Principal axes in feature space, representing the directions of maximum variance in the data.

        Returns:
            bigframes.dataframe.DataFrame: DataFrame of principal components, containing following columns:
                principal_component_id: An integer that identifies the principal component.

                feature: The column name that contains the feature.

                numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL.

                categorical_value: A list of mappings containing information about categorical features. Each mapping contains the following fields:
                    categorical_value.category: The name of each category.

                    categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies.

            The output contains one row per feature per component.
        """
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

    @property
    def explained_variance_(self):
        """The amount of variance explained by each of the selected components.

        Returns:
            bigframes.dataframe.DataFrame: DataFrame containing following columns:
                principal_component_id: An integer that identifies the principal component.

                explained_variance: The factor by which the eigenvector is scaled. Eigenvalue and explained variance are the same concepts in PCA.
        """
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

    @property
    def explained_variance_ratio_(self):
        """Percentage of variance explained by each of the selected components.

        Returns:
            bigframes.dataframe.DataFrame: DataFrame containing following columns:
                principal_component_id: An integer that identifies the principal component.

                explained_variance_ratio: the total variance is the sum of variances, also known as eigenvalues, of all
                of the individual principal components. The explained variance ratio by a principal component is
                the ratio between the variance, also known as eigenvalue, of that principal component and the total variance.
        """
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)