diff --git a/CHANGELOG.md b/CHANGELOG.md index 5704a59..5ec3fe0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## v1.4.0 - 25-11-2024 + +### Changed + +- (Breaking change) The methods `CholeskyFactoriser::factorise`, `CholeskyFactoriser::solve`, `QRFactoriser::factorise`, + `QRFactoriser::leastSquares`, and `QRFactoriser::getQR` are now `void` and do not + return a status code. Instead, a status code is returned by called `statusCode`. + This change leads to a reduction in data being downloaded from the GPU. +- In `Svd` a status code (`bool`) is returned from `Svd::factorise` only if the + `#GPUTILS_DEBUG_MODE` is defined, otherwise, the method returns always `true`. +- New base class `IStatus` used for a universal implementation of `info()` + + diff --git a/include/tensor.cuh b/include/tensor.cuh index 160dd98..5a08392 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -1166,6 +1166,33 @@ std::ostream &DTensor::print(std::ostream &out) const { } + +/* ================================================================================================ + * STATUS INTERFACE + * ================================================================================================ */ +/** + * A class that holds a device pointer to a status code + */ +class IStatus { +protected: + std::unique_ptr> m_info; ///< Status code of computation + + IStatus(size_t n = 1) { + m_info = std::make_unique>(1, 1, n); + } + +public: + + /** + * Provides access to the info stored in this class + * @return tensor of shape (1, 1, n) + */ + virtual DTensor& info() { + return *m_info; + } +}; + + /* ================================================================================================ * SINGULAR VALUE DECOMPOSITION (SVD) * ================================================================================================ */ @@ -1186,6 +1213,7 @@ __global__ void k_countNonzeroSingularValues(const T *d_array, size_t n, unsigne } } + /** * Singular value decomposition (SVD) needs a workspace to be setup for cuSolver before factorisation. * This object can be setup for a specific type and size of (m,n,1)-tensor (i.e., a matrix). @@ -1193,7 +1221,7 @@ __global__ void k_countNonzeroSingularValues(const T *d_array, size_t n, unsigne * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double) */ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX -class Svd { +class Svd : public IStatus { private: @@ -1203,7 +1231,6 @@ private: std::shared_ptr> m_S; ///< Diagonal matrix S or singular values std::shared_ptr> m_U; ///< Matrix U or left singular vectors std::unique_ptr> m_workspace; ///< Workspace for SVD - std::unique_ptr> m_info; ///< Status code of computation std::shared_ptr> m_rank; ///< Rank of original matrix bool m_computeU = false; ///< Whether to compute U bool m_destroyMatrix = true; ///< Whether to sacrifice original matrix @@ -1225,8 +1252,10 @@ private: */ void computeWorkspaceSize(size_t m, size_t n); + public: + /** * Constructor. * @param mat matrix to be factorised @@ -1235,7 +1264,7 @@ public: */ Svd(DTensor &mat, bool computeU = false, - bool destroyMatrix = true) { + bool destroyMatrix = true) : IStatus(mat.numMats()) { checkMatrix(mat); m_destroyMatrix = destroyMatrix; m_tensor = (destroyMatrix) ? &mat : new DTensor(mat); @@ -1248,7 +1277,6 @@ public: m_workspace = std::make_unique>(m_lwork, 1, 1); ///< Allocates required workspace memory m_Vtr = std::make_shared>(n, n, nMats); m_S = std::make_shared>(minMN, 1, nMats); - m_info = std::make_unique>(1, 1, nMats); m_rank = std::make_unique>(1, 1, nMats, true); if (computeU) m_U = std::make_shared>(m, m, nMats); } @@ -1256,9 +1284,8 @@ public: /** * Perform factorisation. * Warning: the original matrix is destroyed by default! - * @return true if factorisation is successful */ - bool factorise(); + void factorise(); /** * @return diagonal matrix S, or singular values @@ -1308,6 +1335,8 @@ public: return *m_rank; } + + }; @@ -1323,11 +1352,10 @@ inline void Svd::computeWorkspaceSize(size_t m, size_t n) { template<> -inline bool Svd::factorise() { +inline void Svd::factorise() { size_t m = m_tensor->numRows(); size_t n = m_tensor->numCols(); size_t nMats = m_tensor->numMats(); - bool info = true; std::unique_ptr> Ui; for (size_t i = 0; i < nMats; i++) { DTensor Ai(*m_tensor, 2, i, i); // tensor A[:, :, i] @@ -1346,18 +1374,15 @@ inline bool Svd::factorise() { m_workspace->raw(), m_lwork, nullptr, // rwork (used only if SVD fails) - m_info->raw())); - info = info && ((*m_info)(0, 0, 0) == 0); + m_info->raw() + i)); } - return info; } template<> -inline bool Svd::factorise() { +inline void Svd::factorise() { size_t m = m_tensor->numRows(); size_t n = m_tensor->numCols(); size_t nMats = m_tensor->numMats(); - bool info = true; std::unique_ptr> Ui; for (size_t i = 0; i < nMats; i++) { DTensor Ai(*m_tensor, 2, i, i); // tensor A[:, :, i] @@ -1376,10 +1401,8 @@ inline bool Svd::factorise() { m_workspace->raw(), m_lwork, nullptr, // rwork (used only if SVD fails) - m_info->raw())); - info = info && ((*m_info)(0, 0, 0) == 0); + m_info->raw() + i)); } - return info; } @@ -1394,11 +1417,10 @@ inline bool Svd::factorise() { * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double) */ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX -class CholeskyFactoriser { +class CholeskyFactoriser : public IStatus { private: int m_workspaceSize = 0; ///< Size of workspace needed for CF - std::unique_ptr> m_info; ///< Status code of computation std::unique_ptr> m_workspace; ///< Workspace for CF DTensor *m_matrix; ///< Matrix to factorise. Do not destroy! @@ -1409,20 +1431,18 @@ private: public: - CholeskyFactoriser(DTensor &A) { + CholeskyFactoriser(DTensor &A) : IStatus() { if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser`"); if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square"); m_matrix = &A; computeWorkspaceSize(); m_workspace = std::make_unique>(m_workspaceSize); - m_info = std::make_unique>(1); } /** * Factorise matrix. - * @return status code of computation */ - int factorise(); + void factorise(); /** * Solves for the solution of A \ b using the CF of A. @@ -1430,9 +1450,9 @@ public: * A and b must have compatible dimensions (same number of rows and matrices=1). * A must be square (m=n). * @param b provided matrix - * @return status code of computation */ - int solve(DTensor &b); + void solve(DTensor &b); + }; @@ -1453,30 +1473,28 @@ inline void CholeskyFactoriser::computeWorkspaceSize() { } template<> -inline int CholeskyFactoriser::factorise() { +inline void CholeskyFactoriser::factorise() { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnDpotrf(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, n, m_matrix->raw(), n, m_workspace->raw(), m_workspaceSize, m_info->raw())); - return (*m_info)(0); } template<> -inline int CholeskyFactoriser::factorise() { +inline void CholeskyFactoriser::factorise() { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnSpotrf(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, n, m_matrix->raw(), n, m_workspace->raw(), m_workspaceSize, m_info->raw())); - return (*m_info)(0); } template<> -inline int CholeskyFactoriser::solve(DTensor &rhs) { +inline void CholeskyFactoriser::solve(DTensor &rhs) { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnDpotrs(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, @@ -1484,11 +1502,10 @@ inline int CholeskyFactoriser::solve(DTensor &rhs) { m_matrix->raw(), n, rhs.raw(), n, m_info->raw())); - return (*m_info)(0); } template<> -inline int CholeskyFactoriser::solve(DTensor &rhs) { +inline void CholeskyFactoriser::solve(DTensor &rhs) { size_t n = m_matrix->numRows(); gpuErrChk(cusolverDnSpotrs(Session::getInstance().cuSolverHandle(), CUBLAS_FILL_MODE_LOWER, @@ -1496,7 +1513,6 @@ inline int CholeskyFactoriser::solve(DTensor &rhs) { m_matrix->raw(), n, rhs.raw(), n, m_info->raw())); - return (*m_info)(0); } @@ -1511,11 +1527,10 @@ inline int CholeskyFactoriser::solve(DTensor &rhs) { * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double) */ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX -class QRFactoriser { +class QRFactoriser : public IStatus { private: int m_workspaceSize = 0; ///< Size of workspace needed for LS - std::unique_ptr> m_info; ///< Status code of computation std::unique_ptr> m_householder; ///< For storing householder reflectors std::unique_ptr> m_workspace; ///< Workspace for LS DTensor *m_matrix; ///< Lhs matrix template. Do not destroy! @@ -1527,21 +1542,19 @@ private: public: - QRFactoriser(DTensor &A) { + QRFactoriser(DTensor &A) : IStatus() { if (A.numMats() > 1) throw std::invalid_argument("[QR] 3D tensors require `leastSquaresBatched`"); if (A.numRows() < A.numCols()) throw std::invalid_argument("[QR] Matrix A must be tall or square"); m_matrix = &A; computeWorkspaceSize(); m_workspace = std::make_unique>(m_workspaceSize); m_householder = std::make_unique>(m_matrix->numCols()); - m_info = std::make_unique>(1); } /** * Factorise matrix. - * @return status code of computation */ - int factorise(); + void factorise(); /** * Solves A \ b using the QR of A. @@ -1549,9 +1562,8 @@ public: * A and b must have compatible dimensions (same number of rows and matrices=1). * A must be tall or square (m>=n). * @param b provided matrix - * @return status code of computation */ - int leastSquares(DTensor &b); + void leastSquares(DTensor &b); /** * Populate the given tensors with Q and R. @@ -1559,11 +1571,11 @@ public: * * @param Q matrix Q (preallocated) * @param R matrix R (preallocated) - * @return status code * * @throws std::invalid_argument if Q or R have invalid dimensions */ - int getQR(DTensor &Q, DTensor &R); + void getQR(DTensor &Q, DTensor &R); + }; @@ -1588,7 +1600,7 @@ inline void QRFactoriser::computeWorkspaceSize() { } template<> -inline int QRFactoriser::factorise() { +inline void QRFactoriser::factorise() { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); gpuErrChk(cusolverDnDgeqrf(Session::getInstance().cuSolverHandle(), @@ -1597,12 +1609,11 @@ inline int QRFactoriser::factorise() { m_householder->raw(), m_workspace->raw(), m_workspaceSize, m_info->raw())); - return (*m_info)(0); } template<> -inline int QRFactoriser::factorise() { +inline void QRFactoriser::factorise() { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); gpuErrChk(cusolverDnSgeqrf(Session::getInstance().cuSolverHandle(), @@ -1611,11 +1622,10 @@ inline int QRFactoriser::factorise() { m_householder->raw(), m_workspace->raw(), m_workspaceSize, m_info->raw())); - return (*m_info)(0); } template<> -inline int QRFactoriser::leastSquares(DTensor &rhs) { +inline void QRFactoriser::leastSquares(DTensor &rhs) { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); double alpha = 1.; @@ -1631,11 +1641,10 @@ inline int QRFactoriser::leastSquares(DTensor &rhs) { &alpha, m_matrix->raw(), m, rhs.raw(), m)); - return (*m_info)(0); } template<> -inline int QRFactoriser::leastSquares(DTensor &rhs) { +inline void QRFactoriser::leastSquares(DTensor &rhs) { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); float alpha = 1.; @@ -1651,11 +1660,10 @@ inline int QRFactoriser::leastSquares(DTensor &rhs) { &alpha, m_matrix->raw(), m, rhs.raw(), m)); - return (*m_info)(0); } template<> -inline int QRFactoriser::getQR(DTensor &Q, DTensor &R) { +inline void QRFactoriser::getQR(DTensor &Q, DTensor &R) { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); if (Q.numRows() != m || Q.numCols() != n) @@ -1686,11 +1694,10 @@ inline int QRFactoriser::getQR(DTensor &Q, DTensor &R) { } } R.upload(vecR, rowMajor); - return (*m_info)(0); } template<> -inline int QRFactoriser::getQR(DTensor &Q, DTensor &R) { +inline void QRFactoriser::getQR(DTensor &Q, DTensor &R) { size_t m = m_matrix->numRows(); size_t n = m_matrix->numCols(); if (Q.numRows() != m || Q.numCols() != n) @@ -1721,7 +1728,6 @@ inline int QRFactoriser::getQR(DTensor &Q, DTensor &R) { } } R.upload(vecR, rowMajor); - return (*m_info)(0); } @@ -1827,13 +1833,12 @@ inline void Nullspace::project(DTensor &b) { * @tparam T data type of tensor (must be float or double) */ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX -class CholeskyBatchFactoriser { +class CholeskyBatchFactoriser : public IStatus { private: DTensor *m_matrix; ///< Matrix to factorise or lower-triangular decomposed matrix size_t m_numRows = 0; ///< Number of rows in `m_matrix` size_t m_numMats = 0; ///< Number of matrices in `m_matrix` - std::unique_ptr> m_deviceInfo; ///< Info from cusolver functions bool m_factorisationDone = false; ///< Whether `m_matrix` holds original or lower-triangular matrix public: @@ -1844,12 +1849,11 @@ public: * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices * @param factorised whether A is the original matrices or the factorised ones (default=original matrices) */ - CholeskyBatchFactoriser(DTensor &A, bool factorised = false) : m_factorisationDone(factorised) { + CholeskyBatchFactoriser(DTensor &A, bool factorised = false) : IStatus(A.numMats()), m_factorisationDone(factorised) { if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square"); m_matrix = &A; m_numRows = A.numRows(); m_numMats = A.numMats(); - m_deviceInfo = std::make_unique>(m_numMats, 1, 1, true); } /** @@ -1863,6 +1867,7 @@ public: */ void solve(DTensor &b); + }; template<> @@ -1873,7 +1878,7 @@ inline void CholeskyBatchFactoriser::factorise() { m_numRows, m_matrix->ptrMatrices(), m_numRows, - m_deviceInfo->raw(), + m_info->raw(), m_numMats)); m_factorisationDone = true; } @@ -1886,7 +1891,7 @@ inline void CholeskyBatchFactoriser::factorise() { m_numRows, m_matrix->ptrMatrices(), m_numRows, - m_deviceInfo->raw(), + m_info->raw(), m_numMats)); m_factorisationDone = true; } @@ -1906,7 +1911,7 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { m_numRows, b.ptrMatrices(), m_numRows, - m_deviceInfo->raw(), + m_info->raw(), m_numMats)); } @@ -1925,7 +1930,7 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { m_numRows, b.ptrMatrices(), m_numRows, - m_deviceInfo->raw(), + m_info->raw(), m_numMats)); } @@ -2015,7 +2020,8 @@ __global__ void k_givensAnnihilateRHypot(const T *data, res[2] = xkj * (*res); // -sin } -template + +template TEMPLATE_CONSTRAINT_REQUIRES_FPX inline void GivensAnnihilator::annihilate(size_t i, size_t k, size_t j) { /* A few checks */ size_t nR = m_matrix->numRows(), nC = m_matrix->numCols(); diff --git a/main.cu b/main.cu index 2a49684..d9f3421 100644 --- a/main.cu +++ b/main.cu @@ -9,22 +9,19 @@ #define real_t double - - int main() { - size_t m = 10; - size_t n = 6; - std::vector v(m*n); - v.reserve(m*n); - std::iota(v.begin(), v.end(), 1); - DTensor a = DTensor(v, m, n); - - auto ga = GivensAnnihilator(a); - ga.annihilate(0, 1, 2); - - std::cout << a; - + std::vector aData = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; + DTensor A(3, 3, 2); + DTensor A0(A, 2, 0, 0); + DTensor A1(A, 2, 1, 1); + A0.upload(aData); + A1.upload(aData); + CholeskyBatchFactoriser chol(A); + chol.factorise(); + std::cout << chol.info()(0); return 0; } diff --git a/test/testTensor.cu b/test/testTensor.cu index 4103243..0b0d312 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -901,7 +901,10 @@ void singularValuesComputation(float epsilon) { 3, 8, 8, 8, 8, 8, 8, 8,}; DTensor B(bData, 8, 3); Svd svd(B, true, false); - EXPECT_EQ(true, svd.factorise()); + svd.factorise(); + for (size_t i = 0; i < B.numMats(); i++) { + EXPECT_EQ(0, svd.info()(i)); + } auto S = svd.singularValues(); EXPECT_NEAR(32.496241123753592, S(0), epsilon); // value from MATLAB EXPECT_NEAR(0.997152358903242, S(1), epsilon); // value from MATLAB @@ -927,7 +930,8 @@ void singularValuesMemory(float epsilon) { 3, 8, 8, 8, 8, 8, 8, 8,}; DTensor B(bData, 8, 3); Svd svd(B, true, false); - EXPECT_EQ(true, svd.factorise()); + svd.factorise(); + EXPECT_EQ(0, svd.info()(0)); DTensor const &v1 = svd.rightSingularVectors(); DTensor const &v2 = svd.rightSingularVectors(); EXPECT_EQ(&v1, &v2); @@ -1044,6 +1048,7 @@ void choleskyFactorisation(T epsilon) { EXPECT_NEAR(3.162277660168380, A(0, 0), epsilon); EXPECT_NEAR(-0.361403161162101, A(2, 1), epsilon); EXPECT_NEAR(5.382321781081287, A(2, 2), epsilon); + EXPECT_EQ(0, chol.info()(0)); } TEST_F(CholeskyTest, choleskyFactorisation) { @@ -1110,6 +1115,8 @@ void choleskyBatchFactorisation(T epsilon) { EXPECT_NEAR(3.162277660168380, A(0, 0, 1), epsilon); EXPECT_NEAR(-0.361403161162101, A(2, 1, 1), epsilon); EXPECT_NEAR(5.382321781081287, A(2, 2, 1), epsilon); + + EXPECT_EQ(0, chol.info()(0)); } TEST_F(CholeskyTest, choleskyBatchFactorisation) { @@ -1150,6 +1157,7 @@ void choleskyBatchFactorSolve(T epsilon) { DTensor error = A * sol; error -= rhs; EXPECT_TRUE(error.normF() < epsilon); + EXPECT_EQ(0, chol.info()(0)); } TEST_F(CholeskyTest, choleskyBatchFactorSolve) { @@ -1197,6 +1205,7 @@ void choleskyBatchSolve(T epsilon) { DTensor error = A * sol; error -= rhs; EXPECT_TRUE(error.normF() < epsilon); + EXPECT_EQ(0, chol.info()(0)); } TEST_F(CholeskyTest, choleskyBatchSolve) { @@ -1228,13 +1237,13 @@ void qrFactorisation(T epsilon) { DTensor A = DTensor::createRandomTensor(nR, nC, 1, -100, 100); QRFactoriser qr(temp); A.deviceCopyTo(temp); - int status = qr.factorise(); - EXPECT_EQ(status, 0); + qr.factorise(); + EXPECT_EQ(0, qr.info()(0)); DTensor Q(nR, nC); DTensor R(nC, nC, 1, true); DTensor QR(nR, nC); - status = qr.getQR(Q, R); - EXPECT_EQ(status, 0); + qr.getQR(Q, R); + EXPECT_EQ(0, qr.info()(0)); QR.addAB(Q, R); QR -= A; T nrm = QR.normF(); @@ -1259,13 +1268,13 @@ void qrFactorisationTall(T epsilon) { DTensor A = DTensor::createRandomTensor(nR, nC, 1, -100, 100); QRFactoriser qr(temp); A.deviceCopyTo(temp); - int status = qr.factorise(); - EXPECT_EQ(status, 0); + qr.factorise(); + EXPECT_EQ(0, qr.info()(0)); DTensor Q(nR, nC); DTensor R(nC, nC, 1, true); DTensor QR(nR, nC); - status = qr.getQR(Q, R); - EXPECT_EQ(status, 0); + qr.getQR(Q, R); + EXPECT_EQ(0, qr.info()(0)); QR.addAB(Q, R); QR -= A; T nrm = QR.normF(); @@ -1301,11 +1310,11 @@ void qrLeastSquares(T epsilon) { DTensor Ax(nR); QRFactoriser qr(temp); A.deviceCopyTo(temp); - int status = qr.factorise(); - EXPECT_EQ(status, 0); + qr.factorise(); + EXPECT_EQ(0, qr.info()(0)); b.deviceCopyTo(xFull); - status = qr.leastSquares(xFull); - EXPECT_EQ(status, 0); + qr.leastSquares(xFull); + EXPECT_EQ(0, qr.info()(0)); Ax.addAB(A, x); Ax -= b; T nrm = Ax.normF();