uxlfoundation · david-cortes-intel · Feb 21, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
@@ -28,6 +28,7 @@
 #include "src/algorithms/service_error_handling.h"
 #include "src/threading/threading.h"
 #include "src/externals/service_profiler.h"
+#include <memory>
 
 namespace daal
 {
@@ -173,6 +174,55 @@ Status UpdateKernel<algorithmFPType, cpu>::compute(const NumericTable & xTable,
     return UpdateKernel<algorithmFPType, cpu>::compute(xTable, yTable, xtxTable, xtyTable, initializeResult, interceptFlag, nullptr);
 }
 
+template <typename algorithmFPType, CpuType cpu>
+Status computeNonBatchedAggregates(const DAAL_INT nRows, const DAAL_INT nCols, const DAAL_INT nResponses, bool initializeResult, bool interceptFlag,
+                                   const algorithmFPType * xPtr, const algorithmFPType * yPtr, algorithmFPType * xtx, algorithmFPType * xty)
+{
+    DAAL_INT nBetasIntercept = nCols + static_cast<int>(interceptFlag);
+    DAAL_INT one_int         = 1;
+    algorithmFPType one      = 1;
+    algorithmFPType zero     = 0;
+    std::unique_ptr<algorithmFPType[]> ones;
+    if (interceptFlag)
+    {
+        ones = std::unique_ptr<algorithmFPType[]>(new algorithmFPType[nRows]);
+        std::fill(ones.get(), ones.get() + nRows, algorithmFPType(1));
+    }
+
+    BlasInst<algorithmFPType, cpu>::xsyrk("U", "N", &nCols, &nRows, &one, xPtr, &nCols, &zero, xtx, &nBetasIntercept);
+    if (interceptFlag)
+    {
+        BlasInst<algorithmFPType, cpu>::xgemv("N", &nCols, &nRows, &one, xPtr, &nCols, ones.get(), &one_int, initializeResult ? &zero : &one,
+                                              xtx + static_cast<size_t>(nBetasIntercept) * static_cast<size_t>(nCols), &one_int);
+        xtx[static_cast<size_t>(nBetasIntercept) * static_cast<size_t>(nBetasIntercept) - 1] = nRows;
+    }
+
+    if (nResponses == 1)
+    {
+        BlasInst<algorithmFPType, cpu>::xgemv("N", &nCols, &nRows, &one, xPtr, &nCols, yPtr, &one_int, initializeResult ? &zero : &one, xty,
+                                              &one_int);
+        if (interceptFlag)
+        {
+            const algorithmFPType last_val = BlasInst<algorithmFPType, cpu>::xxdot(&nRows, yPtr, &one_int, ones.get(), &one_int);
+            if (initializeResult)
+                xty[nCols] = last_val;
+            else
+                xty[nCols] += last_val;
+        }
+    }
+    else
+    {
+        BlasInst<algorithmFPType, cpu>::xgemm("N", "T", &nCols, &nResponses, &nRows, &one, xPtr, &nCols, yPtr, &nResponses,
+                                              initializeResult ? &zero : &one, xty, &nBetasIntercept);
+        if (interceptFlag)
+        {
+            BlasInst<algorithmFPType, cpu>::xgemv("N", &nResponses, &nRows, &one, yPtr, &nResponses, ones.get(), &one_int,
+                                                  initializeResult ? &zero : &one, xty + nCols, &nBetasIntercept);
+        }
+    }
+    return Status();
+}
+
 template <typename algorithmFPType, CpuType cpu>
 Status UpdateKernel<algorithmFPType, cpu>::compute(const NumericTable & xTable, const NumericTable & yTable, NumericTable & xtxTable,
                                                    NumericTable & xtyTable, bool initializeResult, bool interceptFlag,
@@ -193,6 +243,40 @@ Status UpdateKernel<algorithmFPType, cpu>::compute(const NumericTable & xTable,
     DAAL_CHECK_BLOCK_STATUS(xtyBlock);
     algorithmFPType * xty = xtyBlock.get();
 
+    /// Logic here is as follows: it needs to compute t(X)*X and t(X)*y.
+    /// If both are done together, it's possible to reuse caches of data to speed up computations,
+    /// which the code here does by dividing the data into batches of rows on which both aggregates
+    /// are computed, with the batches processed in parallel. But as the number of columns in the
+    /// data grows, the potential speed gains from calculating both aggregates simultaneously
+    /// decreases, and the memory requirements increase, which can become a problem when there are
+    /// many threads in the system. Hence, if the number of columns is too large, it will compute
+    /// both aggregates independently, in separate calls to BLAS functions, while if the number of
+    /// columns is reasonably small, will prefer the batched procedure which typically ends up
+    /// being faster.
+
+    /// These are the thresholds where the non-batched route should be used.
+    // bool use_non_batched_route = (nBetas >= 4096 || (nRows <= 10000 && nBetas >= 1024)) && getDataLayout() == NumericTable::StorageLayout::aos
+    //                              && (nResponses == 1 || yTable.getDataLayout() == NumericTable::StorageLayout::aos);
+    /// For testing purposes, will enable it regardless of input sizes, but this should be changed later.
+    bool use_non_batched_route =
+        getDataLayout() == NumericTable::StorageLayout::aos && (nResponses == 1 || yTable.getDataLayout() == NumericTable::StorageLayout::aos);
+    if (use_non_batched_route)
+    {
+        /// Note: this is only implemented for row-major arrays, because there's
+        /// currently to mechanism to know if a NumericTable is backed by a single
+        /// continuous column-major array. But if such a mechanism is added, there
+        /// shouldn't be any issue in creating a column-major version of this procedure
+        /// or extending it to more than one response.
+        const DAAL_INT nCols = xTable.getNumberOfColumns();
+        ReadRowsType xBlock(const_cast<NumericTable &>(xTable), 0, nRows);
+        DAAL_CHECK_BLOCK_STATUS(xBlock);
+        ReadRowsType yBlock(const_cast<NumericTable &>(yTable), 0, nRows);
+        DAAL_CHECK_BLOCK_STATUS(yBlock);
+        const algorithmFPType * xPtr = xBlock.get();
+        const algorithmFPType * yPtr = yBlock.get();
+        return computeNonBatchedAggregates<algorithmFPType, cpu>(nRows, nCols, nResponses, true, interceptFlag, xPtr, yPtr, xtx, xty);
+    }
+
     /* Initialize output arrays by zero in case of batch mode */
     if (initializeResult)
     {

@@ -103,14 +103,14 @@ struct Blas
 {
     typedef typename _impl<fpType, cpu>::SizeType SizeType;
 
-    static void xsyrk(char * uplo, char * trans, SizeType * p, SizeType * n, fpType * alpha, fpType * a, SizeType * lda, fpType * beta, fpType * ata,
-                      SizeType * ldata)
+    static void xsyrk(const char * uplo, const char * trans, const SizeType * p, const SizeType * n, const fpType * alpha, const fpType * a,
+                      const SizeType * lda, const fpType * beta, fpType * ata, const SizeType * ldata)
     {
         _impl<fpType, cpu>::xsyrk(uplo, trans, p, n, alpha, a, lda, beta, ata, ldata);
     }
 
-    static void xxsyrk(char * uplo, char * trans, SizeType * p, SizeType * n, fpType * alpha, fpType * a, SizeType * lda, fpType * beta, fpType * ata,
-                       SizeType * ldata)
+    static void xxsyrk(const char * uplo, const char * trans, SizeType * p, SizeType * n, fpType * alpha, fpType * a, SizeType * lda, fpType * beta,
+                       fpType * ata, const SizeType * ldata)
     {
         _impl<fpType, cpu>::xxsyrk(uplo, trans, p, n, alpha, a, lda, beta, ata, ldata);
     }

@@ -50,14 +50,14 @@ struct MklBlas<double, cpu>
 {
     typedef DAAL_INT SizeType;
 
-    static void xsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, double * alpha, double * a, DAAL_INT * lda, double * beta, double * ata,
-                      DAAL_INT * ldata)
+    static void xsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const double * alpha, const double * a,
+                      const DAAL_INT * lda, const double * beta, double * ata, const DAAL_INT * ldata)
     {
         __DAAL_MKLFN_CALL_BLAS(dsyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));
     }
 
-    static void xxsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, double * alpha, double * a, DAAL_INT * lda, double * beta, double * ata,
-                       DAAL_INT * ldata)
+    static void xxsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const double * alpha, const double * a,
+                       const DAAL_INT * lda, const double * beta, double * ata, const DAAL_INT * ldata)
     {
         int old_nthr = mkl_set_num_threads_local(1);
         __DAAL_MKLFN_CALL_BLAS(dsyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));
@@ -155,14 +155,14 @@ struct MklBlas<float, cpu>
 {
     typedef DAAL_INT SizeType;
 
-    static void xsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, float * alpha, float * a, DAAL_INT * lda, float * beta, float * ata,
-                      DAAL_INT * ldata)
+    static void xsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const float * alpha, const float * a,
+                      const DAAL_INT * lda, const float * beta, float * ata, const DAAL_INT * ldata)
     {
         __DAAL_MKLFN_CALL_BLAS(ssyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));
     }
 
-    static void xxsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, float * alpha, float * a, DAAL_INT * lda, float * beta, float * ata,
-                       DAAL_INT * ldata)
+    static void xxsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const float * alpha, const float * a,
+                       const DAAL_INT * lda, const float * beta, float * ata, const DAAL_INT * ldata)
     {
         int old_nthr = mkl_set_num_threads_local(1);
         __DAAL_MKLFN_CALL_BLAS(ssyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));

@@ -46,14 +46,14 @@ struct OpenBlas<double, cpu>
 {
     typedef DAAL_INT SizeType;
 
-    static void xsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, double * alpha, double * a, DAAL_INT * lda, double * beta, double * ata,
-                      DAAL_INT * ldata)
+    static void xsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const double * alpha, const double * a,
+                      const DAAL_INT * lda, const double * beta, double * ata, const DAAL_INT * ldata)
     {
         dsyrk_(uplo, trans, p, n, alpha, a, lda, beta, ata, ldata);
     }
 
-    static void xxsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, double * alpha, double * a, DAAL_INT * lda, double * beta, double * ata,
-                       DAAL_INT * ldata)
+    static void xxsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const double * alpha, const double * a,
+                       const DAAL_INT * lda, const double * beta, double * ata, const DAAL_INT * ldata)
     {
         openblas_thread_setter ots(1);
         dsyrk_(uplo, trans, p, n, alpha, a, lda, beta, ata, ldata);
@@ -138,14 +138,14 @@ struct OpenBlas<float, cpu>
 {
     typedef DAAL_INT SizeType;
 
-    static void xsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, float * alpha, float * a, DAAL_INT * lda, float * beta, float * ata,
-                      DAAL_INT * ldata)
+    static void xsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const float * alpha, const float * a,
+                      const DAAL_INT * lda, const const float * beta, float * ata, const DAAL_INT * ldata)
     {
         ssyrk_(uplo, trans, p, n, alpha, a, lda, beta, ata, ldata);
     }
 
-    static void xxsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, float * alpha, float * a, DAAL_INT * lda, float * beta, float * ata,
-                       DAAL_INT * ldata)
+    static void xxsyrk(const char * uplo, const char * trans, const DAAL_INT * p, const DAAL_INT * n, const float * alpha, const float * a,
+                       const DAAL_INT * lda, const float * beta, float * ata, const DAAL_INT * ldata)
     {
         openblas_thread_setter ots(1);
         ssyrk_(uplo, trans, p, n, alpha, a, lda, beta, ata, ldata);