[arch-commits] Commit in root/trunk (5 files)

Konstantin Gizdov kgizdov at archlinux.org
Thu Jul 23 20:24:10 UTC 2020


    Date: Thursday, July 23, 2020 @ 20:24:09
  Author: kgizdov
Revision: 665090

upgpkg: root 6.22.00-1

Added:
  root/trunk/adapt_tmva_to_support_cudnn8.patch
Modified:
  root/trunk/PKGBUILD
  root/trunk/nbman-for-arch.patch
Deleted:
  root/trunk/add_missing_include_ROOT-10315.patch
  root/trunk/fix_relative_paths.patch

--------------------------------------+
 PKGBUILD                             |   20 
 adapt_tmva_to_support_cudnn8.patch   | 1130 +++++++++++++++++++++++++++++++++
 add_missing_include_ROOT-10315.patch |   22 
 fix_relative_paths.patch             |   39 -
 nbman-for-arch.patch                 |   17 
 5 files changed, 1148 insertions(+), 80 deletions(-)

Modified: PKGBUILD
===================================================================
--- PKGBUILD	2020-07-23 20:13:52 UTC (rev 665089)
+++ PKGBUILD	2020-07-23 20:24:09 UTC (rev 665090)
@@ -6,7 +6,7 @@
 
 pkgbase=root
 pkgname=('root' 'root-cuda')
-pkgver=6.20.06
+pkgver=6.22.00
 pkgrel=1
 pkgdesc='C++ data analysis framework and interpreter from CERN'
 arch=('x86_64')
@@ -21,7 +21,7 @@
     'cuda'
     'cudnn'
     'gcc-fortran'
-    'gcc8-fortran'
+    'gcc9-fortran'
     'git'
     'go'
     'libxml2'
@@ -90,12 +90,11 @@
     'settings.cmake'
     'settings-cuda.cmake'
     'jupyter_notebook_config.py'
-    'add_missing_include_ROOT-10315.patch'
-    'fix_relative_paths.patch'
     'nbman-for-arch.patch'
     'thisroot.fail'
+    'adapt_tmva_to_support_cudnn8.patch'
 )
-sha512sums=('232fd5253e83eb02ad33d03941c7c83d39d5f6b0162bd42594e44a0c1c08deade42ae5793e571db767ce4fa0a582aa332b9d1869e621b825d1eb1162819c98c6'
+sha512sums=('9e3c54bbc146b0abb0a2d960af380255ec59d0b3a11a4a97a2a25cb7ac567b07280c4eb48dddf99c1fa2e692881f6396a842ce125d3a253037e52f719739f01e'
             'af8f178fc9df66997d5495b271e38adcd1636aab4c8fc994c6600c2496127829d831250d73d3fc229b02dfe49b9867d0be979beacb959f2f3a05351b8118a4a6'
             '1fe6f4aa09d583d33f27cc766f4935510bb7ab6bbb8d4700baa1aaab92ea6c876500b67da1e4f6e0b510aa5616e4e193b860264b86925de85f2d9f558d75d5dc'
             '3c81d255a17b902ffac0187af1752847036137e16641a88b17eef0d9c944e6f0d3c954bc93307d6270603f43f6c23f2e04f98dc7a68f9d076dbaa8006a2527d6'
@@ -102,10 +101,9 @@
             '9ee5b6606dbd352608a2a4998344ca4026d677c86823e62fff615f6e84efcecdffc07a1e9182a356aa35035e7f35df5a107127722a6bad4b97d1f49cffebf5b9'
             '7665bc8cbe79162e0b969b08802e1b7b2ed22ed8b1402d50cf194172a644f647dcaf0f5abb76f8b6007dfab8dbc811604479be826b345d8fd77edfb51032110b'
             '1c905ee7a3f8f5f3f567d957f9be6b503a8631565d4d9b9bfea5e496ef86865c5a8be1a1f8c7842754029879cf0afd2465249f532a116cc43660aa2e460ae682'
-            '19b46d3b6a8083d9461f5d4f57845a1145b80de699e8b773cee7edecad0b064fe570faffa226720078273f40a88fe8ba933b7d944925391ad869f0b9cdbf9579'
-            'fa2f3b9b8717bfd11b6743a4457bbff104e547112bcec864f212510b1954a92809d9bde5ada8702d11659a150f2957daed4e4d2621858b0bc4e234d9f6924244'
-            'f4152ecddbef8079e7507625c3cfb2dcd6d3c844f38827f293d7c1915ff9be5ea3e4414080b51a83f86128b954520d8ec113349a5a07ba39fc87047b72bf0136'
-            'ff555ac4db568affe139701907f86d919a2206f3e304f69dd317b756ea0904b5934d9364a524060778aa507809ce78448621619bb34039ba34c5a71af71a4a8c')
+            '12814f50b7016bd86d3f91e0e31c052783a0c0fa72b7d6a072d3ae6f86c2437323d585e531235377ebbfdd9cb76abd7da84d9631de821151547f1d4b13417e69'
+            'ff555ac4db568affe139701907f86d919a2206f3e304f69dd317b756ea0904b5934d9364a524060778aa507809ce78448621619bb34039ba34c5a71af71a4a8c'
+            '2ae126795df4127c27a6287a1499bdb8b2bacb74cfbec17dabe378a5fb9fc7c755644e4090a4da1d0045bf5d4f542f06da827a0f48a5927ee8509874045f18b6')
 
 get_pyver () {
     python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))'
@@ -167,8 +165,8 @@
     mkdir -p "${srcdir}/build-cuda"
     cd "${srcdir}/build-cuda"
 
-    CC=/usr/bin/gcc-8 \
-    CXX=/usr/bin/g++-8 \
+    CC=/usr/bin/gcc-9 \
+    CXX=/usr/bin/g++-9 \
     cmake -C "${srcdir}/settings-cuda.cmake" \
         ${CUSTOM_CMAKE_FLAGS} \
         "${srcdir}/${pkgbase}-${pkgver}-cuda"

Added: adapt_tmva_to_support_cudnn8.patch
===================================================================
--- adapt_tmva_to_support_cudnn8.patch	                        (rev 0)
+++ adapt_tmva_to_support_cudnn8.patch	2020-07-23 20:24:09 UTC (rev 665090)
@@ -0,0 +1,1130 @@
+From 05739e6b01fb34b5ef40e1a584107876e68e4b77 Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Tue, 21 Jul 2020 15:13:57 +0300
+Subject: [PATCH 01/10] update deprecated function call name to backward
+ compatible one
+
+---
+ tmva/tmva/src/DNN/Architectures/Cudnn/RecurrentPropagation.cu | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/RecurrentPropagation.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/RecurrentPropagation.cu
+index 058cee28424..60289ec2fdd 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/RecurrentPropagation.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/RecurrentPropagation.cu
+@@ -132,7 +132,11 @@ void TCudnn<AFloat>::InitializeRecurrentDescriptors(TDescriptors *&descriptors,
+    cudnnDataType_t mathPrec = CUDNN_DATA_FLOAT;
+    if      (std::is_same<AFloat, double>::value) { mathPrec = CUDNN_DATA_DOUBLE;}
+ 
++#if (CUDNN_VERSION >= 8000)
++   CUDNNCHECK(cudnnSetRNNDescriptor_v6(handle, rnnDescriptors->LayerDescriptor, hiddenSize, numLayers, rnnDescriptors->HelperDescriptor,
++#else
+    CUDNNCHECK(cudnnSetRNNDescriptor(handle, rnnDescriptors->LayerDescriptor, hiddenSize, numLayers, rnnDescriptors->HelperDescriptor,
++#endif
+       inputMode, direction, mode, algo, mathPrec) );
+ 
+ 
+
+From 90baa4f6ad10076fa148f5aa06ef432bd0f34208 Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Tue, 21 Jul 2020 19:06:09 +0300
+Subject: [PATCH 02/10] adapt convolution forward to cuDNN 8
+
+---
+ .../src/DNN/Architectures/Cudnn/Propagate.cu  | 77 ++++++++++++++++++-
+ 1 file changed, 76 insertions(+), 1 deletion(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index 7a57b6bf104..cc953ee45f9 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -27,6 +27,9 @@
+ // #include "Kernels.cuh"*/
+ // #include <math.h>
+ 
++// for std::numeric_limits<T>::max()
++#include <limits>
++
+ namespace TMVA {
+ namespace DNN  {
+ 
+@@ -378,7 +381,78 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    cudnnHandle_t cudnnHandle = outputTensor.GetCudnnHandle();
+ 
+    // cuDNN decides which algorithm to use
+-   // More detailed alternative: cudnnFindConvolutionForwardAlgorithm
++#if (CUDNN_VERSION >= 8000)
++   /**
++    * I'm sure there may be a faster way, but this works
++    */
++   int convRequestedAlgoCount{8}; // requestedAlgoCount is setting how many algorithms to try, can be tuned, fixed for now as all available
++   cudnnConvolutionDescriptor_t tempConvDescriptor;
++   CUDDNCHECK(cudnnCreateConvolutionDescriptor(&tempConvDescriptor));
++   cudnnTensorDescriptor_t  outputTensorDescriptor;
++   CUDNNCHECK(cudnnCreateTensorDescriptor(&outputTensorDescriptor));
++   CUDNNCHECK(cudnnSetTensor4dDescriptor(outputTensorDescriptor,
++                                             CUDNN_TENSOR_NCHW,  // Layout of the tensor in memory
++                                             Tensor_t::GetDataType(),
++                                             (int)L->GetBatchSize(),
++                                             (int)L->GetDepth(),
++                                             (int)L->GetHeight(),
++                                             (int)L->GetWidth()));
++   int algoCount;
++   cudnnConvolutionFwdAlgoPerf_t convPerfResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   CUDNNCHECK(cudnnFindConvolutionForwardAlgorithm(
++      cudnnHandle,
++      inputTensorDescriptor,
++      convDescriptors->WeightsDescriptor,
++      tempConvDescriptor,
++      outputTensorDescriptor,
++      convRequestedAlgoCount,
++      &algoCount,
++      &convPerfResults));
++   // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
++   // but we arrive at an chicken or egg problem:
++   // workspace size is calculated from chosen forward algorithm,
++   // but finding a forward algorithm depends on workspace size...
++   // i.e.
++   // Tensor_t & inputTensor = L->GetInput();
++   // inputTensor = Tensor_t(inputTensor.GetDeviceBuffer(),{ L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth() },GetTensorLayout(),0,0);
++   // CUDNNCHECK(cudnnFindConvolutionForwardAlgorithmEx(
++   //    cudnnHandle,
++   //    inputTensorDescriptor,
++   //    &inputTensor,
++   //    convDescriptors->WeightsDescriptor,
++   //    &filters,
++   //    tempConvDescriptor,
++   //    outputTensorDescriptor,
++   //    &outputTensor,
++   //    convRequestedAlgoCount,
++   //    &algoCount,
++   //    &convPerfResults,
++   //    &convWorkspace,
++   //    convWorkspace->ForwardWorkspaceSize));
++   // instead choose either fastest or lowest memory algo as per preference
++   int algoIdx{0};
++   if (CNNOptions::ConvMaxWorkspaceSize != 0) {  // prefer fastest
++      float temp_runtime{std::numeric_limits<float>::max()};
++      for (int i = 0; i < algoCount; ++i) {
++         if (convPerfResults[i].status != 0) continue;
++         if (convPerfResults[i].time < temp_runtime) {
++            temp_runtime = convPerfResults[i].time;
++            algoIdx = i;
++         }
++      }
++   } else {  // prefer smallest workspace size
++      size_t temp_memsize{std::numeric_limits<size_t>::max()};
++      for (int i = 0; i < algoCount; ++i) {
++         if (convPerfResults[i].status != 0) continue;
++         if (convPerfResults[i].memory < temp_memsize) {
++            temp_memsize = convPerfResults[i].memory;
++            algoIdx = i;
++         }
++      }
++   }
++   convWorkspace->AlgorithmForward = convPerfResults[algoIdx].algo;
++#else
++   // More detailed alternative: cudnnFindConvolutionForwardAlgorithm (only option in newer cuDNN versions)
+    cudnnConvolutionFwdPreference_t preferenceFwd = (CNNOptions::ConvMaxWorkspaceSize !=0) ? CUDNN_CONVOLUTION_FWD_PREFER_FASTEST :
+                                                    CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+ 
+@@ -389,6 +463,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+       outputTensor.GetTensorDescriptor(), preferenceFwd,
+       memLimit, // Memory limit in bytes for mode CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
+       &convWorkspace->AlgorithmForward));
++#endif
+ 
+    // Allocate memory for the convolution
+    //size_t workSpaceSizeInBytes = 0;
+
+From d9b5e2f82917e7183b9f45a49135641981741477 Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Tue, 21 Jul 2020 19:34:00 +0300
+Subject: [PATCH 03/10] adapt convolution backward to cuDNN 8
+
+---
+ .../src/DNN/Architectures/Cudnn/Propagate.cu  | 72 +++++++++++++++++++
+ 1 file changed, 72 insertions(+)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index cc953ee45f9..85a5c3aa175 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -515,6 +515,77 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    // dx : Activation gradient to be computed                               -> activationGradients [in place op]
+    // dy : Gradient of activation from the following layer (backpropagation)-> activationGradients
+ 
++#if (CUDNN_VERSION >= 8000)
++   /**
++    * I'm sure there may be a faster way, but this works
++    */
++   convRequestedAlgoCount = 6; // reset to max number of available backward algorithms
++   cudnnConvolutionDescriptor_t tempConvBwdDescriptor;
++   CUDDNCHECK(cudnnCreateConvolutionDescriptor(&tempConvBwdDescriptor));
++   cudnnTensorDescriptor_t  outputBwdTensorDescriptor;
++   CUDNNCHECK(cudnnCreateTensorDescriptor(&outputBwdTensorDescriptor));
++   CUDNNCHECK(cudnnSetTensor4dDescriptor(outputBwdTensorDescriptor,
++                                             CUDNN_TENSOR_NCHW,  // Layout of the tensor in memory
++                                             Tensor_t::GetDataType(),
++                                             (int)L->GetBatchSize(),
++                                             (int)L->GetInputDepth(),
++                                             (int)L->GetInputHeight(),
++                                             (int)L->GetInputWidth()));
++   int algoCount;
++   cudnnConvolutionBwdDataAlgoPerf_t convPerfBwdResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithm(
++      cudnnHandle,
++      convDescriptors->WeightsDescriptor,
++      activationGradientsBackwardDescriptor,
++      tempConvBwdDescriptor,
++      outputBwdTensorDescriptor,
++      convRequestedAlgoCount,
++      &algoCount,
++      &convPerfBwdResults));
++   // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
++   // but we arrive at an chicken or egg problem:
++   // workspace size is calculated from chosen forward algorithm,
++   // but finding a forward algorithm depends on workspace size...
++   // i.e.
++   // Tensor_t & outputBwdTensor = L->GetInput();
++   // outputBwdTensor = Tensor_t(outputBwdTensor.GetDeviceBuffer(),{ L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth() },GetTensorLayout(),0,0);
++   // CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
++   //    cudnnHandle,
++   //    convDescriptors->WeightsDescriptor,
++   //    &filters,
++   //    activationGradientsBackwardDescriptor,
++   //    &activationGradientsBackwardTensor,
++   //    tempConvBwdDescriptor,
++   //    outputBwdTensorDescriptor,
++   //    &outputBwdTensor,
++   //    convRequestedAlgoCount,
++   //    &algoCount,
++   //    &convPerfBwdResults,
++   //    &convWorkspace,
++   //    convWorkspace->ForwardWorkspaceSize));
++   // instead choose either fastest or lowest memory algo as per preference
++   int algoIdx{0};
++   if (CNNOptions::ConvMaxWorkspaceSize != 0) {  // prefer fastest
++      float temp_runtime{std::numeric_limits<float>::max()};
++      for (int i = 0; i < algoCount; ++i) {
++         if (convPerfBwdResults[i].status != 0) continue;
++         if (convPerfBwdResults[i].time < temp_runtime) {
++            temp_runtime = convPerfBwdResults[i].time;
++            algoIdx = i;
++         }
++      }
++   } else {  // prefer smallest workspace size
++      size_t temp_memsize{std::numeric_limits<size_t>::max()};
++      for (int i = 0; i < algoCount; ++i) {
++         if (convPerfBwdResults[i].status != 0) continue;
++         if (convPerfBwdResults[i].memory < temp_memsize) {
++            temp_memsize = convPerfBwdResults[i].memory;
++            algoIdx = i;
++         }
++      }
++   }
++   convWorkspace->AlgorithmBackward = convPerfBwdResults[algoIdx].algo;
++#else
+    cudnnConvolutionBwdDataPreference_t preferenceBwdData =
+       (CNNOptions::ConvMaxWorkspaceSize != 0) ? CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+ 
+@@ -525,6 +596,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+                                                       activationGradientsBackwardDescriptor,
+                                                       preferenceBwdData, memLimit,
+                                                       &convWorkspace->AlgorithmBackward));
++#endif
+ 
+    std::cout << "CONV BWD Data Algo used  is "  << convWorkspace->AlgorithmBackward << std::endl;
+    //CUDNNCHECK(cudnnSetConvolutionMathType(convDescriptors->LayerDescriptor, CUDNN_TENSOR_OP_MATH));
+
+From 526b7177c0201be1d0c6b36de0772b7d2ecb90d5 Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Wed, 22 Jul 2020 11:50:29 +0300
+Subject: [PATCH 04/10] fix typo and re-declarations
+
+---
+ tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu | 11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index 85a5c3aa175..1b7e3e845d8 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -387,7 +387,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+     */
+    int convRequestedAlgoCount{8}; // requestedAlgoCount is setting how many algorithms to try, can be tuned, fixed for now as all available
+    cudnnConvolutionDescriptor_t tempConvDescriptor;
+-   CUDDNCHECK(cudnnCreateConvolutionDescriptor(&tempConvDescriptor));
++   CUDNNCHECK(cudnnCreateConvolutionDescriptor(&tempConvDescriptor));
+    cudnnTensorDescriptor_t  outputTensorDescriptor;
+    CUDNNCHECK(cudnnCreateTensorDescriptor(&outputTensorDescriptor));
+    CUDNNCHECK(cudnnSetTensor4dDescriptor(outputTensorDescriptor,
+@@ -407,7 +407,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+       outputTensorDescriptor,
+       convRequestedAlgoCount,
+       &algoCount,
+-      &convPerfResults));
++      convPerfResults));
+    // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
+    // but we arrive at an chicken or egg problem:
+    // workspace size is calculated from chosen forward algorithm,
+@@ -521,7 +521,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+     */
+    convRequestedAlgoCount = 6; // reset to max number of available backward algorithms
+    cudnnConvolutionDescriptor_t tempConvBwdDescriptor;
+-   CUDDNCHECK(cudnnCreateConvolutionDescriptor(&tempConvBwdDescriptor));
++   CUDNNCHECK(cudnnCreateConvolutionDescriptor(&tempConvBwdDescriptor));
+    cudnnTensorDescriptor_t  outputBwdTensorDescriptor;
+    CUDNNCHECK(cudnnCreateTensorDescriptor(&outputBwdTensorDescriptor));
+    CUDNNCHECK(cudnnSetTensor4dDescriptor(outputBwdTensorDescriptor,
+@@ -531,7 +531,6 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+                                              (int)L->GetInputDepth(),
+                                              (int)L->GetInputHeight(),
+                                              (int)L->GetInputWidth()));
+-   int algoCount;
+    cudnnConvolutionBwdDataAlgoPerf_t convPerfBwdResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
+    CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+       cudnnHandle,
+@@ -541,7 +540,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+       outputBwdTensorDescriptor,
+       convRequestedAlgoCount,
+       &algoCount,
+-      &convPerfBwdResults));
++      convPerfBwdResults));
+    // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
+    // but we arrive at an chicken or egg problem:
+    // workspace size is calculated from chosen forward algorithm,
+@@ -564,7 +563,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    convWorkspace->ForwardWorkspaceSize));
+    // instead choose either fastest or lowest memory algo as per preference
+-   int algoIdx{0};
++   algoIdx = 0;
+    if (CNNOptions::ConvMaxWorkspaceSize != 0) {  // prefer fastest
+       float temp_runtime{std::numeric_limits<float>::max()};
+       for (int i = 0; i < algoCount; ++i) {
+
+From 6d84e765322a72c48de00b4a9b7471da8a15fece Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Wed, 22 Jul 2020 17:00:01 +0300
+Subject: [PATCH 05/10] implement workspace limits, fix an algoruthm preference
+ bug and rewrite relevant sections
+
+---
+ .../src/DNN/Architectures/Cudnn/Propagate.cu  | 273 ++++++++++--------
+ 1 file changed, 151 insertions(+), 122 deletions(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index 1b7e3e845d8..2049e2b9195 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -333,35 +333,108 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+                                              TDescriptors * & descriptors,
+                                              const DNN::CNN::TConvParams & /*params*/,
+                                              ConvLayer_t *L) {
+-   auto convWorkspace = new ConvWorkspace_t ();
++   auto convWorkspace = new ConvWorkspace_t();
++   size_t memLimit = (CNNOptions::ConvMaxWorkspaceSize > 0) ? static_cast<size_t>(CNNOptions::ConvMaxWorkspaceSize) : 0;
+    auto convDescriptors = static_cast<ConvDescriptors_t *>(descriptors);
++   // can we do the following and substitute below???
++   // auto weightsDescriptor{convDescriptors->WeightsDescriptor};
++   // auto convDescriptor{convDescriptors->LayerDescriptor};
+ 
++#if (CUDNN_VERSION >= 8000)
++   enum algoPreference { no_workspace, fastest, workspace_limit };
++   algoPreference algoChoice;
++   auto choose_algo = [](algoPreference const& algoPref, auto&& perfResults, size_t memLim = std::numeric_limits<size_t>::max()) -> int {
++      int algoIdx{0};
++      if (algoPref == algoPreference::fastest) {  // prefer fastest
++         float temp_runtime{std::numeric_limits<float>::max()};
++         for (int i = 0; i < algoCount; ++i) {
++            if (PerfResults[i].status == CUDNN_STATUS_SUCCESS && PerfResults[i].time < temp_runtime) {
++               temp_runtime = PerfResults[i].time;
++               algoIdx = i;
++            }
++         }
++      } else if (algoPref == algoPreference::workspace_limit) {  // constrain to workspace size
++         float temp_runtime{std::numeric_limits<float>::max()};
++         for (int i = 0; i < algoCount; ++i) {
++            if (PerfResults[i].status == CUDNN_STATUS_SUCCESS && PerfResults[i].time < temp_runtime && PerfResults[i].memory <= memLim) {
++               temp_runtime = PerfResults[i].time;
++               algoIdx = i;
++            }
++         }
++      } else {  // prefer smallest workspace size
++         size_t temp_memsize{std::numeric_limits<size_t>::max()};
++         for (int i = 0; i < algoCount; ++i) {
++            if (PerfResults[i].status == CUDNN_STATUS_SUCCESS && PerfResults[i].memory < temp_memsize) {
++               temp_memsize = PerfResults[i].memory;
++               algoIdx = i;
++            }
++         }
++      }
++      return algoIdx;
++   };
++#else
++   // More detailed alternative: cudnnFindConvolutionForwardAlgorithm (only option in newer cuDNN versions)
++   cudnnConvolutionFwdPreference_t preferenceFwd;
++   cudnnConvolutionBwdDataPreference_t preferenceBwdData;
++   cudnnConvolutionBwdFilterPreference_t preferenceBwdFilter;
++#endif
++   // decide on algorithm preference early
++   if (CNNOptions::ConvMaxWorkspaceSize < 0) {
++      // no workspace case
++#if (CUDNN_VERSION >= 8000)
++      algoChoice = no_workspace;
++#else
++      preferenceFwd = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
++      preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
++      preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
++#endif
++
++   } else if (CNNOptions::ConvMaxWorkspaceSize == 0) {
++      // fastest overall
++#if (CUDNN_VERSION >= 8000)
++      algoChoice = fastest;
++#else
++      preferenceFwd = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
++      preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
++      preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
++#endif
++
++   } else {
++      // fastest in memory limit
++#if (CUDNN_VERSION >= 8000)
++      algoChoice = workspace_limit;
++#else
++      preferenceFwd = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
++      preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
++      preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
++#endif
++   }
+    // fix the weight tensor shapes
+    // by default the weights are columnmajor, set them to be row major . At this points
+    // they are not yet initialized
+    Tensor_t & filters = L->GetWeightsAt(0);
+-   filters = Tensor_t (filters.GetDeviceBuffer(), {L->GetDepth(),L->GetInputDepth(), L->GetFilterHeight(),L->GetFilterWidth()}, MemoryLayout::RowMajor, 0, 0 );
+-   //PrintTensor(L->GetWeightsAt(0));
++   filters = Tensor_t(filters.GetDeviceBuffer(), {L->GetDepth(), L->GetInputDepth(), L->GetFilterHeight(), L->GetFilterWidth()}, MemoryLayout::RowMajor, 0, 0);
++   // PrintTensor(L->GetWeightsAt(0));
+    Tensor_t & biases = L->GetBiasesAt(0);
+-   biases = Tensor_t (biases.GetDeviceBuffer(), {1, L->GetDepth(),1,1}, GetTensorLayout(), 0, 0 );
++   biases = Tensor_t(biases.GetDeviceBuffer(), {1, L->GetDepth(), 1, 1}, GetTensorLayout(), 0, 0);
+ 
+    Tensor_t & outputTensor = L->GetOutput();
+-   outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(),{ L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth() },GetTensorLayout(),0,0 );
++   outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(), {L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth()}, GetTensorLayout(), 0, 0);
+    Tensor_t & inputActivation = L->GetInputActivation();
+-   inputActivation = Tensor_t(inputActivation.GetDeviceBuffer(),outputTensor.GetShape() ,GetTensorLayout(),0,0 );
++   inputActivation = Tensor_t(inputActivation.GetDeviceBuffer(),outputTensor.GetShape() ,GetTensorLayout(), 0, 0);
+ 
+    Tensor_t &  activationGradients = L->GetActivationGradients();
+-   activationGradients =  Tensor_t(activationGradients.GetDeviceBuffer(),outputTensor.GetShape() ,GetTensorLayout(),0,0 );
++   activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(),outputTensor.GetShape(), GetTensorLayout(), 0, 0);
+ 
+    Tensor_t & weightGradients = L->GetWeightGradientsAt(0);
+-   weightGradients = Tensor_t( weightGradients.GetDeviceBuffer(), filters.GetShape(), GetTensorLayout(), 0, 0 );
++   weightGradients = Tensor_t(weightGradients.GetDeviceBuffer(), filters.GetShape(), GetTensorLayout(), 0, 0);
+ 
+    Tensor_t & biasGradients = L->GetBiasGradientsAt(0);
+-   biasGradients = Tensor_t( biasGradients.GetDeviceBuffer(), biases.GetShape(), GetTensorLayout(), 0, 0 );
++   biasGradients = Tensor_t(biasGradients.GetDeviceBuffer(), biases.GetShape(), GetTensorLayout(), 0, 0);
+ 
+ 
+    // FIXME: Use descriptors instead (Tensor device memory is otherwise allocated during initialization)
+-   //Tensor_t inputTensor  ({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
++   // Tensor_t inputTensor  ({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
+    cudnnTensorDescriptor_t  inputTensorDescriptor;
+    CUDNNCHECK(cudnnCreateTensorDescriptor(&inputTensorDescriptor) );
+    CUDNNCHECK(cudnnSetTensor4dDescriptor(inputTensorDescriptor,
+@@ -385,79 +458,44 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    /**
+     * I'm sure there may be a faster way, but this works
+     */
+-   int convRequestedAlgoCount{8}; // requestedAlgoCount is setting how many algorithms to try, can be tuned, fixed for now as all available
+-   cudnnConvolutionDescriptor_t tempConvDescriptor;
+-   CUDNNCHECK(cudnnCreateConvolutionDescriptor(&tempConvDescriptor));
+-   cudnnTensorDescriptor_t  outputTensorDescriptor;
+-   CUDNNCHECK(cudnnCreateTensorDescriptor(&outputTensorDescriptor));
+-   CUDNNCHECK(cudnnSetTensor4dDescriptor(outputTensorDescriptor,
+-                                             CUDNN_TENSOR_NCHW,  // Layout of the tensor in memory
+-                                             Tensor_t::GetDataType(),
+-                                             (int)L->GetBatchSize(),
+-                                             (int)L->GetDepth(),
+-                                             (int)L->GetHeight(),
+-                                             (int)L->GetWidth()));
++   int convRequestedAlgoCount{8};  // requestedAlgoCount is setting how many algorithms to try, can be tuned, fixed for now as all available
++
+    int algoCount;
+    cudnnConvolutionFwdAlgoPerf_t convPerfResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
+-   CUDNNCHECK(cudnnFindConvolutionForwardAlgorithm(
+-      cudnnHandle,
+-      inputTensorDescriptor,
+-      convDescriptors->WeightsDescriptor,
+-      tempConvDescriptor,
+-      outputTensorDescriptor,
+-      convRequestedAlgoCount,
+-      &algoCount,
+-      convPerfResults));
++   CUDNNCHECK(
++      cudnnFindConvolutionForwardAlgorithm(
++         cudnnHandle,
++         inputTensorDescriptor,
++         convDescriptors->WeightsDescriptor,
++         convDescriptors->LayerDescriptor,
++         outputTensor.GetTensorDescriptor(),
++         convRequestedAlgoCount,
++         &algoCount,
++         convPerfResults
++      )
++   );
+    // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
+-   // but we arrive at an chicken or egg problem:
+-   // workspace size is calculated from chosen forward algorithm,
+-   // but finding a forward algorithm depends on workspace size...
+    // i.e.
+-   // Tensor_t & inputTensor = L->GetInput();
+-   // inputTensor = Tensor_t(inputTensor.GetDeviceBuffer(),{ L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth() },GetTensorLayout(),0,0);
++   // create an input tensor before the inputTensorDescriptor
++   // and get the descriptor from there
++   // Tensor_t inputTensor({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
+    // CUDNNCHECK(cudnnFindConvolutionForwardAlgorithmEx(
+    //    cudnnHandle,
+-   //    inputTensorDescriptor,
++   //    inputTensor.GetTensorDescriptor(),
+    //    &inputTensor,
+    //    convDescriptors->WeightsDescriptor,
+    //    &filters,
+-   //    tempConvDescriptor,
+-   //    outputTensorDescriptor,
++   //    convDescriptors->LayerDescriptor,
++   //    outputTensor.GetTensorDescriptor(),
+    //    &outputTensor,
+    //    convRequestedAlgoCount,
+    //    &algoCount,
+    //    &convPerfResults,
+    //    &convWorkspace,
+-   //    convWorkspace->ForwardWorkspaceSize));
++   //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   int algoIdx{0};
+-   if (CNNOptions::ConvMaxWorkspaceSize != 0) {  // prefer fastest
+-      float temp_runtime{std::numeric_limits<float>::max()};
+-      for (int i = 0; i < algoCount; ++i) {
+-         if (convPerfResults[i].status != 0) continue;
+-         if (convPerfResults[i].time < temp_runtime) {
+-            temp_runtime = convPerfResults[i].time;
+-            algoIdx = i;
+-         }
+-      }
+-   } else {  // prefer smallest workspace size
+-      size_t temp_memsize{std::numeric_limits<size_t>::max()};
+-      for (int i = 0; i < algoCount; ++i) {
+-         if (convPerfResults[i].status != 0) continue;
+-         if (convPerfResults[i].memory < temp_memsize) {
+-            temp_memsize = convPerfResults[i].memory;
+-            algoIdx = i;
+-         }
+-      }
+-   }
+-   convWorkspace->AlgorithmForward = convPerfResults[algoIdx].algo;
++   convWorkspace->AlgorithmForward = convPerfResults[choose_algo(algoChoice, convPerfResults, memLimit)].algo;
+ #else
+-   // More detailed alternative: cudnnFindConvolutionForwardAlgorithm (only option in newer cuDNN versions)
+-   cudnnConvolutionFwdPreference_t preferenceFwd = (CNNOptions::ConvMaxWorkspaceSize !=0) ? CUDNN_CONVOLUTION_FWD_PREFER_FASTEST :
+-                                                   CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+-
+-   size_t memLimit = (CNNOptions::ConvMaxWorkspaceSize > 0) ? (size_t) CNNOptions::ConvMaxWorkspaceSize : 0;
+-
+    CUDNNCHECK(cudnnGetConvolutionForwardAlgorithm(
+       cudnnHandle, inputTensorDescriptor, convDescriptors->WeightsDescriptor, convDescriptors->LayerDescriptor,
+       outputTensor.GetTensorDescriptor(), preferenceFwd,
+@@ -519,75 +557,36 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    /**
+     * I'm sure there may be a faster way, but this works
+     */
+-   convRequestedAlgoCount = 6; // reset to max number of available backward algorithms
+-   cudnnConvolutionDescriptor_t tempConvBwdDescriptor;
+-   CUDNNCHECK(cudnnCreateConvolutionDescriptor(&tempConvBwdDescriptor));
+-   cudnnTensorDescriptor_t  outputBwdTensorDescriptor;
+-   CUDNNCHECK(cudnnCreateTensorDescriptor(&outputBwdTensorDescriptor));
+-   CUDNNCHECK(cudnnSetTensor4dDescriptor(outputBwdTensorDescriptor,
+-                                             CUDNN_TENSOR_NCHW,  // Layout of the tensor in memory
+-                                             Tensor_t::GetDataType(),
+-                                             (int)L->GetBatchSize(),
+-                                             (int)L->GetInputDepth(),
+-                                             (int)L->GetInputHeight(),
+-                                             (int)L->GetInputWidth()));
+-   cudnnConvolutionBwdDataAlgoPerf_t convPerfBwdResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   convRequestedAlgoCount = 6;  // reset to max number of available backward algorithms
++   cudnnConvolutionBwdDataAlgoPerf_t convPerfBwdDataResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
+    CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+       cudnnHandle,
+       convDescriptors->WeightsDescriptor,
++      activationGradients.GetTensorDescriptor(),
++      convDescriptors->LayerDescriptor,
+       activationGradientsBackwardDescriptor,
+-      tempConvBwdDescriptor,
+-      outputBwdTensorDescriptor,
+       convRequestedAlgoCount,
+       &algoCount,
+-      convPerfBwdResults));
++      convPerfBwdDataResults));
+    // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
+-   // but we arrive at an chicken or egg problem:
+-   // workspace size is calculated from chosen forward algorithm,
+-   // but finding a forward algorithm depends on workspace size...
+    // i.e.
+-   // Tensor_t & outputBwdTensor = L->GetInput();
+-   // outputBwdTensor = Tensor_t(outputBwdTensor.GetDeviceBuffer(),{ L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth() },GetTensorLayout(),0,0);
+    // CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
+    //    cudnnHandle,
+    //    convDescriptors->WeightsDescriptor,
+    //    &filters,
++   //    activationGradients.GetTensorDescriptor(),
++   //    &activationGradients,
++   //    convDescriptors->LayerDescriptor,
+    //    activationGradientsBackwardDescriptor,
+-   //    &activationGradientsBackwardTensor,
+-   //    tempConvBwdDescriptor,
+-   //    outputBwdTensorDescriptor,
+-   //    &outputBwdTensor,
++   //    &inputTensor,
+    //    convRequestedAlgoCount,
+    //    &algoCount,
+    //    &convPerfBwdResults,
+    //    &convWorkspace,
+-   //    convWorkspace->ForwardWorkspaceSize));
++   //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   algoIdx = 0;
+-   if (CNNOptions::ConvMaxWorkspaceSize != 0) {  // prefer fastest
+-      float temp_runtime{std::numeric_limits<float>::max()};
+-      for (int i = 0; i < algoCount; ++i) {
+-         if (convPerfBwdResults[i].status != 0) continue;
+-         if (convPerfBwdResults[i].time < temp_runtime) {
+-            temp_runtime = convPerfBwdResults[i].time;
+-            algoIdx = i;
+-         }
+-      }
+-   } else {  // prefer smallest workspace size
+-      size_t temp_memsize{std::numeric_limits<size_t>::max()};
+-      for (int i = 0; i < algoCount; ++i) {
+-         if (convPerfBwdResults[i].status != 0) continue;
+-         if (convPerfBwdResults[i].memory < temp_memsize) {
+-            temp_memsize = convPerfBwdResults[i].memory;
+-            algoIdx = i;
+-         }
+-      }
+-   }
+-   convWorkspace->AlgorithmBackward = convPerfBwdResults[algoIdx].algo;
++   convWorkspace->AlgorithmBackward = convPerfBwdDataResults[choose_algo(algoChoice, convPerfBwdDataResults, memLimit)].algo;
+ #else
+-   cudnnConvolutionBwdDataPreference_t preferenceBwdData =
+-      (CNNOptions::ConvMaxWorkspaceSize != 0) ? CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+-
+    CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle,
+                                                       convDescriptors->WeightsDescriptor,
+                                                       activationGradients.GetTensorDescriptor(),
+@@ -628,11 +627,40 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    // here should be able to use inputTensorDescriptor
+    cudnnTensorDescriptor_t activationBackwardDescriptor = inputTensorDescriptor;
+ 
+-   // cudnnConvolutionBwdFilterPreference_t preference =
+-   cudnnConvolutionBwdFilterPreference_t preferenceBwdFilter = (CNNOptions::ConvMaxWorkspaceSize != 0)
+-                                                                  ? CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE
+-                                                                  : CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+-
++#if (CUDNN_VERSION >= 8000)
++   /**
++    * I'm sure there may be a faster way, but this works
++    */
++   convRequestedAlgoCount = 6;  // reset to max number of available backward algorithms
++   cudnnConvolutionBwdDataAlgoPerf_t convPerfBwdFilterResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
++      cudnnHandle,
++      activationBackwardDescriptor,
++      activationGradients.GetTensorDescriptor(),
++      convDescriptors->LayerDescriptor,
++      convDescriptors->WeightsDescriptor,
++      convRequestedAlgoCount,
++      &algoCount,
++      convPerfBwdFilterResults));
++   // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
++   // i.e.
++   // CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
++   //    cudnnHandle,
++   //    activationBackwardDescriptor,
++   //    &inputTensor,
++   //    activationGradients.GetTensorDescriptor(),
++   //    &activationGradients,
++   //    convDescriptors->LayerDescriptor,
++   //    convDescriptors->WeightsDescriptor,
++   //    &filters,
++   //    convRequestedAlgoCount,
++   //    &algoCount,
++   //    &convPerfBwdFilterResults,
++   //    &convWorkspace,
++   //    memLimit));  // use memLimit for workspace size
++   // instead choose either fastest or lowest memory algo as per preference
++   convWorkspace->AlgorithmBackward = convPerfBwdFilterResults[choose_algo(algoChoice, convPerfBwdFilterResults, memLimit)].algo;
++#else
+    CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
+                                                          activationBackwardDescriptor,
+                                                          activationGradients.GetTensorDescriptor(),
+@@ -641,6 +669,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+                                                          preferenceBwdFilter,
+                                                          memLimit,
+                                                          &convWorkspace->HelperAlgorithm));
++#endif
+ 
+    std::cout << "CONV BWD Filter Algo used  is " << convWorkspace->HelperAlgorithm << std::endl;
+ 
+
+From a9d39cc9ccf9ae474d90b6671d3e0d69d4cf6872 Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Wed, 22 Jul 2020 17:11:30 +0300
+Subject: [PATCH 06/10] implement correct logic behind cudnn logarithm
+ preference
+
+---
+ .../src/DNN/Architectures/Cudnn/Propagate.cu  | 20 +++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index 2049e2b9195..b74c99d1a99 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -380,18 +380,8 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+ #endif
+    // decide on algorithm preference early
+    if (CNNOptions::ConvMaxWorkspaceSize < 0) {
+-      // no workspace case
+ #if (CUDNN_VERSION >= 8000)
+-      algoChoice = no_workspace;
+-#else
+-      preferenceFwd = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+-      preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+-      preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+-#endif
+-
+-   } else if (CNNOptions::ConvMaxWorkspaceSize == 0) {
+       // fastest overall
+-#if (CUDNN_VERSION >= 8000)
+       algoChoice = fastest;
+ #else
+       preferenceFwd = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+@@ -399,6 +389,16 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+       preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+ #endif
+ 
++   } else if (CNNOptions::ConvMaxWorkspaceSize == 0) {
++      // no workspace case
++#if (CUDNN_VERSION >= 8000)
++      algoChoice = no_workspace;
++#else
++      preferenceFwd = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
++      preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
++      preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
++#endif
++
+    } else {
+       // fastest in memory limit
+ #if (CUDNN_VERSION >= 8000)
+
+From 6282dfa816c7f51af5c0ecaa0065514e3f627631 Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Wed, 22 Jul 2020 18:51:56 +0300
+Subject: [PATCH 07/10] use decltype instead of auto, fix typos
+
+---
+ .../src/DNN/Architectures/Cudnn/Propagate.cu  | 22 +++++++++----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index b74c99d1a99..6cefd72c099 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -343,29 +343,29 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+ #if (CUDNN_VERSION >= 8000)
+    enum algoPreference { no_workspace, fastest, workspace_limit };
+    algoPreference algoChoice;
+-   auto choose_algo = [](algoPreference const& algoPref, auto&& perfResults, size_t memLim = std::numeric_limits<size_t>::max()) -> int {
++   auto choose_algo = [](algoPreference const& algoPref, int const algoCount, decltype(perfResults) const& perfResults, size_t memLim = std::numeric_limits<size_t>::max()) -> int {
+       int algoIdx{0};
+       if (algoPref == algoPreference::fastest) {  // prefer fastest
+          float temp_runtime{std::numeric_limits<float>::max()};
+          for (int i = 0; i < algoCount; ++i) {
+-            if (PerfResults[i].status == CUDNN_STATUS_SUCCESS && PerfResults[i].time < temp_runtime) {
+-               temp_runtime = PerfResults[i].time;
++            if (perfResults[i].status == CUDNN_STATUS_SUCCESS && perfResults[i].time < temp_runtime) {
++               temp_runtime = perfResults[i].time;
+                algoIdx = i;
+             }
+          }
+       } else if (algoPref == algoPreference::workspace_limit) {  // constrain to workspace size
+          float temp_runtime{std::numeric_limits<float>::max()};
+          for (int i = 0; i < algoCount; ++i) {
+-            if (PerfResults[i].status == CUDNN_STATUS_SUCCESS && PerfResults[i].time < temp_runtime && PerfResults[i].memory <= memLim) {
+-               temp_runtime = PerfResults[i].time;
++            if (perfResults[i].status == CUDNN_STATUS_SUCCESS && perfResults[i].time < temp_runtime && perfResults[i].memory <= memLim) {
++               temp_runtime = perfResults[i].time;
+                algoIdx = i;
+             }
+          }
+       } else {  // prefer smallest workspace size
+          size_t temp_memsize{std::numeric_limits<size_t>::max()};
+          for (int i = 0; i < algoCount; ++i) {
+-            if (PerfResults[i].status == CUDNN_STATUS_SUCCESS && PerfResults[i].memory < temp_memsize) {
+-               temp_memsize = PerfResults[i].memory;
++            if (perfResults[i].status == CUDNN_STATUS_SUCCESS && perfResults[i].memory < temp_memsize) {
++               temp_memsize = perfResults[i].memory;
+                algoIdx = i;
+             }
+          }
+@@ -494,7 +494,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   convWorkspace->AlgorithmForward = convPerfResults[choose_algo(algoChoice, convPerfResults, memLimit)].algo;
++   convWorkspace->AlgorithmForward = convPerfResults[choose_algo(algoChoice, algoCount, convPerfResults, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionForwardAlgorithm(
+       cudnnHandle, inputTensorDescriptor, convDescriptors->WeightsDescriptor, convDescriptors->LayerDescriptor,
+@@ -585,7 +585,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   convWorkspace->AlgorithmBackward = convPerfBwdDataResults[choose_algo(algoChoice, convPerfBwdDataResults, memLimit)].algo;
++   convWorkspace->AlgorithmBackward = convPerfBwdDataResults[choose_algo(algoChoice, algoCount, convPerfBwdDataResults, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle,
+                                                       convDescriptors->WeightsDescriptor,
+@@ -632,7 +632,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+     * I'm sure there may be a faster way, but this works
+     */
+    convRequestedAlgoCount = 6;  // reset to max number of available backward algorithms
+-   cudnnConvolutionBwdDataAlgoPerf_t convPerfBwdFilterResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   cudnnConvolutionBwdFilterAlgoPerf_t convPerfBwdFilterResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
+    CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+       cudnnHandle,
+       activationBackwardDescriptor,
+@@ -659,7 +659,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   convWorkspace->AlgorithmBackward = convPerfBwdFilterResults[choose_algo(algoChoice, convPerfBwdFilterResults, memLimit)].algo;
++   convWorkspace->AlgorithmBackward = convPerfBwdFilterResults[choose_algo(algoChoice, algoCount, convPerfBwdFilterResults, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
+                                                          activationBackwardDescriptor,
+
+From 259c1c9c4d86391d1987f6635a2aece8cae587ac Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Wed, 22 Jul 2020 19:39:40 +0300
+Subject: [PATCH 08/10] assign backward filter algo to correct place
+
+---
+ tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index 6cefd72c099..5a80dfbc03d 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -659,7 +659,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   convWorkspace->AlgorithmBackward = convPerfBwdFilterResults[choose_algo(algoChoice, algoCount, convPerfBwdFilterResults, memLimit)].algo;
++   convWorkspace->HelperAlgorithm = convPerfBwdFilterResults[choose_algo(algoChoice, algoCount, convPerfBwdFilterResults, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
+                                                          activationBackwardDescriptor,
+
+From 2c109efea0e970b380a62f6102a286542676912a Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Thu, 23 Jul 2020 17:58:58 +0300
+Subject: [PATCH 09/10] make it compile and support C++11
+
+---
+ .../src/DNN/Architectures/Cudnn/Propagate.cu  | 49 ++++++++++++-------
+ 1 file changed, 30 insertions(+), 19 deletions(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index 5a80dfbc03d..66ce64a5efc 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -343,29 +343,37 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+ #if (CUDNN_VERSION >= 8000)
+    enum algoPreference { no_workspace, fastest, workspace_limit };
+    algoPreference algoChoice;
+-   auto choose_algo = [](algoPreference const& algoPref, int const algoCount, decltype(perfResults) const& perfResults, size_t memLim = std::numeric_limits<size_t>::max()) -> int {
++   // C++11 lambdas cannot be templated, so we have to do this HORRIBLE stuff...
++   union LocalPerf_t {
++      // these three type are absolutely equivalent
++      // and one can access them as they wish to get info
++      cudnnConvolutionFwdAlgoPerf_t * fwd;
++      cudnnConvolutionBwdFilterAlgoPerf_t * bwdFilter;
++      cudnnConvolutionBwdDataAlgoPerf_t * bwdData;
++   };
++   auto choose_algo = [](algoPreference const & algoPref, int const algoCount, LocalPerf_t const & perfResults, size_t memLim = std::numeric_limits<size_t>::max()) -> int {
+       int algoIdx{0};
+       if (algoPref == algoPreference::fastest) {  // prefer fastest
+          float temp_runtime{std::numeric_limits<float>::max()};
+          for (int i = 0; i < algoCount; ++i) {
+-            if (perfResults[i].status == CUDNN_STATUS_SUCCESS && perfResults[i].time < temp_runtime) {
+-               temp_runtime = perfResults[i].time;
++            if (perfResults.fwd[i].status == CUDNN_STATUS_SUCCESS && perfResults.fwd[i].time < temp_runtime) {
++               temp_runtime = perfResults.fwd[i].time;
+                algoIdx = i;
+             }
+          }
+       } else if (algoPref == algoPreference::workspace_limit) {  // constrain to workspace size
+          float temp_runtime{std::numeric_limits<float>::max()};
+          for (int i = 0; i < algoCount; ++i) {
+-            if (perfResults[i].status == CUDNN_STATUS_SUCCESS && perfResults[i].time < temp_runtime && perfResults[i].memory <= memLim) {
+-               temp_runtime = perfResults[i].time;
++            if (perfResults.fwd[i].status == CUDNN_STATUS_SUCCESS && perfResults.fwd[i].time < temp_runtime && perfResults.fwd[i].memory <= memLim) {
++               temp_runtime = perfResults.fwd[i].time;
+                algoIdx = i;
+             }
+          }
+       } else {  // prefer smallest workspace size
+          size_t temp_memsize{std::numeric_limits<size_t>::max()};
+          for (int i = 0; i < algoCount; ++i) {
+-            if (perfResults[i].status == CUDNN_STATUS_SUCCESS && perfResults[i].memory < temp_memsize) {
+-               temp_memsize = perfResults[i].memory;
++            if (perfResults.fwd[i].status == CUDNN_STATUS_SUCCESS && perfResults.fwd[i].memory < temp_memsize) {
++               temp_memsize = perfResults.fwd[i].memory;
+                algoIdx = i;
+             }
+          }
+@@ -461,7 +469,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    int convRequestedAlgoCount{8};  // requestedAlgoCount is setting how many algorithms to try, can be tuned, fixed for now as all available
+ 
+    int algoCount;
+-   cudnnConvolutionFwdAlgoPerf_t convPerfResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   cudnnConvolutionFwdAlgoPerf_t convFwdPerfResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
+    CUDNNCHECK(
+       cudnnFindConvolutionForwardAlgorithm(
+          cudnnHandle,
+@@ -471,7 +479,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+          outputTensor.GetTensorDescriptor(),
+          convRequestedAlgoCount,
+          &algoCount,
+-         convPerfResults
++         convFwdPerfResults
+       )
+    );
+    // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
+@@ -490,11 +498,12 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &outputTensor,
+    //    convRequestedAlgoCount,
+    //    &algoCount,
+-   //    &convPerfResults,
++   //    &convFwdPerfResults,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   convWorkspace->AlgorithmForward = convPerfResults[choose_algo(algoChoice, algoCount, convPerfResults, memLimit)].algo;
++   LocalPerf_t fwdPerfResults{convFwdPerfResults};
++   convWorkspace->AlgorithmForward = convFwdPerfResults[choose_algo(algoChoice, algoCount, fwdPerfResults, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionForwardAlgorithm(
+       cudnnHandle, inputTensorDescriptor, convDescriptors->WeightsDescriptor, convDescriptors->LayerDescriptor,
+@@ -558,7 +567,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+     * I'm sure there may be a faster way, but this works
+     */
+    convRequestedAlgoCount = 6;  // reset to max number of available backward algorithms
+-   cudnnConvolutionBwdDataAlgoPerf_t convPerfBwdDataResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   cudnnConvolutionBwdDataAlgoPerf_t convBwdDataPerfResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
+    CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+       cudnnHandle,
+       convDescriptors->WeightsDescriptor,
+@@ -567,7 +576,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+       activationGradientsBackwardDescriptor,
+       convRequestedAlgoCount,
+       &algoCount,
+-      convPerfBwdDataResults));
++      convBwdDataPerfResults));
+    // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
+    // i.e.
+    // CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
+@@ -581,11 +590,12 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &inputTensor,
+    //    convRequestedAlgoCount,
+    //    &algoCount,
+-   //    &convPerfBwdResults,
++   //    &convBwdDataPerfResults,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   convWorkspace->AlgorithmBackward = convPerfBwdDataResults[choose_algo(algoChoice, algoCount, convPerfBwdDataResults, memLimit)].algo;
++   LocalPerf_t bwdDataPerfResults{convBwdDataPerfResults};
++   convWorkspace->AlgorithmBackward = convBwdDataPerfResults[choose_algo(algoChoice, algoCount, bwdDataPerfResults, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle,
+                                                       convDescriptors->WeightsDescriptor,
+@@ -632,7 +642,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+     * I'm sure there may be a faster way, but this works
+     */
+    convRequestedAlgoCount = 6;  // reset to max number of available backward algorithms
+-   cudnnConvolutionBwdFilterAlgoPerf_t convPerfBwdFilterResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
++   cudnnConvolutionBwdFilterAlgoPerf_t convBwdFilterPerfResults[convRequestedAlgoCount];  // this will store metrics to choose convolution algorithm
+    CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+       cudnnHandle,
+       activationBackwardDescriptor,
+@@ -641,7 +651,7 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+       convDescriptors->WeightsDescriptor,
+       convRequestedAlgoCount,
+       &algoCount,
+-      convPerfBwdFilterResults));
++      convBwdFilterPerfResults));
+    // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
+    // i.e.
+    // CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
+@@ -655,11 +665,12 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &filters,
+    //    convRequestedAlgoCount,
+    //    &algoCount,
+-   //    &convPerfBwdFilterResults,
++   //    &convBwdFilterPerfResults,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   convWorkspace->HelperAlgorithm = convPerfBwdFilterResults[choose_algo(algoChoice, algoCount, convPerfBwdFilterResults, memLimit)].algo;
++   LocalPerf_t bwdFilterPerfResults{convBwdFilterPerfResults};
++   convWorkspace->HelperAlgorithm = convBwdFilterPerfResults[choose_algo(algoChoice, algoCount, bwdFilterPerfResults, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
+                                                          activationBackwardDescriptor,
+
+From 1f1dfbbac06c29df98bdebdd9367bf566f2e7ce8 Mon Sep 17 00:00:00 2001
+From: Konstantin Gizdov <kgizdov at gmail.com>
+Date: Thu, 23 Jul 2020 21:37:33 +0300
+Subject: [PATCH 10/10] compiles completely
+
+---
+ .../src/DNN/Architectures/Cudnn/Propagate.cu  | 83 ++++++++++---------
+ 1 file changed, 46 insertions(+), 37 deletions(-)
+
+diff --git a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+index 66ce64a5efc..0694369860a 100644
+--- a/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
++++ b/tmva/tmva/src/DNN/Architectures/Cudnn/Propagate.cu
+@@ -344,41 +344,50 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    enum algoPreference { no_workspace, fastest, workspace_limit };
+    algoPreference algoChoice;
+    // C++11 lambdas cannot be templated, so we have to do this HORRIBLE stuff...
+-   union LocalPerf_t {
+-      // these three type are absolutely equivalent
+-      // and one can access them as they wish to get info
+-      cudnnConvolutionFwdAlgoPerf_t * fwd;
+-      cudnnConvolutionBwdFilterAlgoPerf_t * bwdFilter;
+-      cudnnConvolutionBwdDataAlgoPerf_t * bwdData;
+-   };
+-   auto choose_algo = [](algoPreference const & algoPref, int const algoCount, LocalPerf_t const & perfResults, size_t memLim = std::numeric_limits<size_t>::max()) -> int {
+-      int algoIdx{0};
+-      if (algoPref == algoPreference::fastest) {  // prefer fastest
+-         float temp_runtime{std::numeric_limits<float>::max()};
+-         for (int i = 0; i < algoCount; ++i) {
+-            if (perfResults.fwd[i].status == CUDNN_STATUS_SUCCESS && perfResults.fwd[i].time < temp_runtime) {
+-               temp_runtime = perfResults.fwd[i].time;
+-               algoIdx = i;
++   class LocalPerf {
++    public:
++      LocalPerf(cudnnConvolutionFwdAlgoPerf_t * fwd) {m_fwd = fwd;}
++      LocalPerf(cudnnConvolutionBwdFilterAlgoPerf_t * bwdFilter) {m_bwdFilter = bwdFilter;}
++      LocalPerf(cudnnConvolutionBwdDataAlgoPerf_t * bwdData) {m_bwdData = bwdData;}
++      size_t getMemory(int i) {return m_fwd != nullptr ? m_fwd[i].memory : m_bwdFilter != nullptr ? m_bwdFilter[i].memory : m_bwdData != nullptr ? m_bwdData[i].memory : 0;}
++      float getTime(int i) {return m_fwd != nullptr ? m_fwd[i].time : m_bwdFilter != nullptr ? m_bwdFilter[i].time : m_bwdData != nullptr ? m_bwdData[i].time : 0;}
++      cudnnStatus_t getStatus(int i) {return m_fwd != nullptr ? m_fwd[i].status : m_bwdFilter != nullptr ? m_bwdFilter[i].status : m_bwdData != nullptr ? m_bwdData[i].status : CUDNN_STATUS_BAD_PARAM;}
++      int getIdx(algoPreference const & algoPref, int const algoCount, size_t memLim = std::numeric_limits<size_t>::max()) {
++         int algoIdx{0};
++         if (algoPref == algoPreference::fastest) {  // prefer fastest
++            float temp_runtime{std::numeric_limits<float>::max()};
++            for (int i = 0; i < algoCount; ++i) {
++               if (getStatus(i) == CUDNN_STATUS_SUCCESS && getTime(i) < temp_runtime) {
++                  temp_runtime = getTime(i);
++                  algoIdx = i;
++               }
+             }
+-         }
+-      } else if (algoPref == algoPreference::workspace_limit) {  // constrain to workspace size
+-         float temp_runtime{std::numeric_limits<float>::max()};
+-         for (int i = 0; i < algoCount; ++i) {
+-            if (perfResults.fwd[i].status == CUDNN_STATUS_SUCCESS && perfResults.fwd[i].time < temp_runtime && perfResults.fwd[i].memory <= memLim) {
+-               temp_runtime = perfResults.fwd[i].time;
+-               algoIdx = i;
++         } else if (algoPref == algoPreference::workspace_limit) {  // constrain to workspace size
++            float temp_runtime{std::numeric_limits<float>::max()};
++            for (int i = 0; i < algoCount; ++i) {
++               if (getStatus(i) == CUDNN_STATUS_SUCCESS && getTime(i) < temp_runtime && getMemory(i) <= memLim) {
++                  temp_runtime = getTime(i);
++                  algoIdx = i;
++               }
+             }
+-         }
+-      } else {  // prefer smallest workspace size
+-         size_t temp_memsize{std::numeric_limits<size_t>::max()};
+-         for (int i = 0; i < algoCount; ++i) {
+-            if (perfResults.fwd[i].status == CUDNN_STATUS_SUCCESS && perfResults.fwd[i].memory < temp_memsize) {
+-               temp_memsize = perfResults.fwd[i].memory;
+-               algoIdx = i;
++         } else {  // prefer smallest workspace size
++            size_t temp_memsize{std::numeric_limits<size_t>::max()};
++            for (int i = 0; i < algoCount; ++i) {
++               if (getStatus(i) == CUDNN_STATUS_SUCCESS && getMemory(i) < temp_memsize) {
++                  temp_memsize = getMemory(i);
++                  algoIdx = i;
++               }
+             }
+          }
+-      }
+-      return algoIdx;
++         return algoIdx;
++      };
++    private:
++      LocalPerf();
++      // these three type are absolutely equivalent
++      // and one can access them as they wish to get info
++      cudnnConvolutionFwdAlgoPerf_t * m_fwd;
++      cudnnConvolutionBwdFilterAlgoPerf_t * m_bwdFilter;
++      cudnnConvolutionBwdDataAlgoPerf_t * m_bwdData;
+    };
+ #else
+    // More detailed alternative: cudnnFindConvolutionForwardAlgorithm (only option in newer cuDNN versions)
+@@ -502,8 +511,8 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   LocalPerf_t fwdPerfResults{convFwdPerfResults};
+-   convWorkspace->AlgorithmForward = convFwdPerfResults[choose_algo(algoChoice, algoCount, fwdPerfResults, memLimit)].algo;
++   LocalPerf fwdPerfResults{convFwdPerfResults};
++   convWorkspace->AlgorithmForward = convFwdPerfResults[fwdPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionForwardAlgorithm(
+       cudnnHandle, inputTensorDescriptor, convDescriptors->WeightsDescriptor, convDescriptors->LayerDescriptor,
+@@ -594,8 +603,8 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   LocalPerf_t bwdDataPerfResults{convBwdDataPerfResults};
+-   convWorkspace->AlgorithmBackward = convBwdDataPerfResults[choose_algo(algoChoice, algoCount, bwdDataPerfResults, memLimit)].algo;
++   LocalPerf bwdDataPerfResults{convBwdDataPerfResults};
++   convWorkspace->AlgorithmBackward = convBwdDataPerfResults[bwdDataPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle,
+                                                       convDescriptors->WeightsDescriptor,
+@@ -669,8 +678,8 @@ void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
+    //    &convWorkspace,
+    //    memLimit));  // use memLimit for workspace size
+    // instead choose either fastest or lowest memory algo as per preference
+-   LocalPerf_t bwdFilterPerfResults{convBwdFilterPerfResults};
+-   convWorkspace->HelperAlgorithm = convBwdFilterPerfResults[choose_algo(algoChoice, algoCount, bwdFilterPerfResults, memLimit)].algo;
++   LocalPerf bwdFilterPerfResults{convBwdFilterPerfResults};
++   convWorkspace->HelperAlgorithm = convBwdFilterPerfResults[bwdFilterPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
+ #else
+    CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
+                                                          activationBackwardDescriptor,

Deleted: add_missing_include_ROOT-10315.patch
===================================================================
--- add_missing_include_ROOT-10315.patch	2020-07-23 20:13:52 UTC (rev 665089)
+++ add_missing_include_ROOT-10315.patch	2020-07-23 20:24:09 UTC (rev 665090)
@@ -1,22 +0,0 @@
-From e31986c6523253f3160753b202e22a1c321e2b1a Mon Sep 17 00:00:00 2001
-From: Sergey Linev <S.Linev at gsi.de>
-Date: Thu, 12 Sep 2019 14:06:13 +0200
-Subject: [PATCH] [rbrowser] add missing include ROOT-10315
-
----
- gui/browserv7/inc/ROOT/RBrowserItem.hxx | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/gui/browserv7/inc/ROOT/RBrowserItem.hxx b/gui/browserv7/inc/ROOT/RBrowserItem.hxx
-index b21bc87b4bf..e9ae97e90c0 100644
---- a/gui/browserv7/inc/ROOT/RBrowserItem.hxx
-+++ b/gui/browserv7/inc/ROOT/RBrowserItem.hxx
-@@ -17,6 +17,9 @@
- #ifndef ROOT7_RBrowserItem
- #define ROOT7_RBrowserItem
- 
-+#include <string>
-+#include <vector>
-+
- namespace ROOT {
- namespace Experimental {

Deleted: fix_relative_paths.patch
===================================================================
--- fix_relative_paths.patch	2020-07-23 20:13:52 UTC (rev 665089)
+++ fix_relative_paths.patch	2020-07-23 20:24:09 UTC (rev 665090)
@@ -1,39 +0,0 @@
-diff --git a/cmake/modules/RootConfiguration.cmake b/cmake/modules/RootConfiguration.cmake
-index 41a9cffc78..1c70ce8e39 100644
---- a/cmake/modules/RootConfiguration.cmake
-+++ b/cmake/modules/RootConfiguration.cmake
-@@ -678,22 +678,30 @@ file(RELATIVE_PATH ROOT_CMAKE_TO_INCLUDE_DIR "${CMAKE_INSTALL_FULL_CMAKEDIR}" "$
- file(RELATIVE_PATH ROOT_CMAKE_TO_LIB_DIR "${CMAKE_INSTALL_FULL_CMAKEDIR}" "${CMAKE_INSTALL_FULL_LIBDIR}")
- file(RELATIVE_PATH ROOT_CMAKE_TO_BIN_DIR "${CMAKE_INSTALL_FULL_CMAKEDIR}" "${CMAKE_INSTALL_FULL_BINDIR}")
- 
-+# '_' prefixed variables are used to construct the paths,
-+# while the normal variants evaluate to full paths at runtime
- set(ROOT_INCLUDE_DIR_SETUP "
- # ROOT configured for the install with relative paths, so use these
--get_filename_component(ROOT_INCLUDE_DIRS \"\${_thisdir}/${ROOT_CMAKE_TO_INCLUDE_DIR}\" ABSOLUTE)
-+get_filename_component(_ROOT_INCLUDE_DIRS \"\${_thisdir}/${ROOT_CMAKE_TO_INCLUDE_DIR}\" REALPATH)
-+# resolve relative paths to absolute system paths
-+get_filename_component(ROOT_INCLUDE_DIRS \"\${_ROOT_INCLUDE_DIRS}\" REALPATH)
- ")
- set(ROOT_LIBRARY_DIR_SETUP "
- # ROOT configured for the install with relative paths, so use these
--get_filename_component(ROOT_LIBRARY_DIR \"\${_thisdir}/${ROOT_CMAKE_TO_LIB_DIR}\" ABSOLUTE)
-+get_filename_component(_ROOT_LIBRARY_DIR \"\${_thisdir}/${ROOT_CMAKE_TO_LIB_DIR}\" REALPATH)
-+# resolve relative paths to absolute system paths
-+get_filename_component(ROOT_LIBRARY_DIR \"\${_ROOT_LIBRARY_DIR}\" REALPATH)
- ")
- set(ROOT_BINDIR_SETUP "
- # ROOT configured for the install with relative paths, so use these
--get_filename_component(ROOT_BINDIR \"\${_thisdir}/${ROOT_CMAKE_TO_BIN_DIR}\" ABSOLUTE)
-+get_filename_component(_ROOT_BINDIR \"\${_thisdir}/${ROOT_CMAKE_TO_BIN_DIR}\" REALPATH)
-+# resolve relative paths to absolute system paths
-+get_filename_component(ROOT_BINDIR \"\${_ROOT_BINDIR}\" REALPATH)
- ")
- # Deprecated value ROOT_BINARY_DIR
- set(ROOT_BINARY_DIR_SETUP "
- # Deprecated value, please don't use it and use ROOT_BINDIR instead.
--get_filename_component(ROOT_BINARY_DIR \"\${ROOT_BINDIR}\" ABSOLUTE)
-+get_filename_component(ROOT_BINARY_DIR \"\${ROOT_BINDIR}\" REALPATH)
- ")
- 
- # used by ROOTConfig.cmake from the build directory

Modified: nbman-for-arch.patch
===================================================================
--- nbman-for-arch.patch	2020-07-23 20:13:52 UTC (rev 665089)
+++ nbman-for-arch.patch	2020-07-23 20:24:09 UTC (rev 665090)
@@ -1,6 +1,6 @@
-diff -aur root-6.20.04-old/main/src/nbmain.cxx root-6.20.04-new/main/src/nbmain.cxx
---- root-6.20.04-old/main/src/nbmain.cxx	2020-04-14 00:54:43.902408710 +0300
-+++ root-6.20.04-new/main/src/nbmain.cxx	2020-04-14 01:00:48.022396130 +0300
+diff --color -aur root-6.22.00-old/main/src/nbmain.cxx root-6.22.00-new/main/src/nbmain.cxx
+--- root-6.22.00-old/main/src/nbmain.cxx	2020-07-20 15:26:53.983725609 +0300
++++ root-6.22.00-new/main/src/nbmain.cxx	2020-07-20 15:29:53.940386060 +0300
 @@ -33,10 +33,6 @@
  #define NB_OPT             "notebook"
  #define JUPYTER_CONF_DIR_V "JUPYTER_CONFIG_DIR"
@@ -9,10 +9,10 @@
 -#define ROOTNB_DIR         ".rootnb"
 -#define COMMIT_FILE        ".rootcommit"
 -#define JUPYTER_CONFIG     "jupyter_notebook_config.py"
- 
+
  using namespace std;
- 
-@@ -49,160 +45,12 @@
+
+@@ -46,161 +46,12 @@
  #endif
  
  ////////////////////////////////////////////////////////////////////////////////
@@ -60,10 +60,11 @@
 -
 -   // Copy files in source to dest
 -   TSystemDirectory dir(source.c_str(), source.c_str());
--   TList *files = dir.GetListOfFiles();
+-   std::unique_ptr<TList> files;
+-   files.reset(dir.GetListOfFiles());
 -   if (files) {
 -      TSystemFile *file;
--      TListIter it(files);
+-      TListIter it(files.get());
 -      while ((file = (TSystemFile*)it())) {
 -         TString s = file->GetName();
 -         string fname(s.Data());



More information about the arch-commits mailing list