[arch-commits] Commit in python-pytorch/repos/community-x86_64 (16 files)
Sven-Hendrik Haase
svenstaro at gemini.archlinux.org
Sun Mar 13 06:02:00 UTC 2022
Date: Sunday, March 13, 2022 @ 06:01:58
Author: svenstaro
Revision: 1150467
archrelease: copy trunk to community-x86_64
Added:
python-pytorch/repos/community-x86_64/PKGBUILD
(from rev 1150466, python-pytorch/trunk/PKGBUILD)
python-pytorch/repos/community-x86_64/ffmpeg4.4.patch
(from rev 1150466, python-pytorch/trunk/ffmpeg4.4.patch)
python-pytorch/repos/community-x86_64/fix-building-for-torchvision.patch
(from rev 1150466, python-pytorch/trunk/fix-building-for-torchvision.patch)
python-pytorch/repos/community-x86_64/fix_include_system.patch
(from rev 1150466, python-pytorch/trunk/fix_include_system.patch)
python-pytorch/repos/community-x86_64/test.py
(from rev 1150466, python-pytorch/trunk/test.py)
python-pytorch/repos/community-x86_64/use-system-libuv.patch
(from rev 1150466, python-pytorch/trunk/use-system-libuv.patch)
Deleted:
python-pytorch/repos/community-x86_64/66219.patch
python-pytorch/repos/community-x86_64/PKGBUILD
python-pytorch/repos/community-x86_64/ffmpeg4.4.patch
python-pytorch/repos/community-x86_64/fix-building-for-torchvision.patch
python-pytorch/repos/community-x86_64/fix-jit-frontend-nullptr-deref.patch
python-pytorch/repos/community-x86_64/fix_c10.patch
python-pytorch/repos/community-x86_64/fix_include_system.patch
python-pytorch/repos/community-x86_64/fix_old_nnapi_lite_interpreter_config.patch
python-pytorch/repos/community-x86_64/test.py
python-pytorch/repos/community-x86_64/use-system-libuv.patch
---------------------------------------------+
66219.patch | 3002 --------------------------
PKGBUILD | 616 ++---
ffmpeg4.4.patch | 102
fix-building-for-torchvision.patch | 50
fix-jit-frontend-nullptr-deref.patch | 12
fix_c10.patch | 12
fix_include_system.patch | 26
fix_old_nnapi_lite_interpreter_config.patch | 33
test.py | 14
use-system-libuv.patch | 26
10 files changed, 403 insertions(+), 3490 deletions(-)
Deleted: 66219.patch
===================================================================
--- 66219.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ 66219.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,3002 +0,0 @@
-From 55473c81535db8890d35e29cff852b737954ce80 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Wed, 6 Oct 2021 11:52:58 -0700
-Subject: [PATCH 01/30] Refactor cub namespace
-
----
- aten/src/ATen/cuda/cub.cuh | 102 +++++++++---------
- aten/src/ATen/cuda/cub_definitions.cuh | 17 +++
- caffe2/core/context_gpu.cu | 1 +
- caffe2/operators/accuracy_op.cu | 1 +
- caffe2/operators/affine_channel_op.cu | 1 +
- caffe2/operators/arg_ops.cu | 2 +-
- caffe2/operators/batch_moments_op.cu | 1 +
- caffe2/operators/batch_sparse_to_dense_op.cu | 1 +
- caffe2/operators/boolean_mask_ops.cu | 2 +-
- caffe2/operators/cross_entropy_op.cu | 1 +
- caffe2/operators/distance_op.cu | 1 +
- caffe2/operators/elementwise_div_op.cu | 2 +-
- caffe2/operators/elementwise_linear_op.cu | 1 +
- caffe2/operators/elementwise_mul_op.cu | 2 +-
- caffe2/operators/elementwise_ops.cu | 1 +
- caffe2/operators/find_op.cu | 1 +
- caffe2/operators/generate_proposals_op.cu | 2 +-
- caffe2/operators/normalize_ops.cu | 1 +
- caffe2/operators/one_hot_ops.cu | 1 +
- caffe2/operators/pack_segments.cu | 2 +-
- caffe2/operators/prelu_op.cu | 1 +
- caffe2/operators/reduce_front_back_max_ops.cu | 1 +
- .../reduce_front_back_sum_mean_ops.cu | 1 +
- caffe2/operators/reduction_ops.cu | 2 +-
- caffe2/operators/rmac_regions_op.cu | 7 ++
- caffe2/operators/segment_reduction_op_gpu.cuh | 1 +
- caffe2/operators/sequence_ops.cu | 2 +-
- caffe2/operators/softmax_ops.cu | 1 +
- .../operators/spatial_batch_norm_op_impl.cuh | 2 +-
- caffe2/sgd/adagrad_fused_op_gpu.cu | 1 +
- caffe2/sgd/adagrad_op_gpu.cu | 1 +
- caffe2/sgd/adam_op_gpu.cu | 1 +
- caffe2/utils/math/reduce.cu | 2 +-
- caffe2/utils/math/reduce.cuh | 2 +-
- caffe2/utils/math_gpu.cu | 2 +-
- cmake/Dependencies.cmake | 5 +
- 36 files changed, 115 insertions(+), 60 deletions(-)
- create mode 100644 aten/src/ATen/cuda/cub_definitions.cuh
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 5d8ae777ebef..39938efc48be 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -5,16 +5,24 @@
- #include <iterator>
- #include <limits>
-
--// include cub in a safe manner, see:
--// https://github.com/pytorch/pytorch/pull/55292
-+#include <ATen/cuda/cub_definitions.cuh>
-+
-+#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+
-+#include <cub/cub.cuh>
-+
-+#else
-+
- #undef CUB_NS_POSTFIX //undef to avoid redefinition warnings
- #undef CUB_NS_PREFIX
--#define CUB_NS_PREFIX namespace at { namespace cuda { namespace detail {
--#define CUB_NS_POSTFIX }}}
-+#define CUB_NS_PREFIX namespace at_cuda_detail {
-+#define CUB_NS_POSTFIX }
- #include <cub/cub.cuh>
- #undef CUB_NS_POSTFIX
- #undef CUB_NS_PREFIX
-
-+#endif
-+
- #include <ATen/cuda/Exceptions.h>
- #include <c10/cuda/CUDACachingAllocator.h>
- #include <c10/cuda/CUDAStream.h>
-@@ -33,16 +41,41 @@
- #define NO_ROCM(x)
- #else
- #define NO_ROCM(x) x
-+#endif
-
--namespace at { namespace native {
-+#if !defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()
-
--namespace cub = at::cuda::detail::cub;
-+namespace at_cuda_detail {
-+// backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
-
--}}
-+template <>
-+struct cub::FpLimits<c10::BFloat16>
-+{
-+ static __host__ __device__ __forceinline__ c10::BFloat16 Max() {
-+ unsigned short max_word = 0x7F7F;
-+ return reinterpret_cast<c10::BFloat16&>(max_word);
-+ }
-+
-+ static __host__ __device__ __forceinline__ c10::BFloat16 Lowest() {
-+ unsigned short lowest_word = 0xFF7F;
-+ return reinterpret_cast<c10::BFloat16&>(lowest_word);
-+ }
-+};
-+
-+template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
-+}
- #endif
-
-+namespace at { namespace native {
-+namespace cub = at_cuda_detail::cub;
-+}}
-+namespace caffe2 {
-+namespace cub = at_cuda_detail::cub;
-+}
-+
- namespace at {
- namespace cuda {
-+namespace cub {
-
- namespace detail {
-
-@@ -55,44 +88,17 @@ struct cuda_type<c10::Half> {
- using type = __half;
- };
-
--#if defined(CUDA_VERSION) && CUDA_VERSION >= 11050
--// cub sort support for __nv_bfloat16 is added to cub 1.13 in
--// https://github.com/NVIDIA/cub/pull/306 and according to
--// https://github.com/NVIDIA/cub#releases, 1.13 is included in
--// CUDA Toolkit 11.5
-+#if CUB_SUPPORTS_NV_BFLOAT16()
-
--// waiting for https://github.com/NVIDIA/cub/pull/306 to land on CUDA
- template<>
- struct cuda_type<c10::BFloat16> {
- using type = __nv_bfloat16;
- };
-
--#elif !defined(__HIP_PLATFORM_HCC__)
--
--// backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
--
--template <>
--struct cub::FpLimits<c10::BFloat16>
--{
-- static __host__ __device__ __forceinline__ c10::BFloat16 Max() {
-- unsigned short max_word = 0x7F7F;
-- return reinterpret_cast<c10::BFloat16&>(max_word);
-- }
--
-- static __host__ __device__ __forceinline__ c10::BFloat16 Lowest() {
-- unsigned short lowest_word = 0xFF7F;
-- return reinterpret_cast<c10::BFloat16&>(lowest_word);
-- }
--};
--
--template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
--
- #endif
-
- } // namespace detail
-
--namespace cub {
--
- inline int get_num_bits(uint64_t max_key) {
- int num_bits = 1;
- while (max_key > 1) {
-@@ -115,11 +121,11 @@ static inline void sort_keys(
- key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out);
-
- if (descending) {
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortKeysDescending,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortKeysDescending,
- keys_in_, keys_out_, n,
- begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
- } else {
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortKeys,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortKeys,
- keys_in_, keys_out_, n,
- begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
- }
-@@ -147,11 +153,11 @@ static inline void sort_pairs(
- key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out);
-
- if (descending) {
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortPairsDescending,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairsDescending,
- keys_in_, keys_out_, values_in, values_out, n,
- begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
- } else {
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceRadixSort::SortPairs,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairs,
- keys_in_, keys_out_, values_in, values_out, n,
- begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
- }
-@@ -183,12 +189,12 @@ static inline void segmented_sort_pairs(
- key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out);
-
- if (descending) {
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSegmentedRadixSort::SortPairsDescending,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSegmentedRadixSort::SortPairsDescending,
- keys_in_, keys_out_, values_in, values_out,
- num_elements, num_segments, begin_offsets, end_offsets,
- begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
- } else {
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSegmentedRadixSort::SortPairs,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSegmentedRadixSort::SortPairs,
- keys_in_, keys_out_, values_in, values_out,
- num_elements, num_segments, begin_offsets, end_offsets,
- begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
-@@ -240,7 +246,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
- // so split at int_max/2
- constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30
- int size_cub = std::min<int64_t>(num_items, max_cub_size);
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::InclusiveScan,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
- input,
- output,
- scan_op,
-@@ -260,7 +266,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
- first_elem_ptr,
- scan_op);
- C10_CUDA_KERNEL_LAUNCH_CHECK();
-- using ArgIndexInputIterator = NO_ROCM(detail)::cub::ArgIndexInputIterator<InputIteratorT>;
-+ using ArgIndexInputIterator = NO_ROCM(at_cuda_detail)::cub::ArgIndexInputIterator<InputIteratorT>;
- using tuple = typename ArgIndexInputIterator::value_type;
- auto input_iter_transform = [=] __device__ (const tuple &x)->input_t {
- if (x.key == 0) {
-@@ -269,9 +275,9 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
- return x.value;
- }
- };
-- auto input_ = NO_ROCM(detail)::cub::TransformInputIterator<input_t, decltype(input_iter_transform), ArgIndexInputIterator>(
-+ auto input_ = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<input_t, decltype(input_iter_transform), ArgIndexInputIterator>(
- ArgIndexInputIterator(input + i), input_iter_transform);
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::InclusiveScan,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
- input_,
- output + i,
- scan_op,
-@@ -287,7 +293,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
- // so split at int_max/2
- constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30
- int size_cub = std::min<int64_t>(num_items, max_cub_size);
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::ExclusiveScan,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
- input,
- output,
- scan_op,
-@@ -309,7 +315,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
- C10_CUDA_KERNEL_LAUNCH_CHECK();
- auto input_ = impl::chained_iterator<InitValueT, InputIteratorT>{
- input + i, first_elem_ptr};
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceScan::InclusiveScan,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
- input_,
- output + i,
- scan_op,
-@@ -322,7 +328,7 @@ template<typename InputIteratorT , typename OutputIteratorT , typename NumSelect
- inline void unique(InputIteratorT input, OutputIteratorT output, NumSelectedIteratorT num_selected_out, int64_t num_items) {
- TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
- "cub unique does not support more than INT_MAX elements");
-- CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSelect::Unique,
-+ CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::Unique,
- input, output, num_selected_out, num_items, at::cuda::getCurrentCUDAStream());
- }
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-new file mode 100644
-index 000000000000..a7694fda4168
---- /dev/null
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -0,0 +1,17 @@
-+#include <cub/version.cuh>
-+
-+// cub sort support for __nv_bfloat16 is added to cub 1.13 in:
-+// https://github.com/NVIDIA/cub/pull/306
-+#if CUB_VERSION >= 101300
-+#define CUB_SUPPORTS_NV_BFLOAT16() true
-+#elif
-+#define CUB_SUPPORTS_NV_BFLOAT16() false
-+#endif
-+
-+// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in:
-+// https://github.com/NVIDIA/cub/pull/326
-+#if CUB_VERSION >= 101400
-+#define CUB_SUPPORTS_WRAPPED_NAMESPACE() true
-+#elif
-+#define CUB_SUPPORTS_WRAPPED_NAMESPACE() false
-+#endif
-\ No newline at end of file
-diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
-index c2b89945ada9..475ed61ab4f7 100644
---- a/caffe2/core/context_gpu.cu
-+++ b/caffe2/core/context_gpu.cu
-@@ -4,6 +4,7 @@
- #include <string>
- #include <unordered_map>
-
-+#include <ATen/cuda/cub.cuh>
- #include <ATen/Context.h>
- #include <c10/cuda/CUDAFunctions.h>
- #include <c10/cuda/CUDACachingAllocator.h>
-diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
-index f06663d71a90..7ad2b09c238a 100644
---- a/caffe2/operators/accuracy_op.cu
-+++ b/caffe2/operators/accuracy_op.cu
-@@ -3,6 +3,7 @@
- #include "caffe2/utils/GpuAtomics.cuh"
- #include "caffe2/utils/math.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
-index adf4ac55c0fc..f3d9e22c7e8c 100644
---- a/caffe2/operators/affine_channel_op.cu
-+++ b/caffe2/operators/affine_channel_op.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/affine_channel_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu
-index 7e90d25b836b..fbefe0774376 100644
---- a/caffe2/operators/arg_ops.cu
-+++ b/caffe2/operators/arg_ops.cu
-@@ -2,8 +2,8 @@
-
- #include <limits>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu
-index 4b693b5c04e2..65c43200e5bd 100644
---- a/caffe2/operators/batch_moments_op.cu
-+++ b/caffe2/operators/batch_moments_op.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/batch_moments_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu
-index aea2035a5d3d..2cb09deb8668 100644
---- a/caffe2/operators/batch_sparse_to_dense_op.cu
-+++ b/caffe2/operators/batch_sparse_to_dense_op.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/batch_sparse_to_dense_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/device/device_scan.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
-index 214b7c13ba3c..c87688f51d64 100644
---- a/caffe2/operators/boolean_mask_ops.cu
-+++ b/caffe2/operators/boolean_mask_ops.cu
-@@ -3,7 +3,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/boolean_mask_ops.h"
-
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
-index 380e80399fc3..95f3ffddbf1f 100644
---- a/caffe2/operators/cross_entropy_op.cu
-+++ b/caffe2/operators/cross_entropy_op.cu
-@@ -1,4 +1,5 @@
- #include <assert.h>
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
-index 3a8bb337d541..d94691d5a9d9 100644
---- a/caffe2/operators/distance_op.cu
-+++ b/caffe2/operators/distance_op.cu
-@@ -4,6 +4,7 @@
- #include "caffe2/operators/distance_op.h"
- #include "caffe2/utils/conversions.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu
-index 42b103a0f110..ca9682326324 100644
---- a/caffe2/operators/elementwise_div_op.cu
-+++ b/caffe2/operators/elementwise_div_op.cu
-@@ -3,8 +3,8 @@
- #include <algorithm>
- #include <functional>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/elementwise_ops_utils.h"
-diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
-index cc49115bffc5..c1c45263f34c 100644
---- a/caffe2/operators/elementwise_linear_op.cu
-+++ b/caffe2/operators/elementwise_linear_op.cu
-@@ -5,6 +5,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/operator_fallback_gpu.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu
-index bdbf760cf95b..88c3da00edc3 100644
---- a/caffe2/operators/elementwise_mul_op.cu
-+++ b/caffe2/operators/elementwise_mul_op.cu
-@@ -3,8 +3,8 @@
- #include <algorithm>
- #include <functional>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/elementwise_ops_utils.h"
-diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
-index c9ced33cf806..1ac0426d2ca7 100644
---- a/caffe2/operators/elementwise_ops.cu
-+++ b/caffe2/operators/elementwise_ops.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/elementwise_ops.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_load.cuh>
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
-diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
-index f8ff2bab1637..666df335ce42 100644
---- a/caffe2/operators/find_op.cu
-+++ b/caffe2/operators/find_op.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/find_op.h"
-diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu
-index cab0ad3d0b88..84906a8e8182 100644
---- a/caffe2/operators/generate_proposals_op.cu
-+++ b/caffe2/operators/generate_proposals_op.cu
-@@ -1,4 +1,4 @@
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
- #include "caffe2/core/context.h"
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/generate_proposals_op.h"
-diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
-index 26df05308d88..468175df985f 100644
---- a/caffe2/operators/normalize_ops.cu
-+++ b/caffe2/operators/normalize_ops.cu
-@@ -1,5 +1,6 @@
- #include <algorithm>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
-index e521b3dd09df..86f82f78bb82 100644
---- a/caffe2/operators/one_hot_ops.cu
-+++ b/caffe2/operators/one_hot_ops.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
-index 7475100fd368..b9ed413d1e7b 100644
---- a/caffe2/operators/pack_segments.cu
-+++ b/caffe2/operators/pack_segments.cu
-@@ -1,4 +1,4 @@
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/pack_segments.h"
-
-diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu
-index 745a393f075f..d29882086754 100644
---- a/caffe2/operators/prelu_op.cu
-+++ b/caffe2/operators/prelu_op.cu
-@@ -1,6 +1,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/prelu_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu
-index ae91f8a6da72..ba62b2eff671 100644
---- a/caffe2/operators/reduce_front_back_max_ops.cu
-+++ b/caffe2/operators/reduce_front_back_max_ops.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_max_ops.h"
-diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-index 476596f08425..586c20fe8d8e 100644
---- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-+++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_sum_mean_ops.h"
-diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu
-index ba55a66de588..0d94fab22a7f 100644
---- a/caffe2/operators/reduction_ops.cu
-+++ b/caffe2/operators/reduction_ops.cu
-@@ -2,7 +2,7 @@
- #include "caffe2/operators/reduction_ops.h"
- #include "caffe2/utils/conversions.h"
-
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
-index 76c4d012d71a..39cc5fbc988d 100644
---- a/caffe2/operators/rmac_regions_op.cu
-+++ b/caffe2/operators/rmac_regions_op.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-@@ -10,6 +11,9 @@
- #if defined(USE_ROCM)
- namespace rocprim {
- #else
-+#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+namespace at_cuda_detail {
-+#endif
- namespace cub {
- #endif
-
-@@ -22,6 +26,9 @@ inline __host__ __device__ bool operator<(
- }
-
- } // namespace cub
-+#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+} // namespace at_cuda_detail
-+#endif
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh
-index 8d51196ee138..447617c6e9de 100644
---- a/caffe2/operators/segment_reduction_op_gpu.cuh
-+++ b/caffe2/operators/segment_reduction_op_gpu.cuh
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
- #include <cub/device/device_scan.cuh>
-diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
-index cc34effd3f22..e66d491f85e6 100644
---- a/caffe2/operators/sequence_ops.cu
-+++ b/caffe2/operators/sequence_ops.cu
-@@ -1,6 +1,6 @@
- #include <algorithm>
-
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/sequence_ops.h"
-diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
-index 51c0cbc2bf6a..c01fcf3e0a48 100644
---- a/caffe2/operators/softmax_ops.cu
-+++ b/caffe2/operators/softmax_ops.cu
-@@ -1,4 +1,5 @@
- #include <cfloat>
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-index edc076c7d718..f9b9fb58adc8 100644
---- a/caffe2/operators/spatial_batch_norm_op_impl.cuh
-+++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-@@ -5,8 +5,8 @@
-
- #include <limits>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/utils/math.h"
-diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
-index 2c2ad2cf76ae..396da5195125 100644
---- a/caffe2/sgd/adagrad_fused_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
-@@ -2,6 +2,7 @@
- #include <c10/core/GeneratorImpl.h>
- #include <algorithm>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/device/device_radix_sort.cuh>
- #include "caffe2/sgd/adagrad_fused_op_gpu.cuh"
- #include "caffe2/utils/math.h"
-diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
-index 8abb3376ca87..a6fa842ddc80 100644
---- a/caffe2/sgd/adagrad_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_op_gpu.cu
-@@ -1,5 +1,6 @@
- #include <algorithm>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
-index 42ab975faacb..4b59836b6a68 100644
---- a/caffe2/sgd/adam_op_gpu.cu
-+++ b/caffe2/sgd/adam_op_gpu.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu
-index fc3e476b288b..20919334da50 100644
---- a/caffe2/utils/math/reduce.cu
-+++ b/caffe2/utils/math/reduce.cu
-@@ -6,8 +6,8 @@
- #include <numeric>
- #include <vector>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include <thrust/execution_policy.h>
- #include <thrust/reduce.h>
-diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh
-index 0c43ad45a379..39ad553eec76 100644
---- a/caffe2/utils/math/reduce.cuh
-+++ b/caffe2/utils/math/reduce.cuh
-@@ -1,8 +1,8 @@
- #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_
- #define CAFFE2_UTILS_MATH_REDUCE_CUH_
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/common_gpu.h"
-
-diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
-index a37d4b744d73..b0a44fed34fb 100644
---- a/caffe2/utils/math_gpu.cu
-+++ b/caffe2/utils/math_gpu.cu
-@@ -7,8 +7,8 @@
- #include <numeric>
- #include <vector>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include <thrust/host_vector.h>
- #include <thrust/device_vector.h>
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index ca560288a41a..f127d8f2e5b1 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1622,6 +1622,11 @@ if(NOT INTERN_BUILD_MOBILE)
- list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-fPIC")
- endif()
-
-+ # include cub in a safe manner, see:
-+ # https://github.com/pytorch/pytorch/pull/55292
-+ # https://github.com/NVIDIA/cub/releases/tag/1.14.0
-+ list(APPEND CUDA_NVCC_FLAGS "-DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
-+
- if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
- message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
- list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__"
-
-From f0a6afd7f0a5c6210289869282c77a680d7dfa2e Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Wed, 6 Oct 2021 12:16:48 -0700
-Subject: [PATCH 02/30] fix
-
----
- .github/workflows/lint.yml | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
-index 4c469878e63f..af472e00140a 100644
---- a/.github/workflows/lint.yml
-+++ b/.github/workflows/lint.yml
-@@ -97,7 +97,7 @@ jobs:
- - name: Ensure no direct cub include
- if: always()
- run: |
-- (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
-+ (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' ':(exclude)aten/src/ATen/cuda/cub_definitions.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
- - name: Ensure no raw cuda api calls
- if: always()
- run: |
-
-From 8edc6e961f1911f0fe04cb57af13b251f66c9153 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Wed, 6 Oct 2021 12:26:19 -0700
-Subject: [PATCH 03/30] save
-
----
- .github/workflows/lint.yml | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
-index af472e00140a..0b2abe7a7483 100644
---- a/.github/workflows/lint.yml
-+++ b/.github/workflows/lint.yml
-@@ -97,7 +97,7 @@ jobs:
- - name: Ensure no direct cub include
- if: always()
- run: |
-- (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' ':(exclude)aten/src/ATen/cuda/cub_definitions.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
-+ (! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub*.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
- - name: Ensure no raw cuda api calls
- if: always()
- run: |
-
-From 0d50954a3747ed0f2c7cbfcd58f17a2a81d5929c Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Wed, 6 Oct 2021 12:32:16 -0700
-Subject: [PATCH 04/30] fix
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index a7694fda4168..802f67f22a0f 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -14,4 +14,4 @@
- #define CUB_SUPPORTS_WRAPPED_NAMESPACE() true
- #elif
- #define CUB_SUPPORTS_WRAPPED_NAMESPACE() false
--#endif
-\ No newline at end of file
-+#endif
-
-From 2ac5cf508d694995eee4710d9a6499bb3135d324 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Wed, 6 Oct 2021 14:04:40 -0700
-Subject: [PATCH 05/30] fix
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 802f67f22a0f..1f5628fa9d3a 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -4,7 +4,7 @@
- // https://github.com/NVIDIA/cub/pull/306
- #if CUB_VERSION >= 101300
- #define CUB_SUPPORTS_NV_BFLOAT16() true
--#elif
-+#else
- #define CUB_SUPPORTS_NV_BFLOAT16() false
- #endif
-
-@@ -12,6 +12,6 @@
- // https://github.com/NVIDIA/cub/pull/326
- #if CUB_VERSION >= 101400
- #define CUB_SUPPORTS_WRAPPED_NAMESPACE() true
--#elif
-+#else
- #define CUB_SUPPORTS_WRAPPED_NAMESPACE() false
- #endif
-
-From 815f5a5981919f61aaab2d65597826ddf0495ac9 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Wed, 6 Oct 2021 14:06:32 -0700
-Subject: [PATCH 06/30] fix rocm
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 6 ++++++
- 1 file changed, 6 insertions(+)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 1f5628fa9d3a..7da3ab716e41 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -1,4 +1,10 @@
-+#pragma once
-+
-+#if !defined(USE_ROCM)
- #include <cub/version.cuh>
-+#else
-+#define CUB_VERSION 0
-+#endif
-
- // cub sort support for __nv_bfloat16 is added to cub 1.13 in:
- // https://github.com/NVIDIA/cub/pull/306
-
-From 02e9ca26bdc658edd1d960dc5d31e277b9b9afc1 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Mon, 11 Oct 2021 10:06:27 -0700
-Subject: [PATCH 07/30] revert caffe2 change
-
----
- caffe2/core/context_gpu.cu | 1 -
- caffe2/operators/accuracy_op.cu | 1 -
- caffe2/operators/affine_channel_op.cu | 1 -
- caffe2/operators/arg_ops.cu | 2 +-
- caffe2/operators/batch_moments_op.cu | 1 -
- caffe2/operators/batch_sparse_to_dense_op.cu | 1 -
- caffe2/operators/boolean_mask_ops.cu | 2 +-
- caffe2/operators/cross_entropy_op.cu | 1 -
- caffe2/operators/distance_op.cu | 1 -
- caffe2/operators/elementwise_div_op.cu | 2 +-
- caffe2/operators/elementwise_linear_op.cu | 1 -
- caffe2/operators/elementwise_mul_op.cu | 2 +-
- caffe2/operators/elementwise_ops.cu | 1 -
- caffe2/operators/find_op.cu | 1 -
- caffe2/operators/generate_proposals_op.cu | 2 +-
- caffe2/operators/normalize_ops.cu | 1 -
- caffe2/operators/one_hot_ops.cu | 1 -
- caffe2/operators/pack_segments.cu | 2 +-
- caffe2/operators/prelu_op.cu | 1 -
- caffe2/operators/reduce_front_back_max_ops.cu | 1 -
- caffe2/operators/reduce_front_back_sum_mean_ops.cu | 1 -
- caffe2/operators/reduction_ops.cu | 2 +-
- caffe2/operators/rmac_regions_op.cu | 7 -------
- caffe2/operators/segment_reduction_op_gpu.cuh | 1 -
- caffe2/operators/sequence_ops.cu | 2 +-
- caffe2/operators/softmax_ops.cu | 1 -
- caffe2/operators/spatial_batch_norm_op_impl.cuh | 2 +-
- caffe2/sgd/adagrad_fused_op_gpu.cu | 1 -
- caffe2/sgd/adagrad_op_gpu.cu | 1 -
- caffe2/sgd/adam_op_gpu.cu | 1 -
- caffe2/utils/math/reduce.cu | 2 +-
- caffe2/utils/math/reduce.cuh | 2 +-
- caffe2/utils/math_gpu.cu | 2 +-
- 33 files changed, 12 insertions(+), 39 deletions(-)
-
-diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
-index 475ed61ab4f7..c2b89945ada9 100644
---- a/caffe2/core/context_gpu.cu
-+++ b/caffe2/core/context_gpu.cu
-@@ -4,7 +4,6 @@
- #include <string>
- #include <unordered_map>
-
--#include <ATen/cuda/cub.cuh>
- #include <ATen/Context.h>
- #include <c10/cuda/CUDAFunctions.h>
- #include <c10/cuda/CUDACachingAllocator.h>
-diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
-index 7ad2b09c238a..f06663d71a90 100644
---- a/caffe2/operators/accuracy_op.cu
-+++ b/caffe2/operators/accuracy_op.cu
-@@ -3,7 +3,6 @@
- #include "caffe2/utils/GpuAtomics.cuh"
- #include "caffe2/utils/math.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
-index f3d9e22c7e8c..adf4ac55c0fc 100644
---- a/caffe2/operators/affine_channel_op.cu
-+++ b/caffe2/operators/affine_channel_op.cu
-@@ -1,6 +1,5 @@
- #include "caffe2/operators/affine_channel_op.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu
-index fbefe0774376..7e90d25b836b 100644
---- a/caffe2/operators/arg_ops.cu
-+++ b/caffe2/operators/arg_ops.cu
-@@ -2,8 +2,8 @@
-
- #include <limits>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include <cub/cub.cuh>
-
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu
-index 65c43200e5bd..4b693b5c04e2 100644
---- a/caffe2/operators/batch_moments_op.cu
-+++ b/caffe2/operators/batch_moments_op.cu
-@@ -1,6 +1,5 @@
- #include "caffe2/operators/batch_moments_op.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu
-index 2cb09deb8668..aea2035a5d3d 100644
---- a/caffe2/operators/batch_sparse_to_dense_op.cu
-+++ b/caffe2/operators/batch_sparse_to_dense_op.cu
-@@ -1,6 +1,5 @@
- #include "caffe2/operators/batch_sparse_to_dense_op.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/device/device_scan.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
-index c87688f51d64..214b7c13ba3c 100644
---- a/caffe2/operators/boolean_mask_ops.cu
-+++ b/caffe2/operators/boolean_mask_ops.cu
-@@ -3,7 +3,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/boolean_mask_ops.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
-index 95f3ffddbf1f..380e80399fc3 100644
---- a/caffe2/operators/cross_entropy_op.cu
-+++ b/caffe2/operators/cross_entropy_op.cu
-@@ -1,5 +1,4 @@
- #include <assert.h>
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
-index d94691d5a9d9..3a8bb337d541 100644
---- a/caffe2/operators/distance_op.cu
-+++ b/caffe2/operators/distance_op.cu
-@@ -4,7 +4,6 @@
- #include "caffe2/operators/distance_op.h"
- #include "caffe2/utils/conversions.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu
-index ca9682326324..42b103a0f110 100644
---- a/caffe2/operators/elementwise_div_op.cu
-+++ b/caffe2/operators/elementwise_div_op.cu
-@@ -3,8 +3,8 @@
- #include <algorithm>
- #include <functional>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/elementwise_ops_utils.h"
-diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
-index c1c45263f34c..cc49115bffc5 100644
---- a/caffe2/operators/elementwise_linear_op.cu
-+++ b/caffe2/operators/elementwise_linear_op.cu
-@@ -5,7 +5,6 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/operator_fallback_gpu.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu
-index 88c3da00edc3..bdbf760cf95b 100644
---- a/caffe2/operators/elementwise_mul_op.cu
-+++ b/caffe2/operators/elementwise_mul_op.cu
-@@ -3,8 +3,8 @@
- #include <algorithm>
- #include <functional>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/elementwise_ops_utils.h"
-diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
-index 1ac0426d2ca7..c9ced33cf806 100644
---- a/caffe2/operators/elementwise_ops.cu
-+++ b/caffe2/operators/elementwise_ops.cu
-@@ -1,6 +1,5 @@
- #include "caffe2/operators/elementwise_ops.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_load.cuh>
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
-diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
-index 666df335ce42..f8ff2bab1637 100644
---- a/caffe2/operators/find_op.cu
-+++ b/caffe2/operators/find_op.cu
-@@ -1,4 +1,3 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/find_op.h"
-diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu
-index 84906a8e8182..cab0ad3d0b88 100644
---- a/caffe2/operators/generate_proposals_op.cu
-+++ b/caffe2/operators/generate_proposals_op.cu
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
- #include "caffe2/core/context.h"
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/generate_proposals_op.h"
-diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
-index 468175df985f..26df05308d88 100644
---- a/caffe2/operators/normalize_ops.cu
-+++ b/caffe2/operators/normalize_ops.cu
-@@ -1,6 +1,5 @@
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
-index 86f82f78bb82..e521b3dd09df 100644
---- a/caffe2/operators/one_hot_ops.cu
-+++ b/caffe2/operators/one_hot_ops.cu
-@@ -1,4 +1,3 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
-index b9ed413d1e7b..7475100fd368 100644
---- a/caffe2/operators/pack_segments.cu
-+++ b/caffe2/operators/pack_segments.cu
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/pack_segments.h"
-
-diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu
-index d29882086754..745a393f075f 100644
---- a/caffe2/operators/prelu_op.cu
-+++ b/caffe2/operators/prelu_op.cu
-@@ -1,7 +1,6 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/prelu_op.h"
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu
-index ba62b2eff671..ae91f8a6da72 100644
---- a/caffe2/operators/reduce_front_back_max_ops.cu
-+++ b/caffe2/operators/reduce_front_back_max_ops.cu
-@@ -1,4 +1,3 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_max_ops.h"
-diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-index 586c20fe8d8e..476596f08425 100644
---- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-+++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-@@ -1,4 +1,3 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_sum_mean_ops.h"
-diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu
-index 0d94fab22a7f..ba55a66de588 100644
---- a/caffe2/operators/reduction_ops.cu
-+++ b/caffe2/operators/reduction_ops.cu
-@@ -2,7 +2,7 @@
- #include "caffe2/operators/reduction_ops.h"
- #include "caffe2/utils/conversions.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
-index 39cc5fbc988d..76c4d012d71a 100644
---- a/caffe2/operators/rmac_regions_op.cu
-+++ b/caffe2/operators/rmac_regions_op.cu
-@@ -1,4 +1,3 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-@@ -11,9 +10,6 @@
- #if defined(USE_ROCM)
- namespace rocprim {
- #else
--#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
--namespace at_cuda_detail {
--#endif
- namespace cub {
- #endif
-
-@@ -26,9 +22,6 @@ inline __host__ __device__ bool operator<(
- }
-
- } // namespace cub
--#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
--} // namespace at_cuda_detail
--#endif
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh
-index 447617c6e9de..8d51196ee138 100644
---- a/caffe2/operators/segment_reduction_op_gpu.cuh
-+++ b/caffe2/operators/segment_reduction_op_gpu.cuh
-@@ -1,4 +1,3 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
- #include <cub/device/device_scan.cuh>
-diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
-index e66d491f85e6..cc34effd3f22 100644
---- a/caffe2/operators/sequence_ops.cu
-+++ b/caffe2/operators/sequence_ops.cu
-@@ -1,6 +1,6 @@
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/sequence_ops.h"
-diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
-index c01fcf3e0a48..51c0cbc2bf6a 100644
---- a/caffe2/operators/softmax_ops.cu
-+++ b/caffe2/operators/softmax_ops.cu
-@@ -1,5 +1,4 @@
- #include <cfloat>
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-index f9b9fb58adc8..edc076c7d718 100644
---- a/caffe2/operators/spatial_batch_norm_op_impl.cuh
-+++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-@@ -5,8 +5,8 @@
-
- #include <limits>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/utils/math.h"
-diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
-index 396da5195125..2c2ad2cf76ae 100644
---- a/caffe2/sgd/adagrad_fused_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
-@@ -2,7 +2,6 @@
- #include <c10/core/GeneratorImpl.h>
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/device/device_radix_sort.cuh>
- #include "caffe2/sgd/adagrad_fused_op_gpu.cuh"
- #include "caffe2/utils/math.h"
-diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
-index a6fa842ddc80..8abb3376ca87 100644
---- a/caffe2/sgd/adagrad_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_op_gpu.cu
-@@ -1,6 +1,5 @@
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
-index 4b59836b6a68..42ab975faacb 100644
---- a/caffe2/sgd/adam_op_gpu.cu
-+++ b/caffe2/sgd/adam_op_gpu.cu
-@@ -1,4 +1,3 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu
-index 20919334da50..fc3e476b288b 100644
---- a/caffe2/utils/math/reduce.cu
-+++ b/caffe2/utils/math/reduce.cu
-@@ -6,8 +6,8 @@
- #include <numeric>
- #include <vector>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include <cub/cub.cuh>
-
- #include <thrust/execution_policy.h>
- #include <thrust/reduce.h>
-diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh
-index 39ad553eec76..0c43ad45a379 100644
---- a/caffe2/utils/math/reduce.cuh
-+++ b/caffe2/utils/math/reduce.cuh
-@@ -1,8 +1,8 @@
- #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_
- #define CAFFE2_UTILS_MATH_REDUCE_CUH_
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include <cub/cub.cuh>
-
- #include "caffe2/core/common_gpu.h"
-
-diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
-index b0a44fed34fb..a37d4b744d73 100644
---- a/caffe2/utils/math_gpu.cu
-+++ b/caffe2/utils/math_gpu.cu
-@@ -7,8 +7,8 @@
- #include <numeric>
- #include <vector>
-
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include <cub/cub.cuh>
-
- #include <thrust/host_vector.h>
- #include <thrust/device_vector.h>
-
-From a9ca6d97ff78a4a7a6a6dca90b21ad24666bf6fe Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Mon, 11 Oct 2021 10:27:36 -0700
-Subject: [PATCH 08/30] save
-
----
- aten/src/ATen/cuda/cub.cuh | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 39938efc48be..23e90a81a16c 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -9,6 +9,7 @@
-
- #if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-
-+#define CUB_WRAPPED_NAMESPACE at_cuda_detail
- #include <cub/cub.cuh>
-
- #else
-
-From 3695222a89a011cf58dfdb907515ffc72231f3af Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Mon, 11 Oct 2021 10:51:25 -0700
-Subject: [PATCH 09/30] fix
-
----
- aten/src/ATen/cuda/cub.cuh | 5 ++---
- 1 file changed, 2 insertions(+), 3 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 23e90a81a16c..9014d4b7d2ec 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -67,12 +67,11 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT
- }
- #endif
-
-+#if !defined(USE_ROCM)
- namespace at { namespace native {
- namespace cub = at_cuda_detail::cub;
- }}
--namespace caffe2 {
--namespace cub = at_cuda_detail::cub;
--}
-+#endif
-
- namespace at {
- namespace cuda {
-
-From 4d94df2cf61598bb6078364a0f628cb0321cdff5 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 07:54:47 -0700
-Subject: [PATCH 10/30] Revert "revert caffe2 change"
-
-This reverts commit 02e9ca26bdc658edd1d960dc5d31e277b9b9afc1.
----
- caffe2/core/context_gpu.cu | 1 +
- caffe2/operators/accuracy_op.cu | 1 +
- caffe2/operators/affine_channel_op.cu | 1 +
- caffe2/operators/arg_ops.cu | 2 +-
- caffe2/operators/batch_moments_op.cu | 1 +
- caffe2/operators/batch_sparse_to_dense_op.cu | 1 +
- caffe2/operators/boolean_mask_ops.cu | 2 +-
- caffe2/operators/cross_entropy_op.cu | 1 +
- caffe2/operators/distance_op.cu | 1 +
- caffe2/operators/elementwise_div_op.cu | 2 +-
- caffe2/operators/elementwise_linear_op.cu | 1 +
- caffe2/operators/elementwise_mul_op.cu | 2 +-
- caffe2/operators/elementwise_ops.cu | 1 +
- caffe2/operators/find_op.cu | 1 +
- caffe2/operators/generate_proposals_op.cu | 2 +-
- caffe2/operators/normalize_ops.cu | 1 +
- caffe2/operators/one_hot_ops.cu | 1 +
- caffe2/operators/pack_segments.cu | 2 +-
- caffe2/operators/prelu_op.cu | 1 +
- caffe2/operators/reduce_front_back_max_ops.cu | 1 +
- caffe2/operators/reduce_front_back_sum_mean_ops.cu | 1 +
- caffe2/operators/reduction_ops.cu | 2 +-
- caffe2/operators/rmac_regions_op.cu | 7 +++++++
- caffe2/operators/segment_reduction_op_gpu.cuh | 1 +
- caffe2/operators/sequence_ops.cu | 2 +-
- caffe2/operators/softmax_ops.cu | 1 +
- caffe2/operators/spatial_batch_norm_op_impl.cuh | 2 +-
- caffe2/sgd/adagrad_fused_op_gpu.cu | 1 +
- caffe2/sgd/adagrad_op_gpu.cu | 1 +
- caffe2/sgd/adam_op_gpu.cu | 1 +
- caffe2/utils/math/reduce.cu | 2 +-
- caffe2/utils/math/reduce.cuh | 2 +-
- caffe2/utils/math_gpu.cu | 2 +-
- 33 files changed, 39 insertions(+), 12 deletions(-)
-
-diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
-index c2b89945ada9..475ed61ab4f7 100644
---- a/caffe2/core/context_gpu.cu
-+++ b/caffe2/core/context_gpu.cu
-@@ -4,6 +4,7 @@
- #include <string>
- #include <unordered_map>
-
-+#include <ATen/cuda/cub.cuh>
- #include <ATen/Context.h>
- #include <c10/cuda/CUDAFunctions.h>
- #include <c10/cuda/CUDACachingAllocator.h>
-diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
-index f06663d71a90..7ad2b09c238a 100644
---- a/caffe2/operators/accuracy_op.cu
-+++ b/caffe2/operators/accuracy_op.cu
-@@ -3,6 +3,7 @@
- #include "caffe2/utils/GpuAtomics.cuh"
- #include "caffe2/utils/math.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
-index adf4ac55c0fc..f3d9e22c7e8c 100644
---- a/caffe2/operators/affine_channel_op.cu
-+++ b/caffe2/operators/affine_channel_op.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/affine_channel_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu
-index 7e90d25b836b..fbefe0774376 100644
---- a/caffe2/operators/arg_ops.cu
-+++ b/caffe2/operators/arg_ops.cu
-@@ -2,8 +2,8 @@
-
- #include <limits>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu
-index 4b693b5c04e2..65c43200e5bd 100644
---- a/caffe2/operators/batch_moments_op.cu
-+++ b/caffe2/operators/batch_moments_op.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/batch_moments_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu
-index aea2035a5d3d..2cb09deb8668 100644
---- a/caffe2/operators/batch_sparse_to_dense_op.cu
-+++ b/caffe2/operators/batch_sparse_to_dense_op.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/batch_sparse_to_dense_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/device/device_scan.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
-index 214b7c13ba3c..c87688f51d64 100644
---- a/caffe2/operators/boolean_mask_ops.cu
-+++ b/caffe2/operators/boolean_mask_ops.cu
-@@ -3,7 +3,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/boolean_mask_ops.h"
-
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
-index 380e80399fc3..95f3ffddbf1f 100644
---- a/caffe2/operators/cross_entropy_op.cu
-+++ b/caffe2/operators/cross_entropy_op.cu
-@@ -1,4 +1,5 @@
- #include <assert.h>
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
-index 3a8bb337d541..d94691d5a9d9 100644
---- a/caffe2/operators/distance_op.cu
-+++ b/caffe2/operators/distance_op.cu
-@@ -4,6 +4,7 @@
- #include "caffe2/operators/distance_op.h"
- #include "caffe2/utils/conversions.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu
-index 42b103a0f110..ca9682326324 100644
---- a/caffe2/operators/elementwise_div_op.cu
-+++ b/caffe2/operators/elementwise_div_op.cu
-@@ -3,8 +3,8 @@
- #include <algorithm>
- #include <functional>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/elementwise_ops_utils.h"
-diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
-index cc49115bffc5..c1c45263f34c 100644
---- a/caffe2/operators/elementwise_linear_op.cu
-+++ b/caffe2/operators/elementwise_linear_op.cu
-@@ -5,6 +5,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/operator_fallback_gpu.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu
-index bdbf760cf95b..88c3da00edc3 100644
---- a/caffe2/operators/elementwise_mul_op.cu
-+++ b/caffe2/operators/elementwise_mul_op.cu
-@@ -3,8 +3,8 @@
- #include <algorithm>
- #include <functional>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/elementwise_ops_utils.h"
-diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
-index c9ced33cf806..1ac0426d2ca7 100644
---- a/caffe2/operators/elementwise_ops.cu
-+++ b/caffe2/operators/elementwise_ops.cu
-@@ -1,5 +1,6 @@
- #include "caffe2/operators/elementwise_ops.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_load.cuh>
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
-diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
-index f8ff2bab1637..666df335ce42 100644
---- a/caffe2/operators/find_op.cu
-+++ b/caffe2/operators/find_op.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/find_op.h"
-diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu
-index cab0ad3d0b88..84906a8e8182 100644
---- a/caffe2/operators/generate_proposals_op.cu
-+++ b/caffe2/operators/generate_proposals_op.cu
-@@ -1,4 +1,4 @@
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
- #include "caffe2/core/context.h"
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/generate_proposals_op.h"
-diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
-index 26df05308d88..468175df985f 100644
---- a/caffe2/operators/normalize_ops.cu
-+++ b/caffe2/operators/normalize_ops.cu
-@@ -1,5 +1,6 @@
- #include <algorithm>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
-index e521b3dd09df..86f82f78bb82 100644
---- a/caffe2/operators/one_hot_ops.cu
-+++ b/caffe2/operators/one_hot_ops.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
-index 7475100fd368..b9ed413d1e7b 100644
---- a/caffe2/operators/pack_segments.cu
-+++ b/caffe2/operators/pack_segments.cu
-@@ -1,4 +1,4 @@
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/pack_segments.h"
-
-diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu
-index 745a393f075f..d29882086754 100644
---- a/caffe2/operators/prelu_op.cu
-+++ b/caffe2/operators/prelu_op.cu
-@@ -1,6 +1,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/prelu_op.h"
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu
-index ae91f8a6da72..ba62b2eff671 100644
---- a/caffe2/operators/reduce_front_back_max_ops.cu
-+++ b/caffe2/operators/reduce_front_back_max_ops.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_max_ops.h"
-diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-index 476596f08425..586c20fe8d8e 100644
---- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-+++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_sum_mean_ops.h"
-diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu
-index ba55a66de588..0d94fab22a7f 100644
---- a/caffe2/operators/reduction_ops.cu
-+++ b/caffe2/operators/reduction_ops.cu
-@@ -2,7 +2,7 @@
- #include "caffe2/operators/reduction_ops.h"
- #include "caffe2/utils/conversions.h"
-
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
-index 76c4d012d71a..39cc5fbc988d 100644
---- a/caffe2/operators/rmac_regions_op.cu
-+++ b/caffe2/operators/rmac_regions_op.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-@@ -10,6 +11,9 @@
- #if defined(USE_ROCM)
- namespace rocprim {
- #else
-+#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+namespace at_cuda_detail {
-+#endif
- namespace cub {
- #endif
-
-@@ -22,6 +26,9 @@ inline __host__ __device__ bool operator<(
- }
-
- } // namespace cub
-+#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+} // namespace at_cuda_detail
-+#endif
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh
-index 8d51196ee138..447617c6e9de 100644
---- a/caffe2/operators/segment_reduction_op_gpu.cuh
-+++ b/caffe2/operators/segment_reduction_op_gpu.cuh
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
- #include <cub/device/device_scan.cuh>
-diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
-index cc34effd3f22..e66d491f85e6 100644
---- a/caffe2/operators/sequence_ops.cu
-+++ b/caffe2/operators/sequence_ops.cu
-@@ -1,6 +1,6 @@
- #include <algorithm>
-
--#include <cub/cub.cuh>
-+#include <ATen/cuda/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/sequence_ops.h"
-diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
-index 51c0cbc2bf6a..c01fcf3e0a48 100644
---- a/caffe2/operators/softmax_ops.cu
-+++ b/caffe2/operators/softmax_ops.cu
-@@ -1,4 +1,5 @@
- #include <cfloat>
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-index edc076c7d718..f9b9fb58adc8 100644
---- a/caffe2/operators/spatial_batch_norm_op_impl.cuh
-+++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-@@ -5,8 +5,8 @@
-
- #include <limits>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/utils/math.h"
-diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
-index 2c2ad2cf76ae..396da5195125 100644
---- a/caffe2/sgd/adagrad_fused_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
-@@ -2,6 +2,7 @@
- #include <c10/core/GeneratorImpl.h>
- #include <algorithm>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/device/device_radix_sort.cuh>
- #include "caffe2/sgd/adagrad_fused_op_gpu.cuh"
- #include "caffe2/utils/math.h"
-diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
-index 8abb3376ca87..a6fa842ddc80 100644
---- a/caffe2/sgd/adagrad_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_op_gpu.cu
-@@ -1,5 +1,6 @@
- #include <algorithm>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
-index 42ab975faacb..4b59836b6a68 100644
---- a/caffe2/sgd/adam_op_gpu.cu
-+++ b/caffe2/sgd/adam_op_gpu.cu
-@@ -1,3 +1,4 @@
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu
-index fc3e476b288b..20919334da50 100644
---- a/caffe2/utils/math/reduce.cu
-+++ b/caffe2/utils/math/reduce.cu
-@@ -6,8 +6,8 @@
- #include <numeric>
- #include <vector>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include <thrust/execution_policy.h>
- #include <thrust/reduce.h>
-diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh
-index 0c43ad45a379..39ad553eec76 100644
---- a/caffe2/utils/math/reduce.cuh
-+++ b/caffe2/utils/math/reduce.cuh
-@@ -1,8 +1,8 @@
- #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_
- #define CAFFE2_UTILS_MATH_REDUCE_CUH_
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include "caffe2/core/common_gpu.h"
-
-diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
-index a37d4b744d73..b0a44fed34fb 100644
---- a/caffe2/utils/math_gpu.cu
-+++ b/caffe2/utils/math_gpu.cu
-@@ -7,8 +7,8 @@
- #include <numeric>
- #include <vector>
-
-+#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
--#include <cub/cub.cuh>
-
- #include <thrust/host_vector.h>
- #include <thrust/device_vector.h>
-
-From 359666307907780e946284445e609131652a4739 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 08:04:01 -0700
-Subject: [PATCH 11/30] save
-
----
- aten/src/ATen/cuda/cub.cuh | 4 +++-
- aten/src/ATen/cuda/cub_definitions.cuh | 5 ++++-
- cmake/Dependencies.cmake | 4 ++++
- 3 files changed, 11 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 9014d4b7d2ec..4f39bde4b3c8 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -9,7 +9,6 @@
-
- #if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-
--#define CUB_WRAPPED_NAMESPACE at_cuda_detail
- #include <cub/cub.cuh>
-
- #else
-@@ -71,6 +70,9 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT
- namespace at { namespace native {
- namespace cub = at_cuda_detail::cub;
- }}
-+namespace caffew {
-+namespace cub = at_cuda_detail::cub;
-+}
- #endif
-
- namespace at {
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 7da3ab716e41..b4c82c9ae975 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -1,6 +1,7 @@
- #pragma once
-
- #if !defined(USE_ROCM)
-+#include <cuda.h>
- #include <cub/version.cuh>
- #else
- #define CUB_VERSION 0
-@@ -16,7 +17,9 @@
-
- // cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in:
- // https://github.com/NVIDIA/cub/pull/326
--#if CUB_VERSION >= 101400
-+// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
-+// starting from CUDA 11.4
-+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11040
- #define CUB_SUPPORTS_WRAPPED_NAMESPACE() true
- #else
- #define CUB_SUPPORTS_WRAPPED_NAMESPACE() false
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 564fcebc0b08..bfee8f6dc78a 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1618,6 +1618,10 @@ if(NOT INTERN_BUILD_MOBILE)
- set(CMAKE_CXX_STANDARD 14)
- endif()
-
-+ if(NOT ${CUDA_VERSION} LESS 11.4)
-+ string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
-+ endif()
-+
- if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
- message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
- string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
-
-From d4b3679d45cadbf9123a869cfefa001832bce04a Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 08:09:00 -0700
-Subject: [PATCH 12/30] save
-
----
- aten/src/ATen/cuda/cub.cuh | 2 +-
- aten/src/ATen/cuda/cub_definitions.cuh | 4 ++--
- 2 files changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 4f39bde4b3c8..2583b7e640ec 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -7,7 +7,7 @@
-
- #include <ATen/cuda/cub_definitions.cuh>
-
--#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
-
- #include <cub/cub.cuh>
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index b4c82c9ae975..160d6f157396 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -20,7 +20,7 @@
- // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
- // starting from CUDA 11.4
- #if defined(CUDA_VERSION) && CUDA_VERSION >= 11040
--#define CUB_SUPPORTS_WRAPPED_NAMESPACE() true
-+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
- #else
--#define CUB_SUPPORTS_WRAPPED_NAMESPACE() false
-+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
- #endif
-
-From 2b710dab756fceadf393d1c38cec91ef000e8fa8 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 08:23:39 -0700
-Subject: [PATCH 13/30] fix
-
----
- aten/src/ATen/cuda/cub.cuh | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 2583b7e640ec..042009f3d8b4 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -68,10 +68,10 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT
-
- #if !defined(USE_ROCM)
- namespace at { namespace native {
--namespace cub = at_cuda_detail::cub;
-+namespace cub = ::at_cuda_detail::cub;
- }}
--namespace caffew {
--namespace cub = at_cuda_detail::cub;
-+namespace caffe2 {
-+namespace cub = ::at_cuda_detail::cub;
- }
- #endif
-
-
-From 34c57ca996c3249fe23e7037f98d1a9638187371 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 08:31:35 -0700
-Subject: [PATCH 14/30] fix
-
----
- caffe2/operators/rmac_regions_op.cu | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
-index 39cc5fbc988d..2f6c230574b0 100644
---- a/caffe2/operators/rmac_regions_op.cu
-+++ b/caffe2/operators/rmac_regions_op.cu
-@@ -11,7 +11,7 @@
- #if defined(USE_ROCM)
- namespace rocprim {
- #else
--#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
- namespace at_cuda_detail {
- #endif
- namespace cub {
-@@ -26,7 +26,7 @@ inline __host__ __device__ bool operator<(
- }
-
- } // namespace cub
--#if CUB_SUPPORTS_WRAPPED_NAMESPACE()
-+#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
- } // namespace at_cuda_detail
- #endif
-
-
-From 3f6bce369e20fc75160c8151064e5cf3c4d871ce Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 08:56:27 -0700
-Subject: [PATCH 16/30] save
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 160d6f157396..7746885d6428 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -15,11 +15,11 @@
- #define CUB_SUPPORTS_NV_BFLOAT16() false
- #endif
-
--// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in:
-+// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
- // https://github.com/NVIDIA/cub/pull/326
- // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
- // starting from CUDA 11.4
--#if defined(CUDA_VERSION) && CUDA_VERSION >= 11040
-+#if CUB_VERSION >= 101301
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
- #else
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
-
-From a2346ec6c5804085e49e81007ed70fd1aae0f333 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 09:04:04 -0700
-Subject: [PATCH 17/30] save
-
----
- aten/src/ATen/cuda/cub.cuh | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 042009f3d8b4..17062586b0e3 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -17,9 +17,8 @@
- #undef CUB_NS_PREFIX
- #define CUB_NS_PREFIX namespace at_cuda_detail {
- #define CUB_NS_POSTFIX }
-+#define CUB_NS_QUALIFIER ::at_cuda_detail::cub
- #include <cub/cub.cuh>
--#undef CUB_NS_POSTFIX
--#undef CUB_NS_PREFIX
-
- #endif
-
-
-From add451ed3a579a794609bb26e82b31cec8527d03 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 09:06:53 -0700
-Subject: [PATCH 18/30] save
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 7746885d6428..fb43441148fc 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -19,7 +19,7 @@
- // https://github.com/NVIDIA/cub/pull/326
- // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
- // starting from CUDA 11.4
--#if CUB_VERSION >= 101301
-+#if CUB_VERSION >= 101400
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
- #else
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
-
-From d1113be1236633978a51581ad9270069ddea5c30 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 09:07:12 -0700
-Subject: [PATCH 19/30] save
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index fb43441148fc..4b339b92c6dc 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -18,7 +18,7 @@
- // cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
- // https://github.com/NVIDIA/cub/pull/326
- // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
--// starting from CUDA 11.4
-+// starting from CUDA 11.5
- #if CUB_VERSION >= 101400
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
- #else
-
-From 59949c2c4945f7df89888bbbfb44060ea19e31ff Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 11:45:18 -0700
-Subject: [PATCH 20/30] save
-
----
- aten/src/ATen/cuda/cub.cuh | 8 ++++----
- aten/src/ATen/cuda/cub_definitions.cuh | 8 ++++----
- caffe2/core/context_gpu.cu | 2 +-
- caffe2/operators/accuracy_op.cu | 2 +-
- caffe2/operators/affine_channel_op.cu | 2 +-
- caffe2/operators/arg_ops.cu | 2 +-
- caffe2/operators/batch_moments_op.cu | 2 +-
- caffe2/operators/batch_sparse_to_dense_op.cu | 2 +-
- caffe2/operators/boolean_mask_ops.cu | 4 ++--
- caffe2/operators/cross_entropy_op.cu | 2 +-
- caffe2/operators/distance_op.cu | 2 +-
- caffe2/operators/elementwise_div_op.cu | 2 +-
- caffe2/operators/elementwise_linear_op.cu | 2 +-
- caffe2/operators/elementwise_mul_op.cu | 2 +-
- caffe2/operators/elementwise_ops.cu | 2 +-
- caffe2/operators/find_op.cu | 2 +-
- caffe2/operators/generate_proposals_op.cu | 3 ++-
- caffe2/operators/normalize_ops.cu | 2 +-
- caffe2/operators/one_hot_ops.cu | 2 +-
- caffe2/operators/pack_segments.cu | 3 ++-
- caffe2/operators/prelu_op.cu | 2 +-
- caffe2/operators/reduce_front_back_max_ops.cu | 2 +-
- caffe2/operators/reduce_front_back_sum_mean_ops.cu | 2 +-
- caffe2/operators/reduction_ops.cu | 2 +-
- caffe2/operators/rmac_regions_op.cu | 2 +-
- caffe2/operators/segment_reduction_op_gpu.cuh | 2 +-
- caffe2/operators/sequence_ops.cu | 3 ++-
- caffe2/operators/softmax_ops.cu | 2 +-
- caffe2/operators/spatial_batch_norm_op_impl.cuh | 2 +-
- caffe2/sgd/adagrad_fused_op_gpu.cu | 2 +-
- caffe2/sgd/adagrad_op_gpu.cu | 2 +-
- caffe2/sgd/adam_op_gpu.cu | 2 +-
- caffe2/utils/cub_namespace.cuh | 7 +++++++
- caffe2/utils/math/reduce.cu | 3 +--
- caffe2/utils/math/reduce.cuh | 2 +-
- caffe2/utils/math_gpu.cu | 3 ++-
- cmake/Dependencies.cmake | 2 +-
- 37 files changed, 54 insertions(+), 44 deletions(-)
- create mode 100644 caffe2/utils/cub_namespace.cuh
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index 17062586b0e3..de14455adb98 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -15,11 +15,14 @@
-
- #undef CUB_NS_POSTFIX //undef to avoid redefinition warnings
- #undef CUB_NS_PREFIX
-+#undef CUB_NS_QUALIFIER
- #define CUB_NS_PREFIX namespace at_cuda_detail {
- #define CUB_NS_POSTFIX }
- #define CUB_NS_QUALIFIER ::at_cuda_detail::cub
- #include <cub/cub.cuh>
--
-+#undef CUB_NS_POSTFIX
-+#undef CUB_NS_PREFIX
-+#undef CUB_NS_QUALIFIER
- #endif
-
- #include <ATen/cuda/Exceptions.h>
-@@ -69,9 +72,6 @@ template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOAT
- namespace at { namespace native {
- namespace cub = ::at_cuda_detail::cub;
- }}
--namespace caffe2 {
--namespace cub = ::at_cuda_detail::cub;
--}
- #endif
-
- namespace at {
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 4b339b92c6dc..07cffe9d34e4 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -15,12 +15,12 @@
- #define CUB_SUPPORTS_NV_BFLOAT16() false
- #endif
-
--// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
-+// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in:
- // https://github.com/NVIDIA/cub/pull/326
- // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
--// starting from CUDA 11.5
--#if CUB_VERSION >= 101400
-+// starting from CUDA 11.6
-+#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
- #else
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
--#endif
-+#endif
-\ No newline at end of file
-diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
-index 475ed61ab4f7..9ba9f74d5376 100644
---- a/caffe2/core/context_gpu.cu
-+++ b/caffe2/core/context_gpu.cu
-@@ -4,7 +4,7 @@
- #include <string>
- #include <unordered_map>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <ATen/Context.h>
- #include <c10/cuda/CUDAFunctions.h>
- #include <c10/cuda/CUDACachingAllocator.h>
-diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
-index 7ad2b09c238a..29df54e752d3 100644
---- a/caffe2/operators/accuracy_op.cu
-+++ b/caffe2/operators/accuracy_op.cu
-@@ -3,7 +3,7 @@
- #include "caffe2/utils/GpuAtomics.cuh"
- #include "caffe2/utils/math.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
-index f3d9e22c7e8c..efae0a3fc695 100644
---- a/caffe2/operators/affine_channel_op.cu
-+++ b/caffe2/operators/affine_channel_op.cu
-@@ -1,6 +1,6 @@
- #include "caffe2/operators/affine_channel_op.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu
-index fbefe0774376..56deaa636356 100644
---- a/caffe2/operators/arg_ops.cu
-+++ b/caffe2/operators/arg_ops.cu
-@@ -2,7 +2,7 @@
-
- #include <limits>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/common_gpu.h"
-diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu
-index 65c43200e5bd..81359f6440b0 100644
---- a/caffe2/operators/batch_moments_op.cu
-+++ b/caffe2/operators/batch_moments_op.cu
-@@ -1,6 +1,6 @@
- #include "caffe2/operators/batch_moments_op.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/batch_sparse_to_dense_op.cu b/caffe2/operators/batch_sparse_to_dense_op.cu
-index 2cb09deb8668..3e7ad8af9a5b 100644
---- a/caffe2/operators/batch_sparse_to_dense_op.cu
-+++ b/caffe2/operators/batch_sparse_to_dense_op.cu
-@@ -1,6 +1,6 @@
- #include "caffe2/operators/batch_sparse_to_dense_op.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/device/device_scan.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
-index c87688f51d64..501dd3b191c8 100644
---- a/caffe2/operators/boolean_mask_ops.cu
-+++ b/caffe2/operators/boolean_mask_ops.cu
-@@ -2,8 +2,8 @@
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/boolean_mask_ops.h"
--
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
-index 95f3ffddbf1f..15cb8a4f574a 100644
---- a/caffe2/operators/cross_entropy_op.cu
-+++ b/caffe2/operators/cross_entropy_op.cu
-@@ -1,5 +1,5 @@
- #include <assert.h>
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
-index d94691d5a9d9..a360166854ff 100644
---- a/caffe2/operators/distance_op.cu
-+++ b/caffe2/operators/distance_op.cu
-@@ -4,7 +4,7 @@
- #include "caffe2/operators/distance_op.h"
- #include "caffe2/utils/conversions.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu
-index ca9682326324..33118a8f5e16 100644
---- a/caffe2/operators/elementwise_div_op.cu
-+++ b/caffe2/operators/elementwise_div_op.cu
-@@ -3,7 +3,7 @@
- #include <algorithm>
- #include <functional>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
-index c1c45263f34c..8f749644b295 100644
---- a/caffe2/operators/elementwise_linear_op.cu
-+++ b/caffe2/operators/elementwise_linear_op.cu
-@@ -5,7 +5,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/operator_fallback_gpu.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu
-index 88c3da00edc3..1991b8b513af 100644
---- a/caffe2/operators/elementwise_mul_op.cu
-+++ b/caffe2/operators/elementwise_mul_op.cu
-@@ -3,7 +3,7 @@
- #include <algorithm>
- #include <functional>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
-index 1ac0426d2ca7..932bd5dafda0 100644
---- a/caffe2/operators/elementwise_ops.cu
-+++ b/caffe2/operators/elementwise_ops.cu
-@@ -1,6 +1,6 @@
- #include "caffe2/operators/elementwise_ops.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_load.cuh>
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
-diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
-index 666df335ce42..20d42560b506 100644
---- a/caffe2/operators/find_op.cu
-+++ b/caffe2/operators/find_op.cu
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/find_op.h"
-diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu
-index 84906a8e8182..b63726651939 100644
---- a/caffe2/operators/generate_proposals_op.cu
-+++ b/caffe2/operators/generate_proposals_op.cu
-@@ -1,10 +1,11 @@
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
- #include "caffe2/core/context.h"
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/generate_proposals_op.h"
- #include "caffe2/operators/generate_proposals_op_util_boxes.h" // BBOX_XFORM_CLIP_DEFAULT
- #include "caffe2/operators/generate_proposals_op_util_nms.h"
- #include "caffe2/operators/generate_proposals_op_util_nms_gpu.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- #if defined(USE_ROCM)
- #include <cfloat>
-diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
-index 468175df985f..952c4a772fa5 100644
---- a/caffe2/operators/normalize_ops.cu
-+++ b/caffe2/operators/normalize_ops.cu
-@@ -1,6 +1,6 @@
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
-index 86f82f78bb82..4b1e054b0806 100644
---- a/caffe2/operators/one_hot_ops.cu
-+++ b/caffe2/operators/one_hot_ops.cu
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
-index b9ed413d1e7b..372638abdd24 100644
---- a/caffe2/operators/pack_segments.cu
-+++ b/caffe2/operators/pack_segments.cu
-@@ -1,6 +1,7 @@
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/pack_segments.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/prelu_op.cu b/caffe2/operators/prelu_op.cu
-index d29882086754..6303b70b4a89 100644
---- a/caffe2/operators/prelu_op.cu
-+++ b/caffe2/operators/prelu_op.cu
-@@ -1,7 +1,7 @@
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/prelu_op.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- namespace caffe2 {
-diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu
-index ba62b2eff671..d6bb862e4dbb 100644
---- a/caffe2/operators/reduce_front_back_max_ops.cu
-+++ b/caffe2/operators/reduce_front_back_max_ops.cu
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_max_ops.h"
-diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-index 586c20fe8d8e..2b5cb7110edf 100644
---- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-+++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_sum_mean_ops.h"
-diff --git a/caffe2/operators/reduction_ops.cu b/caffe2/operators/reduction_ops.cu
-index 0d94fab22a7f..9649b85d015c 100644
---- a/caffe2/operators/reduction_ops.cu
-+++ b/caffe2/operators/reduction_ops.cu
-@@ -2,7 +2,7 @@
- #include "caffe2/operators/reduction_ops.h"
- #include "caffe2/utils/conversions.h"
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/rmac_regions_op.cu b/caffe2/operators/rmac_regions_op.cu
-index 2f6c230574b0..6b79e0c4c8b4 100644
---- a/caffe2/operators/rmac_regions_op.cu
-+++ b/caffe2/operators/rmac_regions_op.cu
-@@ -1,5 +1,5 @@
--#include <ATen/cuda/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/rmac_regions_op.h"
-diff --git a/caffe2/operators/segment_reduction_op_gpu.cuh b/caffe2/operators/segment_reduction_op_gpu.cuh
-index 447617c6e9de..bb3f3be13c72 100644
---- a/caffe2/operators/segment_reduction_op_gpu.cuh
-+++ b/caffe2/operators/segment_reduction_op_gpu.cuh
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include <cub/device/device_reduce.cuh>
- #include <cub/device/device_scan.cuh>
-diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
-index e66d491f85e6..2ceb5236ef72 100644
---- a/caffe2/operators/sequence_ops.cu
-+++ b/caffe2/operators/sequence_ops.cu
-@@ -1,6 +1,7 @@
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/sequence_ops.h"
-diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
-index c01fcf3e0a48..b0afac3332a6 100644
---- a/caffe2/operators/softmax_ops.cu
-+++ b/caffe2/operators/softmax_ops.cu
-@@ -1,5 +1,5 @@
- #include <cfloat>
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/operators/spatial_batch_norm_op_impl.cuh b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-index f9b9fb58adc8..6fdb4c63f8ef 100644
---- a/caffe2/operators/spatial_batch_norm_op_impl.cuh
-+++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh
-@@ -5,7 +5,7 @@
-
- #include <limits>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
-index 396da5195125..63d0712e3970 100644
---- a/caffe2/sgd/adagrad_fused_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
-@@ -2,7 +2,7 @@
- #include <c10/core/GeneratorImpl.h>
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/device/device_radix_sort.cuh>
- #include "caffe2/sgd/adagrad_fused_op_gpu.cuh"
- #include "caffe2/utils/math.h"
-diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
-index a6fa842ddc80..0b7f499345be 100644
---- a/caffe2/sgd/adagrad_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_op_gpu.cu
-@@ -1,6 +1,6 @@
- #include <algorithm>
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
-index 4b59836b6a68..a93812fabbe8 100644
---- a/caffe2/sgd/adam_op_gpu.cu
-+++ b/caffe2/sgd/adam_op_gpu.cu
-@@ -1,4 +1,4 @@
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
-diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh
-new file mode 100644
-index 000000000000..c7a5db0dc013
---- /dev/null
-+++ b/caffe2/utils/cub_namespace.cuh
-@@ -0,0 +1,7 @@
-+#include <ATen/cuda/cub_definitions.cuh>
-+
-+#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
-+namespace caffe2 {
-+namespace cub = ::CUB_WRAPPED_NAMESPACE::cub;
-+}
-+#endif
-diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu
-index 20919334da50..69a6469d8ed1 100644
---- a/caffe2/utils/math/reduce.cu
-+++ b/caffe2/utils/math/reduce.cu
-@@ -5,8 +5,7 @@
- #include <limits>
- #include <numeric>
- #include <vector>
--
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include <thrust/execution_policy.h>
-diff --git a/caffe2/utils/math/reduce.cuh b/caffe2/utils/math/reduce.cuh
-index 39ad553eec76..18bdca11b9de 100644
---- a/caffe2/utils/math/reduce.cuh
-+++ b/caffe2/utils/math/reduce.cuh
-@@ -1,7 +1,7 @@
- #ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_
- #define CAFFE2_UTILS_MATH_REDUCE_CUH_
-
--#include <ATen/cuda/cub.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/common_gpu.h"
-diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
-index b0a44fed34fb..54b0a9391c26 100644
---- a/caffe2/utils/math_gpu.cu
-+++ b/caffe2/utils/math_gpu.cu
-@@ -7,8 +7,9 @@
- #include <numeric>
- #include <vector>
-
--#include <ATen/cuda/cub.cuh>
-+#include <cub/cub.cuh>
- #include <cub/block/block_reduce.cuh>
-+#include "caffe2/utils/cub_namespace.cuh"
-
- #include <thrust/host_vector.h>
- #include <thrust/device_vector.h>
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index bfee8f6dc78a..a6a3946bac8e 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1618,7 +1618,7 @@ if(NOT INTERN_BUILD_MOBILE)
- set(CMAKE_CXX_STANDARD 14)
- endif()
-
-- if(NOT ${CUDA_VERSION} LESS 11.4)
-+ if(NOT ${CUDA_VERSION} LESS 11.6)
- string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
- endif()
-
-
-From c22df90d62acfcb572b4b7bf55646fe338843a69 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 11:47:52 -0700
-Subject: [PATCH 21/30] save
-
----
- aten/src/ATen/cuda/cub.cuh | 1 +
- aten/src/ATen/cuda/cub_definitions.cuh | 2 +-
- 2 files changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index de14455adb98..fecf9e077a6f 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -23,6 +23,7 @@
- #undef CUB_NS_POSTFIX
- #undef CUB_NS_PREFIX
- #undef CUB_NS_QUALIFIER
-+
- #endif
-
- #include <ATen/cuda/Exceptions.h>
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 07cffe9d34e4..d639d28e66b6 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -23,4 +23,4 @@
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
- #else
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
--#endif
-\ No newline at end of file
-+#endif
-
-From 64adfaaaca8267cd6cf03ae7a524eee239530425 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 12:29:41 -0700
-Subject: [PATCH 22/30] save
-
----
- caffe2/core/context_gpu.cu | 2 +-
- caffe2/operators/cross_entropy_op.cu | 2 +-
- caffe2/operators/find_op.cu | 2 +-
- caffe2/operators/normalize_ops.cu | 2 +-
- caffe2/operators/one_hot_ops.cu | 2 +-
- caffe2/operators/reduce_front_back_max_ops.cu | 2 +-
- caffe2/operators/reduce_front_back_sum_mean_ops.cu | 2 +-
- caffe2/operators/softmax_ops.cu | 2 +-
- caffe2/sgd/adagrad_op_gpu.cu | 2 +-
- caffe2/sgd/adam_op_gpu.cu | 2 +-
- cmake/Dependencies.cmake | 2 +-
- 11 files changed, 11 insertions(+), 11 deletions(-)
-
-diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
-index 9ba9f74d5376..6d537400913e 100644
---- a/caffe2/core/context_gpu.cu
-+++ b/caffe2/core/context_gpu.cu
-@@ -4,7 +4,6 @@
- #include <string>
- #include <unordered_map>
-
--#include "caffe2/utils/cub_namespace.cuh"
- #include <ATen/Context.h>
- #include <c10/cuda/CUDAFunctions.h>
- #include <c10/cuda/CUDACachingAllocator.h>
-@@ -22,6 +21,7 @@
- #include "caffe2/core/logging.h"
- #include "caffe2/core/tensor.h"
- #include "caffe2/utils/string_utils.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- C10_DEFINE_string(
- caffe2_cuda_memory_pool,
-diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
-index 15cb8a4f574a..c23f05f8e5c2 100644
---- a/caffe2/operators/cross_entropy_op.cu
-+++ b/caffe2/operators/cross_entropy_op.cu
-@@ -1,10 +1,10 @@
- #include <assert.h>
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/cross_entropy_op.h"
- #include "caffe2/operators/operator_fallback_gpu.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/find_op.cu b/caffe2/operators/find_op.cu
-index 20d42560b506..0418a71fbcda 100644
---- a/caffe2/operators/find_op.cu
-+++ b/caffe2/operators/find_op.cu
-@@ -1,7 +1,7 @@
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/find_op.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
-index 952c4a772fa5..e4d1f34b754c 100644
---- a/caffe2/operators/normalize_ops.cu
-+++ b/caffe2/operators/normalize_ops.cu
-@@ -1,11 +1,11 @@
- #include <algorithm>
-
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/normalize_l1_op.h"
- #include "caffe2/operators/normalize_op.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
-index 4b1e054b0806..87e8196765ef 100644
---- a/caffe2/operators/one_hot_ops.cu
-+++ b/caffe2/operators/one_hot_ops.cu
-@@ -1,8 +1,8 @@
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/one_hot_ops.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/reduce_front_back_max_ops.cu b/caffe2/operators/reduce_front_back_max_ops.cu
-index d6bb862e4dbb..2ea25de46009 100644
---- a/caffe2/operators/reduce_front_back_max_ops.cu
-+++ b/caffe2/operators/reduce_front_back_max_ops.cu
-@@ -1,7 +1,7 @@
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_max_ops.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- #if defined(USE_ROCM)
- #include <cfloat>
-diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.cu b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-index 2b5cb7110edf..a7ad6dd50084 100644
---- a/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-+++ b/caffe2/operators/reduce_front_back_sum_mean_ops.cu
-@@ -1,7 +1,7 @@
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/reduce_front_back_sum_mean_ops.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
-index b0afac3332a6..ebf0700c9ef0 100644
---- a/caffe2/operators/softmax_ops.cu
-+++ b/caffe2/operators/softmax_ops.cu
-@@ -1,11 +1,11 @@
- #include <cfloat>
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
-
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/operators/softmax_op.h"
- #include "caffe2/operators/softmax_with_loss_op.h"
- #include "caffe2/operators/spatial_softmax_with_loss_op.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
-index 0b7f499345be..b80d29700c3f 100644
---- a/caffe2/sgd/adagrad_op_gpu.cu
-+++ b/caffe2/sgd/adagrad_op_gpu.cu
-@@ -1,10 +1,10 @@
- #include <algorithm>
-
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/sgd/adagrad_op.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
-index a93812fabbe8..6f9c3234204d 100644
---- a/caffe2/sgd/adam_op_gpu.cu
-+++ b/caffe2/sgd/adam_op_gpu.cu
-@@ -1,8 +1,8 @@
--#include "caffe2/utils/cub_namespace.cuh"
- #include <cub/block/block_reduce.cuh>
- #include "caffe2/core/common_gpu.h"
- #include "caffe2/core/context_gpu.h"
- #include "caffe2/sgd/adam_op.h"
-+#include "caffe2/utils/cub_namespace.cuh"
-
- namespace caffe2 {
-
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index a6a3946bac8e..5074bab1e999 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1618,7 +1618,7 @@ if(NOT INTERN_BUILD_MOBILE)
- set(CMAKE_CXX_STANDARD 14)
- endif()
-
-- if(NOT ${CUDA_VERSION} LESS 11.6)
-+ if(NOT ${CUDA_VERSION} LESS 11.5)
- string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
- endif()
-
-
-From 21abede594471e4f225def12796c1d7c73a96b20 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 12:46:32 -0700
-Subject: [PATCH 23/30] save
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index d639d28e66b6..9828abdfc99a 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -15,10 +15,10 @@
- #define CUB_SUPPORTS_NV_BFLOAT16() false
- #endif
-
--// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.14 in:
-+// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
- // https://github.com/NVIDIA/cub/pull/326
- // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
--// starting from CUDA 11.6
-+// starting from CUDA 11.5
- #if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
- #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
- #else
-
-From c73b101377d5adb8f96eb31988ceef7c2e760839 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Tue, 12 Oct 2021 12:59:12 -0700
-Subject: [PATCH 24/30] comment
-
----
- aten/src/ATen/cuda/cub.cuh | 2 ++
- cmake/Dependencies.cmake | 2 ++
- 2 files changed, 4 insertions(+)
-
-diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
-index fecf9e077a6f..0532470d74ec 100644
---- a/aten/src/ATen/cuda/cub.cuh
-+++ b/aten/src/ATen/cuda/cub.cuh
-@@ -13,6 +13,8 @@
-
- #else
-
-+// include cub in a safe manner, see:
-+// https://github.com/pytorch/pytorch/pull/55292
- #undef CUB_NS_POSTFIX //undef to avoid redefinition warnings
- #undef CUB_NS_PREFIX
- #undef CUB_NS_QUALIFIER
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 5074bab1e999..bde42bf1719f 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1618,6 +1618,8 @@ if(NOT INTERN_BUILD_MOBILE)
- set(CMAKE_CXX_STANDARD 14)
- endif()
-
-+ # use cub in a safe manner, see:
-+ # https://github.com/pytorch/pytorch/pull/55292
- if(NOT ${CUDA_VERSION} LESS 11.5)
- string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
- endif()
-
-From 1cb29c9816e795767236854644a1ba2c28da6e0d Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Fri, 22 Oct 2021 10:42:32 -0700
-Subject: [PATCH 25/30] fix
-
----
- caffe2/utils/cub_namespace.cuh | 19 ++++++++++++++++++-
- 1 file changed, 18 insertions(+), 1 deletion(-)
-
-diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh
-index c7a5db0dc013..752f273128ee 100644
---- a/caffe2/utils/cub_namespace.cuh
-+++ b/caffe2/utils/cub_namespace.cuh
-@@ -1,4 +1,21 @@
--#include <ATen/cuda/cub_definitions.cuh>
-+#pragma once
-+
-+#if !defined(USE_ROCM)
-+#include <cuda.h>
-+#include <cub/version.cuh>
-+#else
-+#define CUB_VERSION 0
-+#endif
-+
-+// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
-+// https://github.com/NVIDIA/cub/pull/326
-+// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
-+// starting from CUDA 11.5
-+#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
-+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
-+#else
-+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
-+#endif
-
- #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
- namespace caffe2 {
-
-From 3ca182d3c6d1dcc45396c3f2f3169cc84496ca1f Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Sun, 24 Oct 2021 14:27:09 -0700
-Subject: [PATCH 26/30] fix some cuda 10 build
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 9828abdfc99a..5243e551f4a7 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -1,6 +1,6 @@
- #pragma once
-
--#if !defined(USE_ROCM)
-+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
- #include <cuda.h>
- #include <cub/version.cuh>
- #else
-
-From 3a7c2bc6da6f2be060be879d0b2277b833ab7cc9 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Sun, 24 Oct 2021 14:28:53 -0700
-Subject: [PATCH 27/30] fix caffe2
-
----
- caffe2/utils/cub_namespace.cuh | 7 -------
- 1 file changed, 7 deletions(-)
-
-diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh
-index 752f273128ee..188a9936f9c6 100644
---- a/caffe2/utils/cub_namespace.cuh
-+++ b/caffe2/utils/cub_namespace.cuh
-@@ -1,12 +1,5 @@
- #pragma once
-
--#if !defined(USE_ROCM)
--#include <cuda.h>
--#include <cub/version.cuh>
--#else
--#define CUB_VERSION 0
--#endif
--
- // cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
- // https://github.com/NVIDIA/cub/pull/326
- // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
-
-From 960606e0dc1da7c16b6cb0ae11b04bace95816ac Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Sun, 24 Oct 2021 14:50:13 -0700
-Subject: [PATCH 28/30] fix
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 5243e551f4a7..b921af480655 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -1,7 +1,10 @@
- #pragma once
-
--#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-+#if !define(USE_ROCM)
- #include <cuda.h>
-+#endif
-+
-+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
- #include <cub/version.cuh>
- #else
- #define CUB_VERSION 0
-
-From 48f128f55396622bfb952b474888813334fae303 Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Sun, 24 Oct 2021 14:51:12 -0700
-Subject: [PATCH 29/30] save
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index b921af480655..8c99f4951c6c 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -1,7 +1,7 @@
- #pragma once
-
- #if !define(USE_ROCM)
--#include <cuda.h>
-+#include <cuda.h> // for CUDA_VERSION
- #endif
-
- #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-
-From b2023a036afe9ddff4f710953ad6ba55f8396ecf Mon Sep 17 00:00:00 2001
-From: Xiang Gao <qasdfgtyuiop at gmail.com>
-Date: Sun, 24 Oct 2021 14:51:59 -0700
-Subject: [PATCH 30/30] fixed
-
----
- aten/src/ATen/cuda/cub_definitions.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
-index 8c99f4951c6c..61119fc17458 100644
---- a/aten/src/ATen/cuda/cub_definitions.cuh
-+++ b/aten/src/ATen/cuda/cub_definitions.cuh
-@@ -1,6 +1,6 @@
- #pragma once
-
--#if !define(USE_ROCM)
-+#if !defined(USE_ROCM)
- #include <cuda.h> // for CUDA_VERSION
- #endif
-
Deleted: PKGBUILD
===================================================================
--- PKGBUILD 2022-03-13 06:01:35 UTC (rev 1150466)
+++ PKGBUILD 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,322 +0,0 @@
-# Maintainer: Sven-Hendrik Haase <svenstaro at archlinux.org>
-# Contributor: Stephen Zhang <zsrkmyn at gmail dot com>
-
-pkgbase=python-pytorch
-pkgname=("python-pytorch" "python-pytorch-cuda")
-_pkgname="pytorch"
-pkgver=1.11.0rc5
-_pkgver=1.11.0-rc5
-pkgrel=2
-_pkgdesc='Tensors and Dynamic neural networks in Python with strong GPU acceleration'
-pkgdesc="${_pkgdesc}"
-arch=('x86_64')
-url="https://pytorch.org"
-license=('BSD')
-depends=('google-glog' 'gflags' 'opencv' 'openmp' 'nccl' 'pybind11' 'python' 'python-yaml' 'libuv'
- 'python-numpy' 'protobuf' 'ffmpeg4.4' 'python-future' 'qt5-base' 'onednn' 'intel-mkl'
- 'python-typing_extensions')
-makedepends=('python' 'python-setuptools' 'python-yaml' 'python-numpy' 'cmake' 'cuda'
- 'cudnn' 'git' 'magma' 'ninja' 'pkgconfig' 'doxygen')
-source=("${_pkgname}-${pkgver}::git+https://github.com/pytorch/pytorch.git#tag=v$_pkgver"
- # generated using parse-submodules
- # "${pkgname}::https://github.com/pytorch/pytorch#commit=7cc129e60c"
- "${pkgname}-pthreadpool::git+https://github.com/Maratyszcza/pthreadpool.git"
- "${pkgname}-NNPACK::git+https://github.com/Maratyszcza/NNPACK.git"
- "${pkgname}-ios-cmake::git+https://github.com/Yangqing/ios-cmake.git"
- "${pkgname}-psimd::git+https://github.com/Maratyszcza/psimd.git"
- "${pkgname}-FXdiv::git+https://github.com/Maratyszcza/FXdiv.git"
- "${pkgname}-FP16::git+https://github.com/Maratyszcza/FP16.git"
- "${pkgname}-cub::git+https://github.com/NVlabs/cub.git"
- "${pkgname}-eigen-git-mirror::git+https://github.com/eigenteam/eigen-git-mirror.git"
- "${pkgname}-cpuinfo::git+https://github.com/pytorch/cpuinfo.git"
- "${pkgname}-enum34::git+https://github.com/PeachPy/enum34.git"
- "${pkgname}-PeachPy::git+https://github.com/Maratyszcza/PeachPy.git"
- "${pkgname}-gloo::git+https://github.com/facebookincubator/gloo"
- "${pkgname}-foxi::git+https://github.com/houseroad/foxi.git"
- "${pkgname}-ideep::git+https://github.com/intel/ideep"
- "${pkgname}-benchmark::git+https://github.com/google/benchmark.git"
- "${pkgname}-QNNPACK::git+https://github.com/pytorch/QNNPACK"
- "${pkgname}-six::git+https://github.com/benjaminp/six.git"
- "${pkgname}-ARM_NEON_2_x86_SSE::git+https://github.com/intel/ARM_NEON_2_x86_SSE.git"
- "${pkgname}-sleef::git+https://github.com/shibatch/sleef"
- "${pkgname}-fbjni::git+https://github.com/facebookincubator/fbjni.git"
- "${pkgname}-gemmlowp::git+https://github.com/google/gemmlowp.git"
- "${pkgname}-onnx-tensorrt::git+https://github.com/onnx/onnx-tensorrt"
- "${pkgname}-pocketfft::git+https://github.com/mreineck/pocketfft"
- "${pkgname}-cudnn-frontend::git+https://github.com/NVIDIA/cudnn-frontend.git"
- "${pkgname}-nccl::git+https://github.com/NVIDIA/nccl"
- "${pkgname}-googletest::git+https://github.com/google/googletest.git"
- "${pkgname}-pybind11::git+https://github.com/pybind/pybind11.git"
- "${pkgname}-breakpad::git+https://github.com/driazati/breakpad.git"
- "${pkgname}-zstd::git+https://github.com/facebook/zstd.git"
- "${pkgname}-tbb::git+https://github.com/01org/tbb"
- "${pkgname}-fbgemm::git+https://github.com/pytorch/fbgemm"
- "${pkgname}-tensorpipe::git+https://github.com/pytorch/tensorpipe.git"
- "${pkgname}-kineto::git+https://github.com/pytorch/kineto"
- "${pkgname}-onnx::git+https://github.com/onnx/onnx.git"
- "${pkgname}-fmt::git+https://github.com/fmtlib/fmt.git"
- "${pkgname}-protobuf::git+https://github.com/protocolbuffers/protobuf.git"
- "${pkgname}-XNNPACK::git+https://github.com/google/XNNPACK.git"
- "${pkgname}-flatbuffers::git+https://github.com/google/flatbuffers.git"
- https://github.com/oneapi-src/oneDNN/commit/1fe0f2594a1bfc6386fd8f6537f971d5ae9c1214.patch
- fix_old_nnapi_lite_interpreter_config.patch
- fix-jit-frontend-nullptr-deref.patch
- fix_include_system.patch
- use-system-libuv.patch
- fix-building-for-torchvision.patch
- fix_c10.patch
- ffmpeg4.4.patch
- 66219.patch)
-sha256sums=('SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- 'SKIP'
- '7728e99500d8034c837bbbe2b48b780d8563de4e56fff38a96766caad08cce05'
- '21476edfa61573892a325cb8a91e13f601142e39b34e24e4575d2cdebb063b3f'
- 'c272684a4c747f034163fcfd9dbb7264d5fe821dd25a060f0b791760ad0083ae'
- '557761502bbd994d9795bef46779e4b8c60ba0b45e7d60841f477d3b7f28a00a'
- 'cd9ac4aaa9f946ac5eafc57cf66c5c16b3ea7ac8af32c2558fad0705411bb669'
- '600bd6a4bbcec9f99ab815d82cee1c2875530b2b75f4010da5ba72ce9bf31aff'
- '4d0d7da4a3fb099ed75f3007559fad04ac96eed87c523b274fb3bb6020e6b9b8'
- '75001b59e76831b0c93a547f851cb980e00b0d8cc7b66fb507eaeac217dc6ff9'
- 'd86efbe915386989d75d313fc76785e6d9c5638b983f17e98cca32174ac1fcee')
-options=('!lto')
-
-get_pyver () {
- python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))'
-}
-
-prepare() {
- cd "${srcdir}/${_pkgname}-${pkgver}"
-
- # generated using parse-submodules
- git submodule init
-
- git config submodule."third_party/pybind11".url "${srcdir}/${pkgname}"-pybind11
- git config submodule."third_party/cub".url "${srcdir}/${pkgname}"-cub
- git config submodule."third_party/eigen".url "${srcdir}/${pkgname}"-eigen-git-mirror
- git config submodule."third_party/googletest".url "${srcdir}/${pkgname}"-googletest
- git config submodule."third_party/benchmark".url "${srcdir}/${pkgname}"-benchmark
- git config submodule."third_party/protobuf".url "${srcdir}/${pkgname}"-protobuf
- git config submodule."third_party/ios-cmake".url "${srcdir}/${pkgname}"-ios-cmake
- git config submodule."third_party/NNPACK".url "${srcdir}/${pkgname}"-NNPACK
- git config submodule."third_party/gloo".url "${srcdir}/${pkgname}"-gloo
- git config submodule."third_party/NNPACK_deps/pthreadpool".url "${srcdir}/${pkgname}"-pthreadpool
- git config submodule."third_party/NNPACK_deps/FXdiv".url "${srcdir}/${pkgname}"-FXdiv
- git config submodule."third_party/NNPACK_deps/FP16".url "${srcdir}/${pkgname}"-FP16
- git config submodule."third_party/NNPACK_deps/psimd".url "${srcdir}/${pkgname}"-psimd
- git config submodule."third_party/zstd".url "${srcdir}/${pkgname}"-zstd
- git config submodule."third_party/cpuinfo".url "${srcdir}/${pkgname}"-cpuinfo
- git config submodule."third_party/python-enum".url "${srcdir}/${pkgname}"-enum34
- git config submodule."third_party/python-peachpy".url "${srcdir}/${pkgname}"-PeachPy
- git config submodule."third_party/python-six".url "${srcdir}/${pkgname}"-six
- git config submodule."third_party/onnx".url "${srcdir}/${pkgname}"-onnx
- git config submodule."third_party/onnx-tensorrt".url "${srcdir}/${pkgname}"-onnx-tensorrt
- git config submodule."third_party/sleef".url "${srcdir}/${pkgname}"-sleef
- git config submodule."third_party/ideep".url "${srcdir}/${pkgname}"-ideep
- git config submodule."third_party/nccl/nccl".url "${srcdir}/${pkgname}"-nccl
- git config submodule."third_party/gemmlowp/gemmlowp".url "${srcdir}/${pkgname}"-gemmlowp
- git config submodule."third_party/QNNPACK".url "${srcdir}/${pkgname}"-QNNPACK
- git config submodule."third_party/neon2sse".url "${srcdir}/${pkgname}"-ARM_NEON_2_x86_SSE
- git config submodule."third_party/fbgemm".url "${srcdir}/${pkgname}"-fbgemm
- git config submodule."third_party/foxi".url "${srcdir}/${pkgname}"-foxi
- git config submodule."third_party/tbb".url "${srcdir}/${pkgname}"-tbb
- git config submodule."android/libs/fbjni".url "${srcdir}/${pkgname}"-fbjni
- git config submodule."third_party/XNNPACK".url "${srcdir}/${pkgname}"-XNNPACK
- git config submodule."third_party/fmt".url "${srcdir}/${pkgname}"-fmt
- git config submodule."third_party/tensorpipe".url "${srcdir}/${pkgname}"-tensorpipe
- git config submodule."third_party/cudnn_frontend".url "${srcdir}/${pkgname}"-cudnn-frontend
- git config submodule."third_party/kineto".url "${srcdir}/${pkgname}"-kineto
- git config submodule."third_party/pocketfft".url "${srcdir}/${pkgname}"-pocketfft
- git config submodule."third_party/breakpad".url "${srcdir}/${pkgname}"-breakpad
- git config submodule."third_party/flatbuffers".url "${srcdir}/${pkgname}"-flatbuffers
-
- git submodule update --init --recursive
-
- # https://bugs.archlinux.org/task/64981
- patch -N torch/utils/cpp_extension.py "${srcdir}"/fix_include_system.patch
-
- # Use system libuv
- patch -Np1 -i "${srcdir}"/use-system-libuv.patch
-
- # fix https://github.com/pytorch/vision/issues/3695
- patch -Np1 -i "${srcdir}/fix-building-for-torchvision.patch"
-
- # cuda 11.4.1 fix
- # patch -Np1 -i "${srcdir}/fix_c10.patch"
-
- # https://discuss.pytorch.org/t/about-build-android-sh-lite-and-nnapi/133581
- # patch -Np1 -i "${srcdir}/fix_old_nnapi_lite_interpreter_config.patch"
-
- # fix nullptr dereference
- # patch -Np1 -i "${srcdir}/fix-jit-frontend-nullptr-deref.patch"
-
- # disable vec tests
- # sed -e '/set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)/d' -i aten/CMakeLists.txt
-
- # https://github.com/pytorch/pytorch/issues/67153, https://github.com/pytorch/pytorch/pull/66219
- # patch -Np1 -i "${srcdir}/66219.patch"
-
- # fix ideep/mkl-dnn
- # patch -Np1 -d third_party/ideep/mkl-dnn -i "${srcdir}/1fe0f2594a1bfc6386fd8f6537f971d5ae9c1214.patch"
-
- # remove local nccl
- # rm -rf third_party/nccl/nccl
- # also remove path from nccl module, so it's not checked
- # sed -e '/path = third_party\/nccl\/nccl/d' -i ./.gitmodules
-
- # fix build with google-glog 0.5 https://github.com/pytorch/pytorch/issues/58054
- sed -e '/namespace glog_internal_namespace_/d' -e 's|::glog_internal_namespace_||' -i c10/util/Logging.cpp
-
- # build against ffmpeg4.4
- patch -Np1 -i "${srcdir}/ffmpeg4.4.patch"
-
- # Backport https://github.com/driazati/breakpad/commit/605c51ed96ad44b34c457bbca320e74e194c317e for fixing build with glibc > 2.33
- git -C third_party/breakpad cherry-pick -n 605c51ed96ad44b34c457bbca320e74e194c317e
-
- cd "${srcdir}"
-
- cp -r "${_pkgname}-${pkgver}" "${_pkgname}-${pkgver}-cuda"
-
- export VERBOSE=1
- export PYTORCH_BUILD_VERSION="${pkgver}"
- export PYTORCH_BUILD_NUMBER=1
-
- # Check tools/setup_helpers/cmake.py, setup.py and CMakeLists.txt for a list of flags that can be set via env vars.
- export ATEN_NO_TEST=ON # do not build ATen tests
- export USE_MKLDNN=ON
- export BUILD_CUSTOM_PROTOBUF=OFF
- export BUILD_CAFFE2=ON
- export BUILD_CAFFE2_OPS=ON
- # export BUILD_SHARED_LIBS=OFF
- export USE_FFMPEG=ON
- export USE_GFLAGS=ON
- export USE_GLOG=ON
- export BUILD_BINARY=ON
- export USE_OBSERVERS=ON
- export USE_OPENCV=ON
- # export USE_SYSTEM_LIBS=ON # experimental, not all libs present in repos
- export USE_SYSTEM_NCCL=ON
- export NCCL_VERSION=$(pkg-config nccl --modversion)
- export NCCL_VER_CODE=$(sed -n 's/^#define NCCL_VERSION_CODE\s*\(.*\).*/\1/p' /usr/include/nccl.h)
- # export BUILD_SPLIT_CUDA=ON # modern preferred build, but splits libs and symbols, ABI break
- # export USE_FAST_NVCC=ON # parallel build with nvcc, spawns too many processes
- export USE_CUPTI_SO=ON # make sure cupti.so is used as shared lib
- export CUDAHOSTCXX=/usr/bin/g++
- export CUDA_HOST_COMPILER="${CUDAHOSTCXX}"
- export CUDA_HOME=/opt/cuda
- # hide buildt-time CUDA devices
- export CUDA_VISIBLE_DEVICES=""
- export CUDNN_LIB_DIR=/usr/lib
- export CUDNN_INCLUDE_DIR=/usr/include
- export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
- export TORCH_CUDA_ARCH_LIST="5.2;6.0;6.2;7.0;7.2;7.5;8.0;8.6;8.6+PTX" #include latest PTX for future compat
- export OVERRIDE_TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
-}
-
-build() {
- echo "Building without cuda and with non-x86-64 optimizations"
- export USE_CUDA=0
- export USE_CUDNN=0
- cd "${srcdir}/${_pkgname}-${pkgver}"
- echo "add_definitions(-march=haswell)" >> cmake/MiscCheck.cmake
- # this horrible hack is necessary because the current release
- # ships inconsistent CMake which tries to build objects before
- # thier dependencies, build twice when dependencies are available
- python setup.py build || python setup.py build
-
-
- echo "Building with cuda and with non-x86-64 optimizations"
- export USE_CUDA=1
- export USE_CUDNN=1
- cd "${srcdir}/${_pkgname}-${pkgver}-cuda"
- echo "add_definitions(-march=haswell)" >> cmake/MiscCheck.cmake
- # same horrible hack as above
- python setup.py build || python setup.py build
-}
-
-_package() {
- # Prevent setup.py from re-running CMake and rebuilding
- sed -e 's/RUN_BUILD_DEPS = True/RUN_BUILD_DEPS = False/g' -i setup.py
-
- python setup.py install --root="${pkgdir}"/ --optimize=1 --skip-build
-
- install -Dm644 LICENSE "${pkgdir}/usr/share/licenses/${pkgname}/LICENSE"
-
- pytorchpath="usr/lib/python$(get_pyver)/site-packages/torch"
- install -d "${pkgdir}/usr/lib"
-
- # put CMake files in correct place
- mv "${pkgdir}/${pytorchpath}/share/cmake" "${pkgdir}/usr/lib/cmake"
-
- # put C++ API in correct place
- mv "${pkgdir}/${pytorchpath}/include" "${pkgdir}/usr/include"
- mv "${pkgdir}/${pytorchpath}/lib"/*.so* "${pkgdir}/usr/lib/"
-
- # clean up duplicates
- # TODO: move towards direct shared library dependecy of:
- # c10, caffe2, libcpuinfo, CUDA RT, gloo, GTest, Intel MKL,
- # NVRTC, ONNX, protobuf, libthreadpool, QNNPACK
- rm -rf "${pkgdir}/usr/include/pybind11"
-
- # python module is hardcoded to look there at runtime
- ln -s /usr/include "${pkgdir}/${pytorchpath}/include"
- find "${pkgdir}"/usr/lib -type f -name "*.so*" -print0 | while read -rd $'\0' _lib; do
- ln -s ${_lib#"$pkgdir"} "${pkgdir}/${pytorchpath}/lib/"
- done
-}
-
-package_python-pytorch() {
- pkgdesc="${_pkgdesc} (with AVX2 CPU optimizations)"
- replaces=(python-pytorch-opt)
- cd "${srcdir}/${_pkgname}-${pkgver}"
- _package
-}
-
-package_python-pytorch-cuda() {
- pkgdesc="${_pkgdesc} (with CUDA and AVX2 CPU optimizations)"
- depends+=(cuda cudnn magma)
- replaces=(python-pytorch-opt-cuda)
- conflicts=(python-pytorch)
- provides=(python-pytorch)
-
- cd "${srcdir}/${_pkgname}-${pkgver}-cuda"
- _package
-}
-
-# vim:set ts=2 sw=2 et:
Copied: python-pytorch/repos/community-x86_64/PKGBUILD (from rev 1150466, python-pytorch/trunk/PKGBUILD)
===================================================================
--- PKGBUILD (rev 0)
+++ PKGBUILD 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -0,0 +1,294 @@
+# Maintainer: Sven-Hendrik Haase <svenstaro at archlinux.org>
+# Contributor: Stephen Zhang <zsrkmyn at gmail dot com>
+
+pkgbase=python-pytorch
+pkgname=("python-pytorch" "python-pytorch-cuda")
+_pkgname="pytorch"
+pkgver=1.11.0
+_pkgver=1.11.0
+pkgrel=1
+_pkgdesc='Tensors and Dynamic neural networks in Python with strong GPU acceleration'
+pkgdesc="${_pkgdesc}"
+arch=('x86_64')
+url="https://pytorch.org"
+license=('BSD')
+depends=('google-glog' 'gflags' 'opencv' 'openmp' 'nccl' 'pybind11' 'python' 'python-yaml' 'libuv'
+ 'python-numpy' 'protobuf' 'ffmpeg4.4' 'python-future' 'qt5-base' 'onednn' 'intel-mkl'
+ 'python-typing_extensions')
+makedepends=('python' 'python-setuptools' 'python-yaml' 'python-numpy' 'cmake' 'cuda'
+ 'cudnn' 'git' 'magma' 'ninja' 'pkgconfig' 'doxygen')
+source=("${_pkgname}-${pkgver}::git+https://github.com/pytorch/pytorch.git#tag=v$_pkgver"
+ # generated using parse-submodules
+ # "${pkgname}::https://github.com/pytorch/pytorch#commit=7cc129e60c"
+ "${pkgname}-pthreadpool::git+https://github.com/Maratyszcza/pthreadpool.git"
+ "${pkgname}-NNPACK::git+https://github.com/Maratyszcza/NNPACK.git"
+ "${pkgname}-ios-cmake::git+https://github.com/Yangqing/ios-cmake.git"
+ "${pkgname}-psimd::git+https://github.com/Maratyszcza/psimd.git"
+ "${pkgname}-FXdiv::git+https://github.com/Maratyszcza/FXdiv.git"
+ "${pkgname}-FP16::git+https://github.com/Maratyszcza/FP16.git"
+ "${pkgname}-cub::git+https://github.com/NVlabs/cub.git"
+ "${pkgname}-eigen-git-mirror::git+https://github.com/eigenteam/eigen-git-mirror.git"
+ "${pkgname}-cpuinfo::git+https://github.com/pytorch/cpuinfo.git"
+ "${pkgname}-enum34::git+https://github.com/PeachPy/enum34.git"
+ "${pkgname}-PeachPy::git+https://github.com/Maratyszcza/PeachPy.git"
+ "${pkgname}-gloo::git+https://github.com/facebookincubator/gloo"
+ "${pkgname}-foxi::git+https://github.com/houseroad/foxi.git"
+ "${pkgname}-ideep::git+https://github.com/intel/ideep"
+ "${pkgname}-benchmark::git+https://github.com/google/benchmark.git"
+ "${pkgname}-QNNPACK::git+https://github.com/pytorch/QNNPACK"
+ "${pkgname}-six::git+https://github.com/benjaminp/six.git"
+ "${pkgname}-ARM_NEON_2_x86_SSE::git+https://github.com/intel/ARM_NEON_2_x86_SSE.git"
+ "${pkgname}-sleef::git+https://github.com/shibatch/sleef"
+ "${pkgname}-fbjni::git+https://github.com/facebookincubator/fbjni.git"
+ "${pkgname}-gemmlowp::git+https://github.com/google/gemmlowp.git"
+ "${pkgname}-onnx-tensorrt::git+https://github.com/onnx/onnx-tensorrt"
+ "${pkgname}-pocketfft::git+https://github.com/mreineck/pocketfft"
+ "${pkgname}-cudnn-frontend::git+https://github.com/NVIDIA/cudnn-frontend.git"
+ "${pkgname}-nccl::git+https://github.com/NVIDIA/nccl"
+ "${pkgname}-googletest::git+https://github.com/google/googletest.git"
+ "${pkgname}-pybind11::git+https://github.com/pybind/pybind11.git"
+ "${pkgname}-breakpad::git+https://github.com/driazati/breakpad.git"
+ "${pkgname}-zstd::git+https://github.com/facebook/zstd.git"
+ "${pkgname}-tbb::git+https://github.com/01org/tbb"
+ "${pkgname}-fbgemm::git+https://github.com/pytorch/fbgemm"
+ "${pkgname}-tensorpipe::git+https://github.com/pytorch/tensorpipe.git"
+ "${pkgname}-kineto::git+https://github.com/pytorch/kineto"
+ "${pkgname}-onnx::git+https://github.com/onnx/onnx.git"
+ "${pkgname}-fmt::git+https://github.com/fmtlib/fmt.git"
+ "${pkgname}-protobuf::git+https://github.com/protocolbuffers/protobuf.git"
+ "${pkgname}-XNNPACK::git+https://github.com/google/XNNPACK.git"
+ "${pkgname}-flatbuffers::git+https://github.com/google/flatbuffers.git"
+ fix_include_system.patch
+ use-system-libuv.patch
+ fix-building-for-torchvision.patch
+ ffmpeg4.4.patch)
+sha256sums=('SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ 'SKIP'
+ '557761502bbd994d9795bef46779e4b8c60ba0b45e7d60841f477d3b7f28a00a'
+ 'cd9ac4aaa9f946ac5eafc57cf66c5c16b3ea7ac8af32c2558fad0705411bb669'
+ '600bd6a4bbcec9f99ab815d82cee1c2875530b2b75f4010da5ba72ce9bf31aff'
+ '75001b59e76831b0c93a547f851cb980e00b0d8cc7b66fb507eaeac217dc6ff9')
+options=('!lto')
+
+get_pyver () {
+ python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))'
+}
+
+prepare() {
+ cd "${srcdir}/${_pkgname}-${pkgver}"
+
+ # generated using parse-submodules
+ git submodule init
+
+ git config submodule."third_party/pybind11".url "${srcdir}/${pkgname}"-pybind11
+ git config submodule."third_party/cub".url "${srcdir}/${pkgname}"-cub
+ git config submodule."third_party/eigen".url "${srcdir}/${pkgname}"-eigen-git-mirror
+ git config submodule."third_party/googletest".url "${srcdir}/${pkgname}"-googletest
+ git config submodule."third_party/benchmark".url "${srcdir}/${pkgname}"-benchmark
+ git config submodule."third_party/protobuf".url "${srcdir}/${pkgname}"-protobuf
+ git config submodule."third_party/ios-cmake".url "${srcdir}/${pkgname}"-ios-cmake
+ git config submodule."third_party/NNPACK".url "${srcdir}/${pkgname}"-NNPACK
+ git config submodule."third_party/gloo".url "${srcdir}/${pkgname}"-gloo
+ git config submodule."third_party/NNPACK_deps/pthreadpool".url "${srcdir}/${pkgname}"-pthreadpool
+ git config submodule."third_party/NNPACK_deps/FXdiv".url "${srcdir}/${pkgname}"-FXdiv
+ git config submodule."third_party/NNPACK_deps/FP16".url "${srcdir}/${pkgname}"-FP16
+ git config submodule."third_party/NNPACK_deps/psimd".url "${srcdir}/${pkgname}"-psimd
+ git config submodule."third_party/zstd".url "${srcdir}/${pkgname}"-zstd
+ git config submodule."third_party/cpuinfo".url "${srcdir}/${pkgname}"-cpuinfo
+ git config submodule."third_party/python-enum".url "${srcdir}/${pkgname}"-enum34
+ git config submodule."third_party/python-peachpy".url "${srcdir}/${pkgname}"-PeachPy
+ git config submodule."third_party/python-six".url "${srcdir}/${pkgname}"-six
+ git config submodule."third_party/onnx".url "${srcdir}/${pkgname}"-onnx
+ git config submodule."third_party/onnx-tensorrt".url "${srcdir}/${pkgname}"-onnx-tensorrt
+ git config submodule."third_party/sleef".url "${srcdir}/${pkgname}"-sleef
+ git config submodule."third_party/ideep".url "${srcdir}/${pkgname}"-ideep
+ git config submodule."third_party/nccl/nccl".url "${srcdir}/${pkgname}"-nccl
+ git config submodule."third_party/gemmlowp/gemmlowp".url "${srcdir}/${pkgname}"-gemmlowp
+ git config submodule."third_party/QNNPACK".url "${srcdir}/${pkgname}"-QNNPACK
+ git config submodule."third_party/neon2sse".url "${srcdir}/${pkgname}"-ARM_NEON_2_x86_SSE
+ git config submodule."third_party/fbgemm".url "${srcdir}/${pkgname}"-fbgemm
+ git config submodule."third_party/foxi".url "${srcdir}/${pkgname}"-foxi
+ git config submodule."third_party/tbb".url "${srcdir}/${pkgname}"-tbb
+ git config submodule."android/libs/fbjni".url "${srcdir}/${pkgname}"-fbjni
+ git config submodule."third_party/XNNPACK".url "${srcdir}/${pkgname}"-XNNPACK
+ git config submodule."third_party/fmt".url "${srcdir}/${pkgname}"-fmt
+ git config submodule."third_party/tensorpipe".url "${srcdir}/${pkgname}"-tensorpipe
+ git config submodule."third_party/cudnn_frontend".url "${srcdir}/${pkgname}"-cudnn-frontend
+ git config submodule."third_party/kineto".url "${srcdir}/${pkgname}"-kineto
+ git config submodule."third_party/pocketfft".url "${srcdir}/${pkgname}"-pocketfft
+ git config submodule."third_party/breakpad".url "${srcdir}/${pkgname}"-breakpad
+ git config submodule."third_party/flatbuffers".url "${srcdir}/${pkgname}"-flatbuffers
+
+ git submodule update --init --recursive
+
+ # https://bugs.archlinux.org/task/64981
+ patch -N torch/utils/cpp_extension.py "${srcdir}"/fix_include_system.patch
+
+ # Use system libuv
+ patch -Np1 -i "${srcdir}"/use-system-libuv.patch
+
+ # fix https://github.com/pytorch/vision/issues/3695
+ patch -Np1 -i "${srcdir}/fix-building-for-torchvision.patch"
+
+ # remove local nccl
+ # rm -rf third_party/nccl/nccl
+ # also remove path from nccl module, so it's not checked
+ # sed -e '/path = third_party\/nccl\/nccl/d' -i ./.gitmodules
+
+ # fix build with google-glog 0.5 https://github.com/pytorch/pytorch/issues/58054
+ sed -e '/namespace glog_internal_namespace_/d' -e 's|::glog_internal_namespace_||' -i c10/util/Logging.cpp
+
+ # build against ffmpeg4.4
+ patch -Np1 -i "${srcdir}/ffmpeg4.4.patch"
+
+ # Backport https://github.com/driazati/breakpad/commit/605c51ed96ad44b34c457bbca320e74e194c317e for fixing build with glibc > 2.33
+ git -C third_party/breakpad cherry-pick -n 605c51ed96ad44b34c457bbca320e74e194c317e
+
+ cd "${srcdir}"
+
+ cp -r "${_pkgname}-${pkgver}" "${_pkgname}-${pkgver}-cuda"
+
+ export VERBOSE=1
+ export PYTORCH_BUILD_VERSION="${pkgver}"
+ export PYTORCH_BUILD_NUMBER=1
+
+ # Check tools/setup_helpers/cmake.py, setup.py and CMakeLists.txt for a list of flags that can be set via env vars.
+ export ATEN_NO_TEST=ON # do not build ATen tests
+ export USE_MKLDNN=ON
+ export BUILD_CUSTOM_PROTOBUF=OFF
+ export BUILD_CAFFE2=ON
+ export BUILD_CAFFE2_OPS=ON
+ # export BUILD_SHARED_LIBS=OFF
+ export USE_FFMPEG=ON
+ export USE_GFLAGS=ON
+ export USE_GLOG=ON
+ export BUILD_BINARY=ON
+ export USE_OBSERVERS=ON
+ export USE_OPENCV=ON
+ # export USE_SYSTEM_LIBS=ON # experimental, not all libs present in repos
+ export USE_SYSTEM_NCCL=ON
+ export NCCL_VERSION=$(pkg-config nccl --modversion)
+ export NCCL_VER_CODE=$(sed -n 's/^#define NCCL_VERSION_CODE\s*\(.*\).*/\1/p' /usr/include/nccl.h)
+ # export BUILD_SPLIT_CUDA=ON # modern preferred build, but splits libs and symbols, ABI break
+ # export USE_FAST_NVCC=ON # parallel build with nvcc, spawns too many processes
+ export USE_CUPTI_SO=ON # make sure cupti.so is used as shared lib
+ export CUDAHOSTCXX=/usr/bin/g++
+ export CUDA_HOST_COMPILER="${CUDAHOSTCXX}"
+ export CUDA_HOME=/opt/cuda
+ # hide buildt-time CUDA devices
+ export CUDA_VISIBLE_DEVICES=""
+ export CUDNN_LIB_DIR=/usr/lib
+ export CUDNN_INCLUDE_DIR=/usr/include
+ export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ export TORCH_CUDA_ARCH_LIST="5.2;6.0;6.2;7.0;7.2;7.5;8.0;8.6;8.6+PTX" #include latest PTX for future compat
+ export OVERRIDE_TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
+}
+
+build() {
+ echo "Building without cuda and with non-x86-64 optimizations"
+ export USE_CUDA=0
+ export USE_CUDNN=0
+ cd "${srcdir}/${_pkgname}-${pkgver}"
+ echo "add_definitions(-march=haswell)" >> cmake/MiscCheck.cmake
+ # this horrible hack is necessary because the current release
+ # ships inconsistent CMake which tries to build objects before
+ # thier dependencies, build twice when dependencies are available
+ python setup.py build || python setup.py build
+
+
+ echo "Building with cuda and with non-x86-64 optimizations"
+ export USE_CUDA=1
+ export USE_CUDNN=1
+ cd "${srcdir}/${_pkgname}-${pkgver}-cuda"
+ echo "add_definitions(-march=haswell)" >> cmake/MiscCheck.cmake
+ # same horrible hack as above
+ python setup.py build || python setup.py build
+}
+
+_package() {
+ # Prevent setup.py from re-running CMake and rebuilding
+ sed -e 's/RUN_BUILD_DEPS = True/RUN_BUILD_DEPS = False/g' -i setup.py
+
+ python setup.py install --root="${pkgdir}"/ --optimize=1 --skip-build
+
+ install -Dm644 LICENSE "${pkgdir}/usr/share/licenses/${pkgname}/LICENSE"
+
+ pytorchpath="usr/lib/python$(get_pyver)/site-packages/torch"
+ install -d "${pkgdir}/usr/lib"
+
+ # put CMake files in correct place
+ mv "${pkgdir}/${pytorchpath}/share/cmake" "${pkgdir}/usr/lib/cmake"
+
+ # put C++ API in correct place
+ mv "${pkgdir}/${pytorchpath}/include" "${pkgdir}/usr/include"
+ mv "${pkgdir}/${pytorchpath}/lib"/*.so* "${pkgdir}/usr/lib/"
+
+ # clean up duplicates
+ # TODO: move towards direct shared library dependecy of:
+ # c10, caffe2, libcpuinfo, CUDA RT, gloo, GTest, Intel MKL,
+ # NVRTC, ONNX, protobuf, libthreadpool, QNNPACK
+ rm -rf "${pkgdir}/usr/include/pybind11"
+
+ # python module is hardcoded to look there at runtime
+ ln -s /usr/include "${pkgdir}/${pytorchpath}/include"
+ find "${pkgdir}"/usr/lib -type f -name "*.so*" -print0 | while read -rd $'\0' _lib; do
+ ln -s ${_lib#"$pkgdir"} "${pkgdir}/${pytorchpath}/lib/"
+ done
+}
+
+package_python-pytorch() {
+ pkgdesc="${_pkgdesc} (with AVX2 CPU optimizations)"
+ replaces=(python-pytorch-opt)
+ cd "${srcdir}/${_pkgname}-${pkgver}"
+ _package
+}
+
+package_python-pytorch-cuda() {
+ pkgdesc="${_pkgdesc} (with CUDA and AVX2 CPU optimizations)"
+ depends+=(cuda cudnn magma)
+ replaces=(python-pytorch-opt-cuda)
+ conflicts=(python-pytorch)
+ provides=(python-pytorch)
+
+ cd "${srcdir}/${_pkgname}-${pkgver}-cuda"
+ _package
+}
+
+# vim:set ts=2 sw=2 et:
Deleted: ffmpeg4.4.patch
===================================================================
--- ffmpeg4.4.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ ffmpeg4.4.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,51 +0,0 @@
-diff --git a/cmake/Modules/FindFFmpeg.cmake b/cmake/Modules/FindFFmpeg.cmake
-index 04437562ee..dc8fe4aa5c 100644
---- a/cmake/Modules/FindFFmpeg.cmake
-+++ b/cmake/Modules/FindFFmpeg.cmake
-@@ -14,34 +14,40 @@ else (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
-
- find_path(FFMPEG_AVCODEC_INCLUDE_DIR
- NAMES libavcodec/avcodec.h
-- PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} /usr/include /usr/local/include /opt/local/include /sw/include
-+ PATHS /usr/include/ffmpeg4.4 /usr/local/include /opt/local/include /sw/include
- PATH_SUFFIXES ffmpeg libav
-+ NO_DEFAULT_PATH
- )
-
- find_library(FFMPEG_LIBAVCODEC
- NAMES avcodec
-- PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-+ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
-+ NO_DEFAULT_PATH
- )
-
- find_library(FFMPEG_LIBAVFORMAT
- NAMES avformat
-- PATHS ${_FFMPEG_AVFORMAT_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-+ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
-+ NO_DEFAULT_PATH
- )
-
- find_library(FFMPEG_LIBAVUTIL
- NAMES avutil
-- PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-+ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
-+ NO_DEFAULT_PATH
- )
-
-
- find_library(FFMPEG_LIBSWSCALE
- NAMES swscale
-- PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-+ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
-+ NO_DEFAULT_PATH
- )
-
- find_library(FFMPEG_LIBSWRESAMPLE
- NAMES swresample
-- PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-+ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
-+ NO_DEFAULT_PATH
- )
-
- if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
Copied: python-pytorch/repos/community-x86_64/ffmpeg4.4.patch (from rev 1150466, python-pytorch/trunk/ffmpeg4.4.patch)
===================================================================
--- ffmpeg4.4.patch (rev 0)
+++ ffmpeg4.4.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -0,0 +1,51 @@
+diff --git a/cmake/Modules/FindFFmpeg.cmake b/cmake/Modules/FindFFmpeg.cmake
+index 04437562ee..dc8fe4aa5c 100644
+--- a/cmake/Modules/FindFFmpeg.cmake
++++ b/cmake/Modules/FindFFmpeg.cmake
+@@ -14,34 +14,40 @@ else (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
+
+ find_path(FFMPEG_AVCODEC_INCLUDE_DIR
+ NAMES libavcodec/avcodec.h
+- PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} /usr/include /usr/local/include /opt/local/include /sw/include
++ PATHS /usr/include/ffmpeg4.4 /usr/local/include /opt/local/include /sw/include
+ PATH_SUFFIXES ffmpeg libav
++ NO_DEFAULT_PATH
+ )
+
+ find_library(FFMPEG_LIBAVCODEC
+ NAMES avcodec
+- PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
++ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
++ NO_DEFAULT_PATH
+ )
+
+ find_library(FFMPEG_LIBAVFORMAT
+ NAMES avformat
+- PATHS ${_FFMPEG_AVFORMAT_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
++ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
++ NO_DEFAULT_PATH
+ )
+
+ find_library(FFMPEG_LIBAVUTIL
+ NAMES avutil
+- PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
++ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
++ NO_DEFAULT_PATH
+ )
+
+
+ find_library(FFMPEG_LIBSWSCALE
+ NAMES swscale
+- PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
++ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
++ NO_DEFAULT_PATH
+ )
+
+ find_library(FFMPEG_LIBSWRESAMPLE
+ NAMES swresample
+- PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
++ PATHS /usr/lib/ffmpeg4.4 /usr/local/lib /opt/local/lib /sw/lib
++ NO_DEFAULT_PATH
+ )
+
+ if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
Deleted: fix-building-for-torchvision.patch
===================================================================
--- fix-building-for-torchvision.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ fix-building-for-torchvision.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,25 +0,0 @@
-From 011495d8045c44527fbd7796ce860618120ae127 Mon Sep 17 00:00:00 2001
-From: Butui Hu <hot123tea123 at gmail.com>
-Date: Fri, 30 Apr 2021 11:36:30 +0800
-Subject: [PATCH] fix building torchvision
-
----
- aten/src/ATen/core/op_registration/op_allowlist.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h
-index f93462bb2cf..12903d1cc09 100644
---- a/aten/src/ATen/core/op_registration/op_allowlist.h
-+++ b/aten/src/ATen/core/op_registration/op_allowlist.h
-@@ -59,7 +59,7 @@ constexpr bool op_allowlist_contains(string_view allowlist, string_view item) {
- // Returns true iff the given op name is on the allowlist
- // and should be registered
- constexpr bool op_allowlist_check(string_view op_name) {
-- assert(op_name.find("::") != string_view::npos);
-+// assert(op_name.find("::") != string_view::npos);
- // Use assert() instead of throw() due to a gcc bug. See:
- // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function
- // https://github.com/fmtlib/fmt/issues/682
---
-2.31.1
-
Copied: python-pytorch/repos/community-x86_64/fix-building-for-torchvision.patch (from rev 1150466, python-pytorch/trunk/fix-building-for-torchvision.patch)
===================================================================
--- fix-building-for-torchvision.patch (rev 0)
+++ fix-building-for-torchvision.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -0,0 +1,25 @@
+From 011495d8045c44527fbd7796ce860618120ae127 Mon Sep 17 00:00:00 2001
+From: Butui Hu <hot123tea123 at gmail.com>
+Date: Fri, 30 Apr 2021 11:36:30 +0800
+Subject: [PATCH] fix building torchvision
+
+---
+ aten/src/ATen/core/op_registration/op_allowlist.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h
+index f93462bb2cf..12903d1cc09 100644
+--- a/aten/src/ATen/core/op_registration/op_allowlist.h
++++ b/aten/src/ATen/core/op_registration/op_allowlist.h
+@@ -59,7 +59,7 @@ constexpr bool op_allowlist_contains(string_view allowlist, string_view item) {
+ // Returns true iff the given op name is on the allowlist
+ // and should be registered
+ constexpr bool op_allowlist_check(string_view op_name) {
+- assert(op_name.find("::") != string_view::npos);
++// assert(op_name.find("::") != string_view::npos);
+ // Use assert() instead of throw() due to a gcc bug. See:
+ // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function
+ // https://github.com/fmtlib/fmt/issues/682
+--
+2.31.1
+
Deleted: fix-jit-frontend-nullptr-deref.patch
===================================================================
--- fix-jit-frontend-nullptr-deref.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ fix-jit-frontend-nullptr-deref.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,12 +0,0 @@
-diff --color -aur pytorch-1.10.0-old/torch/csrc/jit/frontend/ir_emitter.cpp pytorch-1.10.0-new/torch/csrc/jit/frontend/ir_emitter.cpp
---- pytorch-1.10.0-old/torch/csrc/jit/frontend/ir_emitter.cpp 2021-10-26 01:41:27.453059792 +0300
-+++ pytorch-1.10.0-new/torch/csrc/jit/frontend/ir_emitter.cpp 2021-10-26 02:00:09.783068924 +0300
-@@ -1678,7 +1678,7 @@
- << "Union type annotation `" << type_hint->repr_str()
- << "` can hold " << vector_repr.str() << ", but none of "
- << "those list types can hold the types of the given dict"
-- << " elements, which were unified to " << candidate->repr_str();
-+ << " elements, which were unified to " << (*unified_value_type)->repr_str();
- } else {
- refined_type_hint = candidate;
- }
Deleted: fix_c10.patch
===================================================================
--- fix_c10.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ fix_c10.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,12 +0,0 @@
-diff --color -aur pytorch-1.9.0-old/c10/core/TensorImpl.h pytorch-1.9.0-new/c10/core/TensorImpl.h
---- pytorch-1.9.0-old/c10/core/TensorImpl.h 2021-08-17 19:33:40.324974399 +0300
-+++ pytorch-1.9.0-new/c10/core/TensorImpl.h 2021-08-18 01:25:00.005901707 +0300
-@@ -2177,7 +2177,7 @@
- // DispatchKeySet
- //
- static_assert(
-- sizeof(void*) != sizeof(int64_t) || // if 64-bit...
-+ sizeof(void*) <= sizeof(int64_t) || // if 64-bit...
- sizeof(TensorImpl) == sizeof(int64_t) * 24,
- "You changed the size of TensorImpl on 64-bit arch."
- "See Note [TensorImpl size constraints] on how to proceed.");
Deleted: fix_include_system.patch
===================================================================
--- fix_include_system.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ fix_include_system.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,13 +0,0 @@
-diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
-index ca673033e1..c79ce8d37b 100644
---- a/torch/utils/cpp_extension.py
-+++ b/torch/utils/cpp_extension.py
-@@ -1760,7 +1760,7 @@ def _write_ninja_file_to_build_library(path,
- common_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"')
-
- common_cflags += [f'-I{include}' for include in user_includes]
-- common_cflags += [f'-isystem {include}' for include in system_includes]
-+ common_cflags += [f'-I{include}' for include in system_includes]
-
- common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
-
Copied: python-pytorch/repos/community-x86_64/fix_include_system.patch (from rev 1150466, python-pytorch/trunk/fix_include_system.patch)
===================================================================
--- fix_include_system.patch (rev 0)
+++ fix_include_system.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -0,0 +1,13 @@
+diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
+index ca673033e1..c79ce8d37b 100644
+--- a/torch/utils/cpp_extension.py
++++ b/torch/utils/cpp_extension.py
+@@ -1760,7 +1760,7 @@ def _write_ninja_file_to_build_library(path,
+ common_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"')
+
+ common_cflags += [f'-I{include}' for include in user_includes]
+- common_cflags += [f'-isystem {include}' for include in system_includes]
++ common_cflags += [f'-I{include}' for include in system_includes]
+
+ common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
+
Deleted: fix_old_nnapi_lite_interpreter_config.patch
===================================================================
--- fix_old_nnapi_lite_interpreter_config.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ fix_old_nnapi_lite_interpreter_config.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,33 +0,0 @@
-# Relevant discussion: https://discuss.pytorch.org/t/about-build-android-sh-lite-and-nnapi/133581
-diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index baf9666f11..19f9a78443 100644
---- a/aten/src/ATen/CMakeLists.txt
-+++ b/aten/src/ATen/CMakeLists.txt
-@@ -130,7 +130,7 @@ add_subdirectory(quantized)
- add_subdirectory(nnapi)
-
- if(BUILD_LITE_INTERPRETER)
-- set(all_cpu_cpp ${generated_cpp} ${core_generated_cpp} ${cpu_kernel_cpp})
-+ set(all_cpu_cpp ${generated_cpp} ${core_generated_cpp} ${ATen_NNAPI_SRCS} ${cpu_kernel_cpp})
- append_filelist("jit_core_sources" all_cpu_cpp)
- append_filelist("aten_cpu_source_non_codegen_list" all_cpu_cpp)
- append_filelist("aten_native_source_non_codegen_list" all_cpu_cpp)
-diff --git a/scripts/build_android.sh b/scripts/build_android.sh
-index daad46e8fb..211f5bb429 100755
---- a/scripts/build_android.sh
-+++ b/scripts/build_android.sh
-@@ -147,7 +147,11 @@ if [ "${ANDROID_DEBUG_SYMBOLS:-}" == '1' ]; then
- fi
-
- if [ -n "${USE_VULKAN}" ]; then
-- CMAKE_ARGS+=("-DUSE_VULKAN=ON")
-+ CMAKE_ARGS+=("-DUSE_VULKAN=${USE_VULKAN}")
-+fi
-+
-+if [ -n "${USE_NNAPI}" ]; then
-+ CMAKE_ARGS+=("-DUSE_NNAPI=${USE_NNAPI}")
- fi
-
- # Use-specified CMake arguments go last to allow overridding defaults
-
-
Deleted: test.py
===================================================================
--- test.py 2022-03-13 06:01:35 UTC (rev 1150466)
+++ test.py 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,7 +0,0 @@
-#!/usr/bin/env python
-
-import torch
-
-d = torch.device('cuda')
-a = torch.rand(1, 2).to(d)
-print(a + 0)
Copied: python-pytorch/repos/community-x86_64/test.py (from rev 1150466, python-pytorch/trunk/test.py)
===================================================================
--- test.py (rev 0)
+++ test.py 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+
+import torch
+
+d = torch.device('cuda')
+a = torch.rand(1, 2).to(d)
+print(a + 0)
Deleted: use-system-libuv.patch
===================================================================
--- use-system-libuv.patch 2022-03-13 06:01:35 UTC (rev 1150466)
+++ use-system-libuv.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -1,13 +0,0 @@
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 06464e799a..93410bc210 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1346,7 +1346,7 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
- set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
- set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
- endif()
-- set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
-+ set(TP_BUILD_LIBUV OFF CACHE BOOL "" FORCE)
- set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
-
- add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
Copied: python-pytorch/repos/community-x86_64/use-system-libuv.patch (from rev 1150466, python-pytorch/trunk/use-system-libuv.patch)
===================================================================
--- use-system-libuv.patch (rev 0)
+++ use-system-libuv.patch 2022-03-13 06:01:58 UTC (rev 1150467)
@@ -0,0 +1,13 @@
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index 06464e799a..93410bc210 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1346,7 +1346,7 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
+ set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
+ set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
+ endif()
+- set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
++ set(TP_BUILD_LIBUV OFF CACHE BOOL "" FORCE)
+ set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
+
+ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
More information about the arch-commits
mailing list