11 #ifndef CUBBYFLOW_CUDA_ARRAY_IMPL_HPP 12 #define CUBBYFLOW_CUDA_ARRAY_IMPL_HPP 14 #ifdef CUBBYFLOW_USE_CUDA 21 template <
typename T,
size_t N,
size_t I>
22 struct CUDABlockCopyHelper
24 template <
typename... RemainingIndices>
25 CUBBYFLOW_CUDA_HOST_DEVICE
static void Call(CUDAArrayView<const T, N> src,
26 CUDAStdArray<size_t, N> size,
27 CUDAArrayView<T, N> dst,
28 RemainingIndices... indices)
30 for (
size_t i = 0; i < size[I - 1]; ++i)
32 CUDABlockCopyHelper<T, N, I - 1>::Call(src, size, dst, i,
38 template <
typename T,
size_t N>
39 struct CUDABlockCopyHelper<T, N, 1>
41 template <
typename... RemainingIndices>
42 CUBBYFLOW_CUDA_HOST_DEVICE
static void Call(CUDAArrayView<const T, N> src,
43 CUDAStdArray<size_t, N> size,
44 CUDAArrayView<T, N> dst,
45 RemainingIndices... indices)
47 for (
size_t i = 0; i < size[0]; ++i)
49 dst(i, indices...) = src(i, indices...);
54 template <
typename T,
size_t N>
55 __global__
void CUDABlockCopyKernelN(CUDAArrayView<const T, N> src,
56 CUDAStdArray<size_t, N> size,
57 CUDAArrayView<T, N> dst)
59 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
62 CUDABlockCopyHelper<T, N, N - 1>::Call(src, size, dst, i);
67 __global__
void CUDABlockCopyKernel1(CUDAArrayView<const T, 1> src,
68 CUDAStdArray<size_t, 1> size,
69 CUDAArrayView<T, 1> dst)
71 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
78 template <
typename T,
size_t N>
81 static void Call(CUDAArrayView<const T, N> src,
82 CUDAStdArray<size_t, N> size, CUDAArrayView<T, N> dst)
90 unsigned int numBlocks, numThreads;
91 CUDAComputeGridSize((
unsigned int)size[N - 1], 256, numBlocks,
93 CUDABlockCopyKernelN<<<numBlocks, numThreads>>>(src, size, dst);
94 CUBBYFLOW_CUDA_CHECK_LAST_ERROR(
95 "Failed executing CUDABlockCopyKernelN");
100 struct CUDABlockCopy<T, 1>
102 static void Call(CUDAArrayView<const T, 1> src,
103 CUDAStdArray<size_t, 1> size, CUDAArrayView<T, 1> dst)
111 unsigned int numBlocks, numThreads;
112 CUDAComputeGridSize((
unsigned int)size[0], 256, numBlocks, numThreads);
113 CUDABlockCopyKernel1<<<numBlocks, numThreads>>>(src, size, dst);
114 CUBBYFLOW_CUDA_CHECK_LAST_ERROR(
115 "Failed executing CUDABlockCopyKernel1");
121 template <
typename T,
size_t N>
122 CUDAArray<T, N>::CUDAArray() : Base()
127 template <
typename T,
size_t N>
128 CUDAArray<T, N>::CUDAArray(
const CUDAStdArray<size_t, N>& size,
134 for (
size_t i = 1; i < N; ++i)
139 m_data.Resize(l, initVal);
144 template <
typename T,
size_t N>
145 template <
typename... Args>
146 CUDAArray<T, N>::CUDAArray(
size_t nx, Args... args) : CUDAArray{}
148 Vector<size_t, N> newSizeVec;
152 CUDAStdArray<size_t, N> newSize(newSizeVec);
153 CUDAArray newArray(newSize, initVal);
154 *
this = std::move(newArray);
157 template <
typename T,
size_t N>
158 CUDAArray<T, N>::CUDAArray(NestedInitializerListsT<T, N> lst) : CUDAArray{}
160 Vector<size_t, N> newSize;
163 Array<T, N> newCpuArray(newSize);
165 CopyFrom(newCpuArray);
169 template <
typename T,
size_t N>
171 CUDAArray<T, N>::CUDAArray(
172 const std::enable_if_t<(M == 1), std::vector<T>>& vec)
178 template <
typename T,
size_t N>
179 template <
typename OtherDerived>
180 CUDAArray<T, N>::CUDAArray(
const ArrayBase<T, N, OtherDerived>& other)
186 template <
typename T,
size_t N>
187 template <
typename OtherDerived>
188 CUDAArray<T, N>::CUDAArray(
const CUDAArrayBase<T, N, OtherDerived>& other)
194 template <
typename T,
size_t N>
195 CUDAArray<T, N>::CUDAArray(
const CUDAArray& other) : CUDAArray{}
200 template <
typename T,
size_t N>
201 CUDAArray<T, N>::CUDAArray(CUDAArray&& other) noexcept : CUDAArray{}
203 *
this = std::move(other);
206 template <
typename T,
size_t N>
208 CUDAArray<T, N>& CUDAArray<T, N>::operator=(
209 const std::enable_if_t<(M == 1), std::vector<T>>& vec)
215 template <
typename T,
size_t N>
216 template <
typename OtherDerived>
217 CUDAArray<T, N>& CUDAArray<T, N>::operator=(
218 const ArrayBase<T, N, OtherDerived>& other)
224 template <
typename T,
size_t N>
225 template <
typename OtherDerived>
226 CUDAArray<T, N>& CUDAArray<T, N>::operator=(
227 const ArrayBase<const T, N, OtherDerived>& other)
233 template <
typename T,
size_t N>
234 template <
typename OtherDerived>
235 CUDAArray<T, N>& CUDAArray<T, N>::operator=(
236 const CUDAArrayBase<T, N, OtherDerived>& other)
242 template <
typename T,
size_t N>
243 template <
typename OtherDerived>
244 CUDAArray<T, N>& CUDAArray<T, N>::operator=(
245 const CUDAArrayBase<const T, N, OtherDerived>& other)
251 template <
typename T,
size_t N>
252 CUDAArray<T, N>& CUDAArray<T, N>::operator=(
const CUDAArray& other)
254 m_data = other.m_data;
259 template <
typename T,
size_t N>
260 CUDAArray<T, N>& CUDAArray<T, N>::operator=(CUDAArray&& other) noexcept
267 template <
typename T,
size_t N>
268 template <
typename A,
size_t M>
269 std::enable_if_t<(M == 1), void> CUDAArray<T, N>::CopyFrom(
270 const std::vector<T, A>& vec)
272 CUDAArray newArray(vec.size());
273 newArray.m_data.CopyFrom(vec);
274 newArray.SetPtrAndSize(newArray.m_data.data(), newArray.Size());
275 *
this = std::move(newArray);
278 template <
typename T,
size_t N>
279 template <
typename OtherDerived>
280 void CUDAArray<T, N>::CopyFrom(
const ArrayBase<T, N, OtherDerived>& other)
282 CUDAArray newArray(other.Size());
283 CUDACopyHostToDevice(other.data(), other.Length(), newArray.data());
284 *
this = std::move(newArray);
287 template <
typename T,
size_t N>
288 template <
typename OtherDerived>
289 void CUDAArray<T, N>::CopyFrom(
const ArrayBase<const T, N, OtherDerived>& other)
291 CUDAArray newArray(other.Size());
292 CUDACopyHostToDevice(other.data(), other.Length(), newArray.data());
293 *
this = std::move(newArray);
296 template <
typename T,
size_t N>
297 template <
typename OtherDerived>
298 void CUDAArray<T, N>::CopyFrom(
const CUDAArrayBase<T, N, OtherDerived>& other)
300 CUDAArray newArray(other.Size());
301 CUDACopyDeviceToDevice(other.data(), other.Length(), newArray.data());
302 *
this = std::move(newArray);
305 template <
typename T,
size_t N>
306 template <
typename OtherDerived>
307 void CUDAArray<T, N>::CopyFrom(
308 const CUDAArrayBase<const T, N, OtherDerived>& other)
310 CUDAArray newArray(other.Size());
311 CUDACopyDeviceToDevice(other.data(), other.Length(), newArray.data());
312 *
this = std::move(newArray);
315 template <
typename T,
size_t N>
316 template <
typename A,
size_t M>
317 std::enable_if_t<(M == 1), void> CUDAArray<T, N>::CopyTo(std::vector<T, A>& vec)
320 CUDACopyDeviceToHost(
data(),
Length(), vec.data());
323 template <
typename T,
size_t N>
324 void CUDAArray<T, N>::CopyTo(Array<T, N>& other)
326 other.Resize(
m_size.ToVector());
327 CUDACopyDeviceToHost(
data(),
Length(), other.data());
330 template <
typename T,
size_t N>
331 void CUDAArray<T, N>::CopyTo(ArrayView<T, N>& other)
333 assert(
m_size.ToVector() == other.Size());
334 CUDACopyDeviceToHost(
data(),
Length(), other.data());
337 template <
typename T,
size_t N>
338 void CUDAArray<T, N>::CopyTo(CUDAArray<T, N>& other)
340 other.Resize(
m_size.ToVector());
341 CUDACopyDeviceToDevice(
data(),
Length(), other.data());
344 template <
typename T,
size_t N>
345 void CUDAArray<T, N>::CopyTo(CUDAArrayView<T, N>& other)
347 assert(
Length() == other.Length());
348 CUDACopyDeviceToDevice(
data(),
Length(), other.data());
351 template <
typename T,
size_t N>
358 template <
typename T,
size_t N>
359 void CUDAArray<T, N>::Resize(CUDAStdArray<size_t, N> newSize,
const T& initVal)
361 CUDAArray newArray(newSize, initVal);
362 CUDAStdArray<size_t, N> minSize;
364 for (
size_t i = 0; i < N; ++i)
366 minSize[i] = std::min(
m_size[i], newArray.m_size[i]);
369 Internal::CUDABlockCopy<T, N>::Call(View(), minSize, newArray.View());
371 *
this = std::move(newArray);
374 template <
typename T,
size_t N>
375 template <
typename... Args>
376 void CUDAArray<T, N>::Resize(
size_t nx, Args... args)
378 Vector<size_t, N> newSizeVec;
384 CUDAStdArray<size_t, N> newSize(newSizeVec);
385 Resize(newSize, initVal);
389 template <
typename T,
size_t N>
391 std::enable_if_t<(M == 1), void> CUDAArray<T, N>::Append(
const T& val)
393 m_data.PushBack(val);
397 template <
typename T,
size_t N>
398 template <
typename A,
size_t M>
399 std::enable_if_t<(M == 1), void> CUDAArray<T, N>::Append(
400 const std::vector<T, A>& extra)
402 m_data.Append(extra);
403 m_size[0] = m_data.size();
406 template <
typename T,
size_t N>
407 template <
typename OtherDerived,
size_t M>
408 std::enable_if_t<(M == 1), void> CUDAArray<T, N>::Append(
409 const ArrayBase<T, N, OtherDerived>& extra)
411 CUDAArray newArray(
Length() + extra.Length());
413 CUDACopyHostToDevice(extra.data(), extra.Length(),
414 newArray.data() +
m_size[0]);
418 template <
typename T,
size_t N>
419 template <
typename OtherDerived,
size_t M>
420 std::enable_if_t<(M == 1), void> CUDAArray<T, N>::Append(
421 const CUDAArrayBase<T, N, OtherDerived>& extra)
423 CUDAArray newArray(
Length() + extra.Length());
425 CUDACopy(extra.data(), extra.Length(), newArray.data() +
m_size[0]);
429 template <
typename T,
size_t N>
430 void CUDAArray<T, N>::Clear()
436 template <
typename T,
size_t N>
437 void CUDAArray<T, N>::Swap(CUDAArray& other)
440 m_data.Swap(other.m_data);
443 template <
typename T,
size_t N>
444 CUDAArrayView<T, N> CUDAArray<T, N>::View()
446 return CUDAArrayView<T, N>(*this);
449 template <
typename T,
size_t N>
450 CUDAArrayView<const T, N> CUDAArray<T, N>::View()
const 452 return CUDAArrayView<const T, N>(*this);
void SwapPtrAndSize(ArrayBase &other)
Definition: ArrayBase-Impl.hpp:269
void ClearPtrAndSize()
Definition: ArrayBase-Impl.hpp:263
static size_t Call(Vector< size_t, N > &size, NestedInitializerListsT< T, I > lst)
Definition: Array-Impl.hpp:54
Vector< size_t, N > m_size
Definition: ArrayBase.hpp:125
Definition: pybind11Utils.hpp:20
void Fill(ArrayView< T, N > a, const Vector< size_t, N > &begin, const Vector< size_t, N > &end, const T &val)
Definition: ArrayUtils-Impl.hpp:19
static void Call(Vector< size_t, N > &size, T &value, size_t n, Args... args)
Definition: Array-Impl.hpp:27
size_t Length() const
Definition: ArrayBase-Impl.hpp:84
Pointer data()
Definition: ArrayBase-Impl.hpp:39
void SetPtrAndSize(Pointer ptr, size_t ni, Args... args)
Definition: ArrayBase-Impl.hpp:250
static void Call(Array< T, N > &arr, NestedInitializerListsT< T, I > lst)
Definition: Array-Impl.hpp:98