Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockTriDiContainer_impl.hpp
1/*@HEADER
2// ***********************************************************************
3//
4// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ***********************************************************************
40//@HEADER
41*/
42
43#ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44#define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45
46#include <Teuchos_Details_MpiTypeTraits.hpp>
47
48#include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
49#include <Tpetra_Distributor.hpp>
50#include <Tpetra_BlockMultiVector.hpp>
51
52#include <Kokkos_ArithTraits.hpp>
53#include <KokkosBatched_Util.hpp>
54#include <KokkosBatched_Vector.hpp>
55#include <KokkosBatched_Copy_Decl.hpp>
56#include <KokkosBatched_Copy_Impl.hpp>
57#include <KokkosBatched_AddRadial_Decl.hpp>
58#include <KokkosBatched_AddRadial_Impl.hpp>
59#include <KokkosBatched_SetIdentity_Decl.hpp>
60#include <KokkosBatched_SetIdentity_Impl.hpp>
61#include <KokkosBatched_Gemm_Decl.hpp>
62#include <KokkosBatched_Gemm_Serial_Impl.hpp>
63#include <KokkosBatched_Gemm_Team_Impl.hpp>
64#include <KokkosBatched_Gemv_Decl.hpp>
65#include <KokkosBatched_Gemv_Team_Impl.hpp>
66#include <KokkosBatched_Trsm_Decl.hpp>
67#include <KokkosBatched_Trsm_Serial_Impl.hpp>
68#include <KokkosBatched_Trsm_Team_Impl.hpp>
69#include <KokkosBatched_Trsv_Decl.hpp>
70#include <KokkosBatched_Trsv_Serial_Impl.hpp>
71#include <KokkosBatched_Trsv_Team_Impl.hpp>
72#include <KokkosBatched_LU_Decl.hpp>
73#include <KokkosBatched_LU_Serial_Impl.hpp>
74#include <KokkosBatched_LU_Team_Impl.hpp>
75
76#include <KokkosBlas1_nrm1.hpp>
77#include <KokkosBlas1_nrm2.hpp>
78
79#include <memory>
80
81// need to interface this into cmake variable (or only use this flag when it is necessary)
82//#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
83//#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
85#include "cuda_profiler_api.h"
86#endif
87
88// I am not 100% sure about the mpi 3 on cuda
89#if MPI_VERSION >= 3
90#define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
91#endif
92
93// ::: Experiments :::
94// define either pinned memory or cudamemory for mpi
95// if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
96// if defined, this use pinned memory instead of device pointer
97// by default, we enable pinned memory
98#define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
99//#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
100
101// if defined, all views are allocated on cuda space intead of cuda uvm space
102#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
103
104// if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
105#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
106#define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
107#endif
108
109// if defined, it uses multiple execution spaces
110#define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
111
112namespace Ifpack2 {
113
115
116 namespace KB = KokkosBatched;
117
121 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
122
123 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
124 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
125 MemoryTraitsType::is_random_access |
126 flag>;
127
128 template <typename ViewType>
129 using Unmanaged = Kokkos::View<typename ViewType::data_type,
130 typename ViewType::array_layout,
131 typename ViewType::device_type,
132 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
133 template <typename ViewType>
134 using Atomic = Kokkos::View<typename ViewType::data_type,
135 typename ViewType::array_layout,
136 typename ViewType::device_type,
137 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
138 template <typename ViewType>
139 using Const = Kokkos::View<typename ViewType::const_data_type,
140 typename ViewType::array_layout,
141 typename ViewType::device_type,
142 typename ViewType::memory_traits>;
143 template <typename ViewType>
144 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
145
146 template <typename ViewType>
147 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
148
149 template <typename ViewType>
150 using Unmanaged = Kokkos::View<typename ViewType::data_type,
151 typename ViewType::array_layout,
152 typename ViewType::device_type,
153 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
154
155
156 template <typename ViewType>
157 using Scratch = Kokkos::View<typename ViewType::data_type,
158 typename ViewType::array_layout,
159 typename ViewType::execution_space::scratch_memory_space,
160 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
161
165 template<typename LayoutType> struct TpetraLittleBlock;
166 template<> struct TpetraLittleBlock<Kokkos::LayoutLeft> {
167 template<typename T> KOKKOS_INLINE_FUNCTION
168 static T getFlatIndex(const T i, const T j, const T blksize) { return i+j*blksize; }
169 };
170 template<> struct TpetraLittleBlock<Kokkos::LayoutRight> {
171 template<typename T> KOKKOS_INLINE_FUNCTION
172 static T getFlatIndex(const T i, const T j, const T blksize) { return i*blksize+j; }
173 };
174
178 template<typename T> struct BlockTridiagScalarType { typedef T type; };
179#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
180 template<> struct BlockTridiagScalarType<double> { typedef float type; };
181 //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
182#endif
183
187 template<typename T> struct is_cuda { enum : bool { value = false }; };
188#if defined(KOKKOS_ENABLE_CUDA)
189 template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
190#endif
191
195 template<typename T> struct is_hip { enum : bool { value = false }; };
196#if defined(KOKKOS_ENABLE_HIP)
197 template<> struct is_hip<Kokkos::Experimental::HIP> { enum : bool { value = true }; };
198#endif
199
203 template<typename T> struct is_sycl { enum : bool { value = false }; };
204#if defined(KOKKOS_ENABLE_SYCL)
205 template<> struct is_sycl<Kokkos::Experimental::SYCL> { enum : bool { value = true }; };
206#endif
207
208 template<typename T> struct is_device { enum : bool { value = is_cuda<T>::value || is_hip<T>::value || is_sycl<T>::value }; };
209
210
214 template<typename T>
216 static void createInstance(T &exec_instance) {
217 exec_instance = T();
218 }
219#if defined(KOKKOS_ENABLE_CUDA)
220 static void createInstance(const cudaStream_t &s, T &exec_instance) {
221 exec_instance = T();
222 }
223#endif
224 };
225
226#if defined(KOKKOS_ENABLE_CUDA)
227 template<>
228 struct ExecutionSpaceFactory<Kokkos::Cuda> {
229 static void createInstance(Kokkos::Cuda &exec_instance) {
230 exec_instance = Kokkos::Cuda();
231 }
232 static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
233 exec_instance = Kokkos::Cuda(s);
234 }
235 };
236#endif
237
238#if defined(KOKKOS_ENABLE_HIP)
239 template<>
240 struct ExecutionSpaceFactory<Kokkos::Experimental::HIP> {
241 static void createInstance(Kokkos::Experimental::HIP &exec_instance) {
242 exec_instance = Kokkos::Experimental::HIP();
243 }
244 };
245#endif
246
247#if defined(KOKKOS_ENABLE_SYCL)
248 template<>
249 struct ExecutionSpaceFactory<Kokkos::Experimental::SYCL> {
250 static void createInstance(Kokkos::Experimental::SYCL &exec_instance) {
251 exec_instance = Kokkos::Experimental::SYCL();
252 }
253 };
254#endif
255
256
257
261 template<typename CommPtrType>
262 std::string get_msg_prefix (const CommPtrType &comm) {
263 const auto rank = comm->getRank();
264 const auto nranks = comm->getSize();
265 std::stringstream ss;
266 ss << "Rank " << rank << " of " << nranks << ": ";
267 return ss.str();
268 }
269
273 template<typename T, int N>
274 struct ArrayValueType {
275 T v[N];
276 KOKKOS_INLINE_FUNCTION
277 ArrayValueType() {
278 for (int i=0;i<N;++i)
279 this->v[i] = 0;
280 }
281 KOKKOS_INLINE_FUNCTION
282 ArrayValueType(const ArrayValueType &b) {
283 for (int i=0;i<N;++i)
284 this->v[i] = b.v[i];
285 }
286 };
287 template<typename T, int N>
288 static
289 KOKKOS_INLINE_FUNCTION
290 void
291 operator+=(ArrayValueType<T,N> &a,
292 const ArrayValueType<T,N> &b) {
293 for (int i=0;i<N;++i)
294 a.v[i] += b.v[i];
295 }
296
300 template<typename T, int N, typename ExecSpace>
301 struct SumReducer {
302 typedef SumReducer reducer;
303 typedef ArrayValueType<T,N> value_type;
304 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
305 value_type *value;
306
307 KOKKOS_INLINE_FUNCTION
308 SumReducer(value_type &val) : value(&val) {}
309
310 KOKKOS_INLINE_FUNCTION
311 void join(value_type &dst, value_type const &src) const {
312 for (int i=0;i<N;++i)
313 dst.v[i] += src.v[i];
314 }
315 KOKKOS_INLINE_FUNCTION
316 void init(value_type &val) const {
317 for (int i=0;i<N;++i)
318 val.v[i] = Kokkos::reduction_identity<T>::sum();
319 }
320 KOKKOS_INLINE_FUNCTION
321 value_type& reference() {
322 return *value;
323 }
324 KOKKOS_INLINE_FUNCTION
325 result_view_type view() const {
326 return result_view_type(value);
327 }
328 };
329
330#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
331#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
332#define IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) execution_space().fence();
333#else
334#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
335#define IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
336#endif
337
338#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
339#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
340 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
341
342#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
343 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
344#else
346#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
347#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
348#endif
349
353 template <typename MatrixType>
354 struct ImplType {
358 typedef size_t size_type;
359 typedef typename MatrixType::scalar_type scalar_type;
360 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
361 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
362 typedef typename MatrixType::node_type node_type;
363
367 typedef typename Kokkos::ArithTraits<scalar_type>::val_type impl_scalar_type;
368 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
369
370 typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
371 typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
372
376 typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
377
381 typedef typename node_type::device_type node_device_type;
382 typedef typename node_device_type::execution_space node_execution_space;
383 typedef typename node_device_type::memory_space node_memory_space;
384
385#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
387 typedef node_execution_space execution_space;
388 typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
389 Kokkos::CudaSpace,
390 node_memory_space>::type memory_space;
391 typedef Kokkos::Device<execution_space,memory_space> device_type;
392#else
393 typedef node_device_type device_type;
394 typedef node_execution_space execution_space;
395 typedef node_memory_space memory_space;
396#endif
397
398 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
399 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
400 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
401 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
402 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
403 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
404 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
405 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
406
410 template<typename T, int l> using Vector = KB::Vector<T,l>;
411 template<typename T> using SIMD = KB::SIMD<T>;
412 template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
413 template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
414
415 static constexpr int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
416 static constexpr int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
417 typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
418 typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
419
423 typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
424 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
425 // tpetra block crs values
426 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
427 typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
428
429 // tpetra multivector values (layout left): may need to change the typename more explicitly
430 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
431 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
432
433 // packed data always use layout right
434 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
435 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
436 typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
437 typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
438 typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
439 typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
440 };
441
445 template<typename MatrixType>
446 typename Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type>
447 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
448 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
449 using impl_type = ImplType<MatrixType>;
450 using tpetra_map_type = typename impl_type::tpetra_map_type;
451 using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
452 using tpetra_import_type = typename impl_type::tpetra_import_type;
453
454 const auto g = A->getCrsGraph(); // tpetra crs graph object
455 const auto blocksize = A->getBlockSize();
456 const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
457 const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
458
459 auto blockCrsTpetraImporter = Teuchos::rcp(new tpetra_import_type(src, tgt));
460 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
461
462 return blockCrsTpetraImporter;
463 }
464
465 // Partial replacement for forward-mode MultiVector::doImport.
466 // Permits overlapped communication and computation, but also supports sync'ed.
467 // I'm finding that overlapped comm/comp can give quite poor performance on some
468 // platforms, so we can't just use it straightforwardly always.
469
470 template<typename MatrixType>
471 struct AsyncableImport {
472 public:
473 using impl_type = ImplType<MatrixType>;
474
475 private:
479#if !defined(HAVE_IFPACK2_MPI)
480 typedef int MPI_Request;
481 typedef int MPI_Comm;
482#endif
485 using scalar_type = typename impl_type::scalar_type;
486
487 static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
488#ifdef HAVE_IFPACK2_MPI
489 MPI_Request ureq;
490 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
491 if (ireq == NULL) MPI_Request_free(&ureq);
492 return ret;
493#else
494 return 0;
495#endif
496 }
497
498 static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
499#ifdef HAVE_IFPACK2_MPI
500 MPI_Request ureq;
501 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
502 if (ireq == NULL) MPI_Request_free(&ureq);
503 return ret;
504#else
505 return 0;
506#endif
507 }
508
509 static int waitany(int count, MPI_Request* reqs, int* index) {
510#ifdef HAVE_IFPACK2_MPI
511 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
512#else
513 return 0;
514#endif
515 }
516
517 static int waitall(int count, MPI_Request* reqs) {
518#ifdef HAVE_IFPACK2_MPI
519 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
520#else
521 return 0;
522#endif
523 }
524
525 public:
526 using tpetra_map_type = typename impl_type::tpetra_map_type;
527 using tpetra_import_type = typename impl_type::tpetra_import_type;
528
529 using local_ordinal_type = typename impl_type::local_ordinal_type;
530 using global_ordinal_type = typename impl_type::global_ordinal_type;
531 using size_type = typename impl_type::size_type;
532 using impl_scalar_type = typename impl_type::impl_scalar_type;
533
534 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
535 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
536
537 using execution_space = typename impl_type::execution_space;
538 using memory_space = typename impl_type::memory_space;
539 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
540 using size_type_1d_view = typename impl_type::size_type_1d_view;
541 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
542
543#if defined(KOKKOS_ENABLE_CUDA)
544 using impl_scalar_type_1d_view =
545 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
546# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
547 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
548# elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
549 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
550# else // no experimental macros are defined
551 typename impl_type::impl_scalar_type_1d_view,
552# endif
553 typename impl_type::impl_scalar_type_1d_view>::type;
554#else
555 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
556#endif
557 using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
558 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
559
560#ifdef HAVE_IFPACK2_MPI
561 MPI_Comm comm;
562#endif
563
564 impl_scalar_type_2d_view_tpetra remote_multivector;
565 local_ordinal_type blocksize;
566
567 template<typename T>
568 struct SendRecvPair {
569 T send, recv;
570 };
571
572 // (s)end and (r)eceive data:
573 SendRecvPair<int_1d_view_host> pids; // mpi ranks
574 SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
575 SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
576 SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
577 SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
578 SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
579
580 local_ordinal_type_1d_view dm2cm; // permutation
581
582#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
583 using exec_instance_1d_std_vector = std::vector<execution_space>;
584 exec_instance_1d_std_vector exec_instances;
585#endif
586
587 // for cuda
588 public:
589 void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
590 const size_type_1d_view &offs) {
591 // wrap lens to kokkos view and deep copy to device
592 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
593 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
594
595 // exclusive scan
596 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
597 const local_ordinal_type lens_size = lens_device.extent(0);
598 Kokkos::parallel_scan
599 ("AsyncableImport::RangePolicy::setOffsetValues",
600 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
601 if (final)
602 offs(i) = update;
603 update += (i < lens_size ? lens_device[i] : 0);
604 });
605 }
606
607 void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
608 const size_type_1d_view_host &offs) {
609 // wrap lens to kokkos view and deep copy to device
610 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
611 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
612
613 // exclusive scan
614 offs(0) = 0;
615 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
616 offs(i) = offs(i-1) + lens[i-1];
617 }
618 }
619
620 private:
621 void createMpiRequests(const tpetra_import_type &import) {
622 Tpetra::Distributor &distributor = import.getDistributor();
623
624 // copy pids from distributor
625 const auto pids_from = distributor.getProcsFrom();
626 pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
627 memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
628
629 const auto pids_to = distributor.getProcsTo();
630 pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
631 memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
632
633 // mpi requests
634 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
635 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
636
637 // construct offsets
638#if 0
639 const auto lengths_to = distributor.getLengthsTo();
640 offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
641
642 const auto lengths_from = distributor.getLengthsFrom();
643 offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
644
645 setOffsetValues(lengths_to, offset.send);
646 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
647
648 setOffsetValues(lengths_from, offset.recv);
649 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
650#else
651 const auto lengths_to = distributor.getLengthsTo();
652 offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
653
654 const auto lengths_from = distributor.getLengthsFrom();
655 offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
656
657 setOffsetValuesHost(lengths_to, offset_host.send);
658 //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
659
660 setOffsetValuesHost(lengths_from, offset_host.recv);
661 //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
662#endif
663 }
664
665 void createSendRecvIDs(const tpetra_import_type &import) {
666 // For each remote PID, the list of LIDs to receive.
667 const auto remote_lids = import.getRemoteLIDs();
668 const local_ordinal_type_1d_view_host
669 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
670 lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
671 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
672
673 // For each export PID, the list of LIDs to send.
674 auto epids = import.getExportPIDs();
675 auto elids = import.getExportLIDs();
676 TEUCHOS_ASSERT(epids.size() == elids.size());
677 lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
678 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
679
680 // naive search (not sure if pids or epids are sorted)
681 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
682 const auto pid_send_value = pids.send[i];
683 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
684 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
685 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
686 }
687 Kokkos::deep_copy(lids.send, lids_send_host);
688 }
689
690 void createExecutionSpaceInstances() {
691#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
692 //The following line creates 8 streams:
693 exec_instances =
694 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
695#endif
696 }
697
698 public:
699 // for cuda, all tag types are public
700 struct ToBuffer {};
701 struct ToMultiVector {};
702
703 AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
704 const Teuchos::RCP<const tpetra_map_type>& tgt_map,
705 const local_ordinal_type blocksize_,
706 const local_ordinal_type_1d_view dm2cm_) {
707 blocksize = blocksize_;
708 dm2cm = dm2cm_;
709
710#ifdef HAVE_IFPACK2_MPI
711 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
712#endif
713 const tpetra_import_type import(src_map, tgt_map);
714
715 createMpiRequests(import);
716 createSendRecvIDs(import);
717 createExecutionSpaceInstances();
718 }
719
720 void createDataBuffer(const local_ordinal_type &num_vectors) {
721 const size_type extent_0 = lids.recv.extent(0)*blocksize;
722 const size_type extent_1 = num_vectors;
723 if (remote_multivector.extent(0) == extent_0 &&
724 remote_multivector.extent(1) == extent_1) {
725 // skip
726 } else {
727 remote_multivector =
728 impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
729
730 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
731 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
732
733 buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
734 buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
735 }
736 }
737
738 void cancel () {
739#ifdef HAVE_IFPACK2_MPI
740 waitall(reqs.recv.size(), reqs.recv.data());
741 waitall(reqs.send.size(), reqs.send.data());
742#endif
743 }
744
745 // ======================================================================
746 // Async version using execution space instances
747 // ======================================================================
748
749#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
750 template<typename PackTag>
751 static
752 void copy(const local_ordinal_type_1d_view &lids_,
753 const impl_scalar_type_1d_view &buffer_,
754 const local_ordinal_type ibeg_,
755 const local_ordinal_type iend_,
756 const impl_scalar_type_2d_view_tpetra &multivector_,
757 const local_ordinal_type blocksize_,
758 const execution_space &exec_instance_) {
759 const local_ordinal_type num_vectors = multivector_.extent(1);
760 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
761 const local_ordinal_type idiff = iend_ - ibeg_;
762 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
763
764 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
765 local_ordinal_type vector_size(0);
766 if (blocksize_ <= 4) vector_size = 4;
767 else if (blocksize_ <= 8) vector_size = 8;
768 else if (blocksize_ <= 16) vector_size = 16;
769 else vector_size = 32;
770
771 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
772 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
773 Kokkos::parallel_for
774 (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
775 Kokkos::Experimental::require(policy, work_item_property),
776 KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
777 const local_ordinal_type i = member.league_rank();
778 Kokkos::parallel_for
779 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
780 auto aptr = abase + blocksize_*(i + idiff*j);
781 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
782 if (std::is_same<PackTag,ToBuffer>::value)
783 Kokkos::parallel_for
784 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
785 aptr[k] = bptr[k];
786 });
787 else
788 Kokkos::parallel_for
789 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
790 bptr[k] = aptr[k];
791 });
792 });
793 });
794 }
795
796 void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
797 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
798
799#ifdef HAVE_IFPACK2_MPI
800 // constants and reallocate data buffers if necessary
801 const local_ordinal_type num_vectors = mv.extent(1);
802 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
803
804 // 0. post receive async
805 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
806 irecv(comm,
807 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
808 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
809 pids.recv[i],
810 42,
811 &reqs.recv[i]);
812 }
813
815 execution_space().fence();
816
817 // 1. async memcpy
818 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
819 // 1.0. enqueue pack buffer
820 if (i<8) exec_instances[i%8].fence();
821 copy<ToBuffer>(lids.send, buffer.send,
822 offset_host.send(i), offset_host.send(i+1),
823 mv, blocksize,
824 //execution_space());
825 exec_instances[i%8]);
826
827 }
829 //execution_space().fence();
830 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
831 // 1.1. sync the stream and isend
832 if (i<8) exec_instances[i%8].fence();
833 isend(comm,
834 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
835 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
836 pids.send[i],
837 42,
838 &reqs.send[i]);
839 }
840
841 // 2. poke communication
842 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
843 int flag;
844 MPI_Status stat;
845 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
846 }
847#endif // HAVE_IFPACK2_MPI
848 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
849 }
850
851 void syncRecvVar1() {
852 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
853#ifdef HAVE_IFPACK2_MPI
854 // 0. wait for receive async.
855 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
856 local_ordinal_type idx = i;
857
858 // 0.0. wait any
859 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
860
861 // 0.1. unpack data after data is moved into a device
862 copy<ToMultiVector>(lids.recv, buffer.recv,
863 offset_host.recv(idx), offset_host.recv(idx+1),
864 remote_multivector, blocksize,
865 exec_instances[idx%8]);
866 }
867
868 // 1. fire up all cuda events
869 Kokkos::fence();
870
871 // 2. cleanup all open comm
872 waitall(reqs.send.size(), reqs.send.data());
873#endif // HAVE_IFPACK2_MPI
874 }
875#endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
876
877 // ======================================================================
878 // Generic version without using execution space instances
879 // - only difference between device and host architecture is on using team
880 // or range policies.
881 // ======================================================================
882 template<typename PackTag>
883 static
884 void copy(const local_ordinal_type_1d_view &lids_,
885 const impl_scalar_type_1d_view &buffer_,
886 const local_ordinal_type &ibeg_,
887 const local_ordinal_type &iend_,
888 const impl_scalar_type_2d_view_tpetra &multivector_,
889 const local_ordinal_type blocksize_) {
890 const local_ordinal_type num_vectors = multivector_.extent(1);
891 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
892 const local_ordinal_type idiff = iend_ - ibeg_;
893 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
894 if constexpr (is_device<execution_space>::value) {
895 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
896 local_ordinal_type vector_size(0);
897 if (blocksize_ <= 4) vector_size = 4;
898 else if (blocksize_ <= 8) vector_size = 8;
899 else if (blocksize_ <= 16) vector_size = 16;
900 else vector_size = 32;
901 const team_policy_type policy(idiff, 1, vector_size);
902 Kokkos::parallel_for
903 ("AsyncableImport::TeamPolicy::copy",
904 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
905 const local_ordinal_type i = member.league_rank();
906 Kokkos::parallel_for
907 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
908 auto aptr = abase + blocksize_*(i + idiff*j);
909 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
910 if (std::is_same<PackTag,ToBuffer>::value)
911 Kokkos::parallel_for
912 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
913 aptr[k] = bptr[k];
914 });
915 else
916 Kokkos::parallel_for
917 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
918 bptr[k] = aptr[k];
919 });
920 });
921 });
922 } else {
923 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
924 Kokkos::parallel_for
925 ("AsyncableImport::RangePolicy::copy",
926 policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
927 const local_ordinal_type i = ij%idiff;
928 const local_ordinal_type j = ij/idiff;
929 auto aptr = abase + blocksize_*(i + idiff*j);
930 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
931 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
932 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
933 memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
934 });
935 }
936 }
937
938
942 void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
943 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
944
945#ifdef HAVE_IFPACK2_MPI
946 // constants and reallocate data buffers if necessary
947 const local_ordinal_type num_vectors = mv.extent(1);
948 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
949
950 // receive async
951 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
952 irecv(comm,
953 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
954 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
955 pids.recv[i],
956 42,
957 &reqs.recv[i]);
958 }
959
960 // send async
961 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
962 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
963 mv, blocksize);
964 Kokkos::fence();
965 isend(comm,
966 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
967 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
968 pids.send[i],
969 42,
970 &reqs.send[i]);
971 }
972
973 // I find that issuing an Iprobe seems to nudge some MPIs into action,
974 // which helps with overlapped comm/comp performance.
975 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
976 int flag;
977 MPI_Status stat;
978 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
979 }
980#endif
981 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
982 }
983
984 void syncRecvVar0() {
985 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
986#ifdef HAVE_IFPACK2_MPI
987 // receive async.
988 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
989 local_ordinal_type idx = i;
990 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
991 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
992 remote_multivector, blocksize);
993 }
994 // wait on the sends to match all Isends with a cleanup operation.
995 waitall(reqs.send.size(), reqs.send.data());
996#endif
997 }
998
1002 void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
1003#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
1004#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
1005 asyncSendRecvVar1(mv);
1006#else
1007 asyncSendRecvVar0(mv);
1008#endif
1009#else
1010 asyncSendRecvVar0(mv);
1011#endif
1012 }
1013 void syncRecv() {
1014#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
1015#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
1016 syncRecvVar1();
1017#else
1018 syncRecvVar0();
1019#endif
1020#else
1021 syncRecvVar0();
1022#endif
1023 }
1024
1025 void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
1026 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
1027 asyncSendRecv(mv);
1028 syncRecv();
1029 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
1030 }
1031
1032 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
1033 };
1034
1038 template<typename MatrixType>
1039 Teuchos::RCP<AsyncableImport<MatrixType> >
1040 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
1041 using impl_type = ImplType<MatrixType>;
1042 using tpetra_map_type = typename impl_type::tpetra_map_type;
1043 using local_ordinal_type = typename impl_type::local_ordinal_type;
1044 using global_ordinal_type = typename impl_type::global_ordinal_type;
1045 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1046
1047 const auto g = A->getCrsGraph(); // tpetra crs graph object
1048 const auto blocksize = A->getBlockSize();
1049 const auto domain_map = g.getDomainMap();
1050 const auto column_map = g.getColMap();
1051
1052 std::vector<global_ordinal_type> gids;
1053 bool separate_remotes = true, found_first = false, need_owned_permutation = false;
1054 for (size_t i=0;i<column_map->getLocalNumElements();++i) {
1055 const global_ordinal_type gid = column_map->getGlobalElement(i);
1056 if (!domain_map->isNodeGlobalElement(gid)) {
1057 found_first = true;
1058 gids.push_back(gid);
1059 } else if (found_first) {
1060 separate_remotes = false;
1061 break;
1062 }
1063 if (!need_owned_permutation &&
1064 domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
1065 // The owned part of the domain and column maps are different
1066 // orderings. We *could* do a super efficient impl of this case in the
1067 // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
1068 // really, if a caller cares about speed, they wouldn't make different
1069 // local permutations like this. So we punt on the best impl and go for
1070 // a pretty good one: the permutation is done in place in
1071 // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
1072 // is the presumably worse memory access pattern of the input vector.
1073 need_owned_permutation = true;
1074 }
1075 }
1076
1077 if (separate_remotes) {
1078 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
1079 const auto parsimonious_col_map
1080 = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
1081 if (parsimonious_col_map->getGlobalNumElements() > 0) {
1082 // make the importer only if needed.
1083 local_ordinal_type_1d_view dm2cm;
1084 if (need_owned_permutation) {
1085 dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
1086 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
1087 for (size_t i=0;i<domain_map->getLocalNumElements();++i)
1088 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
1089 Kokkos::deep_copy(dm2cm, dm2cm_host);
1090 }
1091 return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
1092 }
1093 }
1094 return Teuchos::null;
1095 }
1096
1097 template<typename MatrixType>
1098 struct PartInterface {
1099 using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
1100 using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
1101
1102 PartInterface() = default;
1103 PartInterface(const PartInterface &b) = default;
1104
1105 // Some terms:
1106 // The matrix A is split as A = D + R, where D is the matrix of tridiag
1107 // blocks and R is the remainder.
1108 // A part is roughly a synonym for a tridiag. The distinction is that a part
1109 // is the set of rows belonging to one tridiag and, equivalently, the off-diag
1110 // rows in R associated with that tridiag. In contrast, the term tridiag is
1111 // used to refer specifically to tridiag data, such as the pointer into the
1112 // tridiag data array.
1113 // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
1114 // tridiag, and partptr points to the beginning of each tridiag. This is the
1115 // LID space.
1116 // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
1117 // by this ordinal. This is the 'index' space.
1118 // A flat index is the mathematical index into an array. A pack index
1119 // accounts for SIMD packing.
1120
1121 // Local row LIDs. Permutation from caller's index space to tridiag index
1122 // space.
1123 local_ordinal_type_1d_view lclrow;
1124 // partptr_ is the pointer array into lclrow_.
1125 local_ordinal_type_1d_view partptr; // np+1
1126 // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
1127 // is the start of the i'th pack.
1128 local_ordinal_type_1d_view packptr; // npack+1
1129 // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
1130 // an alias of partptr_ in the case of no overlap.
1131 local_ordinal_type_1d_view part2rowidx0; // np+1
1132 // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
1133 // it's the same as part2rowidx0_; if it's > 1, then the value is combined
1134 // with i % vector_length to get the location in the packed data.
1135 local_ordinal_type_1d_view part2packrowidx0; // np+1
1136 local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
1137 // rowidx2part_ maps the row index to the part index.
1138 local_ordinal_type_1d_view rowidx2part; // nr
1139 // True if lcl{row|col} is at most a constant away from row{idx|col}. In
1140 // practice, this knowledge is not particularly useful, as packing for batched
1141 // processing is done at the same time as the permutation from LID to index
1142 // space. But it's easy to detect, so it's recorded in case an optimization
1143 // can be made based on it.
1144 bool row_contiguous;
1145
1146 local_ordinal_type max_partsz;
1147 };
1148
1152 template<typename MatrixType>
1153 PartInterface<MatrixType>
1154 createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1155 const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
1156 using impl_type = ImplType<MatrixType>;
1157 using local_ordinal_type = typename impl_type::local_ordinal_type;
1158 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1159
1160 constexpr int vector_length = impl_type::vector_length;
1161
1162 const auto comm = A->getRowMap()->getComm();
1163
1164 PartInterface<MatrixType> interf;
1165
1166 const bool jacobi = partitions.size() == 0;
1167 const local_ordinal_type A_n_lclrows = A->getLocalNumRows();
1168 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1169
1170#if defined(BLOCKTRIDICONTAINER_DEBUG)
1171 local_ordinal_type nrows = 0;
1172 if (jacobi)
1173 nrows = nparts;
1174 else
1175 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1176
1177 TEUCHOS_TEST_FOR_EXCEPT_MSG
1178 (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1179 << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1180#endif
1181
1182 // permutation vector
1183 std::vector<local_ordinal_type> p;
1184 if (jacobi) {
1185 interf.max_partsz = 1;
1186 } else {
1187 // reorder parts to maximize simd packing efficiency
1188 p.resize(nparts);
1189
1190 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1191 std::vector<size_idx_pair_type> partsz(nparts);
1192 for (local_ordinal_type i=0;i<nparts;++i)
1193 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1194 std::sort(partsz.begin(), partsz.end(),
1195 [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1196 return x.first > y.first;
1197 });
1198 for (local_ordinal_type i=0;i<nparts;++i)
1199 p[i] = partsz[i].second;
1200
1201 interf.max_partsz = partsz[0].first;
1202 }
1203
1204 // allocate parts
1205 interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1206 interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1207 interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1208 interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1209 interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1210
1211 // mirror to host and compute on host execution space
1212 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1213 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1214 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1215 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1216 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1217
1218 // Determine parts.
1219 interf.row_contiguous = true;
1220 partptr(0) = 0;
1221 part2rowidx0(0) = 0;
1222 part2packrowidx0(0) = 0;
1223 local_ordinal_type pack_nrows = 0;
1224 if (jacobi) {
1225 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1226 const local_ordinal_type ipnrows = 1;
1227 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1228 get_msg_prefix(comm)
1229 << "partition " << p[ip]
1230 << " is empty, which is not allowed.");
1231 //assume No overlap.
1232 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1233 // Since parts are ordered in nonincreasing size, the size of the first
1234 // part in a pack is the size for all parts in the pack.
1235 if (ip % vector_length == 0) pack_nrows = ipnrows;
1236 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1237 const local_ordinal_type os = partptr(ip);
1238 for (local_ordinal_type i=0;i<ipnrows;++i) {
1239 const auto lcl_row = ip;
1240 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1241 get_msg_prefix(comm)
1242 << "partitions[" << p[ip] << "]["
1243 << i << "] = " << lcl_row
1244 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1245 << "].");
1246 lclrow(os+i) = lcl_row;
1247 rowidx2part(os+i) = ip;
1248 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1249 interf.row_contiguous = false;
1250 }
1251 partptr(ip+1) = os + ipnrows;
1252 }
1253 } else {
1254 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1255 const auto* part = &partitions[p[ip]];
1256 const local_ordinal_type ipnrows = part->size();
1257 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1258 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1259 get_msg_prefix(comm)
1260 << "partition " << p[ip]
1261 << " is empty, which is not allowed.");
1262 //assume No overlap.
1263 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1264 // Since parts are ordered in nonincreasing size, the size of the first
1265 // part in a pack is the size for all parts in the pack.
1266 if (ip % vector_length == 0) pack_nrows = ipnrows;
1267 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1268 const local_ordinal_type os = partptr(ip);
1269 for (local_ordinal_type i=0;i<ipnrows;++i) {
1270 const auto lcl_row = (*part)[i];
1271 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1272 get_msg_prefix(comm)
1273 << "partitions[" << p[ip] << "]["
1274 << i << "] = " << lcl_row
1275 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1276 << "].");
1277 lclrow(os+i) = lcl_row;
1278 rowidx2part(os+i) = ip;
1279 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1280 interf.row_contiguous = false;
1281 }
1282 partptr(ip+1) = os + ipnrows;
1283 }
1284 }
1285#if defined(BLOCKTRIDICONTAINER_DEBUG)
1286 TEUCHOS_ASSERT(partptr(nparts) == nrows);
1287#endif
1288 if (lclrow(0) != 0) interf.row_contiguous = false;
1289
1290 Kokkos::deep_copy(interf.partptr, partptr);
1291 Kokkos::deep_copy(interf.lclrow, lclrow);
1292
1293 //assume No overlap. Thus:
1294 interf.part2rowidx0 = interf.partptr;
1295 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1296
1297 interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
1298 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1299
1300 { // Fill packptr.
1301 local_ordinal_type npacks = 0;
1302 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1303 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1304 ++npacks;
1305 interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1306 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1307 packptr(0) = 0;
1308 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1309 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1310 packptr(k++) = ip;
1311 Kokkos::deep_copy(interf.packptr, packptr);
1312 }
1313
1314 return interf;
1315 }
1316
1320 template <typename MatrixType>
1321 struct BlockTridiags {
1322 using impl_type = ImplType<MatrixType>;
1323 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1324 using size_type_1d_view = typename impl_type::size_type_1d_view;
1325 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1326
1327 // flat_td_ptr(i) is the index into flat-array values of the start of the
1328 // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1329 // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1330 // vector_length is the position in the pack.
1331 size_type_1d_view flat_td_ptr, pack_td_ptr;
1332 // List of local column indices into A from which to grab
1333 // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1334 local_ordinal_type_1d_view A_colindsub;
1335 // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1336 // tridiag's pack, and i % vector_length gives the position in the pack.
1337 vector_type_3d_view values;
1338
1339 bool is_diagonal_only;
1340
1341 BlockTridiags() = default;
1342 BlockTridiags(const BlockTridiags &b) = default;
1343
1344 // Index into row-major block of a tridiag.
1345 template <typename idx_type>
1346 static KOKKOS_FORCEINLINE_FUNCTION
1347 idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1348 // Given a row of a row-major tridiag, return the index of the first block
1349 // in that row.
1350 template <typename idx_type>
1351 static KOKKOS_FORCEINLINE_FUNCTION
1352 idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1353 // Number of blocks in a tridiag having a given number of rows.
1354 template <typename idx_type>
1355 static KOKKOS_FORCEINLINE_FUNCTION
1356 idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1357 };
1358
1359
1363 template<typename MatrixType>
1365 createBlockTridiags(const PartInterface<MatrixType> &interf) {
1366 using impl_type = ImplType<MatrixType>;
1367 using execution_space = typename impl_type::execution_space;
1368 using local_ordinal_type = typename impl_type::local_ordinal_type;
1369 using size_type = typename impl_type::size_type;
1370 using size_type_1d_view = typename impl_type::size_type_1d_view;
1371
1372 constexpr int vector_length = impl_type::vector_length;
1373
1375
1376 const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1377
1378 { // construct the flat index pointers into the tridiag values array.
1379 btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
1380 const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1381 Kokkos::parallel_scan
1382 ("createBlockTridiags::RangePolicy::flat_td_ptr",
1383 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1384 if (final)
1385 btdm.flat_td_ptr(i) = update;
1386 if (i < ntridiags) {
1387 const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1388 update += btdm.NumBlocks(nrows);
1389 }
1390 });
1391
1392 const auto nblocks = Kokkos::create_mirror_view_and_copy
1393 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1394 btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1395 }
1396
1397 // And the packed index pointers.
1398 if (vector_length == 1) {
1399 btdm.pack_td_ptr = btdm.flat_td_ptr;
1400 } else {
1401 const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1402 btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
1403 const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1404 Kokkos::parallel_scan
1405 ("createBlockTridiags::RangePolicy::pack_td_ptr",
1406 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1407 const local_ordinal_type parti = interf.packptr(i);
1408 const local_ordinal_type parti_next = interf.packptr(i+1);
1409 if (final) {
1410 const size_type nblks = update;
1411 for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1412 btdm.pack_td_ptr(pti) = nblks;
1413 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1414 // last one
1415 if (i == npacks-1)
1416 btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1417 }
1418 {
1419 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1420 update += btdm.NumBlocks(nrows);
1421 }
1422 });
1423 }
1424
1425 // values and A_colindsub are created in the symbolic phase
1426
1427 return btdm;
1428 }
1429
1430 // Set the tridiags to be I to the full pack block size. That way, if a
1431 // tridiag within a pack is shorter than the longest one, the extra blocks are
1432 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1433 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1434 // blocks, then the solve proceeds as though the extra blocks aren't
1435 // present. Since this extra work is part of the SIMD calls, it's not actually
1436 // extra work. Instead, it means we don't have to put checks or masks in, or
1437 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1438 // since the numeric phase fills in only the used entries, leaving these I
1439 // blocks intact.
1440 template<typename MatrixType>
1441 void
1442 setTridiagsToIdentity
1443 (const BlockTridiags<MatrixType>& btdm,
1444 const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1445 {
1446 using impl_type = ImplType<MatrixType>;
1447 using execution_space = typename impl_type::execution_space;
1448 using local_ordinal_type = typename impl_type::local_ordinal_type;
1449 using size_type_1d_view = typename impl_type::size_type_1d_view;
1450
1451 const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1452 const local_ordinal_type blocksize = btdm.values.extent(1);
1453
1454 {
1455 const int vector_length = impl_type::vector_length;
1456 const int internal_vector_length = impl_type::internal_vector_length;
1457
1458 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1459 using internal_vector_type = typename impl_type::internal_vector_type;
1460 using internal_vector_type_4d_view =
1461 typename impl_type::internal_vector_type_4d_view;
1462
1463 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1464 const internal_vector_type_4d_view values
1465 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1466 btdm.values.extent(0),
1467 btdm.values.extent(1),
1468 btdm.values.extent(2),
1469 vector_length/internal_vector_length);
1470 const local_ordinal_type vector_loop_size = values.extent(3);
1471#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1472 local_ordinal_type total_team_size(0);
1473 if (blocksize <= 5) total_team_size = 32;
1474 else if (blocksize <= 9) total_team_size = 64;
1475 else if (blocksize <= 12) total_team_size = 96;
1476 else if (blocksize <= 16) total_team_size = 128;
1477 else if (blocksize <= 20) total_team_size = 160;
1478 else total_team_size = 160;
1479 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1480 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1481#elif defined(KOKKOS_ENABLE_HIP)
1482 // FIXME: HIP
1483 // These settings might be completely wrong
1484 // will have to do some experiments to decide
1485 // what makes sense on AMD GPUs
1486 local_ordinal_type total_team_size(0);
1487 if (blocksize <= 5) total_team_size = 32;
1488 else if (blocksize <= 9) total_team_size = 64;
1489 else if (blocksize <= 12) total_team_size = 96;
1490 else if (blocksize <= 16) total_team_size = 128;
1491 else if (blocksize <= 20) total_team_size = 160;
1492 else total_team_size = 160;
1493 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1494 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1495#elif defined(KOKKOS_ENABLE_SYCL)
1496 // SYCL: FIXME
1497 local_ordinal_type total_team_size(0);
1498 if (blocksize <= 5) total_team_size = 32;
1499 else if (blocksize <= 9) total_team_size = 64;
1500 else if (blocksize <= 12) total_team_size = 96;
1501 else if (blocksize <= 16) total_team_size = 128;
1502 else if (blocksize <= 20) total_team_size = 160;
1503 else total_team_size = 160;
1504 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1505 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1506#else
1507 // Host architecture: team size is always one
1508 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1509#endif
1510 Kokkos::parallel_for
1511 ("setTridiagsToIdentity::TeamPolicy",
1512 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1513 const local_ordinal_type k = member.league_rank();
1514 const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1515 const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1516 const local_ordinal_type diff = iend - ibeg;
1517 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1518 const btdm_scalar_type one(1);
1519 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1520 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1521 const local_ordinal_type i = ibeg + ii*3;
1522 for (local_ordinal_type j=0;j<blocksize;++j)
1523 values(i,j,j,v) = one;
1524 });
1525 });
1526 });
1527 }
1528 }
1529
1533 template <typename MatrixType>
1534 struct AmD {
1535 using impl_type = ImplType<MatrixType>;
1536 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1537 using size_type_1d_view = typename impl_type::size_type_1d_view;
1538 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
1539 // rowptr points to the start of each row of A_colindsub.
1540 size_type_1d_view rowptr, rowptr_remote;
1541 // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1542 // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1543 // where g is A's graph, are the columns AmD uses. If seq_method_, then
1544 // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1545 // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1546 // contains the remote ones.
1547 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1548
1549 // Currently always true.
1550 bool is_tpetra_block_crs;
1551
1552 // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1553 impl_scalar_type_1d_view_tpetra tpetra_values;
1554
1555 AmD() = default;
1556 AmD(const AmD &b) = default;
1557 };
1558
1562 template<typename MatrixType>
1563 void
1564 performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1565 const PartInterface<MatrixType> &interf,
1567 AmD<MatrixType> &amd,
1568 const bool overlap_communication_and_computation) {
1569 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
1570
1571 using impl_type = ImplType<MatrixType>;
1572 // using node_memory_space = typename impl_type::node_memory_space;
1573 using host_execution_space = typename impl_type::host_execution_space;
1574
1575 using local_ordinal_type = typename impl_type::local_ordinal_type;
1576 using global_ordinal_type = typename impl_type::global_ordinal_type;
1577 using size_type = typename impl_type::size_type;
1578 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1579 using size_type_1d_view = typename impl_type::size_type_1d_view;
1580 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1581 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1582
1583 constexpr int vector_length = impl_type::vector_length;
1584
1585 const auto comm = A->getRowMap()->getComm();
1586 const auto& g = A->getCrsGraph();
1587 const auto blocksize = A->getBlockSize();
1588
1589 // mirroring to host
1590 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1591 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1592 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1593 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1594 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1595
1596 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1597
1598 // find column to row map on host
1599 Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1600 Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1601 {
1602 const auto rowmap = g.getRowMap();
1603 const auto colmap = g.getColMap();
1604 const auto dommap = g.getDomainMap();
1605 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1606
1607#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1608 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1609 Kokkos::parallel_for
1610 ("performSymbolicPhase::RangePolicy::col2row",
1611 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1612 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1613 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1614 if (dommap->isNodeGlobalElement(gid)) {
1615 const local_ordinal_type lc = colmap->getLocalElement(gid);
1616# if defined(BLOCKTRIDICONTAINER_DEBUG)
1617 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1618 get_msg_prefix(comm) << "GID " << gid
1619 << " gives an invalid local column.");
1620# endif
1621 col2row(lc) = lr;
1622 }
1623 });
1624#endif
1625 }
1626
1627 // construct the D and R graphs in A = D + R.
1628 {
1629 const auto local_graph = g.getLocalGraphHost();
1630 const auto local_graph_rowptr = local_graph.row_map;
1631 TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1632 const auto local_graph_colidx = local_graph.entries;
1633
1634 //assume no overlap.
1635
1636 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1637 {
1638 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1639 Kokkos::parallel_for
1640 ("performSymbolicPhase::RangePolicy::lclrow2idx",
1641 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1642 lclrow2idx[lclrow(i)] = i;
1643 });
1644 }
1645
1646 // count (block) nnzs in D and R.
1647 typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1648 typename sum_reducer_type::value_type sum_reducer_value;
1649 {
1650 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1651 Kokkos::parallel_reduce
1652 // profiling interface does not work
1653 (//"performSymbolicPhase::RangePolicy::count_nnz",
1654 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1655 // LID -> index.
1656 const local_ordinal_type ri0 = lclrow2idx[lr];
1657 const local_ordinal_type pi0 = rowidx2part(ri0);
1658 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1659 const local_ordinal_type lc = local_graph_colidx(j);
1660 const local_ordinal_type lc2r = col2row[lc];
1661 bool incr_R = false;
1662 do { // breakable
1663 if (lc2r == (local_ordinal_type) -1) {
1664 incr_R = true;
1665 break;
1666 }
1667 const local_ordinal_type ri = lclrow2idx[lc2r];
1668 const local_ordinal_type pi = rowidx2part(ri);
1669 if (pi != pi0) {
1670 incr_R = true;
1671 break;
1672 }
1673 // Test for being in the tridiag. This is done in index space. In
1674 // LID space, tridiag LIDs in a row are not necessarily related by
1675 // {-1, 0, 1}.
1676 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1677 ++update.v[0]; // D_nnz
1678 else
1679 incr_R = true;
1680 } while (0);
1681 if (incr_R) {
1682 if (lc < nrows) ++update.v[1]; // R_nnz_owned
1683 else ++update.v[2]; // R_nnz_remote
1684 }
1685 }
1686 }, sum_reducer_type(sum_reducer_value));
1687 }
1688 size_type D_nnz = sum_reducer_value.v[0];
1689 size_type R_nnz_owned = sum_reducer_value.v[1];
1690 size_type R_nnz_remote = sum_reducer_value.v[2];
1691
1692 if (!overlap_communication_and_computation) {
1693 R_nnz_owned += R_nnz_remote;
1694 R_nnz_remote = 0;
1695 }
1696
1697 // construct the D graph.
1698 {
1699 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1700
1701 btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1702 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1703
1704#if defined(BLOCKTRIDICONTAINER_DEBUG)
1705 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1706#endif
1707
1708 const local_ordinal_type nparts = partptr.extent(0) - 1;
1709 {
1710 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1711 Kokkos::parallel_for
1712 ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1713 policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1714 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1715 local_ordinal_type offset = 0;
1716 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1717 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1718 offset = 1;
1719 const local_ordinal_type lr0 = lclrow(ri0);
1720 const size_type j0 = local_graph_rowptr(lr0);
1721 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1722 const local_ordinal_type lc = local_graph_colidx(j);
1723 const local_ordinal_type lc2r = col2row[lc];
1724 if (lc2r == (local_ordinal_type) -1) continue;
1725 const local_ordinal_type ri = lclrow2idx[lc2r];
1726 const local_ordinal_type pi = rowidx2part(ri);
1727 if (pi != pi0) continue;
1728 if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1729 const local_ordinal_type row_entry = j - j0;
1730 D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1731 }
1732 }
1733 });
1734 }
1735#if defined(BLOCKTRIDICONTAINER_DEBUG)
1736 for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1737 TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1738#endif
1739 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1740
1741 // Allocate values.
1742 {
1743 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1744 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1745 btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1746 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1747 }
1748 }
1749
1750 // Construct the R graph.
1751 {
1752 amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1753 amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
1754
1755 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1756 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1757
1758 amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1759 amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
1760
1761 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1762 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1763
1764 {
1765 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1766 Kokkos::parallel_for
1767 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1768 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1769 const local_ordinal_type ri0 = lclrow2idx[lr];
1770 const local_ordinal_type pi0 = rowidx2part(ri0);
1771 const size_type j0 = local_graph_rowptr(lr);
1772 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1773 const local_ordinal_type lc = local_graph_colidx(j);
1774 const local_ordinal_type lc2r = col2row[lc];
1775 if (lc2r != (local_ordinal_type) -1) {
1776 const local_ordinal_type ri = lclrow2idx[lc2r];
1777 const local_ordinal_type pi = rowidx2part(ri);
1778 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1779 continue;
1780 }
1781 }
1782 // exclusive scan will be performed later
1783 if (!overlap_communication_and_computation || lc < nrows) {
1784 ++R_rowptr(lr);
1785 } else {
1786 ++R_rowptr_remote(lr);
1787 }
1788 }
1789 });
1790 }
1791
1792 // exclusive scan
1793 typedef ArrayValueType<size_type,2> update_type;
1794 {
1795 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1796 Kokkos::parallel_scan
1797 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1798 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1799 update_type &update,
1800 const bool &final) {
1801 update_type val;
1802 val.v[0] = R_rowptr(lr);
1803 if (overlap_communication_and_computation)
1804 val.v[1] = R_rowptr_remote(lr);
1805
1806 if (final) {
1807 R_rowptr(lr) = update.v[0];
1808 if (overlap_communication_and_computation)
1809 R_rowptr_remote(lr) = update.v[1];
1810
1811 if (lr < nrows) {
1812 const local_ordinal_type ri0 = lclrow2idx[lr];
1813 const local_ordinal_type pi0 = rowidx2part(ri0);
1814
1815 size_type cnt_rowptr = R_rowptr(lr);
1816 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1817
1818 const size_type j0 = local_graph_rowptr(lr);
1819 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1820 const local_ordinal_type lc = local_graph_colidx(j);
1821 const local_ordinal_type lc2r = col2row[lc];
1822 if (lc2r != (local_ordinal_type) -1) {
1823 const local_ordinal_type ri = lclrow2idx[lc2r];
1824 const local_ordinal_type pi = rowidx2part(ri);
1825 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1826 continue;
1827 }
1828 const local_ordinal_type row_entry = j - j0;
1829 if (!overlap_communication_and_computation || lc < nrows)
1830 R_A_colindsub(cnt_rowptr++) = row_entry;
1831 else
1832 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1833 }
1834 }
1835 }
1836 update += val;
1837 });
1838 }
1839 TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1840 Kokkos::deep_copy(amd.rowptr, R_rowptr);
1841 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1842 if (overlap_communication_and_computation) {
1843 TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1844 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1845 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1846 }
1847
1848 // Allocate or view values.
1849 amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
1850
1851 }
1852 }
1853 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
1854 }
1855
1856
1860 template<typename ArgActiveExecutionMemorySpace>
1862
1863 template<>
1864 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1865 typedef KB::Mode::Serial mode_type;
1866#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1867 typedef KB::Algo::Level3::CompactMKL algo_type;
1868#else
1869 typedef KB::Algo::Level3::Blocked algo_type;
1870#endif
1871 static int recommended_team_size(const int /* blksize */,
1872 const int /* vector_length */,
1873 const int /* internal_vector_length */) {
1874 return 1;
1875 }
1876
1877 };
1878
1879#if defined(KOKKOS_ENABLE_CUDA)
1880 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
1881 const int vector_length,
1882 const int internal_vector_length) {
1883 const int vector_size = vector_length/internal_vector_length;
1884 int total_team_size(0);
1885 if (blksize <= 5) total_team_size = 32;
1886 else if (blksize <= 9) total_team_size = 32; // 64
1887 else if (blksize <= 12) total_team_size = 96;
1888 else if (blksize <= 16) total_team_size = 128;
1889 else if (blksize <= 20) total_team_size = 160;
1890 else total_team_size = 160;
1891 return 2*total_team_size/vector_size;
1892 }
1893 template<>
1894 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1895 typedef KB::Mode::Team mode_type;
1896 typedef KB::Algo::Level3::Unblocked algo_type;
1897 static int recommended_team_size(const int blksize,
1898 const int vector_length,
1899 const int internal_vector_length) {
1900 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1901 }
1902 };
1903 template<>
1904 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1905 typedef KB::Mode::Team mode_type;
1906 typedef KB::Algo::Level3::Unblocked algo_type;
1907 static int recommended_team_size(const int blksize,
1908 const int vector_length,
1909 const int internal_vector_length) {
1910 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1911 }
1912 };
1913#endif
1914
1915#if defined(KOKKOS_ENABLE_HIP)
1916 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
1917 const int vector_length,
1918 const int internal_vector_length) {
1919 const int vector_size = vector_length/internal_vector_length;
1920 int total_team_size(0);
1921 if (blksize <= 5) total_team_size = 32;
1922 else if (blksize <= 9) total_team_size = 32; // 64
1923 else if (blksize <= 12) total_team_size = 96;
1924 else if (blksize <= 16) total_team_size = 128;
1925 else if (blksize <= 20) total_team_size = 160;
1926 else total_team_size = 160;
1927 return 2*total_team_size/vector_size;
1928 }
1929 template<>
1930 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
1931 typedef KB::Mode::Team mode_type;
1932 typedef KB::Algo::Level3::Unblocked algo_type;
1933 static int recommended_team_size(const int blksize,
1934 const int vector_length,
1935 const int internal_vector_length) {
1936 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1937 }
1938 };
1939 template<>
1940 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
1941 typedef KB::Mode::Team mode_type;
1942 typedef KB::Algo::Level3::Unblocked algo_type;
1943 static int recommended_team_size(const int blksize,
1944 const int vector_length,
1945 const int internal_vector_length) {
1946 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1947 }
1948 };
1949#endif
1950
1951#if defined(KOKKOS_ENABLE_SYCL)
1952 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
1953 const int vector_length,
1954 const int internal_vector_length) {
1955 const int vector_size = vector_length/internal_vector_length;
1956 int total_team_size(0);
1957 if (blksize <= 5) total_team_size = 32;
1958 else if (blksize <= 9) total_team_size = 32; // 64
1959 else if (blksize <= 12) total_team_size = 96;
1960 else if (blksize <= 16) total_team_size = 128;
1961 else if (blksize <= 20) total_team_size = 160;
1962 else total_team_size = 160;
1963 return 2*total_team_size/vector_size;
1964 }
1965 template<>
1966 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
1967 typedef KB::Mode::Team mode_type;
1968 typedef KB::Algo::Level3::Unblocked algo_type;
1969 static int recommended_team_size(const int blksize,
1970 const int vector_length,
1971 const int internal_vector_length) {
1972 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
1973 }
1974 };
1975 template<>
1976 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
1977 typedef KB::Mode::Team mode_type;
1978 typedef KB::Algo::Level3::Unblocked algo_type;
1979 static int recommended_team_size(const int blksize,
1980 const int vector_length,
1981 const int internal_vector_length) {
1982 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
1983 }
1984 };
1985#endif
1986
1987
1988 template<typename MatrixType>
1989 struct ExtractAndFactorizeTridiags {
1990 public:
1991 using impl_type = ImplType<MatrixType>;
1992 // a functor cannot have both device_type and execution_space; specialization error in kokkos
1993 using execution_space = typename impl_type::execution_space;
1994 using memory_space = typename impl_type::memory_space;
1996 using local_ordinal_type = typename impl_type::local_ordinal_type;
1997 using size_type = typename impl_type::size_type;
1998 using impl_scalar_type = typename impl_type::impl_scalar_type;
1999 using magnitude_type = typename impl_type::magnitude_type;
2001 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2003 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2004 using size_type_1d_view = typename impl_type::size_type_1d_view;
2005 using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2007 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2008 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2009 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2010 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2011 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2012 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2013 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2014
2015 using internal_vector_type = typename impl_type::internal_vector_type;
2016 static constexpr int vector_length = impl_type::vector_length;
2017 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2018
2020 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2021 using member_type = typename team_policy_type::member_type;
2022
2023 private:
2024 // part interface
2025 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
2026 const local_ordinal_type max_partsz;
2027 // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2028 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2029 const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
2030 const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2031 // block tridiags
2032 const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
2033 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2034 const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
2035 const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
2036 // shared information
2037 const local_ordinal_type blocksize, blocksize_square;
2038 // diagonal safety
2039 const magnitude_type tiny;
2040 const local_ordinal_type vector_loop_size;
2041 const local_ordinal_type vector_length_value;
2042
2043 public:
2044 ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2045 const PartInterface<MatrixType> &interf_,
2046 const Teuchos::RCP<const block_crs_matrix_type> &A_,
2047 const magnitude_type& tiny_) :
2048 // interface
2049 partptr(interf_.partptr),
2050 lclrow(interf_.lclrow),
2051 packptr(interf_.packptr),
2052 max_partsz(interf_.max_partsz),
2053 // block crs matrix
2054 A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
2055 A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
2056 // block tridiags
2057 pack_td_ptr(btdm_.pack_td_ptr),
2058 flat_td_ptr(btdm_.flat_td_ptr),
2059 A_colindsub(btdm_.A_colindsub),
2060 internal_vector_values((internal_vector_type*)btdm_.values.data(),
2061 btdm_.values.extent(0),
2062 btdm_.values.extent(1),
2063 btdm_.values.extent(2),
2064 vector_length/internal_vector_length),
2065 scalar_values((btdm_scalar_type*)btdm_.values.data(),
2066 btdm_.values.extent(0),
2067 btdm_.values.extent(1),
2068 btdm_.values.extent(2),
2069 vector_length),
2070 blocksize(btdm_.values.extent(1)),
2071 blocksize_square(blocksize*blocksize),
2072 // diagonal weight to avoid zero pivots
2073 tiny(tiny_),
2074 vector_loop_size(vector_length/internal_vector_length),
2075 vector_length_value(vector_length) {}
2076
2077 private:
2078
2079 KOKKOS_INLINE_FUNCTION
2080 void
2081 extract(local_ordinal_type partidx,
2082 local_ordinal_type npacks) const {
2083 using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2084 const size_type kps = pack_td_ptr(partidx);
2085 local_ordinal_type kfs[vector_length] = {};
2086 local_ordinal_type ri0[vector_length] = {};
2087 local_ordinal_type nrows[vector_length] = {};
2088
2089 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2090 kfs[vi] = flat_td_ptr(partidx);
2091 ri0[vi] = partptr(partidx);
2092 nrows[vi] = partptr(partidx+1) - ri0[vi];
2093 }
2094 for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
2095 for (local_ordinal_type e=0;e<3;++e) {
2096 const impl_scalar_type* block[vector_length] = {};
2097 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2098 const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2099 block[vi] = &A_values(Aj*blocksize_square);
2100 }
2101 const size_type pi = kps + j;
2102 ++j;
2103 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2104 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2105 //const auto idx = ii*blocksize + jj;
2106 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2107 auto& v = internal_vector_values(pi, ii, jj, 0);
2108 for (local_ordinal_type vi=0;vi<npacks;++vi)
2109 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2110 }
2111 }
2112
2113 if (nrows[0] == 1) break;
2114 if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2115 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2116 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2117 npacks = vi;
2118 break;
2119 }
2120 }
2121 }
2122 }
2123 }
2124
2125 KOKKOS_INLINE_FUNCTION
2126 void
2127 extract(const member_type &member,
2128 const local_ordinal_type &partidxbeg,
2129 const local_ordinal_type &npacks,
2130 const local_ordinal_type &vbeg) const {
2131 using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2132 local_ordinal_type kfs_vals[internal_vector_length] = {};
2133 local_ordinal_type ri0_vals[internal_vector_length] = {};
2134 local_ordinal_type nrows_vals[internal_vector_length] = {};
2135
2136 const size_type kps = pack_td_ptr(partidxbeg);
2137 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2138 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
2139 ri0_vals[vi] = partptr(partidxbeg+vi);
2140 nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
2141 }
2142
2143 local_ordinal_type j_vals[internal_vector_length] = {};
2144 for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
2145 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2146 const local_ordinal_type nrows = nrows_vals[vi];
2147 if (tr < nrows) {
2148 auto &j = j_vals[vi];
2149 const local_ordinal_type kfs = kfs_vals[vi];
2150 const local_ordinal_type ri0 = ri0_vals[vi];
2151 const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
2152 const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
2153 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
2154 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
2155 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
2156 const size_type pi = kps + j;
2157 Kokkos::parallel_for
2158 (Kokkos::TeamThreadRange(member,blocksize),
2159 [&](const local_ordinal_type &ii) {
2160 for (local_ordinal_type jj=0;jj<blocksize;++jj)
2161 scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
2162 });
2163 }
2164 }
2165 }
2166 }
2167 }
2168
2169 template<typename AAViewType,
2170 typename WWViewType>
2171 KOKKOS_INLINE_FUNCTION
2172 void
2173 factorize(const member_type &member,
2174 const local_ordinal_type &i0,
2175 const local_ordinal_type &nrows,
2176 const local_ordinal_type &v,
2177 const AAViewType &AA,
2178 const WWViewType &WW) const {
2179
2180 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
2181 <typename execution_space::memory_space> default_mode_and_algo_type;
2182
2183 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2184 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
2185
2186 // constant
2187 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2188
2189 // subview pattern
2190 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2191 KB::LU<member_type,
2192 default_mode_type,KB::Algo::LU::Unblocked>
2193 ::invoke(member, A , tiny);
2194
2195 if (nrows > 1) {
2196 auto B = A;
2197 auto C = A;
2198 local_ordinal_type i = i0;
2199 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2200 B.assign_data( &AA(i+1,0,0,v) );
2201 KB::Trsm<member_type,
2202 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2203 default_mode_type,default_algo_type>
2204 ::invoke(member, one, A, B);
2205 C.assign_data( &AA(i+2,0,0,v) );
2206 KB::Trsm<member_type,
2207 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2208 default_mode_type,default_algo_type>
2209 ::invoke(member, one, A, C);
2210 A.assign_data( &AA(i+3,0,0,v) );
2211
2212 member.team_barrier();
2213 KB::Gemm<member_type,
2214 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2215 default_mode_type,default_algo_type>
2216 ::invoke(member, -one, C, B, one, A);
2217 KB::LU<member_type,
2218 default_mode_type,KB::Algo::LU::Unblocked>
2219 ::invoke(member, A, tiny);
2220 }
2221 } else {
2222 // for block jacobi invert a matrix here
2223 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2224 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2225 ::invoke(member, A, W);
2226 KB::SetIdentity<member_type,default_mode_type>
2227 ::invoke(member, A);
2228 member.team_barrier();
2229 KB::Trsm<member_type,
2230 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2231 default_mode_type,default_algo_type>
2232 ::invoke(member, one, W, A);
2233 KB::Trsm<member_type,
2234 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2235 default_mode_type,default_algo_type>
2236 ::invoke(member, one, W, A);
2237 }
2238 }
2239
2240 public:
2241
2242 struct ExtractAndFactorizeTag {};
2243
2244 KOKKOS_INLINE_FUNCTION
2245 void
2246 operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
2247 // btdm is packed and sorted from largest one
2248 const local_ordinal_type packidx = member.league_rank();
2249
2250 const local_ordinal_type partidx = packptr(packidx);
2251 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2252 const local_ordinal_type i0 = pack_td_ptr(partidx);
2253 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2254
2255 internal_vector_scratch_type_3d_view
2256 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
2257 if (vector_loop_size == 1) {
2258 extract(partidx, npacks);
2259 factorize(member, i0, nrows, 0, internal_vector_values, WW);
2260 } else {
2261 Kokkos::parallel_for
2262 (Kokkos::ThreadVectorRange(member, vector_loop_size),
2263 [&](const local_ordinal_type &v) {
2264 const local_ordinal_type vbeg = v*internal_vector_length;
2265 if (vbeg < npacks)
2266 extract(member, partidx+vbeg, npacks, vbeg);
2267 // this is not safe if vector loop size is different from vector size of
2268 // the team policy. we always make sure this when constructing the team policy
2269 member.team_barrier();
2270 factorize(member, i0, nrows, v, internal_vector_values, WW);
2271 });
2272 }
2273 }
2274
2275 void run() {
2276 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2277 const local_ordinal_type team_size =
2278 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2279 recommended_team_size(blocksize, vector_length, internal_vector_length);
2280 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
2281 shmem_size(blocksize, blocksize, vector_loop_size);
2282
2283 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
2284 policy(packptr.extent(0)-1, team_size, vector_loop_size);
2285#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2286 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2287 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
2288#else
2289 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
2290 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2291 policy, *this);
2292#endif
2293 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2294 }
2295
2296 };
2297
2301 template<typename MatrixType>
2302 void
2303 performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
2304 const PartInterface<MatrixType> &interf,
2306 const typename ImplType<MatrixType>::magnitude_type tiny) {
2307 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
2308 ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
2309 function.run();
2310 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
2311 }
2312
2316 template<typename MatrixType>
2317 struct MultiVectorConverter {
2318 public:
2319 using impl_type = ImplType<MatrixType>;
2320 using execution_space = typename impl_type::execution_space;
2321 using memory_space = typename impl_type::memory_space;
2322
2323 using local_ordinal_type = typename impl_type::local_ordinal_type;
2324 using impl_scalar_type = typename impl_type::impl_scalar_type;
2325 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2326 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
2327 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2328 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2329 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2330 using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
2331 static constexpr int vector_length = impl_type::vector_length;
2332
2333 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2334
2335 private:
2336 // part interface
2337 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2338 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2339 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2340 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2341 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2342 const local_ordinal_type blocksize;
2343 const local_ordinal_type num_vectors;
2344
2345 // packed multivector output (or input)
2346 vector_type_3d_view packed_multivector;
2347 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
2348
2349 template<typename TagType>
2350 KOKKOS_INLINE_FUNCTION
2351 void copy_multivectors(const local_ordinal_type &j,
2352 const local_ordinal_type &vi,
2353 const local_ordinal_type &pri,
2354 const local_ordinal_type &ri0) const {
2355 for (local_ordinal_type col=0;col<num_vectors;++col)
2356 for (local_ordinal_type i=0;i<blocksize;++i)
2357 packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2358 }
2359
2360 public:
2361
2362 MultiVectorConverter(const PartInterface<MatrixType> &interf,
2363 const vector_type_3d_view &pmv)
2364 : partptr(interf.partptr),
2365 packptr(interf.packptr),
2366 part2packrowidx0(interf.part2packrowidx0),
2367 part2rowidx0(interf.part2rowidx0),
2368 lclrow(interf.lclrow),
2369 blocksize(pmv.extent(1)),
2370 num_vectors(pmv.extent(2)),
2371 packed_multivector(pmv) {}
2372
2373 // TODO:: modify this routine similar to the team level functions
2374 // inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
2375 KOKKOS_INLINE_FUNCTION
2376 void
2377 operator() (const local_ordinal_type &packidx) const {
2378 local_ordinal_type partidx = packptr(packidx);
2379 local_ordinal_type npacks = packptr(packidx+1) - partidx;
2380 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2381
2382 local_ordinal_type ri0[vector_length] = {};
2383 local_ordinal_type nrows[vector_length] = {};
2384 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
2385 ri0[v] = part2rowidx0(partidx);
2386 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
2387 }
2388 for (local_ordinal_type j=0;j<nrows[0];++j) {
2389 local_ordinal_type cnt = 1;
2390 for (;cnt<npacks && j!= nrows[cnt];++cnt);
2391 npacks = cnt;
2392 const local_ordinal_type pri = pri0 + j;
2393 for (local_ordinal_type col=0;col<num_vectors;++col)
2394 for (local_ordinal_type i=0;i<blocksize;++i)
2395 for (local_ordinal_type v=0;v<npacks;++v)
2396 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
2397 }
2398 }
2399
2400 KOKKOS_INLINE_FUNCTION
2401 void
2402 operator() (const member_type &member) const {
2403 const local_ordinal_type packidx = member.league_rank();
2404 const local_ordinal_type partidx_begin = packptr(packidx);
2405 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
2406 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
2407 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
2408 const local_ordinal_type partidx = partidx_begin + v;
2409 const local_ordinal_type ri0 = part2rowidx0(partidx);
2410 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
2411
2412 if (nrows == 1) {
2413 const local_ordinal_type pri = pri0;
2414 for (local_ordinal_type col=0;col<num_vectors;++col) {
2415 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
2416 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
2417 });
2418 }
2419 } else {
2420 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
2421 const local_ordinal_type pri = pri0 + j;
2422 for (local_ordinal_type col=0;col<num_vectors;++col)
2423 for (local_ordinal_type i=0;i<blocksize;++i)
2424 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2425 });
2426 }
2427 });
2428 }
2429
2430 void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
2431 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2432 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter");
2433
2434 scalar_multivector = scalar_multivector_;
2435 if constexpr (is_device<execution_space>::value) {
2436 const local_ordinal_type vl = vector_length;
2437 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2438 Kokkos::parallel_for
2439 ("MultiVectorConverter::TeamPolicy", policy, *this);
2440 } else {
2441 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
2442 Kokkos::parallel_for
2443 ("MultiVectorConverter::RangePolicy", policy, *this);
2444 }
2445 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2446 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
2447 }
2448 };
2449
2453 template<typename ArgActiveExecutionMemorySpace>
2455
2456 template<>
2457 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2458 typedef KB::Mode::Serial mode_type;
2459 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2460#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2461 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
2462#else
2463 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
2464#endif
2465 static int recommended_team_size(const int /* blksize */,
2466 const int /* vector_length */,
2467 const int /* internal_vector_length */) {
2468 return 1;
2469 }
2470 };
2471
2472#if defined(KOKKOS_ENABLE_CUDA)
2473 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
2474 const int vector_length,
2475 const int internal_vector_length) {
2476 const int vector_size = vector_length/internal_vector_length;
2477 int total_team_size(0);
2478 if (blksize <= 5) total_team_size = 32;
2479 else if (blksize <= 9) total_team_size = 32; // 64
2480 else if (blksize <= 12) total_team_size = 96;
2481 else if (blksize <= 16) total_team_size = 128;
2482 else if (blksize <= 20) total_team_size = 160;
2483 else total_team_size = 160;
2484 return total_team_size/vector_size;
2485 }
2486
2487 template<>
2488 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2489 typedef KB::Mode::Team mode_type;
2490 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2491 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2492 static int recommended_team_size(const int blksize,
2493 const int vector_length,
2494 const int internal_vector_length) {
2495 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2496 }
2497 };
2498 template<>
2499 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2500 typedef KB::Mode::Team mode_type;
2501 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2502 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2503 static int recommended_team_size(const int blksize,
2504 const int vector_length,
2505 const int internal_vector_length) {
2506 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2507 }
2508 };
2509#endif
2510
2511#if defined(KOKKOS_ENABLE_HIP)
2512 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
2513 const int vector_length,
2514 const int internal_vector_length) {
2515 const int vector_size = vector_length/internal_vector_length;
2516 int total_team_size(0);
2517 if (blksize <= 5) total_team_size = 32;
2518 else if (blksize <= 9) total_team_size = 32; // 64
2519 else if (blksize <= 12) total_team_size = 96;
2520 else if (blksize <= 16) total_team_size = 128;
2521 else if (blksize <= 20) total_team_size = 160;
2522 else total_team_size = 160;
2523 return total_team_size/vector_size;
2524 }
2525
2526 template<>
2527 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
2528 typedef KB::Mode::Team mode_type;
2529 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2530 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2531 static int recommended_team_size(const int blksize,
2532 const int vector_length,
2533 const int internal_vector_length) {
2534 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2535 }
2536 };
2537 template<>
2538 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
2539 typedef KB::Mode::Team mode_type;
2540 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2541 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2542 static int recommended_team_size(const int blksize,
2543 const int vector_length,
2544 const int internal_vector_length) {
2545 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2546 }
2547 };
2548#endif
2549
2550#if defined(KOKKOS_ENABLE_SYCL)
2551 static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
2552 const int vector_length,
2553 const int internal_vector_length) {
2554 const int vector_size = vector_length/internal_vector_length;
2555 int total_team_size(0);
2556 if (blksize <= 5) total_team_size = 32;
2557 else if (blksize <= 9) total_team_size = 32; // 64
2558 else if (blksize <= 12) total_team_size = 96;
2559 else if (blksize <= 16) total_team_size = 128;
2560 else if (blksize <= 20) total_team_size = 160;
2561 else total_team_size = 160;
2562 return total_team_size/vector_size;
2563 }
2564
2565 template<>
2566 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2567 typedef KB::Mode::Team mode_type;
2568 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2569 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2570 static int recommended_team_size(const int blksize,
2571 const int vector_length,
2572 const int internal_vector_length) {
2573 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2574 }
2575 };
2576 template<>
2577 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2578 typedef KB::Mode::Team mode_type;
2579 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2580 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2581 static int recommended_team_size(const int blksize,
2582 const int vector_length,
2583 const int internal_vector_length) {
2584 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2585 }
2586 };
2587#endif
2588
2589
2590
2591
2592 template<typename MatrixType>
2593 struct SolveTridiags {
2594 public:
2595 using impl_type = ImplType<MatrixType>;
2596 using execution_space = typename impl_type::execution_space;
2597
2598 using local_ordinal_type = typename impl_type::local_ordinal_type;
2599 using size_type = typename impl_type::size_type;
2600 using impl_scalar_type = typename impl_type::impl_scalar_type;
2601 using magnitude_type = typename impl_type::magnitude_type;
2602 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2603 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2605 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2606 using size_type_1d_view = typename impl_type::size_type_1d_view;
2608 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2609 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2610 //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2611
2612 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2613
2614 using internal_vector_type =typename impl_type::internal_vector_type;
2615 static constexpr int vector_length = impl_type::vector_length;
2616 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2617
2619 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2620 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2621
2623 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2624 using member_type = typename team_policy_type::member_type;
2625
2626 private:
2627 // part interface
2628 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2629 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2630 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2631 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2632
2633 // block tridiags
2634 const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2635
2636 // block tridiags values
2637 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2638 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2639
2640 const local_ordinal_type vector_loop_size;
2641
2642 // copy to multivectors : damping factor and Y_scalar_multivector
2643 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
2644#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
2645 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2646#else
2647 /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2648#endif
2649 const impl_scalar_type df;
2650 const bool compute_diff;
2651
2652 public:
2653 SolveTridiags(const PartInterface<MatrixType> &interf,
2654 const BlockTridiags<MatrixType> &btdm,
2655 const vector_type_3d_view &pmv,
2656 const impl_scalar_type damping_factor,
2657 const bool is_norm_manager_active)
2658 :
2659 // interface
2660 partptr(interf.partptr),
2661 packptr(interf.packptr),
2662 part2packrowidx0(interf.part2packrowidx0),
2663 lclrow(interf.lclrow),
2664 // block tridiags and multivector
2665 pack_td_ptr(btdm.pack_td_ptr),
2666 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2667 btdm.values.extent(0),
2668 btdm.values.extent(1),
2669 btdm.values.extent(2),
2670 vector_length/internal_vector_length),
2671 X_internal_vector_values((internal_vector_type*)pmv.data(),
2672 pmv.extent(0),
2673 pmv.extent(1),
2674 pmv.extent(2),
2675 vector_length/internal_vector_length),
2676 vector_loop_size(vector_length/internal_vector_length),
2677 Y_scalar_multivector(),
2678 Z_scalar_vector(),
2679 df(damping_factor),
2680 compute_diff(is_norm_manager_active)
2681 {}
2682
2683 public:
2684
2686 KOKKOS_INLINE_FUNCTION
2687 void
2688 copyToFlatMultiVector(const member_type &member,
2689 const local_ordinal_type partidxbeg, // partidx for v = 0
2690 const local_ordinal_type npacks,
2691 const local_ordinal_type pri0,
2692 const local_ordinal_type v, // index with a loop of vector_loop_size
2693 const local_ordinal_type blocksize,
2694 const local_ordinal_type num_vectors) const {
2695 const local_ordinal_type vbeg = v*internal_vector_length;
2696 if (vbeg < npacks) {
2697 local_ordinal_type ri0_vals[internal_vector_length] = {};
2698 local_ordinal_type nrows_vals[internal_vector_length] = {};
2699 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2700 const local_ordinal_type partidx = partidxbeg+vv;
2701 ri0_vals[vi] = partptr(partidx);
2702 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2703 }
2704
2705 impl_scalar_type z_partial_sum(0);
2706 if (nrows_vals[0] == 1) {
2707 const local_ordinal_type j=0, pri=pri0;
2708 {
2709 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2710 const local_ordinal_type ri0 = ri0_vals[vi];
2711 const local_ordinal_type nrows = nrows_vals[vi];
2712 if (j < nrows) {
2713 Kokkos::parallel_for
2714 (Kokkos::TeamThreadRange(member, blocksize),
2715 [&](const local_ordinal_type &i) {
2716 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2717 for (local_ordinal_type col=0;col<num_vectors;++col) {
2718 impl_scalar_type &y = Y_scalar_multivector(row,col);
2719 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2720 y += df*yd;
2721
2722 {//if (compute_diff) {
2723 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2724 z_partial_sum += yd_abs*yd_abs;
2725 }
2726 }
2727 });
2728 }
2729 }
2730 }
2731 } else {
2732 Kokkos::parallel_for
2733 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2734 [&](const local_ordinal_type &j) {
2735 const local_ordinal_type pri = pri0 + j;
2736 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2737 const local_ordinal_type ri0 = ri0_vals[vi];
2738 const local_ordinal_type nrows = nrows_vals[vi];
2739 if (j < nrows) {
2740 for (local_ordinal_type col=0;col<num_vectors;++col) {
2741 for (local_ordinal_type i=0;i<blocksize;++i) {
2742 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2743 impl_scalar_type &y = Y_scalar_multivector(row,col);
2744 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2745 y += df*yd;
2746
2747 {//if (compute_diff) {
2748 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2749 z_partial_sum += yd_abs*yd_abs;
2750 }
2751 }
2752 }
2753 }
2754 }
2755 });
2756 }
2757 //if (compute_diff)
2758 Z_scalar_vector(member.league_rank()) += z_partial_sum;
2759 }
2760 }
2761
2765 template<typename WWViewType>
2766 KOKKOS_INLINE_FUNCTION
2767 void
2768 solveSingleVector(const member_type &member,
2769 const local_ordinal_type &blocksize,
2770 const local_ordinal_type &i0,
2771 const local_ordinal_type &r0,
2772 const local_ordinal_type &nrows,
2773 const local_ordinal_type &v,
2774 const WWViewType &WW) const {
2775
2776 typedef SolveTridiagsDefaultModeAndAlgo
2777 <typename execution_space::memory_space> default_mode_and_algo_type;
2778
2779 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2780 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2781
2782 // base pointers
2783 auto A = D_internal_vector_values.data();
2784 auto X = X_internal_vector_values.data();
2785
2786 // constant
2787 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2788 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2789 //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2790
2791 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2792 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2793 const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2794 const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2795 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2796 const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2797
2798 // for multiple rhs
2799 //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2800 //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2801
2802 // move to starting point
2803 A += i0*astep + v;
2804 X += r0*xstep + v;
2805
2806 //for (local_ordinal_type col=0;col<num_vectors;++col)
2807 if (nrows > 1) {
2808 // solve Lx = x
2809 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2810 (default_mode_type,default_algo_type,
2811 member,
2812 KB::Diag::Unit,
2813 blocksize,blocksize,
2814 one,
2815 A, as0, as1,
2816 X, xs0);
2817
2818 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2819 member.team_barrier();
2820 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2821 (default_mode_type,default_algo_type,
2822 member,
2823 blocksize, blocksize,
2824 -one,
2825 A+2*astep, as0, as1,
2826 X, xs0,
2827 one,
2828 X+1*xstep, xs0);
2829 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2830 (default_mode_type,default_algo_type,
2831 member,
2832 KB::Diag::Unit,
2833 blocksize,blocksize,
2834 one,
2835 A+3*astep, as0, as1,
2836 X+1*xstep, xs0);
2837
2838 A += 3*astep;
2839 X += 1*xstep;
2840 }
2841
2842 // solve Ux = x
2843 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2844 (default_mode_type,default_algo_type,
2845 member,
2846 KB::Diag::NonUnit,
2847 blocksize, blocksize,
2848 one,
2849 A, as0, as1,
2850 X, xs0);
2851
2852 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2853 A -= 3*astep;
2854 member.team_barrier();
2855 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2856 (default_mode_type,default_algo_type,
2857 member,
2858 blocksize, blocksize,
2859 -one,
2860 A+1*astep, as0, as1,
2861 X, xs0,
2862 one,
2863 X-1*xstep, xs0);
2864 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2865 (default_mode_type,default_algo_type,
2866 member,
2867 KB::Diag::NonUnit,
2868 blocksize, blocksize,
2869 one,
2870 A, as0, as1,
2871 X-1*xstep,xs0);
2872 X -= 1*xstep;
2873 }
2874 // for multiple rhs
2875 //X += xs1;
2876 } else {
2877 const local_ordinal_type ws0 = WW.stride_0();
2878 auto W = WW.data() + v;
2879 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2880 (default_mode_type,
2881 member, blocksize, X, xs0, W, ws0);
2882 member.team_barrier();
2883 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2884 (default_mode_type,default_algo_type,
2885 member,
2886 blocksize, blocksize,
2887 one,
2888 A, as0, as1,
2889 W, xs0,
2890 zero,
2891 X, xs0);
2892 }
2893 }
2894
2895 template<typename WWViewType>
2896 KOKKOS_INLINE_FUNCTION
2897 void
2898 solveMultiVector(const member_type &member,
2899 const local_ordinal_type &/* blocksize */,
2900 const local_ordinal_type &i0,
2901 const local_ordinal_type &r0,
2902 const local_ordinal_type &nrows,
2903 const local_ordinal_type &v,
2904 const WWViewType &WW) const {
2905
2906 typedef SolveTridiagsDefaultModeAndAlgo
2907 <typename execution_space::memory_space> default_mode_and_algo_type;
2908
2909 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2910 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2911
2912 // constant
2913 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2914 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2915
2916 // subview pattern
2917 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2918 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2919 auto X2 = X1;
2920
2921 local_ordinal_type i = i0, r = r0;
2922
2923
2924 if (nrows > 1) {
2925 // solve Lx = x
2926 KB::Trsm<member_type,
2927 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2928 default_mode_type,default_algo_type>
2929 ::invoke(member, one, A, X1);
2930 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2931 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2932 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2933 member.team_barrier();
2934 KB::Gemm<member_type,
2935 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2936 default_mode_type,default_algo_type>
2937 ::invoke(member, -one, A, X1, one, X2);
2938 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2939 KB::Trsm<member_type,
2940 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2941 default_mode_type,default_algo_type>
2942 ::invoke(member, one, A, X2);
2943 X1.assign_data( X2.data() );
2944 }
2945
2946 // solve Ux = x
2947 KB::Trsm<member_type,
2948 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2949 default_mode_type,default_algo_type>
2950 ::invoke(member, one, A, X1);
2951 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2952 i -= 3;
2953 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2954 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2955 member.team_barrier();
2956 KB::Gemm<member_type,
2957 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2958 default_mode_type,default_algo_type>
2959 ::invoke(member, -one, A, X1, one, X2);
2960
2961 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2962 KB::Trsm<member_type,
2963 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2964 default_mode_type,default_algo_type>
2965 ::invoke(member, one, A, X2);
2966 X1.assign_data( X2.data() );
2967 }
2968 } else {
2969 // matrix is already inverted
2970 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2971 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2972 ::invoke(member, X1, W);
2973 member.team_barrier();
2974 KB::Gemm<member_type,
2975 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2976 default_mode_type,default_algo_type>
2977 ::invoke(member, one, A, W, zero, X1);
2978 }
2979 }
2980
2981 template<int B> struct SingleVectorTag {};
2982 template<int B> struct MultiVectorTag {};
2983
2984 template<int B>
2985 KOKKOS_INLINE_FUNCTION
2986 void
2987 operator() (const SingleVectorTag<B> &, const member_type &member) const {
2988 const local_ordinal_type packidx = member.league_rank();
2989 const local_ordinal_type partidx = packptr(packidx);
2990 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2991 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2992 const local_ordinal_type i0 = pack_td_ptr(partidx);
2993 const local_ordinal_type r0 = part2packrowidx0(partidx);
2994 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2995 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2996 const local_ordinal_type num_vectors = 1;
2997 internal_vector_scratch_type_3d_view
2998 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2999 Kokkos::single(Kokkos::PerTeam(member), [&]() {
3000 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
3001 });
3002 Kokkos::parallel_for
3003 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3004 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
3005 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
3006 });
3007 }
3008
3009 template<int B>
3010 KOKKOS_INLINE_FUNCTION
3011 void
3012 operator() (const MultiVectorTag<B> &, const member_type &member) const {
3013 const local_ordinal_type packidx = member.league_rank();
3014 const local_ordinal_type partidx = packptr(packidx);
3015 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
3016 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3017 const local_ordinal_type i0 = pack_td_ptr(partidx);
3018 const local_ordinal_type r0 = part2packrowidx0(partidx);
3019 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
3020 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
3021 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
3022
3023 internal_vector_scratch_type_3d_view
3024 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
3025 Kokkos::single(Kokkos::PerTeam(member), [&]() {
3026 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
3027 });
3028 Kokkos::parallel_for
3029 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3030 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
3031 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
3032 });
3033 }
3034
3035 void run(const impl_scalar_type_2d_view_tpetra &Y,
3036 const impl_scalar_type_1d_view &Z) {
3037 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3038 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags");
3039
3041 this->Y_scalar_multivector = Y;
3042 this->Z_scalar_vector = Z;
3043
3044 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
3045 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
3046
3047 const local_ordinal_type team_size =
3048 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3049 recommended_team_size(blocksize, vector_length, internal_vector_length);
3050 const int per_team_scratch = internal_vector_scratch_type_3d_view
3051 ::shmem_size(blocksize, num_vectors, vector_loop_size);
3052
3053#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
3054#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
3055 if (num_vectors == 1) { \
3056 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
3057 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
3058 Kokkos::parallel_for \
3059 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
3060 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
3061 } else { \
3062 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
3063 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
3064 Kokkos::parallel_for \
3065 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
3066 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
3067 } break
3068#else
3069#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
3070 if (num_vectors == 1) { \
3071 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
3072 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
3073 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
3074 Kokkos::parallel_for \
3075 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
3076 policy, *this); \
3077 } else { \
3078 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
3079 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
3080 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
3081 Kokkos::parallel_for \
3082 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
3083 policy, *this); \
3084 } break
3085#endif
3086 switch (blocksize) {
3087 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
3088 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
3089 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
3090 case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
3091 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
3092 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
3093 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
3094 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
3095 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
3096 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
3097 }
3098#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
3099
3100 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3101 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
3102 }
3103 };
3104
3108 static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
3109 const int team_size) {
3110 int total_team_size(0);
3111 if (blksize <= 5) total_team_size = 32;
3112 else if (blksize <= 9) total_team_size = 32; // 64
3113 else if (blksize <= 12) total_team_size = 96;
3114 else if (blksize <= 16) total_team_size = 128;
3115 else if (blksize <= 20) total_team_size = 160;
3116 else total_team_size = 160;
3117 return total_team_size/team_size;
3118 }
3119
3120 static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
3121 const int team_size) {
3122 int total_team_size(0);
3123 if (blksize <= 5) total_team_size = 32;
3124 else if (blksize <= 9) total_team_size = 32; // 64
3125 else if (blksize <= 12) total_team_size = 96;
3126 else if (blksize <= 16) total_team_size = 128;
3127 else if (blksize <= 20) total_team_size = 160;
3128 else total_team_size = 160;
3129 return total_team_size/team_size;
3130 }
3131
3132 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
3133 const int team_size) {
3134 int total_team_size(0);
3135 if (blksize <= 5) total_team_size = 32;
3136 else if (blksize <= 9) total_team_size = 32; // 64
3137 else if (blksize <= 12) total_team_size = 96;
3138 else if (blksize <= 16) total_team_size = 128;
3139 else if (blksize <= 20) total_team_size = 160;
3140 else total_team_size = 160;
3141 return total_team_size/team_size;
3142 }
3143
3144 template<typename T>
3145 static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
3146 const int team_size) {
3147 if ( is_cuda<T>::value )
3148 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
3149 if ( is_hip<T>::value )
3150 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
3151 if ( is_sycl<T>::value )
3152 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
3153 return -1;
3154 }
3155
3156
3157 template<typename MatrixType>
3158 struct ComputeResidualVector {
3159 public:
3160 using impl_type = ImplType<MatrixType>;
3161 using node_device_type = typename impl_type::node_device_type;
3162 using execution_space = typename impl_type::execution_space;
3163 using memory_space = typename impl_type::memory_space;
3164
3165 using local_ordinal_type = typename impl_type::local_ordinal_type;
3166 using size_type = typename impl_type::size_type;
3167 using impl_scalar_type = typename impl_type::impl_scalar_type;
3168 using magnitude_type = typename impl_type::magnitude_type;
3169 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3170 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3172 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3173 using size_type_1d_view = typename impl_type::size_type_1d_view;
3174 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
3175 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3176 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
3177 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3178 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3179 static constexpr int vector_length = impl_type::vector_length;
3180
3182 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3183
3184 // enum for max blocksize and vector length
3185 enum : int { max_blocksize = 32 };
3186
3187 private:
3188 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
3189 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
3190 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
3191 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
3192 Unmanaged<vector_type_3d_view> y_packed;
3193 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
3194
3195 // AmD information
3196 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
3197 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
3198 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
3199
3200 // block crs graph information
3201 // for cuda (kokkos crs graph uses a different size_type from size_t)
3202 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
3203 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
3204
3205 // blocksize
3206 const local_ordinal_type blocksize_requested;
3207
3208 // part interface
3209 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3210 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3211 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
3212 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3213 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3214 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
3215 const bool is_dm2cm_active;
3216
3217 public:
3218 template<typename LocalCrsGraphType>
3219 ComputeResidualVector(const AmD<MatrixType> &amd,
3220 const LocalCrsGraphType &graph,
3221 const local_ordinal_type &blocksize_requested_,
3222 const PartInterface<MatrixType> &interf,
3223 const local_ordinal_type_1d_view &dm2cm_)
3224 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
3225 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
3226 tpetra_values(amd.tpetra_values),
3227 A_rowptr(graph.row_map),
3228 A_colind(graph.entries),
3229 blocksize_requested(blocksize_requested_),
3230 part2packrowidx0(interf.part2packrowidx0),
3231 part2rowidx0(interf.part2rowidx0),
3232 rowidx2part(interf.rowidx2part),
3233 partptr(interf.partptr),
3234 lclrow(interf.lclrow),
3235 dm2cm(dm2cm_),
3236 is_dm2cm_active(dm2cm_.span() > 0)
3237 {}
3238
3239 inline
3240 void
3241 SerialGemv(const local_ordinal_type &blocksize,
3242 const impl_scalar_type * const KOKKOS_RESTRICT AA,
3243 const impl_scalar_type * const KOKKOS_RESTRICT xx,
3244 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
3245 using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3246 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3247 impl_scalar_type val = 0;
3248#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
3249# pragma ivdep
3250#endif
3251#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
3252# pragma unroll
3253#endif
3254 for (local_ordinal_type k1=0;k1<blocksize;++k1)
3255 val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
3256 yy[k0] -= val;
3257 }
3258 }
3259
3260 template<typename bbViewType, typename yyViewType>
3261 KOKKOS_INLINE_FUNCTION
3262 void
3263 VectorCopy(const member_type &member,
3264 const local_ordinal_type &blocksize,
3265 const bbViewType &bb,
3266 const yyViewType &yy) const {
3267 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
3268 yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
3269 });
3270 }
3271
3272 template<typename AAViewType, typename xxViewType, typename yyViewType>
3273 KOKKOS_INLINE_FUNCTION
3274 void
3275 TeamVectorGemv(const member_type &member,
3276 const local_ordinal_type &blocksize,
3277 const AAViewType &AA,
3278 const xxViewType &xx,
3279 const yyViewType &yy) const {
3280 Kokkos::parallel_for
3281 (Kokkos::TeamThreadRange(member, blocksize),
3282 [&](const local_ordinal_type &k0) {
3283 impl_scalar_type val = 0;
3284 Kokkos::parallel_for
3285 (Kokkos::ThreadVectorRange(member, blocksize),
3286 [&](const local_ordinal_type &k1) {
3287 val += AA(k0,k1)*xx(k1);
3288 });
3289 Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3290 });
3291 }
3292
3293 template<typename AAViewType, typename xxViewType, typename yyViewType>
3294 KOKKOS_INLINE_FUNCTION
3295 void
3296 VectorGemv(const member_type &member,
3297 const local_ordinal_type &blocksize,
3298 const AAViewType &AA,
3299 const xxViewType &xx,
3300 const yyViewType &yy) const {
3301 Kokkos::parallel_for
3302 (Kokkos::ThreadVectorRange(member, blocksize),
3303 [&](const local_ordinal_type &k0) {
3304 impl_scalar_type val(0);
3305 for (local_ordinal_type k1=0;k1<blocksize;++k1) {
3306 val += AA(k0,k1)*xx(k1);
3307 }
3308 Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3309 });
3310 }
3311
3312 // template<typename AAViewType, typename xxViewType, typename yyViewType>
3313 // KOKKOS_INLINE_FUNCTION
3314 // void
3315 // VectorGemv(const member_type &member,
3316 // const local_ordinal_type &blocksize,
3317 // const AAViewType &AA,
3318 // const xxViewType &xx,
3319 // const yyViewType &yy) const {
3320 // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3321 // impl_scalar_type val = 0;
3322 // Kokkos::parallel_for
3323 // (Kokkos::ThreadVectorRange(member, blocksize),
3324 // [&](const local_ordinal_type &k1) {
3325 // val += AA(k0,k1)*xx(k1);
3326 // });
3327 // Kokkos::atomic_fetch_add(&yy(k0), -val);
3328 // }
3329 // }
3330
3331 struct SeqTag {};
3332
3333 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3334 KOKKOS_INLINE_FUNCTION
3335 void
3336 operator() (const SeqTag &, const local_ordinal_type& i) const {
3337 const local_ordinal_type blocksize = blocksize_requested;
3338 const local_ordinal_type blocksize_square = blocksize*blocksize;
3339
3340 // constants
3341 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3342 const local_ordinal_type num_vectors = y.extent(1);
3343 const local_ordinal_type row = i*blocksize;
3344 for (local_ordinal_type col=0;col<num_vectors;++col) {
3345 // y := b
3346 impl_scalar_type *yy = &y(row, col);
3347 const impl_scalar_type * const bb = &b(row, col);
3348 memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
3349
3350 // y -= Rx
3351 const size_type A_k0 = A_rowptr[i];
3352 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
3353 const size_type j = A_k0 + colindsub[k];
3354 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3355 const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
3356 SerialGemv(blocksize,AA,xx,yy);
3357 }
3358 }
3359 }
3360
3361 KOKKOS_INLINE_FUNCTION
3362 void
3363 operator() (const SeqTag &, const member_type &member) const {
3364
3365 // constants
3366 const local_ordinal_type blocksize = blocksize_requested;
3367 const local_ordinal_type blocksize_square = blocksize*blocksize;
3368
3369 const local_ordinal_type lr = member.league_rank();
3370 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3371 const local_ordinal_type num_vectors = y.extent(1);
3372
3373 // subview pattern
3374 auto bb = Kokkos::subview(b, block_range, 0);
3375 auto xx = bb;
3376 auto yy = Kokkos::subview(y, block_range, 0);
3377 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3378
3379 const local_ordinal_type row = lr*blocksize;
3380 for (local_ordinal_type col=0;col<num_vectors;++col) {
3381 // y := b
3382 yy.assign_data(&y(row, col));
3383 bb.assign_data(&b(row, col));
3384 if (member.team_rank() == 0)
3385 VectorCopy(member, blocksize, bb, yy);
3386 member.team_barrier();
3387
3388 // y -= Rx
3389 const size_type A_k0 = A_rowptr[lr];
3390 Kokkos::parallel_for
3391 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3392 [&](const local_ordinal_type &k) {
3393 const size_type j = A_k0 + colindsub[k];
3394 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3395 xx.assign_data( &x(A_colind[j]*blocksize, col) );
3396 VectorGemv(member, blocksize, A_block, xx, yy);
3397 });
3398 }
3399 }
3400
3401 template<int B>
3402 struct AsyncTag {};
3403
3404 template<int B>
3405 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3406 KOKKOS_INLINE_FUNCTION
3407 void
3408 operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
3409 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3410 const local_ordinal_type blocksize_square = blocksize*blocksize;
3411
3412 // constants
3413 const local_ordinal_type partidx = rowidx2part(rowidx);
3414 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3415 const local_ordinal_type v = partidx % vector_length;
3416
3417 const local_ordinal_type num_vectors = y_packed.extent(2);
3418 const local_ordinal_type num_local_rows = lclrow.extent(0);
3419
3420 // temporary buffer for y flat
3421 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
3422
3423 const local_ordinal_type lr = lclrow(rowidx);
3424 const local_ordinal_type row = lr*blocksize;
3425 for (local_ordinal_type col=0;col<num_vectors;++col) {
3426 // y := b
3427 memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3428
3429 // y -= Rx
3430 const size_type A_k0 = A_rowptr[lr];
3431 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
3432 const size_type j = A_k0 + colindsub[k];
3433 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3434 const local_ordinal_type A_colind_at_j = A_colind[j];
3435 if (A_colind_at_j < num_local_rows) {
3436 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3437 const impl_scalar_type * const xx = &x(loc*blocksize, col);
3438 SerialGemv(blocksize, AA,xx,yy);
3439 } else {
3440 const auto loc = A_colind_at_j - num_local_rows;
3441 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3442 SerialGemv(blocksize, AA,xx_remote,yy);
3443 }
3444 }
3445 // move yy to y_packed
3446 for (local_ordinal_type k=0;k<blocksize;++k)
3447 y_packed(pri, k, col)[v] = yy[k];
3448 }
3449 }
3450
3451 template<int B>
3452 KOKKOS_INLINE_FUNCTION
3453 void
3454 operator() (const AsyncTag<B> &, const member_type &member) const {
3455 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3456 const local_ordinal_type blocksize_square = blocksize*blocksize;
3457
3458 // constants
3459 const local_ordinal_type rowidx = member.league_rank();
3460 const local_ordinal_type partidx = rowidx2part(rowidx);
3461 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3462 const local_ordinal_type v = partidx % vector_length;
3463
3464 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3465 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3466 const local_ordinal_type num_local_rows = lclrow.extent(0);
3467
3468 // subview pattern
3469 auto bb = Kokkos::subview(b, block_range, 0);
3470 auto xx = Kokkos::subview(x, block_range, 0);
3471 auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
3472 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3473 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3474
3475 const local_ordinal_type lr = lclrow(rowidx);
3476 const local_ordinal_type row = lr*blocksize;
3477 for (local_ordinal_type col=0;col<num_vectors;++col) {
3478 // y := b
3479 bb.assign_data(&b(row, col));
3480 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3481 if (member.team_rank() == 0)
3482 VectorCopy(member, blocksize, bb, yy);
3483 member.team_barrier();
3484
3485 // y -= Rx
3486 const size_type A_k0 = A_rowptr[lr];
3487 Kokkos::parallel_for
3488 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3489 [&](const local_ordinal_type &k) {
3490 const size_type j = A_k0 + colindsub[k];
3491 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3492
3493 const local_ordinal_type A_colind_at_j = A_colind[j];
3494 if (A_colind_at_j < num_local_rows) {
3495 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3496 xx.assign_data( &x(loc*blocksize, col) );
3497 VectorGemv(member, blocksize, A_block, xx, yy);
3498 } else {
3499 const auto loc = A_colind_at_j - num_local_rows;
3500 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3501 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3502 }
3503 });
3504 }
3505 }
3506
3507 template <int P, int B> struct OverlapTag {};
3508
3509 template<int P, int B>
3510 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3511 KOKKOS_INLINE_FUNCTION
3512 void
3513 operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
3514 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3515 const local_ordinal_type blocksize_square = blocksize*blocksize;
3516
3517 // constants
3518 const local_ordinal_type partidx = rowidx2part(rowidx);
3519 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3520 const local_ordinal_type v = partidx % vector_length;
3521
3522 const local_ordinal_type num_vectors = y_packed.extent(2);
3523 const local_ordinal_type num_local_rows = lclrow.extent(0);
3524
3525 // temporary buffer for y flat
3526 impl_scalar_type yy[max_blocksize] = {};
3527
3528 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3529 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3530
3531 const local_ordinal_type lr = lclrow(rowidx);
3532 const local_ordinal_type row = lr*blocksize;
3533 for (local_ordinal_type col=0;col<num_vectors;++col) {
3534 if (P == 0) {
3535 // y := b
3536 memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3537 } else {
3538 // y (temporary) := 0
3539 memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
3540 }
3541
3542 // y -= Rx
3543 const size_type A_k0 = A_rowptr[lr];
3544 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
3545 const size_type j = A_k0 + colindsub_used[k];
3546 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3547 const local_ordinal_type A_colind_at_j = A_colind[j];
3548 if (P == 0) {
3549 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3550 const impl_scalar_type * const xx = &x(loc*blocksize, col);
3551 SerialGemv(blocksize,AA,xx,yy);
3552 } else {
3553 const auto loc = A_colind_at_j - num_local_rows;
3554 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3555 SerialGemv(blocksize,AA,xx_remote,yy);
3556 }
3557 }
3558 // move yy to y_packed
3559 if (P == 0) {
3560 for (local_ordinal_type k=0;k<blocksize;++k)
3561 y_packed(pri, k, col)[v] = yy[k];
3562 } else {
3563 for (local_ordinal_type k=0;k<blocksize;++k)
3564 y_packed(pri, k, col)[v] += yy[k];
3565 }
3566 }
3567 }
3568
3569 template<int P, int B>
3570 KOKKOS_INLINE_FUNCTION
3571 void
3572 operator() (const OverlapTag<P,B> &, const member_type &member) const {
3573 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3574 const local_ordinal_type blocksize_square = blocksize*blocksize;
3575
3576 // constants
3577 const local_ordinal_type rowidx = member.league_rank();
3578 const local_ordinal_type partidx = rowidx2part(rowidx);
3579 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3580 const local_ordinal_type v = partidx % vector_length;
3581
3582 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3583 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3584 const local_ordinal_type num_local_rows = lclrow.extent(0);
3585
3586 // subview pattern
3587 auto bb = Kokkos::subview(b, block_range, 0);
3588 auto xx = bb; //Kokkos::subview(x, block_range, 0);
3589 auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
3590 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3591 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3592 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3593 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3594
3595 const local_ordinal_type lr = lclrow(rowidx);
3596 const local_ordinal_type row = lr*blocksize;
3597 for (local_ordinal_type col=0;col<num_vectors;++col) {
3598 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3599 if (P == 0) {
3600 // y := b
3601 bb.assign_data(&b(row, col));
3602 if (member.team_rank() == 0)
3603 VectorCopy(member, blocksize, bb, yy);
3604 member.team_barrier();
3605 }
3606
3607 // y -= Rx
3608 const size_type A_k0 = A_rowptr[lr];
3609 Kokkos::parallel_for
3610 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3611 [&](const local_ordinal_type &k) {
3612 const size_type j = A_k0 + colindsub_used[k];
3613 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3614
3615 const local_ordinal_type A_colind_at_j = A_colind[j];
3616 if (P == 0) {
3617 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3618 xx.assign_data( &x(loc*blocksize, col) );
3619 VectorGemv(member, blocksize, A_block, xx, yy);
3620 } else {
3621 const auto loc = A_colind_at_j - num_local_rows;
3622 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3623 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3624 }
3625 });
3626 }
3627 }
3628
3629 // y = b - Rx; seq method
3630 template<typename MultiVectorLocalViewTypeY,
3631 typename MultiVectorLocalViewTypeB,
3632 typename MultiVectorLocalViewTypeX>
3633 void run(const MultiVectorLocalViewTypeY &y_,
3634 const MultiVectorLocalViewTypeB &b_,
3635 const MultiVectorLocalViewTypeX &x_) {
3636 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3637 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
3638
3639 y = y_; b = b_; x = x_;
3640 if constexpr (is_device<execution_space>::value) {
3641 const local_ordinal_type blocksize = blocksize_requested;
3642 const local_ordinal_type team_size = 8;
3643 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
3644 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3645 Kokkos::parallel_for
3646 ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3647 } else {
3648 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3649 Kokkos::parallel_for
3650 ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
3651 }
3652 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3653 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
3654 }
3655
3656 // y = b - R (x , x_remote)
3657 template<typename MultiVectorLocalViewTypeB,
3658 typename MultiVectorLocalViewTypeX,
3659 typename MultiVectorLocalViewTypeX_Remote>
3660 void run(const vector_type_3d_view &y_packed_,
3661 const MultiVectorLocalViewTypeB &b_,
3662 const MultiVectorLocalViewTypeX &x_,
3663 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3664 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3665 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
3666
3667 b = b_; x = x_; x_remote = x_remote_;
3668 if constexpr (is_device<execution_space>::value) {
3669 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3670 y_packed_.extent(0),
3671 y_packed_.extent(1),
3672 y_packed_.extent(2),
3673 vector_length);
3674 } else {
3675 y_packed = y_packed_;
3676 }
3677
3678 if constexpr(is_device<execution_space>::value) {
3679 const local_ordinal_type blocksize = blocksize_requested;
3680 const local_ordinal_type team_size = 8;
3681 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
3682 // local_ordinal_type vl_power_of_two = 1;
3683 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3684 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3685 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3686#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3687 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3688 policy(rowidx2part.extent(0), team_size, vector_size); \
3689 Kokkos::parallel_for \
3690 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3691 policy, *this); } break
3692 switch (blocksize_requested) {
3693 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3694 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3695 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3696 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3697 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3698 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3699 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3700 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3701 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3702 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3703 }
3704#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3705 } else {
3706#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3707 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3708 Kokkos::parallel_for \
3709 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3710 policy, *this); } break
3711 switch (blocksize_requested) {
3712 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3713 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3714 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3715 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3716 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3717 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3718 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3719 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3720 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3721 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3722 }
3723#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3724 }
3725 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3726 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
3727 }
3728
3729 // y = b - R (y , y_remote)
3730 template<typename MultiVectorLocalViewTypeB,
3731 typename MultiVectorLocalViewTypeX,
3732 typename MultiVectorLocalViewTypeX_Remote>
3733 void run(const vector_type_3d_view &y_packed_,
3734 const MultiVectorLocalViewTypeB &b_,
3735 const MultiVectorLocalViewTypeX &x_,
3736 const MultiVectorLocalViewTypeX_Remote &x_remote_,
3737 const bool compute_owned) {
3738 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3739 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
3740
3741 b = b_; x = x_; x_remote = x_remote_;
3742 if constexpr (is_device<execution_space>::value) {
3743 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3744 y_packed_.extent(0),
3745 y_packed_.extent(1),
3746 y_packed_.extent(2),
3747 vector_length);
3748 } else {
3749 y_packed = y_packed_;
3750 }
3751
3752 if constexpr (is_device<execution_space>::value) {
3753 const local_ordinal_type blocksize = blocksize_requested;
3754 const local_ordinal_type team_size = 8;
3755 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
3756 // local_ordinal_type vl_power_of_two = 1;
3757 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3758 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3759 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3760#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3761 if (compute_owned) { \
3762 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3763 policy(rowidx2part.extent(0), team_size, vector_size); \
3764 Kokkos::parallel_for \
3765 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3766 } else { \
3767 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3768 policy(rowidx2part.extent(0), team_size, vector_size); \
3769 Kokkos::parallel_for \
3770 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3771 } break
3772 switch (blocksize_requested) {
3773 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3774 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3775 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3776 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3777 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3778 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3779 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3780 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3781 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3782 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3783 }
3784#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3785 } else {
3786#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3787 if (compute_owned) { \
3788 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3789 policy(0, rowidx2part.extent(0)); \
3790 Kokkos::parallel_for \
3791 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3792 } else { \
3793 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3794 policy(0, rowidx2part.extent(0)); \
3795 Kokkos::parallel_for \
3796 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3797 } break
3798
3799 switch (blocksize_requested) {
3800 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3801 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3802 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3803 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3804 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3805 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3806 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3807 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3808 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3809 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3810 }
3811#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3812 }
3813 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3814 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
3815 }
3816 };
3817
3818 template<typename MatrixType>
3819 void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3820 /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
3821 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3822 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector");
3823
3824 using impl_type = ImplType<MatrixType>;
3825 using local_ordinal_type = typename impl_type::local_ordinal_type;
3826 using impl_scalar_type = typename impl_type::impl_scalar_type;
3827#if 0
3828 const auto norm2 = KokkosBlas::nrm1(zz);
3829#else
3830 impl_scalar_type norm2(0);
3831 Kokkos::parallel_reduce
3832 ("ReduceMultiVector::Device",
3833 Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3834 KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
3835 update += zz(i);
3836 }, norm2);
3837#endif
3838 vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
3839
3840 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3841 IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
3842 }
3843
3847 template<typename MatrixType>
3848 struct NormManager {
3849 public:
3850 using impl_type = ImplType<MatrixType>;
3851 using host_execution_space = typename impl_type::host_execution_space;
3852 using magnitude_type = typename impl_type::magnitude_type;
3853
3854 private:
3855 bool collective_;
3856 int sweep_step_, sweep_step_upper_bound_;
3857#ifdef HAVE_IFPACK2_MPI
3858 MPI_Request mpi_request_;
3859 MPI_Comm comm_;
3860#endif
3861 magnitude_type work_[3];
3862
3863 public:
3864 NormManager() = default;
3865 NormManager(const NormManager &b) = default;
3866 NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3867 sweep_step_ = 1;
3868 sweep_step_upper_bound_ = 1;
3869 collective_ = comm->getSize() > 1;
3870 if (collective_) {
3871#ifdef HAVE_IFPACK2_MPI
3872 const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3873 TEUCHOS_ASSERT( ! mpi_comm.is_null());
3874 comm_ = *mpi_comm->getRawMpiComm();
3875#endif
3876 }
3877 const magnitude_type zero(0), minus_one(-1);
3878 work_[0] = zero;
3879 work_[1] = zero;
3880 work_[2] = minus_one;
3881 }
3882
3883 // Check the norm every sweep_step sweeps.
3884 void setCheckFrequency(const int sweep_step) {
3885 TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3886 sweep_step_upper_bound_ = sweep_step;
3887 sweep_step_ = 1;
3888 }
3889
3890 // Get the buffer into which to store rank-local squared norms.
3891 magnitude_type* getBuffer() { return &work_[0]; }
3892
3893 // Call MPI_Iallreduce to find the global squared norms.
3894 void ireduce(const int sweep, const bool force = false) {
3895 if ( ! force && sweep % sweep_step_) return;
3896
3897 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
3898
3899 work_[1] = work_[0];
3900#ifdef HAVE_IFPACK2_MPI
3901 auto send_data = &work_[1];
3902 auto recv_data = &work_[0];
3903 if (collective_) {
3904# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3905 MPI_Iallreduce(send_data, recv_data, 1,
3906 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3907 MPI_SUM, comm_, &mpi_request_);
3908# else
3909 MPI_Allreduce (send_data, recv_data, 1,
3910 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3911 MPI_SUM, comm_);
3912# endif
3913 }
3914#endif
3915 }
3916
3917 // Check if the norm-based termination criterion is met. tol2 is the
3918 // tolerance squared. Sweep is the sweep index. If not every iteration is
3919 // being checked, this function immediately returns false. If a check must
3920 // be done at this iteration, it waits for the reduction triggered by
3921 // ireduce to complete, then checks the global norm against the tolerance.
3922 bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3923 // early return
3924 if (sweep <= 0) return false;
3925
3926 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
3927
3928 TEUCHOS_ASSERT(sweep >= 1);
3929 if ( ! force && (sweep - 1) % sweep_step_) return false;
3930 if (collective_) {
3931#ifdef HAVE_IFPACK2_MPI
3932# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3933 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3934# else
3935 // Do nothing.
3936# endif
3937#endif
3938 }
3939 bool r_val = false;
3940 if (sweep == 1) {
3941 work_[2] = work_[0];
3942 } else {
3943 r_val = (work_[0] < tol2*work_[2]);
3944 }
3945
3946 // adjust sweep step
3947 const auto adjusted_sweep_step = 2*sweep_step_;
3948 if (adjusted_sweep_step < sweep_step_upper_bound_) {
3949 sweep_step_ = adjusted_sweep_step;
3950 } else {
3951 sweep_step_ = sweep_step_upper_bound_;
3952 }
3953 return r_val;
3954 }
3955
3956 // After termination has occurred, finalize the norms for use in
3957 // get_norms{0,final}.
3958 void finalize () {
3959 work_[0] = std::sqrt(work_[0]); // after converged
3960 if (work_[2] >= 0)
3961 work_[2] = std::sqrt(work_[2]); // first norm
3962 // if work_[2] is minus one, then norm is not requested.
3963 }
3964
3965 // Report norms to the caller.
3966 const magnitude_type getNorms0 () const { return work_[2]; }
3967 const magnitude_type getNormsFinal () const { return work_[0]; }
3968 };
3969
3973 template<typename MatrixType>
3974 int
3975 applyInverseJacobi(// importer
3976 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3977 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3978 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3979 const bool overlap_communication_and_computation,
3980 // tpetra interface
3981 const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3982 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3983 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3984 /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3985 // local object interface
3986 const PartInterface<MatrixType> &interf, // mesh interface
3987 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3988 const AmD<MatrixType> &amd, // R = A - D
3989 /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3990 /* */ NormManager<MatrixType> &norm_manager,
3991 // preconditioner parameters
3992 const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3993 /* */ bool is_y_zero,
3994 const int max_num_sweeps,
3995 const typename ImplType<MatrixType>::magnitude_type tol,
3996 const int check_tol_every) {
3997 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
3998
3999 using impl_type = ImplType<MatrixType>;
4000 using node_memory_space = typename impl_type::node_memory_space;
4001 using local_ordinal_type = typename impl_type::local_ordinal_type;
4002 using size_type = typename impl_type::size_type;
4003 using impl_scalar_type = typename impl_type::impl_scalar_type;
4004 using magnitude_type = typename impl_type::magnitude_type;
4005 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4006 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4007 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4008 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4009
4010 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4011
4012 // either tpetra importer or async importer must be active
4013 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4014 "Neither Tpetra importer nor Async importer is null.");
4015 // max number of sweeps should be positive number
4016 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4017 "Maximum number of sweeps must be >= 1.");
4018
4019 // const parameters
4020 const bool is_seq_method_requested = !tpetra_importer.is_null();
4021 const bool is_async_importer_active = !async_importer.is_null();
4022 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4023 const magnitude_type tolerance = tol*tol;
4024 const local_ordinal_type blocksize = btdm.values.extent(1);
4025 const local_ordinal_type num_vectors = Y.getNumVectors();
4026 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4027
4028 const impl_scalar_type zero(0.0);
4029
4030 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4031 "The seq method for applyInverseJacobi, " <<
4032 "which in any case is for developer use only, " <<
4033 "does not support norm-based termination.");
4034 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4035 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4036 TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4037 std::invalid_argument,
4038 "The seq method for applyInverseJacobi, " <<
4039 "which in any case is for developer use only, " <<
4040 "only supports memory spaces accessible from host.");
4041
4042 // if workspace is needed more, resize it
4043 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4044 if (work.span() < work_span_required)
4045 work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4046
4047 // construct W
4048 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4049 if (local_ordinal_type(W.extent(0)) < W_size)
4050 W = impl_scalar_type_1d_view("W", W_size);
4051
4052 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4053 {
4054 if (is_seq_method_requested) {
4055 if (Z.getNumVectors() != Y.getNumVectors())
4056 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4057 } else {
4058 if (is_async_importer_active) {
4059 // create comm data buffer and keep it here
4060 async_importer->createDataBuffer(num_vectors);
4061 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4062 }
4063 }
4064 }
4065
4066 // wrap the workspace with 3d view
4067 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4068 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4069 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4070 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4071 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4072
4073 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4074 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4075 damping_factor, is_norm_manager_active);
4076
4077 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4078 ComputeResidualVector<MatrixType>
4079 compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
4080 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4081
4082 // norm manager workspace resize
4083 if (is_norm_manager_active)
4084 norm_manager.setCheckFrequency(check_tol_every);
4085
4086 // iterate
4087 int sweep = 0;
4088 for (;sweep<max_num_sweeps;++sweep) {
4089 {
4090 if (is_y_zero) {
4091 // pmv := x(lclrow)
4092 multivector_converter.run(XX);
4093 } else {
4094 if (is_seq_method_requested) {
4095 // SEQ METHOD IS TESTING ONLY
4096
4097 // y := x - R y
4098 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4099 compute_residual_vector.run(YY, XX, ZZ);
4100
4101 // pmv := y(lclrow).
4102 multivector_converter.run(YY);
4103 } else {
4104 // fused y := x - R y and pmv := y(lclrow);
4105 // real use case does not use overlap comp and comm
4106 if (overlap_communication_and_computation || !is_async_importer_active) {
4107 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4108 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
4109 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4110 if (is_async_importer_active) async_importer->cancel();
4111 break;
4112 }
4113 if (is_async_importer_active) {
4114 async_importer->syncRecv();
4115 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
4116 }
4117 } else {
4118 if (is_async_importer_active)
4119 async_importer->syncExchange(YY);
4120 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
4121 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
4122 }
4123 }
4124 }
4125 }
4126
4127 // pmv := inv(D) pmv.
4128 {
4129 solve_tridiags.run(YY, W);
4130 }
4131 {
4132 if (is_norm_manager_active) {
4133 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
4134 reduceVector<MatrixType>(W, norm_manager.getBuffer());
4135 if (sweep + 1 == max_num_sweeps) {
4136 norm_manager.ireduce(sweep, true);
4137 norm_manager.checkDone(sweep + 1, tolerance, true);
4138 } else {
4139 norm_manager.ireduce(sweep);
4140 }
4141 }
4142 }
4143 is_y_zero = false;
4144 }
4145
4146 //sqrt the norms for the caller's use.
4147 if (is_norm_manager_active) norm_manager.finalize();
4148
4149 return sweep;
4150 }
4151
4152
4153 template<typename MatrixType>
4154 struct ImplObject {
4155 using impl_type = ImplType<MatrixType>;
4156 using part_interface_type = PartInterface<MatrixType>;
4157 using block_tridiags_type = BlockTridiags<MatrixType>;
4158 using amd_type = AmD<MatrixType>;
4159 using norm_manager_type = NormManager<MatrixType>;
4160 using async_import_type = AsyncableImport<MatrixType>;
4161
4162 // distructed objects
4163 Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
4164 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
4165 Teuchos::RCP<async_import_type> async_importer;
4166 bool overlap_communication_and_computation;
4167
4168 // copy of Y (mutable to penentrate const)
4169 mutable typename impl_type::tpetra_multivector_type Z;
4170 mutable typename impl_type::impl_scalar_type_1d_view W;
4171
4172 // local objects
4173 part_interface_type part_interface;
4174 block_tridiags_type block_tridiags; // D
4175 amd_type a_minus_d; // R = A - D
4176 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
4177 mutable norm_manager_type norm_manager;
4178 };
4179
4180 } // namespace BlockTriDiContainerDetails
4181
4182} // namespace Ifpack2
4183
4184#endif
Definition Ifpack2_BlockTriDiContainer_decl.hpp:111
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:447
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1365
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1040
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:2303
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1154
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3108
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3975
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition Ifpack2_BlockTriDiContainer_impl.hpp:121
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1564
std::string get_msg_prefix(const CommPtrType &comm)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:262
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:74
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1534
Definition Ifpack2_BlockTriDiContainer_impl.hpp:274
Definition Ifpack2_BlockTriDiContainer_impl.hpp:178
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1321
Definition Ifpack2_BlockTriDiContainer_impl.hpp:215
Definition Ifpack2_BlockTriDiContainer_impl.hpp:354
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition Ifpack2_BlockTriDiContainer_impl.hpp:376
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockTriDiContainer_impl.hpp:367
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockTriDiContainer_impl.hpp:423
KB::Vector< T, l > Vector
Definition Ifpack2_BlockTriDiContainer_impl.hpp:410
size_t size_type
Definition Ifpack2_BlockTriDiContainer_impl.hpp:358
node_type::device_type node_device_type
Definition Ifpack2_BlockTriDiContainer_impl.hpp:381
Definition Ifpack2_BlockTriDiContainer_impl.hpp:2454
Definition Ifpack2_BlockTriDiContainer_impl.hpp:301
Definition Ifpack2_BlockTriDiContainer_impl.hpp:165
Definition Ifpack2_BlockTriDiContainer_impl.hpp:187
Definition Ifpack2_BlockTriDiContainer_impl.hpp:195
Definition Ifpack2_BlockTriDiContainer_impl.hpp:203