Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
41#define TPETRA_BLOCKCRSMATRIX_DEF_HPP
42
45
49#include "Tpetra_BlockMultiVector.hpp"
50#include "Tpetra_BlockView.hpp"
51
52#include "Teuchos_TimeMonitor.hpp"
53#ifdef HAVE_TPETRA_DEBUG
54# include <set>
55#endif // HAVE_TPETRA_DEBUG
56
57//
58// mfh 25 May 2016: Temporary fix for #393.
59//
60// Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
61// 4.7.2 compiler bug ("internal compiler error") when compiling them.
62// Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
63// use them in that case, either.
64//
65// mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
66// compiler error") too. Ditto for GCC 5.1. I'll just disable the
67// thing for any GCC version.
68//
69#if defined(__CUDACC__)
70 // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
71# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
73# endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74
75#elif defined(__GNUC__)
76
77# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
79# endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80
81#else // some other compiler
82
83 // Optimistically assume that other compilers aren't broken.
84# if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85# define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
86# endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87#endif // defined(__CUDACC__), defined(__GNUC__)
88
89
90namespace Tpetra {
91
92namespace Impl {
93
94 template<typename T>
95 struct BlockCrsRowStruct {
96 T totalNumEntries, totalNumBytes, maxRowLength;
97 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
98 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
99 KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
100 : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
101 };
102
103 template<typename T>
104 static
105 KOKKOS_INLINE_FUNCTION
106 void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
107 a.totalNumEntries += b.totalNumEntries;
108 a.totalNumBytes += b.totalNumBytes;
109 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
110 }
111
112 template<typename T, typename ExecSpace>
113 struct BlockCrsReducer {
114 typedef BlockCrsReducer reducer;
115 typedef T value_type;
116 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
117 value_type *value;
118
119 KOKKOS_INLINE_FUNCTION
120 BlockCrsReducer(value_type &val) : value(&val) {}
121
122 KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
123 KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
124 KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
125 KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
126 KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
127 };
128
129 template<class AlphaCoeffType,
130 class GraphType,
131 class MatrixValuesType,
132 class InVecType,
133 class BetaCoeffType,
134 class OutVecType,
135 bool IsBuiltInType>
136 class BcrsApplyNoTransFunctor {
137 private:
138 static_assert (Kokkos::is_view<MatrixValuesType>::value,
139 "MatrixValuesType must be a Kokkos::View.");
140 static_assert (Kokkos::is_view<OutVecType>::value,
141 "OutVecType must be a Kokkos::View.");
142 static_assert (Kokkos::is_view<InVecType>::value,
143 "InVecType must be a Kokkos::View.");
144 static_assert (std::is_same<MatrixValuesType,
145 typename MatrixValuesType::const_type>::value,
146 "MatrixValuesType must be a const Kokkos::View.");
147 static_assert (std::is_same<typename OutVecType::value_type,
148 typename OutVecType::non_const_value_type>::value,
149 "OutVecType must be a nonconst Kokkos::View.");
150 static_assert (std::is_same<typename InVecType::value_type,
151 typename InVecType::const_value_type>::value,
152 "InVecType must be a const Kokkos::View.");
153 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
154 "MatrixValuesType must be a rank-1 Kokkos::View.");
155 static_assert (static_cast<int> (InVecType::rank) == 1,
156 "InVecType must be a rank-1 Kokkos::View.");
157 static_assert (static_cast<int> (OutVecType::rank) == 1,
158 "OutVecType must be a rank-1 Kokkos::View.");
159 typedef typename MatrixValuesType::non_const_value_type scalar_type;
160
161 public:
162 typedef typename GraphType::device_type device_type;
163
165 typedef typename std::remove_const<typename GraphType::data_type>::type
166 local_ordinal_type;
167
174 void setX (const InVecType& X) { X_ = X; }
175
182 void setY (const OutVecType& Y) { Y_ = Y; }
183
184 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
185 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
186
188 BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
189 const GraphType& graph,
190 const MatrixValuesType& val,
191 const local_ordinal_type blockSize,
192 const InVecType& X,
193 const beta_coeff_type& beta,
194 const OutVecType& Y) :
195 alpha_ (alpha),
196 ptr_ (graph.row_map),
197 ind_ (graph.entries),
198 val_ (val),
199 blockSize_ (blockSize),
200 X_ (X),
201 beta_ (beta),
202 Y_ (Y)
203 {}
204
205 // Dummy team version
206 KOKKOS_INLINE_FUNCTION void
207 operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
208 {
209 Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
210 }
211
212 // Range Policy for non built-in types
213 KOKKOS_INLINE_FUNCTION void
214 operator () (const local_ordinal_type& lclRow) const
215 {
216 using ::Tpetra::FILL;
217 using ::Tpetra::SCAL;
218 using ::Tpetra::GEMV;
219 using Kokkos::ArithTraits;
220 // I'm not writing 'using Kokkos::make_pair;' here, because that
221 // may break builds for users who make the mistake of putting
222 // 'using namespace std;' in the global namespace. Please don't
223 // ever do that! But just in case you do, I'll take this
224 // precaution.
225 using Kokkos::parallel_for;
226 using Kokkos::subview;
227 typedef typename decltype (ptr_)::non_const_value_type offset_type;
228 typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
230 device_type,
231 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
232 little_block_type;
233
234 const offset_type Y_ptBeg = lclRow * blockSize_;
235 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
236 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
237
238 // This version of the code does not use temporary storage.
239 // Each thread writes to its own block of the target vector.
240 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
241 FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
242 }
243 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
244 SCAL (beta_, Y_cur);
245 }
246
247 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
248 const offset_type blkBeg = ptr_[lclRow];
249 const offset_type blkEnd = ptr_[lclRow+1];
250 // Precompute to save integer math in the inner loop.
251 const offset_type bs2 = blockSize_ * blockSize_;
252 for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
253 ++absBlkOff) {
254 little_block_type A_cur (val_.data () + absBlkOff * bs2,
255 blockSize_, blockSize_);
256 const offset_type X_blkCol = ind_[absBlkOff];
257 const offset_type X_ptBeg = X_blkCol * blockSize_;
258 const offset_type X_ptEnd = X_ptBeg + blockSize_;
259 auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
260
261 GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
262 } // for each entry in current local block row of matrix
263 }
264 }
265
266 private:
267 alpha_coeff_type alpha_;
268 typename GraphType::row_map_type::const_type ptr_;
269 typename GraphType::entries_type::const_type ind_;
270 MatrixValuesType val_;
271 local_ordinal_type blockSize_;
272 InVecType X_;
273 beta_coeff_type beta_;
274 OutVecType Y_;
275 };
276
277 template<class AlphaCoeffType,
278 class GraphType,
279 class MatrixValuesType,
280 class InVecType,
281 class BetaCoeffType,
282 class OutVecType>
283 class BcrsApplyNoTransFunctor<AlphaCoeffType,
284 GraphType,
285 MatrixValuesType,
286 InVecType,
287 BetaCoeffType,
288 OutVecType,
289 true> {
290 private:
291 static_assert (Kokkos::is_view<MatrixValuesType>::value,
292 "MatrixValuesType must be a Kokkos::View.");
293 static_assert (Kokkos::is_view<OutVecType>::value,
294 "OutVecType must be a Kokkos::View.");
295 static_assert (Kokkos::is_view<InVecType>::value,
296 "InVecType must be a Kokkos::View.");
297 static_assert (std::is_same<MatrixValuesType,
298 typename MatrixValuesType::const_type>::value,
299 "MatrixValuesType must be a const Kokkos::View.");
300 static_assert (std::is_same<typename OutVecType::value_type,
301 typename OutVecType::non_const_value_type>::value,
302 "OutVecType must be a nonconst Kokkos::View.");
303 static_assert (std::is_same<typename InVecType::value_type,
304 typename InVecType::const_value_type>::value,
305 "InVecType must be a const Kokkos::View.");
306 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
307 "MatrixValuesType must be a rank-1 Kokkos::View.");
308 static_assert (static_cast<int> (InVecType::rank) == 1,
309 "InVecType must be a rank-1 Kokkos::View.");
310 static_assert (static_cast<int> (OutVecType::rank) == 1,
311 "OutVecType must be a rank-1 Kokkos::View.");
312 typedef typename MatrixValuesType::non_const_value_type scalar_type;
313
314 public:
315 typedef typename GraphType::device_type device_type;
316
318 static constexpr bool runOnHost = !std::is_same_v<typename device_type::execution_space, Kokkos::DefaultExecutionSpace> || std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::DefaultHostExecutionSpace>;
319
320
321
323 typedef typename std::remove_const<typename GraphType::data_type>::type
324 local_ordinal_type;
325
332 void setX (const InVecType& X) { X_ = X; }
333
340 void setY (const OutVecType& Y) { Y_ = Y; }
341
342 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
343 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
344
346 BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
347 const GraphType& graph,
348 const MatrixValuesType& val,
349 const local_ordinal_type blockSize,
350 const InVecType& X,
351 const beta_coeff_type& beta,
352 const OutVecType& Y) :
353 alpha_ (alpha),
354 ptr_ (graph.row_map),
355 ind_ (graph.entries),
356 val_ (val),
357 blockSize_ (blockSize),
358 X_ (X),
359 beta_ (beta),
360 Y_ (Y)
361 {}
362
363 // dummy Range version
364 KOKKOS_INLINE_FUNCTION void
365 operator () (const local_ordinal_type& lclRow) const
366 {
367 Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
368 }
369
370 // Team policy for built-in types
371 KOKKOS_INLINE_FUNCTION void
372 operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
373 {
374
375 const local_ordinal_type lclRow = member.league_rank();
376
377 using Kokkos::ArithTraits;
378 // I'm not writing 'using Kokkos::make_pair;' here, because that
379 // may break builds for users who make the mistake of putting
380 // 'using namespace std;' in the global namespace. Please don't
381 // ever do that! But just in case you do, I'll take this
382 // precaution.
383 using Kokkos::parallel_for;
384 using Kokkos::subview;
385 typedef typename decltype (ptr_)::non_const_value_type offset_type;
386 typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
388 device_type,
389 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
390 little_block_type;
391
392 const offset_type Y_ptBeg = lclRow * blockSize_;
393 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
394 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
395
396 // This version of the code does not use temporary storage.
397 // Each thread writes to its own block of the target vector.
398 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
399 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
400 [&](const local_ordinal_type &i) {
401 Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
402 });
403 }
404 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
405 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
406 [&](const local_ordinal_type &i) {
407 Y_cur(i) *= beta_;
408 });
409 }
410 member.team_barrier();
411
412 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
413 const offset_type blkBeg = ptr_[lclRow];
414 const offset_type blkEnd = ptr_[lclRow+1];
415 // Precompute to save integer math in the inner loop.
416 const offset_type bs2 = blockSize_ * blockSize_;
417 little_block_type A_cur (val_.data (), blockSize_, blockSize_);
418 auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
419 Kokkos::parallel_for
420 (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
421 [&](const local_ordinal_type &absBlkOff) {
422 A_cur.assign_data(val_.data () + absBlkOff * bs2);
423 const offset_type X_blkCol = ind_[absBlkOff];
424 const offset_type X_ptBeg = X_blkCol * blockSize_;
425 X_cur.assign_data(&X_(X_ptBeg));
426
427 Kokkos::parallel_for
428 (Kokkos::ThreadVectorRange(member, blockSize_),
429 [&](const local_ordinal_type &k0) {
430 scalar_type val(0);
431 for (local_ordinal_type k1=0;k1<blockSize_;++k1)
432 val += A_cur(k0,k1)*X_cur(k1);
433 if constexpr(runOnHost) {
434 // host space team size is always 1
435 Y_cur(k0) += alpha_*val;
436 }
437 else {
438 // cuda space team size can be larger than 1
439 // atomic is not allowed for sacado type;
440 // thus this needs to be specialized or
441 // sacado atomic should be supported.
442 Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
443 }
444 });
445 }); // for each entry in current local block row of matrix
446 }
447 }
448
449 private:
450 alpha_coeff_type alpha_;
451 typename GraphType::row_map_type::const_type ptr_;
452 typename GraphType::entries_type::const_type ind_;
453 MatrixValuesType val_;
454 local_ordinal_type blockSize_;
455 InVecType X_;
456 beta_coeff_type beta_;
457 OutVecType Y_;
458 };
459
460
461 template<class AlphaCoeffType,
462 class GraphType,
463 class MatrixValuesType,
464 class InMultiVecType,
465 class BetaCoeffType,
466 class OutMultiVecType>
467 void
468 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
469 const GraphType& graph,
470 const MatrixValuesType& val,
471 const typename std::remove_const<typename GraphType::data_type>::type blockSize,
472 const InMultiVecType& X,
473 const BetaCoeffType& beta,
474 const OutMultiVecType& Y
475 )
476 {
477 static_assert (Kokkos::is_view<MatrixValuesType>::value,
478 "MatrixValuesType must be a Kokkos::View.");
479 static_assert (Kokkos::is_view<OutMultiVecType>::value,
480 "OutMultiVecType must be a Kokkos::View.");
481 static_assert (Kokkos::is_view<InMultiVecType>::value,
482 "InMultiVecType must be a Kokkos::View.");
483 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
484 "MatrixValuesType must be a rank-1 Kokkos::View.");
485 static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
486 "OutMultiVecType must be a rank-2 Kokkos::View.");
487 static_assert (static_cast<int> (InMultiVecType::rank) == 2,
488 "InMultiVecType must be a rank-2 Kokkos::View.");
489
490 typedef typename MatrixValuesType::device_type::execution_space execution_space;
491 typedef typename MatrixValuesType::device_type::memory_space memory_space;
492 typedef typename MatrixValuesType::const_type matrix_values_type;
493 typedef typename OutMultiVecType::non_const_type out_multivec_type;
494 typedef typename InMultiVecType::const_type in_multivec_type;
495 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
496 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
497 typedef typename std::remove_const<typename GraphType::data_type>::type LO;
498
499 constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
500 constexpr bool is_host_memory_space = std::is_same<memory_space,Kokkos::HostSpace>::value;
501 constexpr bool use_team_policy = (is_builtin_type_enabled && !is_host_memory_space);
502
503 const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
504 static_cast<LO> (0) :
505 static_cast<LO> (graph.row_map.extent (0) - 1);
506 const LO numVecs = Y.extent (1);
507 if (numLocalMeshRows == 0 || numVecs == 0) {
508 return; // code below doesn't handle numVecs==0 correctly
509 }
510
511 // These assignments avoid instantiating the functor extra times
512 // unnecessarily, e.g., for X const vs. nonconst. We only need the
513 // X const case, so only instantiate for that case.
514 in_multivec_type X_in = X;
515 out_multivec_type Y_out = Y;
516
517 // The functor only knows how to handle one vector at a time, and it
518 // expects 1-D Views. Thus, we need to know the type of each column
519 // of X and Y.
520 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
521 typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
522 typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
523 matrix_values_type, in_vec_type, beta_type, out_vec_type,
524 use_team_policy> functor_type;
525
526 auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
527 auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
528
529 // Compute the first column of Y.
530 if (use_team_policy) {
531 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
532 // Built-in version uses atomic add which might not be supported from sacado or any user-defined types.
533 typedef Kokkos::TeamPolicy<execution_space> policy_type;
534 policy_type policy(1,1);
535#if defined(KOKKOS_ENABLE_CUDA)
536 constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
537#else
538 constexpr bool is_cuda = false;
539#endif // defined(KOKKOS_ENABLE_CUDA)
540 if (is_cuda) {
541 LO team_size = 8, vector_size = 1;
542 if (blockSize <= 5) vector_size = 4;
543 else if (blockSize <= 9) vector_size = 8;
544 else if (blockSize <= 12) vector_size = 12;
545 else if (blockSize <= 20) vector_size = 20;
546 else vector_size = 20;
547 policy = policy_type(numLocalMeshRows, team_size, vector_size);
548 } else {
549 policy = policy_type(numLocalMeshRows, 1, 1);
550 }
551 Kokkos::parallel_for (policy, functor);
552
553 // Compute the remaining columns of Y.
554 for (LO j = 1; j < numVecs; ++j) {
555 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
556 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
557 functor.setX (X_j);
558 functor.setY (Y_j);
559 Kokkos::parallel_for (policy, functor);
560 }
561 } else {
562 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
563 Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
564 Kokkos::parallel_for (policy, functor);
565 for (LO j = 1; j < numVecs; ++j) {
566 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
567 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
568 functor.setX (X_j);
569 functor.setY (Y_j);
570 Kokkos::parallel_for (policy, functor);
571 }
572 }
573 }
574} // namespace Impl
575
576namespace { // (anonymous)
577
578// Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
579// version that takes two Kokkos::View arguments).
580template<class Scalar, class LO, class GO, class Node>
581class GetLocalDiagCopy {
582public:
583 typedef typename Node::device_type device_type;
584 typedef size_t diag_offset_type;
585 typedef Kokkos::View<const size_t*, device_type,
586 Kokkos::MemoryUnmanaged> diag_offsets_type;
587 typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
588 typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
589 typedef typename local_graph_device_type::row_map_type row_offsets_type;
590 typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
591 typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
592 typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
593
594 // Constructor
595 GetLocalDiagCopy (const diag_type& diag,
596 const values_type& val,
597 const diag_offsets_type& diagOffsets,
598 const row_offsets_type& ptr,
599 const LO blockSize) :
600 diag_ (diag),
601 diagOffsets_ (diagOffsets),
602 ptr_ (ptr),
603 blockSize_ (blockSize),
604 offsetPerBlock_ (blockSize_*blockSize_),
605 val_(val)
606 {}
607
608 KOKKOS_INLINE_FUNCTION void
609 operator() (const LO& lclRowInd) const
610 {
611 using Kokkos::ALL;
612
613 // Get row offset
614 const size_t absOffset = ptr_[lclRowInd];
615
616 // Get offset relative to start of row
617 const size_t relOffset = diagOffsets_[lclRowInd];
618
619 // Get the total offset
620 const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
621
622 // Get a view of the block. BCRS currently uses LayoutRight
623 // regardless of the device.
624 typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
625 device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
626 const_little_block_type;
627 const_little_block_type D_in (val_.data () + pointOffset,
628 blockSize_, blockSize_);
629 auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
630 COPY (D_in, D_out);
631 }
632
633 private:
634 diag_type diag_;
635 diag_offsets_type diagOffsets_;
636 row_offsets_type ptr_;
637 LO blockSize_;
638 LO offsetPerBlock_;
639 values_type val_;
640 };
641} // namespace (anonymous)
642
643 template<class Scalar, class LO, class GO, class Node>
644 std::ostream&
647 {
648 * (this->localError_) = true;
649 if ((*errs_).is_null ()) {
650 *errs_ = Teuchos::rcp (new std::ostringstream ());
651 }
652 return **errs_;
653 }
654
655 template<class Scalar, class LO, class GO, class Node>
658 dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
659 graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
660 blockSize_ (static_cast<LO> (0)),
661 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
662 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
663 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
664 offsetPerBlock_ (0),
665 localError_ (new bool (false)),
666 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
667 {
668 }
669
670 template<class Scalar, class LO, class GO, class Node>
672 BlockCrsMatrix (const crs_graph_type& graph,
673 const LO blockSize) :
674 dist_object_type (graph.getMap ()),
675 graph_ (graph),
676 rowMeshMap_ (* (graph.getRowMap ())),
677 blockSize_ (blockSize),
678 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
679 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
680 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
681 offsetPerBlock_ (blockSize * blockSize),
682 localError_ (new bool (false)),
683 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
684 {
685
687 TEUCHOS_TEST_FOR_EXCEPTION(
688 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
689 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
690 "rows (isSorted() is false). This class assumes sorted rows.");
691
692 graphRCP_ = Teuchos::rcpFromRef(graph_);
693
694 // Trick to test whether LO is nonpositive, without a compiler
695 // warning in case LO is unsigned (which is generally a bad idea
696 // anyway). I don't promise that the trick works, but it
697 // generally does with gcc at least, in my experience.
698 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
699 TEUCHOS_TEST_FOR_EXCEPTION(
700 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
701 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
702 " <= 0. The block size must be positive.");
703
704 domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
705 rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
706
707 {
708 // These are rcp
709 const auto domainPointMap = getDomainMap();
710 const auto colPointMap = Teuchos::rcp
711 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
712 *pointImporter_ = Teuchos::rcp
713 (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
714 }
715 {
716 auto local_graph_h = graph.getLocalGraphHost ();
717 auto ptr_h = local_graph_h.row_map;
718 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
719 Kokkos::deep_copy(ptrHost_, ptr_h);
720
721 auto ind_h = local_graph_h.entries;
722 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
723 // DEEP_COPY REVIEW - HOST-TO-HOST
724 Kokkos::deep_copy (indHost_, ind_h);
725
726 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
727 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
728 }
729 }
730
731 template<class Scalar, class LO, class GO, class Node>
733 BlockCrsMatrix (const crs_graph_type& graph,
734 const typename local_matrix_device_type::values_type& values,
735 const LO blockSize) :
736 dist_object_type (graph.getMap ()),
737 graph_ (graph),
738 rowMeshMap_ (* (graph.getRowMap ())),
739 blockSize_ (blockSize),
740 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
741 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
742 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
743 offsetPerBlock_ (blockSize * blockSize),
744 localError_ (new bool (false)),
745 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
746 {
748 TEUCHOS_TEST_FOR_EXCEPTION(
749 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
750 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
751 "rows (isSorted() is false). This class assumes sorted rows.");
752
753 graphRCP_ = Teuchos::rcpFromRef(graph_);
754
755 // Trick to test whether LO is nonpositive, without a compiler
756 // warning in case LO is unsigned (which is generally a bad idea
757 // anyway). I don't promise that the trick works, but it
758 // generally does with gcc at least, in my experience.
759 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
760 TEUCHOS_TEST_FOR_EXCEPTION(
761 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
762 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
763 " <= 0. The block size must be positive.");
764
765 domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
766 rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
767
768 {
769 // These are rcp
770 const auto domainPointMap = getDomainMap();
771 const auto colPointMap = Teuchos::rcp
772 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
773 *pointImporter_ = Teuchos::rcp
774 (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
775 }
776 {
777 auto local_graph_h = graph.getLocalGraphHost ();
778 auto ptr_h = local_graph_h.row_map;
779 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
780 Kokkos::deep_copy(ptrHost_, ptr_h);
781
782 auto ind_h = local_graph_h.entries;
783 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
784 Kokkos::deep_copy (indHost_, ind_h);
785
786 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
787 TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
788 val_ = decltype (val_) (values);
789 }
790 }
791
792 template<class Scalar, class LO, class GO, class Node>
794 BlockCrsMatrix (const crs_graph_type& graph,
795 const map_type& domainPointMap,
796 const map_type& rangePointMap,
797 const LO blockSize) :
798 dist_object_type (graph.getMap ()),
799 graph_ (graph),
800 rowMeshMap_ (* (graph.getRowMap ())),
801 domainPointMap_ (domainPointMap),
802 rangePointMap_ (rangePointMap),
803 blockSize_ (blockSize),
804 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
805 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
806 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
807 offsetPerBlock_ (blockSize * blockSize),
808 localError_ (new bool (false)),
809 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
810 {
811 TEUCHOS_TEST_FOR_EXCEPTION(
812 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
813 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
814 "rows (isSorted() is false). This class assumes sorted rows.");
815
816 graphRCP_ = Teuchos::rcpFromRef(graph_);
817
818 // Trick to test whether LO is nonpositive, without a compiler
819 // warning in case LO is unsigned (which is generally a bad idea
820 // anyway). I don't promise that the trick works, but it
821 // generally does with gcc at least, in my experience.
822 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
823 TEUCHOS_TEST_FOR_EXCEPTION(
824 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
825 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
826 " <= 0. The block size must be positive.");
827 {
828 // These are rcp
829 const auto rcpDomainPointMap = getDomainMap();
830 const auto colPointMap = Teuchos::rcp
831 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
832 *pointImporter_ = Teuchos::rcp
833 (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
834 }
835 {
836 auto local_graph_h = graph.getLocalGraphHost ();
837 auto ptr_h = local_graph_h.row_map;
838 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
839 // DEEP_COPY REVIEW - HOST-TO-HOST
840 Kokkos::deep_copy(ptrHost_, ptr_h);
841
842 auto ind_h = local_graph_h.entries;
843 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
844 // DEEP_COPY REVIEW - HOST-TO-HOST
845 Kokkos::deep_copy (indHost_, ind_h);
846
847 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
848 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
849
850 }
851 }
852
853 template<class Scalar, class LO, class GO, class Node>
854 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
856 getDomainMap () const
857 { // Copy constructor of map_type does a shallow copy.
858 // We're only returning by RCP for backwards compatibility.
859 return Teuchos::rcp (new map_type (domainPointMap_));
860 }
861
862 template<class Scalar, class LO, class GO, class Node>
863 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
865 getRangeMap () const
866 { // Copy constructor of map_type does a shallow copy.
867 // We're only returning by RCP for backwards compatibility.
868 return Teuchos::rcp (new map_type (rangePointMap_));
869 }
870
871 template<class Scalar, class LO, class GO, class Node>
872 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
874 getRowMap () const
875 {
876 return graph_.getRowMap();
877 }
878
879 template<class Scalar, class LO, class GO, class Node>
880 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
882 getColMap () const
883 {
884 return graph_.getColMap();
885 }
886
887 template<class Scalar, class LO, class GO, class Node>
890 getGlobalNumRows() const
891 {
892 return graph_.getGlobalNumRows();
893 }
894
895 template<class Scalar, class LO, class GO, class Node>
896 size_t
898 getLocalNumRows() const
899 {
900 return graph_.getLocalNumRows();
901 }
902
903 template<class Scalar, class LO, class GO, class Node>
904 size_t
907 {
908 return graph_.getLocalMaxNumRowEntries();
909 }
910
911 template<class Scalar, class LO, class GO, class Node>
912 void
914 apply (const mv_type& X,
915 mv_type& Y,
916 Teuchos::ETransp mode,
917 Scalar alpha,
918 Scalar beta) const
919 {
920 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
921 TEUCHOS_TEST_FOR_EXCEPTION(
922 mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
923 std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
924 "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
925 "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
926
927 BMV X_view;
928 BMV Y_view;
929 const LO blockSize = getBlockSize ();
930 try {
931 X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
932 Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
933 }
934 catch (std::invalid_argument& e) {
935 TEUCHOS_TEST_FOR_EXCEPTION(
936 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
937 "apply: Either the input MultiVector X or the output MultiVector Y "
938 "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
939 "graph. BlockMultiVector's constructor threw the following exception: "
940 << e.what ());
941 }
942
943 try {
944 // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
945 // either that or mark fields of this class as 'mutable'. The
946 // problem is that applyBlock wants to do lazy initialization of
947 // temporary block multivectors.
948 const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
949 } catch (std::invalid_argument& e) {
950 TEUCHOS_TEST_FOR_EXCEPTION(
951 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
952 "apply: The implementation method applyBlock complained about having "
953 "an invalid argument. It reported the following: " << e.what ());
954 } catch (std::logic_error& e) {
955 TEUCHOS_TEST_FOR_EXCEPTION(
956 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
957 "apply: The implementation method applyBlock complained about a "
958 "possible bug in its implementation. It reported the following: "
959 << e.what ());
960 } catch (std::exception& e) {
961 TEUCHOS_TEST_FOR_EXCEPTION(
962 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
963 "apply: The implementation method applyBlock threw an exception which "
964 "is neither std::invalid_argument nor std::logic_error, but is a "
965 "subclass of std::exception. It reported the following: "
966 << e.what ());
967 } catch (...) {
968 TEUCHOS_TEST_FOR_EXCEPTION(
969 true, std::logic_error, "Tpetra::BlockCrsMatrix::"
970 "apply: The implementation method applyBlock threw an exception which "
971 "is not an instance of a subclass of std::exception. This probably "
972 "indicates a bug. Please report this to the Tpetra developers.");
973 }
974 }
975
976 template<class Scalar, class LO, class GO, class Node>
977 void
981 Teuchos::ETransp mode,
982 const Scalar alpha,
983 const Scalar beta)
984 {
985 TEUCHOS_TEST_FOR_EXCEPTION(
986 X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
987 "Tpetra::BlockCrsMatrix::applyBlock: "
988 "X and Y have different block sizes. X.getBlockSize() = "
989 << X.getBlockSize () << " != Y.getBlockSize() = "
990 << Y.getBlockSize () << ".");
991
992 if (mode == Teuchos::NO_TRANS) {
993 applyBlockNoTrans (X, Y, alpha, beta);
994 } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
995 applyBlockTrans (X, Y, mode, alpha, beta);
996 } else {
997 TEUCHOS_TEST_FOR_EXCEPTION(
998 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
999 "applyBlock: Invalid 'mode' argument. Valid values are "
1000 "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
1001 }
1002 }
1003
1004 template<class Scalar, class LO, class GO, class Node>
1005 void
1008 const Import<LO, GO, Node>& importer) const
1009 {
1010 using Teuchos::RCP;
1011 using Teuchos::rcp;
1012 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1013
1014 // Right now, we make many assumptions...
1015 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1016 "destMatrix is required to be null.");
1017
1018 // BlockCrsMatrix requires a complete graph at construction.
1019 // So first step is to import and fill complete the destGraph.
1020 RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
1021 RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
1022 srcGraph->getDomainMap(),
1023 srcGraph->getRangeMap());
1024
1025
1026 // Final step, create and import the destMatrix.
1027 destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
1028 destMatrix->doImport(*this, importer, Tpetra::INSERT);
1029 }
1030
1031
1032 template<class Scalar, class LO, class GO, class Node>
1033 void
1036 const Export<LO, GO, Node>& exporter) const
1037 {
1038 using Teuchos::RCP;
1039 using Teuchos::rcp;
1040 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1041
1042 // Right now, we make many assumptions...
1043 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1044 "destMatrix is required to be null.");
1045
1046 // BlockCrsMatrix requires a complete graph at construction.
1047 // So first step is to import and fill complete the destGraph.
1048 RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
1049 RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
1050 srcGraph->getDomainMap(),
1051 srcGraph->getRangeMap());
1052
1053
1054 // Final step, create and import the destMatrix.
1055 destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
1056 destMatrix->doExport(*this, exporter, Tpetra::INSERT);
1057 }
1058
1059
1060 template<class Scalar, class LO, class GO, class Node>
1061 void
1063 setAllToScalar (const Scalar& alpha)
1064 {
1065 auto val_d = val_.getDeviceView(Access::OverwriteAll);
1066 Kokkos::deep_copy(val_d, alpha);
1067 }
1068
1069 template<class Scalar, class LO, class GO, class Node>
1070 LO
1072 replaceLocalValues (const LO localRowInd,
1073 const LO colInds[],
1074 const Scalar vals[],
1075 const LO numColInds) const
1076 {
1077 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1078 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1079 ptrdiff_t * offsets = offsets_host_view.data();
1080 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1081 const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1082 return validCount;
1083 }
1084
1085 template <class Scalar, class LO, class GO, class Node>
1086 void
1088 getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1089 Kokkos::MemoryUnmanaged>& offsets) const
1090 {
1091 graph_.getLocalDiagOffsets (offsets);
1092 }
1093
1094 template <class Scalar, class LO, class GO, class Node>
1095 void
1097 getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1098 Kokkos::MemoryUnmanaged>& diag,
1099 const Kokkos::View<const size_t*, device_type,
1100 Kokkos::MemoryUnmanaged>& offsets) const
1101 {
1102 using Kokkos::parallel_for;
1103 const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1104
1105 const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
1106 const LO blockSize = this->getBlockSize ();
1107 TEUCHOS_TEST_FOR_EXCEPTION
1108 (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1109 static_cast<LO> (diag.extent (1)) < blockSize ||
1110 static_cast<LO> (diag.extent (2)) < blockSize,
1111 std::invalid_argument, prefix <<
1112 "The input Kokkos::View is not big enough to hold all the data.");
1113 TEUCHOS_TEST_FOR_EXCEPTION
1114 (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1115 prefix << "offsets.size() = " << offsets.size () << " < local number of "
1116 "diagonal blocks " << lclNumMeshRows << ".");
1117
1118 typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1119 typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1120
1121 // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1122 // we reserve the right to do lazy allocation of device data. (We
1123 // don't plan to do lazy allocation for host data; the host
1124 // version of the data always exists.)
1125 auto val_d = val_.getDeviceView(Access::ReadOnly);
1126 parallel_for (policy_type (0, lclNumMeshRows),
1127 functor_type (diag, val_d, offsets,
1128 graph_.getLocalGraphDevice ().row_map, blockSize_));
1129 }
1130
1131 template<class Scalar, class LO, class GO, class Node>
1132 LO
1134 absMaxLocalValues (const LO localRowInd,
1135 const LO colInds[],
1136 const Scalar vals[],
1137 const LO numColInds) const
1138 {
1139 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1140 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1141 ptrdiff_t * offsets = offsets_host_view.data();
1142 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1143 const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1144 return validCount;
1145 }
1146
1147
1148 template<class Scalar, class LO, class GO, class Node>
1149 LO
1151 sumIntoLocalValues (const LO localRowInd,
1152 const LO colInds[],
1153 const Scalar vals[],
1154 const LO numColInds) const
1155 {
1156 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1157 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1158 ptrdiff_t * offsets = offsets_host_view.data();
1159 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1160 const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1161 return validCount;
1162 }
1163 template<class Scalar, class LO, class GO, class Node>
1164 void
1166 getLocalRowCopy (LO LocalRow,
1167 nonconst_local_inds_host_view_type &Indices,
1168 nonconst_values_host_view_type &Values,
1169 size_t& NumEntries) const
1170 {
1171 auto vals = getValuesHost(LocalRow);
1172 const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1173 if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
1174 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1175 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1176 << numInds << " row entries");
1177 }
1178 const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1179 for (LO i=0; i<numInds; ++i) {
1180 Indices[i] = colInds[i];
1181 }
1182 for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1183 Values[i] = vals[i];
1184 }
1185 NumEntries = numInds;
1186 }
1187
1188 template<class Scalar, class LO, class GO, class Node>
1189 LO
1191 getLocalRowOffsets (const LO localRowInd,
1192 ptrdiff_t offsets[],
1193 const LO colInds[],
1194 const LO numColInds) const
1195 {
1196 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1197 // We got no offsets, because the input local row index is
1198 // invalid on the calling process. That may not be an error, if
1199 // numColInds is zero anyway; it doesn't matter. This is the
1200 // advantage of returning the number of valid indices.
1201 return static_cast<LO> (0);
1202 }
1203
1204 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1205 LO hint = 0; // Guess for the relative offset into the current row
1206 LO validCount = 0; // number of valid column indices in colInds
1207
1208 for (LO k = 0; k < numColInds; ++k) {
1209 const LO relBlockOffset =
1210 this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1211 if (relBlockOffset != LINV) {
1212 offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1213 hint = relBlockOffset + 1;
1214 ++validCount;
1215 }
1216 }
1217 return validCount;
1218 }
1219
1220
1221 template<class Scalar, class LO, class GO, class Node>
1222 LO
1224 replaceLocalValuesByOffsets (const LO localRowInd,
1225 const ptrdiff_t offsets[],
1226 const Scalar vals[],
1227 const LO numOffsets) const
1228 {
1229 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1230 // We modified no values, because the input local row index is
1231 // invalid on the calling process. That may not be an error, if
1232 // numColInds is zero anyway; it doesn't matter. This is the
1233 // advantage of returning the number of valid indices.
1234 return static_cast<LO> (0);
1235 }
1236 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1237 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1238 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
1239 impl_scalar_type* vOut = val_out.data();
1240
1241 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1242 const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
1243 size_t pointOffset = 0; // Current offset into input values
1244 LO validCount = 0; // number of valid offsets
1245
1246 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1247 const size_t blockOffset = offsets[k]*perBlockSize;
1248 if (offsets[k] != STINV) {
1249 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1250 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1251 COPY (A_new, A_old);
1252 ++validCount;
1253 }
1254 }
1255 return validCount;
1256 }
1257
1258
1259 template<class Scalar, class LO, class GO, class Node>
1260 LO
1262 absMaxLocalValuesByOffsets (const LO localRowInd,
1263 const ptrdiff_t offsets[],
1264 const Scalar vals[],
1265 const LO numOffsets) const
1266 {
1267 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1268 // We modified no values, because the input local row index is
1269 // invalid on the calling process. That may not be an error, if
1270 // numColInds is zero anyway; it doesn't matter. This is the
1271 // advantage of returning the number of valid indices.
1272 return static_cast<LO> (0);
1273 }
1274 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1275 auto val_out = getValuesHost(localRowInd);
1276 impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
1277
1278 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1279 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1280 size_t pointOffset = 0; // Current offset into input values
1281 LO validCount = 0; // number of valid offsets
1282
1283 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1284 const size_t blockOffset = offsets[k]*perBlockSize;
1285 if (blockOffset != STINV) {
1286 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1287 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1288 ::Tpetra::Impl::absMax (A_old, A_new);
1289 ++validCount;
1290 }
1291 }
1292 return validCount;
1293 }
1294
1295
1296 template<class Scalar, class LO, class GO, class Node>
1297 LO
1299 sumIntoLocalValuesByOffsets (const LO localRowInd,
1300 const ptrdiff_t offsets[],
1301 const Scalar vals[],
1302 const LO numOffsets) const
1303 {
1304 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1305 // We modified no values, because the input local row index is
1306 // invalid on the calling process. That may not be an error, if
1307 // numColInds is zero anyway; it doesn't matter. This is the
1308 // advantage of returning the number of valid indices.
1309 return static_cast<LO> (0);
1310 }
1311 const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1312 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1313 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1314 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
1315 impl_scalar_type* vOut = val_out.data();
1316
1317 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1318 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1319 size_t pointOffset = 0; // Current offset into input values
1320 LO validCount = 0; // number of valid offsets
1321
1322 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1323 const size_t blockOffset = offsets[k]*perBlockSize;
1324 if (blockOffset != STINV) {
1325 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1326 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1327 AXPY (ONE, A_new, A_old);
1328 ++validCount;
1329 }
1330 }
1331 return validCount;
1332 }
1333
1334 template<class Scalar, class LO, class GO, class Node>
1336 impl_scalar_type_dualview::t_host::const_type
1338 getValuesHost () const
1339 {
1340 return val_.getHostView(Access::ReadOnly);
1341 }
1342
1343 template<class Scalar, class LO, class GO, class Node>
1344 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1345 impl_scalar_type_dualview::t_dev::const_type
1346 BlockCrsMatrix<Scalar, LO, GO, Node>::
1347 getValuesDevice () const
1348 {
1349 return val_.getDeviceView(Access::ReadOnly);
1350 }
1351
1352 template<class Scalar, class LO, class GO, class Node>
1354 impl_scalar_type_dualview::t_host
1356 getValuesHostNonConst () const
1357 {
1358 return val_.getHostView(Access::ReadWrite);
1359 }
1360
1361 template<class Scalar, class LO, class GO, class Node>
1363 impl_scalar_type_dualview::t_dev
1366 {
1367 return val_.getDeviceView(Access::ReadWrite);
1368 }
1369
1370 template<class Scalar, class LO, class GO, class Node>
1371 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1372 impl_scalar_type_dualview::t_host::const_type
1374 getValuesHost (const LO& lclRow) const
1375 {
1376 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1377 auto val = val_.getHostView(Access::ReadOnly);
1378 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1379 return r_val;
1380 }
1381
1382 template<class Scalar, class LO, class GO, class Node>
1384 impl_scalar_type_dualview::t_dev::const_type
1386 getValuesDevice (const LO& lclRow) const
1387 {
1388 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1389 auto val = val_.getDeviceView(Access::ReadOnly);
1390 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1391 return r_val;
1392 }
1393
1394 template<class Scalar, class LO, class GO, class Node>
1397 getValuesHostNonConst (const LO& lclRow)
1398 {
1399 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1400 auto val = val_.getHostView(Access::ReadWrite);
1401 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1402 return r_val;
1403 }
1404
1405 template<class Scalar, class LO, class GO, class Node>
1408 getValuesDeviceNonConst (const LO& lclRow)
1409 {
1410 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1411 auto val = val_.getDeviceView(Access::ReadWrite);
1412 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1413 return r_val;
1414 }
1415
1416 template<class Scalar, class LO, class GO, class Node>
1417 size_t
1419 getNumEntriesInLocalRow (const LO localRowInd) const
1420 {
1421 const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1422 if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1423 return static_cast<LO> (0); // the calling process doesn't have that row
1424 } else {
1425 return static_cast<LO> (numEntInGraph);
1426 }
1427 }
1428
1429 template<class Scalar, class LO, class GO, class Node>
1430 typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
1432 getLocalMatrixDevice () const
1433 {
1434 auto numCols = this->graph_.getColMap()->getLocalNumElements();
1435 auto val = val_.getDeviceView(Access::ReadWrite);
1436 const LO blockSize = this->getBlockSize ();
1437 const auto graph = this->graph_.getLocalGraphDevice ();
1438
1439 return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
1440 numCols,
1441 val,
1442 graph,
1443 blockSize);
1444 }
1445
1446 template<class Scalar, class LO, class GO, class Node>
1447 void
1451 const Teuchos::ETransp mode,
1452 const Scalar alpha,
1453 const Scalar beta)
1454 {
1455 (void) X;
1456 (void) Y;
1457 (void) mode;
1458 (void) alpha;
1459 (void) beta;
1460
1461 TEUCHOS_TEST_FOR_EXCEPTION(
1462 true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
1463 "transpose and conjugate transpose modes are not yet implemented.");
1464 }
1465
1466 template<class Scalar, class LO, class GO, class Node>
1467 void
1468 BlockCrsMatrix<Scalar, LO, GO, Node>::
1469 applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1470 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1471 const Scalar alpha,
1472 const Scalar beta)
1473 {
1474 using Teuchos::RCP;
1475 using Teuchos::rcp;
1476 typedef ::Tpetra::Import<LO, GO, Node> import_type;
1477 typedef ::Tpetra::Export<LO, GO, Node> export_type;
1478 const Scalar zero = STS::zero ();
1479 const Scalar one = STS::one ();
1480 RCP<const import_type> import = graph_.getImporter ();
1481 // "export" is a reserved C++ keyword, so we can't use it.
1482 RCP<const export_type> theExport = graph_.getExporter ();
1483 const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1484
1485 if (alpha == zero) {
1486 if (beta == zero) {
1487 Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1488 }
1489 else if (beta != one) {
1490 Y.scale (beta);
1491 }
1492 }
1493 else { // alpha != 0
1494 const BMV* X_colMap = NULL;
1495 if (import.is_null ()) {
1496 try {
1497 X_colMap = &X;
1498 }
1499 catch (std::exception& e) {
1500 TEUCHOS_TEST_FOR_EXCEPTION
1501 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1502 "operator= threw an exception: " << std::endl << e.what ());
1503 }
1504 }
1505 else {
1506 // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1507 // Y_rowMap_ below. This lets us do lazy initialization
1508 // correctly with view semantics of BlockCrsMatrix. All views
1509 // of this BlockCrsMatrix have the same outer pointer. That
1510 // way, we can set the inner pointer in one view, and all
1511 // other views will see it.
1512 if ((*X_colMap_).is_null () ||
1513 (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1514 (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1515 *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1516 static_cast<LO> (X.getNumVectors ())));
1517 }
1518 (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1519 **pointImporter_,
1521 try {
1522 X_colMap = &(**X_colMap_);
1523 }
1524 catch (std::exception& e) {
1525 TEUCHOS_TEST_FOR_EXCEPTION
1526 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1527 "operator= threw an exception: " << std::endl << e.what ());
1528 }
1529 }
1530
1531 BMV* Y_rowMap = NULL;
1532 if (theExport.is_null ()) {
1533 Y_rowMap = &Y;
1534 }
1535 else if ((*Y_rowMap_).is_null () ||
1536 (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1537 (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1538 *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1539 static_cast<LO> (X.getNumVectors ())));
1540 try {
1541 Y_rowMap = &(**Y_rowMap_);
1542 }
1543 catch (std::exception& e) {
1544 TEUCHOS_TEST_FOR_EXCEPTION(
1545 true, std::logic_error, prefix << "Tpetra::MultiVector::"
1546 "operator= threw an exception: " << std::endl << e.what ());
1547 }
1548 }
1549
1550 try {
1551 localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1552 }
1553 catch (std::exception& e) {
1554 TEUCHOS_TEST_FOR_EXCEPTION
1555 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1556 "an exception: " << e.what ());
1557 }
1558 catch (...) {
1559 TEUCHOS_TEST_FOR_EXCEPTION
1560 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1561 "an exception not a subclass of std::exception.");
1562 }
1563
1564 if (! theExport.is_null ()) {
1565 Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1566 }
1567 }
1568 }
1569
1570 template<class Scalar, class LO, class GO, class Node>
1571 void
1575 const Scalar alpha,
1576 const Scalar beta)
1577 {
1578 using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1579
1580 const impl_scalar_type alpha_impl = alpha;
1581 const auto graph = this->graph_.getLocalGraphDevice ();
1582 const impl_scalar_type beta_impl = beta;
1583 const LO blockSize = this->getBlockSize ();
1584
1585 auto X_mv = X.getMultiVectorView ();
1586 auto Y_mv = Y.getMultiVectorView ();
1587
1588 //auto X_lcl = X_mv.template getLocalView<device_type> (Access::ReadOnly);
1589 //auto Y_lcl = Y_mv.template getLocalView<device_type> (Access::ReadWrite);
1590 auto X_lcl = X_mv.getLocalViewDevice (Access::ReadOnly);
1591 auto Y_lcl = Y_mv.getLocalViewDevice (Access::ReadWrite);
1592 auto val = val_.getDeviceView(Access::ReadWrite);
1593
1594 bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1595 beta_impl, Y_lcl);
1596 }
1597
1598 template<class Scalar, class LO, class GO, class Node>
1599 LO
1601 findRelOffsetOfColumnIndex (const LO localRowIndex,
1602 const LO colIndexToFind,
1603 const LO hint) const
1604 {
1605 const size_t absStartOffset = ptrHost_[localRowIndex];
1606 const size_t absEndOffset = ptrHost_[localRowIndex+1];
1607 const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1608 // Amortize pointer arithmetic over the search loop.
1609 const LO* const curInd = indHost_.data () + absStartOffset;
1610
1611 // If the hint was correct, then the hint is the offset to return.
1612 if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1613 // Always return the offset relative to the current row.
1614 return hint;
1615 }
1616
1617 // The hint was wrong, so we must search for the given column
1618 // index in the column indices for the given row.
1619 LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1620
1621 // We require that the graph have sorted rows. However, binary
1622 // search only pays if the current row is longer than a certain
1623 // amount. We set this to 32, but you might want to tune this.
1624 const LO maxNumEntriesForLinearSearch = 32;
1625 if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1626 // Use binary search. It would probably be better for us to
1627 // roll this loop by hand. If we wrote it right, a smart
1628 // compiler could perhaps use conditional loads and avoid
1629 // branches (according to Jed Brown on May 2014).
1630 const LO* beg = curInd;
1631 const LO* end = curInd + numEntriesInRow;
1632 std::pair<const LO*, const LO*> p =
1633 std::equal_range (beg, end, colIndexToFind);
1634 if (p.first != p.second) {
1635 // offset is relative to the current row
1636 relOffset = static_cast<LO> (p.first - beg);
1637 }
1638 }
1639 else { // use linear search
1640 for (LO k = 0; k < numEntriesInRow; ++k) {
1641 if (colIndexToFind == curInd[k]) {
1642 relOffset = k; // offset is relative to the current row
1643 break;
1644 }
1645 }
1646 }
1647
1648 return relOffset;
1649 }
1650
1651 template<class Scalar, class LO, class GO, class Node>
1652 LO
1654 offsetPerBlock () const
1655 {
1656 return offsetPerBlock_;
1657 }
1658
1659 template<class Scalar, class LO, class GO, class Node>
1663 const size_t pointOffset) const
1664 {
1665 // Row major blocks
1666 const LO rowStride = blockSize_;
1667 return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1668 }
1669
1670 template<class Scalar, class LO, class GO, class Node>
1674 const size_t pointOffset) const
1675 {
1676 // Row major blocks
1677 const LO rowStride = blockSize_;
1678 return little_block_type (val + pointOffset, blockSize_, rowStride);
1679 }
1680
1681 template<class Scalar, class LO, class GO, class Node>
1682 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1685 const size_t pointOffset) const
1686 {
1687 // Row major blocks
1688 const LO rowStride = blockSize_;
1689 const size_t bs2 = blockSize_ * blockSize_;
1690 return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1691 }
1692
1693 template<class Scalar, class LO, class GO, class Node>
1696 getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1697 {
1698 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1699
1700 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1701 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1702 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1703 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1704 auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1705 auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1706 return r_val;
1707 }
1708 else {
1709 return little_block_type ();
1710 }
1711 }
1712
1713 template<class Scalar, class LO, class GO, class Node>
1714 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1716 getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1717 {
1718 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1719
1720 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1721 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1722 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1723 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1724 auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1725 auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1726 return r_val;
1727 }
1728 else {
1729 return little_block_host_type ();
1730 }
1731 }
1732
1733
1734 template<class Scalar, class LO, class GO, class Node>
1735 bool
1737 checkSizes (const ::Tpetra::SrcDistObject& source)
1738 {
1739 using std::endl;
1740 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1741 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1742
1743 if (src == NULL) {
1744 std::ostream& err = markLocalErrorAndGetStream ();
1745 err << "checkSizes: The source object of the Import or Export "
1746 "must be a BlockCrsMatrix with the same template parameters as the "
1747 "target object." << endl;
1748 }
1749 else {
1750 // Use a string of ifs, not if-elseifs, because we want to know
1751 // all the errors.
1752 if (src->getBlockSize () != this->getBlockSize ()) {
1753 std::ostream& err = markLocalErrorAndGetStream ();
1754 err << "checkSizes: The source and target objects of the Import or "
1755 << "Export must have the same block sizes. The source's block "
1756 << "size = " << src->getBlockSize () << " != the target's block "
1757 << "size = " << this->getBlockSize () << "." << endl;
1758 }
1759 if (! src->graph_.isFillComplete ()) {
1760 std::ostream& err = markLocalErrorAndGetStream ();
1761 err << "checkSizes: The source object of the Import or Export is "
1762 "not fill complete. Both source and target objects must be fill "
1763 "complete." << endl;
1764 }
1765 if (! this->graph_.isFillComplete ()) {
1766 std::ostream& err = markLocalErrorAndGetStream ();
1767 err << "checkSizes: The target object of the Import or Export is "
1768 "not fill complete. Both source and target objects must be fill "
1769 "complete." << endl;
1770 }
1771 if (src->graph_.getColMap ().is_null ()) {
1772 std::ostream& err = markLocalErrorAndGetStream ();
1773 err << "checkSizes: The source object of the Import or Export does "
1774 "not have a column Map. Both source and target objects must have "
1775 "column Maps." << endl;
1776 }
1777 if (this->graph_.getColMap ().is_null ()) {
1778 std::ostream& err = markLocalErrorAndGetStream ();
1779 err << "checkSizes: The target object of the Import or Export does "
1780 "not have a column Map. Both source and target objects must have "
1781 "column Maps." << endl;
1782 }
1783 }
1784
1785 return ! (* (this->localError_));
1786 }
1787
1788 template<class Scalar, class LO, class GO, class Node>
1789 void
1792 (const ::Tpetra::SrcDistObject& source,
1793 const size_t numSameIDs,
1794 const Kokkos::DualView<const local_ordinal_type*,
1795 buffer_device_type>& permuteToLIDs,
1796 const Kokkos::DualView<const local_ordinal_type*,
1797 buffer_device_type>& permuteFromLIDs,
1798 const CombineMode /*CM*/)
1799 {
1803 using std::endl;
1804 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1805
1806 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1807 const bool debug = Behavior::debug();
1808 const bool verbose = Behavior::verbose();
1809
1810 // Define this function prefix
1811 std::string prefix;
1812 {
1813 std::ostringstream os;
1814 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1815 os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1816 prefix = os.str();
1817 }
1818
1819 // check if this already includes a local error
1820 if (* (this->localError_)) {
1821 std::ostream& err = this->markLocalErrorAndGetStream ();
1822 err << prefix
1823 << "The target object of the Import or Export is already in an error state."
1824 << endl;
1825 return;
1826 }
1827
1828 //
1829 // Verbose input dual view status
1830 //
1831 if (verbose) {
1832 std::ostringstream os;
1833 os << prefix << endl
1834 << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1835 << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1836 std::cerr << os.str ();
1837 }
1838
1842 if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1843 std::ostream& err = this->markLocalErrorAndGetStream ();
1844 err << prefix
1845 << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1846 << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1847 << "." << endl;
1848 return;
1849 }
1850 if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1851 std::ostream& err = this->markLocalErrorAndGetStream ();
1852 err << prefix
1853 << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1854 << endl;
1855 return;
1856 }
1857
1858 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1859 if (src == NULL) {
1860 std::ostream& err = this->markLocalErrorAndGetStream ();
1861 err << prefix
1862 << "The source (input) object of the Import or "
1863 "Export is either not a BlockCrsMatrix, or does not have the right "
1864 "template parameters. checkSizes() should have caught this. "
1865 "Please report this bug to the Tpetra developers." << endl;
1866 return;
1867 }
1868
1869 bool lclErr = false;
1870#ifdef HAVE_TPETRA_DEBUG
1871 std::set<LO> invalidSrcCopyRows;
1872 std::set<LO> invalidDstCopyRows;
1873 std::set<LO> invalidDstCopyCols;
1874 std::set<LO> invalidDstPermuteCols;
1875 std::set<LO> invalidPermuteFromRows;
1876#endif // HAVE_TPETRA_DEBUG
1877
1878 // Copy the initial sequence of rows that are the same.
1879 //
1880 // The two graphs might have different column Maps, so we need to
1881 // do this using global column indices. This is purely local, so
1882 // we only need to check for local sameness of the two column
1883 // Maps.
1884
1885#ifdef HAVE_TPETRA_DEBUG
1886 const map_type& srcRowMap = * (src->graph_.getRowMap ());
1887#endif // HAVE_TPETRA_DEBUG
1888 const map_type& dstRowMap = * (this->graph_.getRowMap ());
1889 const map_type& srcColMap = * (src->graph_.getColMap ());
1890 const map_type& dstColMap = * (this->graph_.getColMap ());
1891 const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1892
1893 const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1894 if (verbose) {
1895 std::ostringstream os;
1896 os << prefix
1897 << "canUseLocalColumnIndices: "
1898 << (canUseLocalColumnIndices ? "true" : "false")
1899 << ", numPermute: " << numPermute
1900 << endl;
1901 std::cerr << os.str ();
1902 }
1903
1904 const auto permuteToLIDsHost = permuteToLIDs.view_host();
1905 const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1906
1907 if (canUseLocalColumnIndices) {
1908 // Copy local rows that are the "same" in both source and target.
1909 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1910#ifdef HAVE_TPETRA_DEBUG
1911 if (! srcRowMap.isNodeLocalElement (localRow)) {
1912 lclErr = true;
1913 invalidSrcCopyRows.insert (localRow);
1914 continue; // skip invalid rows
1915 }
1916#endif // HAVE_TPETRA_DEBUG
1917
1918 local_inds_host_view_type lclSrcCols;
1919 values_host_view_type vals;
1920 LO numEntries;
1921 // If this call fails, that means the mesh row local index is
1922 // invalid. That means the Import or Export is invalid somehow.
1923 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1924 if (numEntries > 0) {
1925 LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1926 if (err != numEntries) {
1927 lclErr = true;
1928 if (! dstRowMap.isNodeLocalElement (localRow)) {
1929#ifdef HAVE_TPETRA_DEBUG
1930 invalidDstCopyRows.insert (localRow);
1931#endif // HAVE_TPETRA_DEBUG
1932 }
1933 else {
1934 // Once there's an error, there's no sense in saving
1935 // time, so we check whether the column indices were
1936 // invalid. However, only remember which ones were
1937 // invalid in a debug build, because that might take a
1938 // lot of space.
1939 for (LO k = 0; k < numEntries; ++k) {
1940 if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1941 lclErr = true;
1942#ifdef HAVE_TPETRA_DEBUG
1943 (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1944#endif // HAVE_TPETRA_DEBUG
1945 }
1946 }
1947 }
1948 }
1949 }
1950 } // for each "same" local row
1951
1952 // Copy the "permute" local rows.
1953 for (size_t k = 0; k < numPermute; ++k) {
1954 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1955 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1956
1957 local_inds_host_view_type lclSrcCols;
1958 values_host_view_type vals;
1959 LO numEntries;
1960 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1961 if (numEntries > 0) {
1962 LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1963 if (err != numEntries) {
1964 lclErr = true;
1965#ifdef HAVE_TPETRA_DEBUG
1966 for (LO c = 0; c < numEntries; ++c) {
1967 if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1968 invalidDstPermuteCols.insert (lclSrcCols[c]);
1969 }
1970 }
1971#endif // HAVE_TPETRA_DEBUG
1972 }
1973 }
1974 }
1975 }
1976 else { // must convert column indices to global
1977 // Reserve space to store the destination matrix's local column indices.
1978 const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1979 Teuchos::Array<LO> lclDstCols (maxNumEnt);
1980
1981 // Copy local rows that are the "same" in both source and target.
1982 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1983 local_inds_host_view_type lclSrcCols;
1984 values_host_view_type vals;
1985 LO numEntries;
1986
1987 // If this call fails, that means the mesh row local index is
1988 // invalid. That means the Import or Export is invalid somehow.
1989 try {
1990 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1991 } catch (std::exception& e) {
1992 if (debug) {
1993 std::ostringstream os;
1994 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1995 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1996 << localRow << ", src->getLocalRowView() threw an exception: "
1997 << e.what ();
1998 std::cerr << os.str ();
1999 }
2000 throw e;
2001 }
2002
2003 if (numEntries > 0) {
2004 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2005 lclErr = true;
2006 if (debug) {
2007 std::ostringstream os;
2008 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2009 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2010 << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2011 << maxNumEnt << endl;
2012 std::cerr << os.str ();
2013 }
2014 }
2015 else {
2016 // Convert the source matrix's local column indices to the
2017 // destination matrix's local column indices.
2018 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2019 for (LO j = 0; j < numEntries; ++j) {
2020 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2021 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2022 lclErr = true;
2023#ifdef HAVE_TPETRA_DEBUG
2024 invalidDstCopyCols.insert (lclDstColsView[j]);
2025#endif // HAVE_TPETRA_DEBUG
2026 }
2027 }
2028 LO err(0);
2029 try {
2030 err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
2031 } catch (std::exception& e) {
2032 if (debug) {
2033 std::ostringstream os;
2034 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2035 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2036 << localRow << ", this->replaceLocalValues() threw an exception: "
2037 << e.what ();
2038 std::cerr << os.str ();
2039 }
2040 throw e;
2041 }
2042 if (err != numEntries) {
2043 lclErr = true;
2044 if (debug) {
2045 std::ostringstream os;
2046 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2047 os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2048 "localRow " << localRow << ", this->replaceLocalValues "
2049 "returned " << err << " instead of numEntries = "
2050 << numEntries << endl;
2051 std::cerr << os.str ();
2052 }
2053 }
2054 }
2055 }
2056 }
2057
2058 // Copy the "permute" local rows.
2059 for (size_t k = 0; k < numPermute; ++k) {
2060 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2061 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2062
2063 local_inds_host_view_type lclSrcCols;
2064 values_host_view_type vals;
2065 LO numEntries;
2066
2067 try {
2068 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
2069 } catch (std::exception& e) {
2070 if (debug) {
2071 std::ostringstream os;
2072 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2073 os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2074 "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2075 << ", src->getLocalRowView() threw an exception: " << e.what ();
2076 std::cerr << os.str ();
2077 }
2078 throw e;
2079 }
2080
2081 if (numEntries > 0) {
2082 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2083 lclErr = true;
2084 }
2085 else {
2086 // Convert the source matrix's local column indices to the
2087 // destination matrix's local column indices.
2088 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2089 for (LO j = 0; j < numEntries; ++j) {
2090 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2091 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2092 lclErr = true;
2093#ifdef HAVE_TPETRA_DEBUG
2094 invalidDstPermuteCols.insert (lclDstColsView[j]);
2095#endif // HAVE_TPETRA_DEBUG
2096 }
2097 }
2098 LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
2099 if (err != numEntries) {
2100 lclErr = true;
2101 }
2102 }
2103 }
2104 }
2105 }
2106
2107 if (lclErr) {
2108 std::ostream& err = this->markLocalErrorAndGetStream ();
2109#ifdef HAVE_TPETRA_DEBUG
2110 err << "copyAndPermute: The graph structure of the source of the "
2111 "Import or Export must be a subset of the graph structure of the "
2112 "target. ";
2113 err << "invalidSrcCopyRows = [";
2114 for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2115 it != invalidSrcCopyRows.end (); ++it) {
2116 err << *it;
2117 typename std::set<LO>::const_iterator itp1 = it;
2118 itp1++;
2119 if (itp1 != invalidSrcCopyRows.end ()) {
2120 err << ",";
2121 }
2122 }
2123 err << "], invalidDstCopyRows = [";
2124 for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2125 it != invalidDstCopyRows.end (); ++it) {
2126 err << *it;
2127 typename std::set<LO>::const_iterator itp1 = it;
2128 itp1++;
2129 if (itp1 != invalidDstCopyRows.end ()) {
2130 err << ",";
2131 }
2132 }
2133 err << "], invalidDstCopyCols = [";
2134 for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2135 it != invalidDstCopyCols.end (); ++it) {
2136 err << *it;
2137 typename std::set<LO>::const_iterator itp1 = it;
2138 itp1++;
2139 if (itp1 != invalidDstCopyCols.end ()) {
2140 err << ",";
2141 }
2142 }
2143 err << "], invalidDstPermuteCols = [";
2144 for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2145 it != invalidDstPermuteCols.end (); ++it) {
2146 err << *it;
2147 typename std::set<LO>::const_iterator itp1 = it;
2148 itp1++;
2149 if (itp1 != invalidDstPermuteCols.end ()) {
2150 err << ",";
2151 }
2152 }
2153 err << "], invalidPermuteFromRows = [";
2154 for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2155 it != invalidPermuteFromRows.end (); ++it) {
2156 err << *it;
2157 typename std::set<LO>::const_iterator itp1 = it;
2158 itp1++;
2159 if (itp1 != invalidPermuteFromRows.end ()) {
2160 err << ",";
2161 }
2162 }
2163 err << "]" << endl;
2164#else
2165 err << "copyAndPermute: The graph structure of the source of the "
2166 "Import or Export must be a subset of the graph structure of the "
2167 "target." << endl;
2168#endif // HAVE_TPETRA_DEBUG
2169 }
2170
2171 if (debug) {
2172 std::ostringstream os;
2173 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2174 const bool lclSuccess = ! (* (this->localError_));
2175 os << "*** Proc " << myRank << ": copyAndPermute "
2176 << (lclSuccess ? "succeeded" : "FAILED");
2177 if (lclSuccess) {
2178 os << endl;
2179 } else {
2180 os << ": error messages: " << this->errorMessages (); // comes w/ endl
2181 }
2182 std::cerr << os.str ();
2183 }
2184 }
2185
2186 namespace { // (anonymous)
2187
2206 template<class LO, class GO>
2207 size_t
2208 packRowCount (const size_t numEnt,
2209 const size_t numBytesPerValue,
2210 const size_t blkSize)
2211 {
2213
2214 if (numEnt == 0) {
2215 // Empty rows always take zero bytes, to ensure sparsity.
2216 return 0;
2217 }
2218 else {
2219 // We store the number of entries as a local index (LO).
2220 LO numEntLO = 0; // packValueCount wants this.
2221 GO gid {};
2222 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2223 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2224 const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2225 return numEntLen + gidsLen + valsLen;
2226 }
2227 }
2228
2239 template<class ST, class LO, class GO>
2240 size_t
2241 unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
2242 const size_t offset,
2243 const size_t numBytes,
2244 const size_t /* numBytesPerValue */)
2245 {
2246 using Kokkos::subview;
2247 using ::Tpetra::Details::PackTraits;
2248
2249 if (numBytes == 0) {
2250 // Empty rows always take zero bytes, to ensure sparsity.
2251 return static_cast<size_t> (0);
2252 }
2253 else {
2254 LO numEntLO = 0;
2255 const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
2256 TEUCHOS_TEST_FOR_EXCEPTION
2257 (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2258 "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2259 << ".");
2260 const char* const inBuf = imports.data () + offset;
2261 const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
2262 TEUCHOS_TEST_FOR_EXCEPTION
2263 (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2264 "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2265 << ".");
2266 return static_cast<size_t> (numEntLO);
2267 }
2268 }
2269
2273 template<class ST, class LO, class GO>
2274 size_t
2275 packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
2276 const size_t offset,
2277 const size_t numEnt,
2278 const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
2279 const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
2280 const size_t numBytesPerValue,
2281 const size_t blockSize)
2282 {
2283 using Kokkos::subview;
2284 using ::Tpetra::Details::PackTraits;
2285
2286 if (numEnt == 0) {
2287 // Empty rows always take zero bytes, to ensure sparsity.
2288 return 0;
2289 }
2290 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2291
2292 const GO gid = 0; // packValueCount wants this
2293 const LO numEntLO = static_cast<size_t> (numEnt);
2294
2295 const size_t numEntBeg = offset;
2296 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2297 const size_t gidsBeg = numEntBeg + numEntLen;
2298 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2299 const size_t valsBeg = gidsBeg + gidsLen;
2300 const size_t valsLen = numScalarEnt * numBytesPerValue;
2301
2302 char* const numEntOut = exports.data () + numEntBeg;
2303 char* const gidsOut = exports.data () + gidsBeg;
2304 char* const valsOut = exports.data () + valsBeg;
2305
2306 size_t numBytesOut = 0;
2307 int errorCode = 0;
2308 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
2309
2310 {
2311 Kokkos::pair<int, size_t> p;
2312 p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
2313 errorCode += p.first;
2314 numBytesOut += p.second;
2315
2316 p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
2317 errorCode += p.first;
2318 numBytesOut += p.second;
2319 }
2320
2321 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2322 TEUCHOS_TEST_FOR_EXCEPTION
2323 (numBytesOut != expectedNumBytes, std::logic_error,
2324 "packRowForBlockCrs: numBytesOut = " << numBytesOut
2325 << " != expectedNumBytes = " << expectedNumBytes << ".");
2326
2327 TEUCHOS_TEST_FOR_EXCEPTION
2328 (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2329 "PackTraits::packArray returned a nonzero error code");
2330
2331 return numBytesOut;
2332 }
2333
2334 // Return the number of bytes actually read / used.
2335 template<class ST, class LO, class GO>
2336 size_t
2337 unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
2338 const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
2339 const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
2340 const size_t offset,
2341 const size_t numBytes,
2342 const size_t numEnt,
2343 const size_t numBytesPerValue,
2344 const size_t blockSize)
2345 {
2346 using ::Tpetra::Details::PackTraits;
2347
2348 if (numBytes == 0) {
2349 // Rows with zero bytes always have zero entries.
2350 return 0;
2351 }
2352 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2353 TEUCHOS_TEST_FOR_EXCEPTION
2354 (static_cast<size_t> (imports.extent (0)) <= offset,
2355 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2356 << imports.extent (0) << " <= offset = " << offset << ".");
2357 TEUCHOS_TEST_FOR_EXCEPTION
2358 (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2359 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2360 << imports.extent (0) << " < offset + numBytes = "
2361 << (offset + numBytes) << ".");
2362
2363 const GO gid = 0; // packValueCount wants this
2364 const LO lid = 0; // packValueCount wants this
2365
2366 const size_t numEntBeg = offset;
2367 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
2368 const size_t gidsBeg = numEntBeg + numEntLen;
2369 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2370 const size_t valsBeg = gidsBeg + gidsLen;
2371 const size_t valsLen = numScalarEnt * numBytesPerValue;
2372
2373 const char* const numEntIn = imports.data () + numEntBeg;
2374 const char* const gidsIn = imports.data () + gidsBeg;
2375 const char* const valsIn = imports.data () + valsBeg;
2376
2377 size_t numBytesOut = 0;
2378 int errorCode = 0;
2379 LO numEntOut;
2380 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
2381 TEUCHOS_TEST_FOR_EXCEPTION
2382 (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2383 "unpackRowForBlockCrs: Expected number of entries " << numEnt
2384 << " != actual number of entries " << numEntOut << ".");
2385
2386 {
2387 Kokkos::pair<int, size_t> p;
2388 p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2389 errorCode += p.first;
2390 numBytesOut += p.second;
2391
2392 p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2393 errorCode += p.first;
2394 numBytesOut += p.second;
2395 }
2396
2397 TEUCHOS_TEST_FOR_EXCEPTION
2398 (numBytesOut != numBytes, std::logic_error,
2399 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2400 << " != numBytes = " << numBytes << ".");
2401
2402 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2403 TEUCHOS_TEST_FOR_EXCEPTION
2404 (numBytesOut != expectedNumBytes, std::logic_error,
2405 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2406 << " != expectedNumBytes = " << expectedNumBytes << ".");
2407
2408 TEUCHOS_TEST_FOR_EXCEPTION
2409 (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2410 "PackTraits::unpackArray returned a nonzero error code");
2411
2412 return numBytesOut;
2413 }
2414 } // namespace (anonymous)
2415
2416 template<class Scalar, class LO, class GO, class Node>
2417 void
2420 (const ::Tpetra::SrcDistObject& source,
2421 const Kokkos::DualView<const local_ordinal_type*,
2422 buffer_device_type>& exportLIDs,
2423 Kokkos::DualView<packet_type*,
2424 buffer_device_type>& exports, // output
2425 Kokkos::DualView<size_t*,
2426 buffer_device_type> numPacketsPerLID, // output
2427 size_t& constantNumPackets)
2428 {
2433
2434 typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2435
2436 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
2437
2438 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2439
2440 const bool debug = Behavior::debug();
2441 const bool verbose = Behavior::verbose();
2442
2443 // Define this function prefix
2444 std::string prefix;
2445 {
2446 std::ostringstream os;
2447 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2448 os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2449 prefix = os.str();
2450 }
2451
2452 // check if this already includes a local error
2453 if (* (this->localError_)) {
2454 std::ostream& err = this->markLocalErrorAndGetStream ();
2455 err << prefix
2456 << "The target object of the Import or Export is already in an error state."
2457 << std::endl;
2458 return;
2459 }
2460
2461 //
2462 // Verbose input dual view status
2463 //
2464 if (verbose) {
2465 std::ostringstream os;
2466 os << prefix << std::endl
2467 << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2468 << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2469 << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2470 std::cerr << os.str ();
2471 }
2472
2476 if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2477 std::ostream& err = this->markLocalErrorAndGetStream ();
2478 err << prefix
2479 << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2480 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2481 << "." << std::endl;
2482 return;
2483 }
2484 if (exportLIDs.need_sync_host ()) {
2485 std::ostream& err = this->markLocalErrorAndGetStream ();
2486 err << prefix << "exportLIDs be sync'd to host." << std::endl;
2487 return;
2488 }
2489
2490 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
2491 if (src == NULL) {
2492 std::ostream& err = this->markLocalErrorAndGetStream ();
2493 err << prefix
2494 << "The source (input) object of the Import or "
2495 "Export is either not a BlockCrsMatrix, or does not have the right "
2496 "template parameters. checkSizes() should have caught this. "
2497 "Please report this bug to the Tpetra developers." << std::endl;
2498 return;
2499 }
2500
2501 // Graphs and matrices are allowed to have a variable number of
2502 // entries per row. We could test whether all rows have the same
2503 // number of entries, but DistObject can only use this
2504 // optimization if all rows on _all_ processes have the same
2505 // number of entries. Rather than do the all-reduce necessary to
2506 // test for this unlikely case, we tell DistObject (by setting
2507 // constantNumPackets to zero) to assume that different rows may
2508 // have different numbers of entries.
2509 constantNumPackets = 0;
2510
2511 // const values
2512 const crs_graph_type& srcGraph = src->graph_;
2513 const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2514 const size_t numExportLIDs = exportLIDs.extent (0);
2515 size_t numBytesPerValue(0);
2516 {
2517 auto val_host = val_.getHostView(Access::ReadOnly);
2518 numBytesPerValue =
2519 PackTraits<impl_scalar_type>
2520 ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2521 }
2522
2523 // Compute the number of bytes ("packets") per row to pack. While
2524 // we're at it, compute the total # of block entries to send, and
2525 // the max # of block entries in any of the rows we're sending.
2526
2527 Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2528
2529 // Graph information is on host; let's do this on host parallel reduce
2530 auto exportLIDsHost = exportLIDs.view_host();
2531 auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2532 numPacketsPerLID.modify_host ();
2533 {
2534 using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2535 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2536 Kokkos::parallel_reduce
2537 (policy,
2538 [=](const size_t &i, typename reducer_type::value_type &update) {
2539 const LO lclRow = exportLIDsHost(i);
2540 size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2541 numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2542
2543 const size_t numBytes =
2544 packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2545 numPacketsPerLIDHost(i) = numBytes;
2546 update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2547 }, rowReducerStruct);
2548 }
2549
2550 // Compute the number of bytes ("packets") per row to pack. While
2551 // we're at it, compute the total # of block entries to send, and
2552 // the max # of block entries in any of the rows we're sending.
2553 const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2554 const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2555 const size_t maxRowLength = rowReducerStruct.maxRowLength;
2556
2557 if (verbose) {
2558 std::ostringstream os;
2559 os << prefix
2560 << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2561 << std::endl;
2562 std::cerr << os.str ();
2563 }
2564
2565 // We use a "struct of arrays" approach to packing each row's
2566 // entries. All the column indices (as global indices) go first,
2567 // then all their owning process ranks, and then the values.
2568 if(exports.extent(0) != totalNumBytes)
2569 {
2570 const std::string oldLabel = exports.d_view.label ();
2571 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2572 exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2573 }
2574 if (totalNumEntries > 0) {
2575 // Current position (in bytes) in the 'exports' output array.
2576 Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2577 {
2578 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2579 Kokkos::parallel_scan
2580 (policy,
2581 [=](const size_t &i, size_t &update, const bool &final) {
2582 if (final) offset(i) = update;
2583 update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2584 });
2585 }
2586 if (offset(numExportLIDs) != totalNumBytes) {
2587 std::ostream& err = this->markLocalErrorAndGetStream ();
2588 err << prefix
2589 << "At end of method, the final offset (in bytes) "
2590 << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2591 << totalNumBytes << ". "
2592 << "Please report this bug to the Tpetra developers." << std::endl;
2593 return;
2594 }
2595
2596 // For each block row of the matrix owned by the calling
2597 // process, pack that block row's column indices and values into
2598 // the exports array.
2599
2600 // Source matrix's column Map. We verified in checkSizes() that
2601 // the column Map exists (is not null).
2602 const map_type& srcColMap = * (srcGraph.getColMap ());
2603
2604 // Pack the data for each row to send, into the 'exports' buffer.
2605 // exports will be modified on host.
2606 exports.modify_host();
2607 {
2608 typedef Kokkos::TeamPolicy<host_exec> policy_type;
2609 const auto policy =
2610 policy_type(numExportLIDs, 1, 1)
2611 .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2612 // The following parallel_for needs const access to the local values of src.
2613 // (the local graph is also accessed on host, but this does not use WDVs).
2614 getValuesHost();
2616 Kokkos::parallel_for
2617 (policy,
2618 [=](const typename policy_type::member_type &member) {
2619 const size_t i = member.league_rank();
2620 Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2621 gblColInds(member.team_scratch(0), maxRowLength);
2622
2623 const LO lclRowInd = exportLIDsHost(i);
2624 local_inds_host_view_type lclColInds;
2625 values_host_view_type vals;
2626
2627 // It's OK to ignore the return value, since if the calling
2628 // process doesn't own that local row, then the number of
2629 // entries in that row on the calling process is zero.
2630 src->getLocalRowView (lclRowInd, lclColInds, vals);
2631 const size_t numEnt = lclColInds.extent(0);
2632
2633 // Convert column indices from local to global.
2634 for (size_t j = 0; j < numEnt; ++j)
2635 gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2636
2637 // Kyungjoo: additional wrapping scratch view is necessary
2638 // the following function interface need the same execution space
2639 // host scratch space somehow is not considered same as the host_exec
2640 // Copy the row's data into the current spot in the exports array.
2641 const size_t numBytes =
2642 packRowForBlockCrs<impl_scalar_type, LO, GO>
2643 (exports.view_host(),
2644 offset(i),
2645 numEnt,
2646 Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2647 vals,
2648 numBytesPerValue,
2649 blockSize);
2650
2651 // numBytes should be same as the difference between offsets
2652 if (debug) {
2653 const size_t offsetDiff = offset(i+1) - offset(i);
2654 if (numBytes != offsetDiff) {
2655 std::ostringstream os;
2656 os << prefix
2657 << "numBytes computed from packRowForBlockCrs is different from "
2658 << "precomputed offset values, LID = " << i << std::endl;
2659 std::cerr << os.str ();
2660 }
2661 }
2662 }); // for each LID (of a row) to send
2664 }
2665 } // if totalNumEntries > 0
2666
2667 if (debug) {
2668 std::ostringstream os;
2669 const bool lclSuccess = ! (* (this->localError_));
2670 os << prefix
2671 << (lclSuccess ? "succeeded" : "FAILED")
2672 << std::endl;
2673 std::cerr << os.str ();
2674 }
2675 }
2676
2677 template<class Scalar, class LO, class GO, class Node>
2678 void
2681 (const Kokkos::DualView<const local_ordinal_type*,
2682 buffer_device_type>& importLIDs,
2683 Kokkos::DualView<packet_type*,
2684 buffer_device_type> imports,
2685 Kokkos::DualView<size_t*,
2686 buffer_device_type> numPacketsPerLID,
2687 const size_t /* constantNumPackets */,
2688 const CombineMode combineMode)
2689 {
2694 using std::endl;
2695 using host_exec =
2696 typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2697
2698 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2699 const bool verbose = Behavior::verbose ();
2700
2701 // Define this function prefix
2702 std::string prefix;
2703 {
2704 std::ostringstream os;
2705 auto map = this->graph_.getRowMap ();
2706 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2707 const int myRank = comm.is_null () ? -1 : comm->getRank ();
2708 os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2709 prefix = os.str ();
2710 if (verbose) {
2711 os << "Start" << endl;
2712 std::cerr << os.str ();
2713 }
2714 }
2715
2716 // check if this already includes a local error
2717 if (* (this->localError_)) {
2718 std::ostream& err = this->markLocalErrorAndGetStream ();
2719 std::ostringstream os;
2720 os << prefix << "{Im/Ex}port target is already in error." << endl;
2721 if (verbose) {
2722 std::cerr << os.str ();
2723 }
2724 err << os.str ();
2725 return;
2726 }
2727
2731 if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2732 std::ostream& err = this->markLocalErrorAndGetStream ();
2733 std::ostringstream os;
2734 os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2735 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2736 << "." << endl;
2737 if (verbose) {
2738 std::cerr << os.str ();
2739 }
2740 err << os.str ();
2741 return;
2742 }
2743
2744 if (combineMode != ADD && combineMode != INSERT &&
2745 combineMode != REPLACE && combineMode != ABSMAX &&
2746 combineMode != ZERO) {
2747 std::ostream& err = this->markLocalErrorAndGetStream ();
2748 std::ostringstream os;
2749 os << prefix
2750 << "Invalid CombineMode value " << combineMode << ". Valid "
2751 << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2752 << std::endl;
2753 if (verbose) {
2754 std::cerr << os.str ();
2755 }
2756 err << os.str ();
2757 return;
2758 }
2759
2760 if (this->graph_.getColMap ().is_null ()) {
2761 std::ostream& err = this->markLocalErrorAndGetStream ();
2762 std::ostringstream os;
2763 os << prefix << "Target matrix's column Map is null." << endl;
2764 if (verbose) {
2765 std::cerr << os.str ();
2766 }
2767 err << os.str ();
2768 return;
2769 }
2770
2771 // Target matrix's column Map. Use to convert the global column
2772 // indices in the receive buffer to local indices. We verified in
2773 // checkSizes() that the column Map exists (is not null).
2774 const map_type& tgtColMap = * (this->graph_.getColMap ());
2775
2776 // Const values
2777 const size_t blockSize = this->getBlockSize ();
2778 const size_t numImportLIDs = importLIDs.extent(0);
2779 // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2780 // default-constructed instance could have a different size than
2781 // other instances. (We assume all nominally constructed
2782 // instances have the same size; that's not the issue here.) This
2783 // could be bad if the calling process has no entries, but other
2784 // processes have entries that they want to send to us.
2785 size_t numBytesPerValue(0);
2786 {
2787 auto val_host = val_.getHostView(Access::ReadOnly);
2788 numBytesPerValue =
2789 PackTraits<impl_scalar_type>::packValueCount
2790 (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2791 }
2792 const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2793 const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2794
2795 if (verbose) {
2796 std::ostringstream os;
2797 os << prefix << "combineMode: "
2798 << ::Tpetra::combineModeToString (combineMode)
2799 << ", blockSize: " << blockSize
2800 << ", numImportLIDs: " << numImportLIDs
2801 << ", numBytesPerValue: " << numBytesPerValue
2802 << ", maxRowNumEnt: " << maxRowNumEnt
2803 << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2804 std::cerr << os.str ();
2805 }
2806
2807 if (combineMode == ZERO || numImportLIDs == 0) {
2808 if (verbose) {
2809 std::ostringstream os;
2810 os << prefix << "Nothing to unpack. Done!" << std::endl;
2811 std::cerr << os.str ();
2812 }
2813 return;
2814 }
2815
2816 // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2817 // we can remove this sync.
2818 if (imports.need_sync_host ()) {
2819 imports.sync_host ();
2820 }
2821
2822 // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2823 // sync'd numPacketsPerLID to host, since it needs to do that in
2824 // order to post MPI_Irecv messages with the correct lengths. A
2825 // hypothetical device-specific boundary exchange implementation
2826 // could possibly receive data without sync'ing lengths to host,
2827 // but we don't need to design for that nonexistent thing yet.
2828
2829 if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2830 importLIDs.need_sync_host ()) {
2831 std::ostream& err = this->markLocalErrorAndGetStream ();
2832 std::ostringstream os;
2833 os << prefix << "All input DualViews must be sync'd to host by now. "
2834 << "imports_nc.need_sync_host()="
2835 << (imports.need_sync_host () ? "true" : "false")
2836 << ", numPacketsPerLID.need_sync_host()="
2837 << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2838 << ", importLIDs.need_sync_host()="
2839 << (importLIDs.need_sync_host () ? "true" : "false")
2840 << "." << endl;
2841 if (verbose) {
2842 std::cerr << os.str ();
2843 }
2844 err << os.str ();
2845 return;
2846 }
2847
2848 const auto importLIDsHost = importLIDs.view_host ();
2849 const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2850
2851 // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2852 // loop, by only unpacking on final==true (when we know the
2853 // current offset's value).
2854
2855 Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2856 {
2857 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2858 Kokkos::parallel_scan
2859 ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2860 [=] (const size_t &i, size_t &update, const bool &final) {
2861 if (final) offset(i) = update;
2862 update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2863 });
2864 }
2865
2866 // this variable does not matter with a race condition (just error flag)
2867 //
2868 // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2869 // or atomics on bool, so we use int instead. (I know we're not
2870 // launching a CUDA loop here, but we could in the future, and I'd
2871 // like to avoid potential trouble.)
2872 Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2873 errorDuringUnpack ("errorDuringUnpack");
2874 errorDuringUnpack () = 0;
2875 {
2876 using policy_type = Kokkos::TeamPolicy<host_exec>;
2877 size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2878 + 2 * sizeof(GO); // Yeah, this is a fudge factor
2879
2880 const auto policy = policy_type (numImportLIDs, 1, 1)
2881 .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2882 using host_scratch_space = typename host_exec::scratch_memory_space;
2883
2884 using pair_type = Kokkos::pair<size_t, size_t>;
2885
2886 //The following parallel_for modifies values on host while unpacking.
2889 Kokkos::parallel_for
2890 ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2891 [=] (const typename policy_type::member_type& member) {
2892 const size_t i = member.league_rank();
2893 Kokkos::View<GO*, host_scratch_space> gblColInds
2894 (member.team_scratch (0), maxRowNumEnt);
2895 Kokkos::View<LO*, host_scratch_space> lclColInds
2896 (member.team_scratch (0), maxRowNumEnt);
2897 Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2898 (member.team_scratch (0), maxRowNumScalarEnt);
2899
2900
2901 const size_t offval = offset(i);
2902 const LO lclRow = importLIDsHost(i);
2903 const size_t numBytes = numPacketsPerLIDHost(i);
2904 const size_t numEnt =
2905 unpackRowCount<impl_scalar_type, LO, GO>
2906 (imports.view_host (), offval, numBytes, numBytesPerValue);
2907
2908 if (numBytes > 0) {
2909 if (numEnt > maxRowNumEnt) {
2910 errorDuringUnpack() = 1;
2911 if (verbose) {
2912 std::ostringstream os;
2913 os << prefix
2914 << "At i = " << i << ", numEnt = " << numEnt
2915 << " > maxRowNumEnt = " << maxRowNumEnt
2916 << std::endl;
2917 std::cerr << os.str ();
2918 }
2919 return;
2920 }
2921 }
2922 const size_t numScalarEnt = numEnt*blockSize*blockSize;
2923 auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2924 auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2925 auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2926
2927 // Kyungjoo: additional wrapping scratch view is necessary
2928 // the following function interface need the same execution space
2929 // host scratch space somehow is not considered same as the host_exec
2930 size_t numBytesOut = 0;
2931 try {
2932 numBytesOut =
2933 unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2934 (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2935 Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2936 imports.view_host(),
2937 offval, numBytes, numEnt,
2938 numBytesPerValue, blockSize);
2939 }
2940 catch (std::exception& e) {
2941 errorDuringUnpack() = 1;
2942 if (verbose) {
2943 std::ostringstream os;
2944 os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2945 << e.what () << endl;
2946 std::cerr << os.str ();
2947 }
2948 return;
2949 }
2950
2951 if (numBytes != numBytesOut) {
2952 errorDuringUnpack() = 1;
2953 if (verbose) {
2954 std::ostringstream os;
2955 os << prefix
2956 << "At i = " << i << ", numBytes = " << numBytes
2957 << " != numBytesOut = " << numBytesOut << "."
2958 << std::endl;
2959 std::cerr << os.str();
2960 }
2961 return;
2962 }
2963
2964 // Convert incoming global indices to local indices.
2965 for (size_t k = 0; k < numEnt; ++k) {
2966 lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2967 if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2968 if (verbose) {
2969 std::ostringstream os;
2970 os << prefix
2971 << "At i = " << i << ", GID " << gidsOut(k)
2972 << " is not owned by the calling process."
2973 << std::endl;
2974 std::cerr << os.str();
2975 }
2976 return;
2977 }
2978 }
2979
2980 // Combine the incoming data with the matrix's current data.
2981 LO numCombd = 0;
2982 const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2983 const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2984 (const_cast<const impl_scalar_type*> (valsOut.data ()));
2985 if (combineMode == ADD) {
2986 numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2987 } else if (combineMode == INSERT || combineMode == REPLACE) {
2988 numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2989 } else if (combineMode == ABSMAX) {
2990 numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2991 }
2992
2993 if (static_cast<LO> (numEnt) != numCombd) {
2994 errorDuringUnpack() = 1;
2995 if (verbose) {
2996 std::ostringstream os;
2997 os << prefix << "At i = " << i << ", numEnt = " << numEnt
2998 << " != numCombd = " << numCombd << "."
2999 << endl;
3000 std::cerr << os.str ();
3001 }
3002 return;
3003 }
3004 }); // for each import LID i
3006 }
3007
3008 if (errorDuringUnpack () != 0) {
3009 std::ostream& err = this->markLocalErrorAndGetStream ();
3010 err << prefix << "Unpacking failed.";
3011 if (! verbose) {
3012 err << " Please run again with the environment variable setting "
3013 "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3014 }
3015 err << endl;
3016 }
3017
3018 if (verbose) {
3019 std::ostringstream os;
3020 const bool lclSuccess = ! (* (this->localError_));
3021 os << prefix
3022 << (lclSuccess ? "succeeded" : "FAILED")
3023 << std::endl;
3024 std::cerr << os.str ();
3025 }
3026 }
3027
3028 template<class Scalar, class LO, class GO, class Node>
3029 std::string
3031 {
3032 using Teuchos::TypeNameTraits;
3033 std::ostringstream os;
3034 os << "\"Tpetra::BlockCrsMatrix\": { "
3035 << "Template parameters: { "
3036 << "Scalar: " << TypeNameTraits<Scalar>::name ()
3037 << "LO: " << TypeNameTraits<LO>::name ()
3038 << "GO: " << TypeNameTraits<GO>::name ()
3039 << "Node: " << TypeNameTraits<Node>::name ()
3040 << " }"
3041 << ", Label: \"" << this->getObjectLabel () << "\""
3042 << ", Global dimensions: ["
3043 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3044 << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3045 << ", Block size: " << getBlockSize ()
3046 << " }";
3047 return os.str ();
3048 }
3049
3050
3051 template<class Scalar, class LO, class GO, class Node>
3052 void
3054 describe (Teuchos::FancyOStream& out,
3055 const Teuchos::EVerbosityLevel verbLevel) const
3056 {
3057 using Teuchos::ArrayRCP;
3058 using Teuchos::CommRequest;
3059 using Teuchos::FancyOStream;
3060 using Teuchos::getFancyOStream;
3061 using Teuchos::ireceive;
3062 using Teuchos::isend;
3063 using Teuchos::outArg;
3064 using Teuchos::TypeNameTraits;
3065 using Teuchos::VERB_DEFAULT;
3066 using Teuchos::VERB_NONE;
3067 using Teuchos::VERB_LOW;
3068 using Teuchos::VERB_MEDIUM;
3069 // using Teuchos::VERB_HIGH;
3070 using Teuchos::VERB_EXTREME;
3071 using Teuchos::RCP;
3072 using Teuchos::wait;
3073 using std::endl;
3074
3075 // Set default verbosity if applicable.
3076 const Teuchos::EVerbosityLevel vl =
3077 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3078
3079 if (vl == VERB_NONE) {
3080 return; // print nothing
3081 }
3082
3083 // describe() always starts with a tab before it prints anything.
3084 Teuchos::OSTab tab0 (out);
3085
3086 out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3087 Teuchos::OSTab tab1 (out);
3088
3089 out << "Template parameters:" << endl;
3090 {
3091 Teuchos::OSTab tab2 (out);
3092 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3093 << "LO: " << TypeNameTraits<LO>::name () << endl
3094 << "GO: " << TypeNameTraits<GO>::name () << endl
3095 << "Node: " << TypeNameTraits<Node>::name () << endl;
3096 }
3097 out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3098 << "Global dimensions: ["
3099 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3100 << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3101
3102 const LO blockSize = getBlockSize ();
3103 out << "Block size: " << blockSize << endl;
3104
3105 // constituent objects
3106 if (vl >= VERB_MEDIUM) {
3107 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3108 const int myRank = comm.getRank ();
3109 if (myRank == 0) {
3110 out << "Row Map:" << endl;
3111 }
3112 getRowMap()->describe (out, vl);
3113
3114 if (! getColMap ().is_null ()) {
3115 if (getColMap() == getRowMap()) {
3116 if (myRank == 0) {
3117 out << "Column Map: same as row Map" << endl;
3118 }
3119 }
3120 else {
3121 if (myRank == 0) {
3122 out << "Column Map:" << endl;
3123 }
3124 getColMap ()->describe (out, vl);
3125 }
3126 }
3127 if (! getDomainMap ().is_null ()) {
3128 if (getDomainMap () == getRowMap ()) {
3129 if (myRank == 0) {
3130 out << "Domain Map: same as row Map" << endl;
3131 }
3132 }
3133 else if (getDomainMap () == getColMap ()) {
3134 if (myRank == 0) {
3135 out << "Domain Map: same as column Map" << endl;
3136 }
3137 }
3138 else {
3139 if (myRank == 0) {
3140 out << "Domain Map:" << endl;
3141 }
3142 getDomainMap ()->describe (out, vl);
3143 }
3144 }
3145 if (! getRangeMap ().is_null ()) {
3146 if (getRangeMap () == getDomainMap ()) {
3147 if (myRank == 0) {
3148 out << "Range Map: same as domain Map" << endl;
3149 }
3150 }
3151 else if (getRangeMap () == getRowMap ()) {
3152 if (myRank == 0) {
3153 out << "Range Map: same as row Map" << endl;
3154 }
3155 }
3156 else {
3157 if (myRank == 0) {
3158 out << "Range Map: " << endl;
3159 }
3160 getRangeMap ()->describe (out, vl);
3161 }
3162 }
3163 }
3164
3165 if (vl >= VERB_EXTREME) {
3166 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3167 const int myRank = comm.getRank ();
3168 const int numProcs = comm.getSize ();
3169
3170 // Print the calling process' data to the given output stream.
3171 RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3172 RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3173 FancyOStream& os = *osPtr;
3174 os << "Process " << myRank << ":" << endl;
3175 Teuchos::OSTab tab2 (os);
3176
3177 const map_type& meshRowMap = * (graph_.getRowMap ());
3178 const map_type& meshColMap = * (graph_.getColMap ());
3179 for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3180 meshLclRow <= meshRowMap.getMaxLocalIndex ();
3181 ++meshLclRow) {
3182 const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3183 os << "Row " << meshGblRow << ": {";
3184
3185 local_inds_host_view_type lclColInds;
3186 values_host_view_type vals;
3187 LO numInds = 0;
3188 this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
3189
3190 for (LO k = 0; k < numInds; ++k) {
3191 const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3192
3193 os << "Col " << gblCol << ": [";
3194 for (LO i = 0; i < blockSize; ++i) {
3195 for (LO j = 0; j < blockSize; ++j) {
3196 os << vals[blockSize*blockSize*k + i*blockSize + j];
3197 if (j + 1 < blockSize) {
3198 os << ", ";
3199 }
3200 }
3201 if (i + 1 < blockSize) {
3202 os << "; ";
3203 }
3204 }
3205 os << "]";
3206 if (k + 1 < numInds) {
3207 os << ", ";
3208 }
3209 }
3210 os << "}" << endl;
3211 }
3212
3213 // Print data on Process 0. This will automatically respect the
3214 // current indentation level.
3215 if (myRank == 0) {
3216 out << lclOutStrPtr->str ();
3217 lclOutStrPtr = Teuchos::null; // clear it to save space
3218 }
3219
3220 const int sizeTag = 1337;
3221 const int dataTag = 1338;
3222
3223 ArrayRCP<char> recvDataBuf; // only used on Process 0
3224
3225 // Send string sizes and data from each process in turn to
3226 // Process 0, and print on that process.
3227 for (int p = 1; p < numProcs; ++p) {
3228 if (myRank == 0) {
3229 // Receive the incoming string's length.
3230 ArrayRCP<size_t> recvSize (1);
3231 recvSize[0] = 0;
3232 RCP<CommRequest<int> > recvSizeReq =
3233 ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3234 wait<int> (comm, outArg (recvSizeReq));
3235 const size_t numCharsToRecv = recvSize[0];
3236
3237 // Allocate space for the string to receive. Reuse receive
3238 // buffer space if possible. We can do this because in the
3239 // current implementation, we only have one receive in
3240 // flight at a time. Leave space for the '\0' at the end,
3241 // in case the sender doesn't send it.
3242 if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3243 recvDataBuf.resize (numCharsToRecv + 1);
3244 }
3245 ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3246 // Post the receive of the actual string data.
3247 RCP<CommRequest<int> > recvDataReq =
3248 ireceive<int, char> (recvData, p, dataTag, comm);
3249 wait<int> (comm, outArg (recvDataReq));
3250
3251 // Print the received data. This will respect the current
3252 // indentation level. Make sure that the string is
3253 // null-terminated.
3254 recvDataBuf[numCharsToRecv] = '\0';
3255 out << recvDataBuf.getRawPtr ();
3256 }
3257 else if (myRank == p) { // if I am not Process 0, and my rank is p
3258 // This deep-copies the string at most twice, depending on
3259 // whether std::string reference counts internally (it
3260 // generally does, so this won't deep-copy at all).
3261 const std::string stringToSend = lclOutStrPtr->str ();
3262 lclOutStrPtr = Teuchos::null; // clear original to save space
3263
3264 // Send the string's length to Process 0.
3265 const size_t numCharsToSend = stringToSend.size ();
3266 ArrayRCP<size_t> sendSize (1);
3267 sendSize[0] = numCharsToSend;
3268 RCP<CommRequest<int> > sendSizeReq =
3269 isend<int, size_t> (sendSize, 0, sizeTag, comm);
3270 wait<int> (comm, outArg (sendSizeReq));
3271
3272 // Send the actual string to Process 0. We know that the
3273 // string has length > 0, so it's save to take the address
3274 // of the first entry. Make a nonowning ArrayRCP to hold
3275 // the string. Process 0 will add a null termination
3276 // character at the end of the string, after it receives the
3277 // message.
3278 ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3279 RCP<CommRequest<int> > sendDataReq =
3280 isend<int, char> (sendData, 0, dataTag, comm);
3281 wait<int> (comm, outArg (sendDataReq));
3282 }
3283 } // for each process rank p other than 0
3284 } // extreme verbosity level (print the whole matrix)
3285 }
3286
3287 template<class Scalar, class LO, class GO, class Node>
3288 Teuchos::RCP<const Teuchos::Comm<int> >
3290 getComm() const
3291 {
3292 return graph_.getComm();
3293 }
3294
3295
3296 template<class Scalar, class LO, class GO, class Node>
3299 getGlobalNumCols() const
3300 {
3301 return graph_.getGlobalNumCols();
3302 }
3303
3304
3305 template<class Scalar, class LO, class GO, class Node>
3306 size_t
3308 getLocalNumCols() const
3309 {
3310 return graph_.getLocalNumCols();
3311 }
3312
3313 template<class Scalar, class LO, class GO, class Node>
3314 GO
3316 getIndexBase() const
3317 {
3318 return graph_.getIndexBase();
3319 }
3320
3321 template<class Scalar, class LO, class GO, class Node>
3325 {
3326 return graph_.getGlobalNumEntries();
3327 }
3328
3329 template<class Scalar, class LO, class GO, class Node>
3330 size_t
3332 getLocalNumEntries() const
3333 {
3334 return graph_.getLocalNumEntries();
3335 }
3336
3337 template<class Scalar, class LO, class GO, class Node>
3338 size_t
3340 getNumEntriesInGlobalRow (GO globalRow) const
3341 {
3342 return graph_.getNumEntriesInGlobalRow(globalRow);
3343 }
3344
3345
3346 template<class Scalar, class LO, class GO, class Node>
3347 size_t
3350 {
3351 return graph_.getGlobalMaxNumRowEntries();
3352 }
3353
3354 template<class Scalar, class LO, class GO, class Node>
3355 bool
3357 hasColMap() const
3358 {
3359 return graph_.hasColMap();
3360 }
3361
3362
3363 template<class Scalar, class LO, class GO, class Node>
3364 bool
3366 isLocallyIndexed() const
3367 {
3368 return graph_.isLocallyIndexed();
3369 }
3370
3371 template<class Scalar, class LO, class GO, class Node>
3372 bool
3374 isGloballyIndexed() const
3375 {
3376 return graph_.isGloballyIndexed();
3377 }
3378
3379 template<class Scalar, class LO, class GO, class Node>
3380 bool
3382 isFillComplete() const
3383 {
3384 return graph_.isFillComplete ();
3385 }
3386
3387 template<class Scalar, class LO, class GO, class Node>
3388 bool
3390 supportsRowViews() const
3391 {
3392 return false;
3393 }
3394
3395 template<class Scalar, class LO, class GO, class Node>
3396 void
3398 getGlobalRowCopy (GO /*GlobalRow*/,
3399 nonconst_global_inds_host_view_type &/*Indices*/,
3400 nonconst_values_host_view_type &/*Values*/,
3401 size_t& /*NumEntries*/) const
3402 {
3403 TEUCHOS_TEST_FOR_EXCEPTION(
3404 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3405 "This class doesn't support global matrix indexing.");
3406
3407 }
3408
3409 template<class Scalar, class LO, class GO, class Node>
3410 void
3412 getGlobalRowView (GO /* GlobalRow */,
3413 global_inds_host_view_type &/* indices */,
3414 values_host_view_type &/* values */) const
3415 {
3416 TEUCHOS_TEST_FOR_EXCEPTION(
3417 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3418 "This class doesn't support global matrix indexing.");
3419
3420 }
3421
3422 template<class Scalar, class LO, class GO, class Node>
3423 void
3425 getLocalRowView (LO localRowInd,
3426 local_inds_host_view_type &colInds,
3427 values_host_view_type &vals) const
3428 {
3429 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3430 colInds = local_inds_host_view_type();
3431 vals = values_host_view_type();
3432 }
3433 else {
3434 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3435 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3436 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3437
3438 vals = getValuesHost (localRowInd);
3439 }
3440 }
3441
3442 template<class Scalar, class LO, class GO, class Node>
3443 void
3445 getLocalRowViewNonConst (LO localRowInd,
3446 local_inds_host_view_type &colInds,
3447 nonconst_values_host_view_type &vals) const
3448 {
3449 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3450 colInds = nonconst_local_inds_host_view_type();
3451 vals = nonconst_values_host_view_type();
3452 }
3453 else {
3454 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3455 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3456 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3457
3458 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
3459 vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
3460 }
3461 }
3462
3463 template<class Scalar, class LO, class GO, class Node>
3464 void
3467 {
3468 const size_t lclNumMeshRows = graph_.getLocalNumRows ();
3469
3470 Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3471 graph_.getLocalDiagOffsets (diagOffsets);
3472
3473 // The code below works on host, so use a host View.
3474 auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3475 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
3476 Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
3477
3478 auto vals_host_out = val_.getHostView(Access::ReadOnly);
3479 const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
3480
3481 // TODO amk: This is a temporary measure to make the code run with Ifpack2
3482 size_t rowOffset = 0;
3483 size_t offset = 0;
3484 LO bs = getBlockSize();
3485 for(size_t r=0; r<getLocalNumRows(); r++)
3486 {
3487 // move pointer to start of diagonal block
3488 offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3489 for(int b=0; b<bs; b++)
3490 {
3491 diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3492 }
3493 // move pointer to start of next block row
3494 rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3495 }
3496
3497 //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3498 }
3499
3500 template<class Scalar, class LO, class GO, class Node>
3501 void
3503 leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3504 {
3505 TEUCHOS_TEST_FOR_EXCEPTION(
3506 true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3507 "not implemented.");
3508
3509 }
3510
3511 template<class Scalar, class LO, class GO, class Node>
3512 void
3514 rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3515 {
3516 TEUCHOS_TEST_FOR_EXCEPTION(
3517 true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3518 "not implemented.");
3519
3520 }
3521
3522 template<class Scalar, class LO, class GO, class Node>
3523 Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3525 getGraph() const
3526 {
3527 return graphRCP_;
3528 }
3529
3530 template<class Scalar, class LO, class GO, class Node>
3531 typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3533 getFrobeniusNorm () const
3534 {
3535 TEUCHOS_TEST_FOR_EXCEPTION(
3536 true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3537 "not implemented.");
3538 }
3539
3540} // namespace Tpetra
3541
3542//
3543// Explicit instantiation macro
3544//
3545// Must be expanded from within the Tpetra namespace!
3546//
3547#define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3548 template class BlockCrsMatrix< S, LO, GO, NODE >;
3549
3550#endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual LO getBlockSize() const override
The number of degrees of freedom per mesh point.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
std::string errorMessages() const
The current stream of error messages.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
::Tpetra::MultiVector< Scalar, LO, GO, node_type > mv_type
The implementation of MultiVector that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
size_t getLocalNumRows() const override
get the local number of block rows
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
local_matrix_device_type getLocalMatrixDevice() const
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Kokkos::View< const impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
::Tpetra::Map< LO, GO, node_type > map_type
The implementation of Map that this class uses.
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
::Tpetra::CrsGraph< LO, GO, node_type > crs_graph_type
The implementation of CrsGraph that this class uses.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
virtual bool checkSizes(const ::Tpetra::SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
MultiVector for multiple degrees of freedom per mesh point.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Tpetra::Map< LO, GO, Node > map_type
The specialization of Tpetra::Map that this class uses.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
::Tpetra::Import< LO, GO, node_type > import_type
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Description of Tpetra's behavior.
virtual Teuchos::RCP< const map_type > getMap() const
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
A distributed dense vector.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Namespace for new Tpetra features that are not ready for public release, but are ready for evaluation...
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
Teuchos::RCP< CrsGraphType > exportAndFillCompleteCrsGraph(const Teuchos::RCP< const CrsGraphType > &sourceGraph, const Export< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > &exporter, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember CrsGraph constructor that fuses Export and fillComplete().
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Teuchos::RCP< CrsGraphType > importAndFillCompleteCrsGraph(const Teuchos::RCP< const CrsGraphType > &sourceGraph, const Import< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > &importer, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember CrsGraph constructor that fuses Import and fillComplete().
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
Traits class for packing / unpacking data of type T.