40#ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
41#define TPETRA_BLOCKCRSMATRIX_DEF_HPP
49#include "Tpetra_BlockMultiVector.hpp"
52#include "Teuchos_TimeMonitor.hpp"
53#ifdef HAVE_TPETRA_DEBUG
69#if defined(__CUDACC__)
71# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
75#elif defined(__GNUC__)
77# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
84# if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85# define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
95 struct BlockCrsRowStruct {
96 T totalNumEntries, totalNumBytes, maxRowLength;
97 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() =
default;
98 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(
const BlockCrsRowStruct &b) =
default;
99 KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(
const T& numEnt,
const T& numBytes,
const T& rowLength)
100 : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
105 KOKKOS_INLINE_FUNCTION
106 void operator+=(BlockCrsRowStruct<T> &a,
const BlockCrsRowStruct<T> &b) {
107 a.totalNumEntries += b.totalNumEntries;
108 a.totalNumBytes += b.totalNumBytes;
109 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
112 template<
typename T,
typename ExecSpace>
113 struct BlockCrsReducer {
114 typedef BlockCrsReducer reducer;
115 typedef T value_type;
116 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
119 KOKKOS_INLINE_FUNCTION
120 BlockCrsReducer(value_type &val) : value(&val) {}
122 KOKKOS_INLINE_FUNCTION
void join(value_type &dst, value_type &src)
const { dst += src; }
123 KOKKOS_INLINE_FUNCTION
void join(value_type &dst,
const value_type &src)
const { dst += src; }
124 KOKKOS_INLINE_FUNCTION
void init(value_type &val)
const { val = value_type(); }
125 KOKKOS_INLINE_FUNCTION value_type& reference() {
return *value; }
126 KOKKOS_INLINE_FUNCTION result_view_type view()
const {
return result_view_type(value); }
129 template<
class AlphaCoeffType,
131 class MatrixValuesType,
136 class BcrsApplyNoTransFunctor {
138 static_assert (Kokkos::is_view<MatrixValuesType>::value,
139 "MatrixValuesType must be a Kokkos::View.");
140 static_assert (Kokkos::is_view<OutVecType>::value,
141 "OutVecType must be a Kokkos::View.");
142 static_assert (Kokkos::is_view<InVecType>::value,
143 "InVecType must be a Kokkos::View.");
144 static_assert (std::is_same<MatrixValuesType,
145 typename MatrixValuesType::const_type>::value,
146 "MatrixValuesType must be a const Kokkos::View.");
147 static_assert (std::is_same<
typename OutVecType::value_type,
148 typename OutVecType::non_const_value_type>::value,
149 "OutVecType must be a nonconst Kokkos::View.");
150 static_assert (std::is_same<
typename InVecType::value_type,
151 typename InVecType::const_value_type>::value,
152 "InVecType must be a const Kokkos::View.");
153 static_assert (
static_cast<int> (MatrixValuesType::rank) == 1,
154 "MatrixValuesType must be a rank-1 Kokkos::View.");
155 static_assert (
static_cast<int> (InVecType::rank) == 1,
156 "InVecType must be a rank-1 Kokkos::View.");
157 static_assert (
static_cast<int> (OutVecType::rank) == 1,
158 "OutVecType must be a rank-1 Kokkos::View.");
159 typedef typename MatrixValuesType::non_const_value_type scalar_type;
162 typedef typename GraphType::device_type device_type;
165 typedef typename std::remove_const<typename GraphType::data_type>::type
174 void setX (
const InVecType& X) { X_ = X; }
182 void setY (
const OutVecType& Y) { Y_ = Y; }
184 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
185 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
188 BcrsApplyNoTransFunctor (
const alpha_coeff_type& alpha,
189 const GraphType& graph,
190 const MatrixValuesType& val,
191 const local_ordinal_type blockSize,
193 const beta_coeff_type& beta,
194 const OutVecType& Y) :
196 ptr_ (graph.row_map),
197 ind_ (graph.entries),
199 blockSize_ (blockSize),
206 KOKKOS_INLINE_FUNCTION
void
207 operator () (
const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member)
const
209 Kokkos::abort(
"Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
213 KOKKOS_INLINE_FUNCTION
void
214 operator () (
const local_ordinal_type& lclRow)
const
219 using Kokkos::ArithTraits;
225 using Kokkos::parallel_for;
226 using Kokkos::subview;
227 typedef typename decltype (ptr_)::non_const_value_type offset_type;
228 typedef Kokkos::View<
typename MatrixValuesType::const_value_type**,
231 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
234 const offset_type Y_ptBeg = lclRow * blockSize_;
235 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
236 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
240 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
241 FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
243 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) {
247 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
248 const offset_type blkBeg = ptr_[lclRow];
249 const offset_type blkEnd = ptr_[lclRow+1];
251 const offset_type bs2 = blockSize_ * blockSize_;
252 for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
254 little_block_type A_cur (val_.data () + absBlkOff * bs2,
255 blockSize_, blockSize_);
256 const offset_type X_blkCol = ind_[absBlkOff];
257 const offset_type X_ptBeg = X_blkCol * blockSize_;
258 const offset_type X_ptEnd = X_ptBeg + blockSize_;
259 auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
261 GEMV (alpha_, A_cur, X_cur, Y_cur);
267 alpha_coeff_type alpha_;
268 typename GraphType::row_map_type::const_type ptr_;
269 typename GraphType::entries_type::const_type ind_;
270 MatrixValuesType val_;
271 local_ordinal_type blockSize_;
273 beta_coeff_type beta_;
277 template<
class AlphaCoeffType,
279 class MatrixValuesType,
283 class BcrsApplyNoTransFunctor<AlphaCoeffType,
291 static_assert (Kokkos::is_view<MatrixValuesType>::value,
292 "MatrixValuesType must be a Kokkos::View.");
293 static_assert (Kokkos::is_view<OutVecType>::value,
294 "OutVecType must be a Kokkos::View.");
295 static_assert (Kokkos::is_view<InVecType>::value,
296 "InVecType must be a Kokkos::View.");
297 static_assert (std::is_same<MatrixValuesType,
298 typename MatrixValuesType::const_type>::value,
299 "MatrixValuesType must be a const Kokkos::View.");
300 static_assert (std::is_same<
typename OutVecType::value_type,
301 typename OutVecType::non_const_value_type>::value,
302 "OutVecType must be a nonconst Kokkos::View.");
303 static_assert (std::is_same<
typename InVecType::value_type,
304 typename InVecType::const_value_type>::value,
305 "InVecType must be a const Kokkos::View.");
306 static_assert (
static_cast<int> (MatrixValuesType::rank) == 1,
307 "MatrixValuesType must be a rank-1 Kokkos::View.");
308 static_assert (
static_cast<int> (InVecType::rank) == 1,
309 "InVecType must be a rank-1 Kokkos::View.");
310 static_assert (
static_cast<int> (OutVecType::rank) == 1,
311 "OutVecType must be a rank-1 Kokkos::View.");
312 typedef typename MatrixValuesType::non_const_value_type scalar_type;
315 typedef typename GraphType::device_type device_type;
318 static constexpr bool runOnHost = !std::is_same_v<typename device_type::execution_space, Kokkos::DefaultExecutionSpace> || std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::DefaultHostExecutionSpace>;
323 typedef typename std::remove_const<typename GraphType::data_type>::type
332 void setX (
const InVecType& X) { X_ = X; }
340 void setY (
const OutVecType& Y) { Y_ = Y; }
342 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
343 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
346 BcrsApplyNoTransFunctor (
const alpha_coeff_type& alpha,
347 const GraphType& graph,
348 const MatrixValuesType& val,
349 const local_ordinal_type blockSize,
351 const beta_coeff_type& beta,
352 const OutVecType& Y) :
354 ptr_ (graph.row_map),
355 ind_ (graph.entries),
357 blockSize_ (blockSize),
364 KOKKOS_INLINE_FUNCTION
void
365 operator () (
const local_ordinal_type& lclRow)
const
367 Kokkos::abort(
"Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
371 KOKKOS_INLINE_FUNCTION
void
372 operator () (
const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member)
const
375 const local_ordinal_type lclRow = member.league_rank();
377 using Kokkos::ArithTraits;
383 using Kokkos::parallel_for;
384 using Kokkos::subview;
385 typedef typename decltype (ptr_)::non_const_value_type offset_type;
386 typedef Kokkos::View<
typename MatrixValuesType::const_value_type**,
389 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
392 const offset_type Y_ptBeg = lclRow * blockSize_;
393 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
394 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
398 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
399 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
400 [&](
const local_ordinal_type &i) {
401 Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
404 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) {
405 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
406 [&](
const local_ordinal_type &i) {
410 member.team_barrier();
412 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
413 const offset_type blkBeg = ptr_[lclRow];
414 const offset_type blkEnd = ptr_[lclRow+1];
416 const offset_type bs2 = blockSize_ * blockSize_;
417 little_block_type A_cur (val_.data (), blockSize_, blockSize_);
418 auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
420 (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
421 [&](
const local_ordinal_type &absBlkOff) {
422 A_cur.assign_data(val_.data () + absBlkOff * bs2);
423 const offset_type X_blkCol = ind_[absBlkOff];
424 const offset_type X_ptBeg = X_blkCol * blockSize_;
425 X_cur.assign_data(&X_(X_ptBeg));
428 (Kokkos::ThreadVectorRange(member, blockSize_),
429 [&](
const local_ordinal_type &k0) {
431 for (local_ordinal_type k1=0;k1<blockSize_;++k1)
432 val += A_cur(k0,k1)*X_cur(k1);
433 if constexpr(runOnHost) {
435 Y_cur(k0) += alpha_*val;
442 Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
450 alpha_coeff_type alpha_;
451 typename GraphType::row_map_type::const_type ptr_;
452 typename GraphType::entries_type::const_type ind_;
453 MatrixValuesType val_;
454 local_ordinal_type blockSize_;
456 beta_coeff_type beta_;
461 template<
class AlphaCoeffType,
463 class MatrixValuesType,
464 class InMultiVecType,
466 class OutMultiVecType>
468 bcrsLocalApplyNoTrans (
const AlphaCoeffType& alpha,
469 const GraphType& graph,
470 const MatrixValuesType& val,
471 const typename std::remove_const<typename GraphType::data_type>::type blockSize,
472 const InMultiVecType& X,
473 const BetaCoeffType& beta,
474 const OutMultiVecType& Y
477 static_assert (Kokkos::is_view<MatrixValuesType>::value,
478 "MatrixValuesType must be a Kokkos::View.");
479 static_assert (Kokkos::is_view<OutMultiVecType>::value,
480 "OutMultiVecType must be a Kokkos::View.");
481 static_assert (Kokkos::is_view<InMultiVecType>::value,
482 "InMultiVecType must be a Kokkos::View.");
483 static_assert (
static_cast<int> (MatrixValuesType::rank) == 1,
484 "MatrixValuesType must be a rank-1 Kokkos::View.");
485 static_assert (
static_cast<int> (OutMultiVecType::rank) == 2,
486 "OutMultiVecType must be a rank-2 Kokkos::View.");
487 static_assert (
static_cast<int> (InMultiVecType::rank) == 2,
488 "InMultiVecType must be a rank-2 Kokkos::View.");
490 typedef typename MatrixValuesType::device_type::execution_space execution_space;
491 typedef typename MatrixValuesType::device_type::memory_space memory_space;
492 typedef typename MatrixValuesType::const_type matrix_values_type;
493 typedef typename OutMultiVecType::non_const_type out_multivec_type;
494 typedef typename InMultiVecType::const_type in_multivec_type;
495 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
496 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
497 typedef typename std::remove_const<typename GraphType::data_type>::type LO;
499 constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
500 constexpr bool is_host_memory_space = std::is_same<memory_space,Kokkos::HostSpace>::value;
501 constexpr bool use_team_policy = (is_builtin_type_enabled && !is_host_memory_space);
503 const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
504 static_cast<LO
> (0) :
505 static_cast<LO> (graph.row_map.extent (0) - 1);
506 const LO numVecs = Y.extent (1);
507 if (numLocalMeshRows == 0 || numVecs == 0) {
514 in_multivec_type X_in = X;
515 out_multivec_type Y_out = Y;
520 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
521 typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
522 typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
523 matrix_values_type, in_vec_type, beta_type, out_vec_type,
524 use_team_policy> functor_type;
526 auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
527 auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
530 if (use_team_policy) {
531 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
533 typedef Kokkos::TeamPolicy<execution_space> policy_type;
534 policy_type policy(1,1);
535#if defined(KOKKOS_ENABLE_CUDA)
536 constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
538 constexpr bool is_cuda =
false;
541 LO team_size = 8, vector_size = 1;
542 if (blockSize <= 5) vector_size = 4;
543 else if (blockSize <= 9) vector_size = 8;
544 else if (blockSize <= 12) vector_size = 12;
545 else if (blockSize <= 20) vector_size = 20;
546 else vector_size = 20;
547 policy = policy_type(numLocalMeshRows, team_size, vector_size);
549 policy = policy_type(numLocalMeshRows, 1, 1);
551 Kokkos::parallel_for (policy, functor);
554 for (LO j = 1; j < numVecs; ++j) {
555 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
556 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
559 Kokkos::parallel_for (policy, functor);
562 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
563 Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
564 Kokkos::parallel_for (policy, functor);
565 for (LO j = 1; j < numVecs; ++j) {
566 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
567 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
570 Kokkos::parallel_for (policy, functor);
580template<
class Scalar,
class LO,
class GO,
class Node>
581class GetLocalDiagCopy {
583 typedef typename Node::device_type device_type;
584 typedef size_t diag_offset_type;
585 typedef Kokkos::View<
const size_t*, device_type,
586 Kokkos::MemoryUnmanaged> diag_offsets_type;
587 typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
589 typedef typename local_graph_device_type::row_map_type row_offsets_type;
590 typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
591 typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
592 typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
595 GetLocalDiagCopy (
const diag_type& diag,
596 const values_type& val,
597 const diag_offsets_type& diagOffsets,
598 const row_offsets_type& ptr,
599 const LO blockSize) :
601 diagOffsets_ (diagOffsets),
603 blockSize_ (blockSize),
604 offsetPerBlock_ (blockSize_*blockSize_),
608 KOKKOS_INLINE_FUNCTION
void
609 operator() (
const LO& lclRowInd)
const
614 const size_t absOffset = ptr_[lclRowInd];
617 const size_t relOffset = diagOffsets_[lclRowInd];
620 const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
625 device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
626 const_little_block_type;
627 const_little_block_type D_in (val_.data () + pointOffset,
628 blockSize_, blockSize_);
629 auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
635 diag_offsets_type diagOffsets_;
636 row_offsets_type ptr_;
643 template<
class Scalar,
class LO,
class GO,
class Node>
648 * (this->localError_) =
true;
649 if ((*errs_).is_null ()) {
650 *errs_ = Teuchos::rcp (
new std::ostringstream ());
655 template<
class Scalar,
class LO,
class GO,
class Node>
658 dist_object_type (Teuchos::rcp (new
map_type ())),
659 graph_ (Teuchos::rcp (new
map_type ()), 0),
660 blockSize_ (static_cast<LO> (0)),
661 X_colMap_ (new Teuchos::RCP<BMV> ()),
662 Y_rowMap_ (new Teuchos::RCP<BMV> ()),
665 localError_ (new bool (false)),
666 errs_ (new Teuchos::RCP<std::ostringstream> ())
670 template<
class Scalar,
class LO,
class GO,
class Node>
673 const LO blockSize) :
674 dist_object_type (graph.
getMap ()),
677 blockSize_ (blockSize),
678 X_colMap_ (new Teuchos::RCP<BMV> ()),
679 Y_rowMap_ (new Teuchos::RCP<BMV> ()),
681 offsetPerBlock_ (blockSize * blockSize),
682 localError_ (new bool (false)),
683 errs_ (new Teuchos::RCP<std::ostringstream> ())
687 TEUCHOS_TEST_FOR_EXCEPTION(
688 ! graph_.isSorted (), std::invalid_argument,
"Tpetra::"
689 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
690 "rows (isSorted() is false). This class assumes sorted rows.");
692 graphRCP_ = Teuchos::rcpFromRef(graph_);
698 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
699 TEUCHOS_TEST_FOR_EXCEPTION(
700 blockSizeIsNonpositive, std::invalid_argument,
"Tpetra::"
701 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
702 " <= 0. The block size must be positive.");
710 const auto colPointMap = Teuchos::rcp
712 *pointImporter_ = Teuchos::rcp
716 auto local_graph_h = graph.getLocalGraphHost ();
717 auto ptr_h = local_graph_h.row_map;
718 ptrHost_ =
decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph row offset"), ptr_h.extent(0));
719 Kokkos::deep_copy(ptrHost_, ptr_h);
721 auto ind_h = local_graph_h.entries;
722 indHost_ =
decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph column indices"), ind_h.extent(0));
724 Kokkos::deep_copy (indHost_, ind_h);
726 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
727 val_ =
decltype (val_) (impl_scalar_type_dualview(
"val", numValEnt));
731 template<
class Scalar,
class LO,
class GO,
class Node>
734 const typename local_matrix_device_type::values_type& values,
735 const LO blockSize) :
736 dist_object_type (graph.
getMap ()),
739 blockSize_ (blockSize),
740 X_colMap_ (new Teuchos::RCP<BMV> ()),
741 Y_rowMap_ (new Teuchos::RCP<BMV> ()),
743 offsetPerBlock_ (blockSize * blockSize),
744 localError_ (new bool (false)),
745 errs_ (new Teuchos::RCP<std::ostringstream> ())
748 TEUCHOS_TEST_FOR_EXCEPTION(
749 ! graph_.isSorted (), std::invalid_argument,
"Tpetra::"
750 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
751 "rows (isSorted() is false). This class assumes sorted rows.");
753 graphRCP_ = Teuchos::rcpFromRef(graph_);
759 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
760 TEUCHOS_TEST_FOR_EXCEPTION(
761 blockSizeIsNonpositive, std::invalid_argument,
"Tpetra::"
762 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
763 " <= 0. The block size must be positive.");
771 const auto colPointMap = Teuchos::rcp
773 *pointImporter_ = Teuchos::rcp
777 auto local_graph_h = graph.getLocalGraphHost ();
778 auto ptr_h = local_graph_h.row_map;
779 ptrHost_ =
decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph row offset"), ptr_h.extent(0));
780 Kokkos::deep_copy(ptrHost_, ptr_h);
782 auto ind_h = local_graph_h.entries;
783 indHost_ =
decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph column indices"), ind_h.extent(0));
784 Kokkos::deep_copy (indHost_, ind_h);
786 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
787 TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
788 val_ =
decltype (val_) (values);
792 template<
class Scalar,
class LO,
class GO,
class Node>
797 const LO blockSize) :
798 dist_object_type (graph.
getMap ()),
801 domainPointMap_ (domainPointMap),
802 rangePointMap_ (rangePointMap),
803 blockSize_ (blockSize),
804 X_colMap_ (new Teuchos::RCP<BMV> ()),
805 Y_rowMap_ (new Teuchos::RCP<BMV> ()),
807 offsetPerBlock_ (blockSize * blockSize),
808 localError_ (new bool (false)),
809 errs_ (new Teuchos::RCP<std::ostringstream> ())
811 TEUCHOS_TEST_FOR_EXCEPTION(
812 ! graph_.isSorted (), std::invalid_argument,
"Tpetra::"
813 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
814 "rows (isSorted() is false). This class assumes sorted rows.");
816 graphRCP_ = Teuchos::rcpFromRef(graph_);
822 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
823 TEUCHOS_TEST_FOR_EXCEPTION(
824 blockSizeIsNonpositive, std::invalid_argument,
"Tpetra::"
825 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
826 " <= 0. The block size must be positive.");
830 const auto colPointMap = Teuchos::rcp
832 *pointImporter_ = Teuchos::rcp
836 auto local_graph_h = graph.getLocalGraphHost ();
837 auto ptr_h = local_graph_h.row_map;
838 ptrHost_ =
decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph row offset"), ptr_h.extent(0));
840 Kokkos::deep_copy(ptrHost_, ptr_h);
842 auto ind_h = local_graph_h.entries;
843 indHost_ =
decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph column indices"), ind_h.extent(0));
845 Kokkos::deep_copy (indHost_, ind_h);
847 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
848 val_ =
decltype (val_) (impl_scalar_type_dualview(
"val", numValEnt));
853 template<
class Scalar,
class LO,
class GO,
class Node>
854 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
859 return Teuchos::rcp (
new map_type (domainPointMap_));
862 template<
class Scalar,
class LO,
class GO,
class Node>
863 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
868 return Teuchos::rcp (
new map_type (rangePointMap_));
871 template<
class Scalar,
class LO,
class GO,
class Node>
872 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
876 return graph_.getRowMap();
879 template<
class Scalar,
class LO,
class GO,
class Node>
880 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
884 return graph_.getColMap();
887 template<
class Scalar,
class LO,
class GO,
class Node>
892 return graph_.getGlobalNumRows();
895 template<
class Scalar,
class LO,
class GO,
class Node>
900 return graph_.getLocalNumRows();
903 template<
class Scalar,
class LO,
class GO,
class Node>
908 return graph_.getLocalMaxNumRowEntries();
911 template<
class Scalar,
class LO,
class GO,
class Node>
916 Teuchos::ETransp mode,
921 TEUCHOS_TEST_FOR_EXCEPTION(
922 mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
923 std::invalid_argument,
"Tpetra::BlockCrsMatrix::apply: "
924 "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
925 "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
931 X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
932 Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
934 catch (std::invalid_argument& e) {
935 TEUCHOS_TEST_FOR_EXCEPTION(
936 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
937 "apply: Either the input MultiVector X or the output MultiVector Y "
938 "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
939 "graph. BlockMultiVector's constructor threw the following exception: "
948 const_cast<this_BCRS_type&
> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
949 }
catch (std::invalid_argument& e) {
950 TEUCHOS_TEST_FOR_EXCEPTION(
951 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
952 "apply: The implementation method applyBlock complained about having "
953 "an invalid argument. It reported the following: " << e.what ());
954 }
catch (std::logic_error& e) {
955 TEUCHOS_TEST_FOR_EXCEPTION(
956 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
957 "apply: The implementation method applyBlock complained about a "
958 "possible bug in its implementation. It reported the following: "
960 }
catch (std::exception& e) {
961 TEUCHOS_TEST_FOR_EXCEPTION(
962 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
963 "apply: The implementation method applyBlock threw an exception which "
964 "is neither std::invalid_argument nor std::logic_error, but is a "
965 "subclass of std::exception. It reported the following: "
968 TEUCHOS_TEST_FOR_EXCEPTION(
969 true, std::logic_error,
"Tpetra::BlockCrsMatrix::"
970 "apply: The implementation method applyBlock threw an exception which "
971 "is not an instance of a subclass of std::exception. This probably "
972 "indicates a bug. Please report this to the Tpetra developers.");
976 template<
class Scalar,
class LO,
class GO,
class Node>
981 Teuchos::ETransp mode,
985 TEUCHOS_TEST_FOR_EXCEPTION(
987 "Tpetra::BlockCrsMatrix::applyBlock: "
988 "X and Y have different block sizes. X.getBlockSize() = "
992 if (mode == Teuchos::NO_TRANS) {
993 applyBlockNoTrans (X, Y, alpha, beta);
994 }
else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
995 applyBlockTrans (X, Y, mode, alpha, beta);
997 TEUCHOS_TEST_FOR_EXCEPTION(
998 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
999 "applyBlock: Invalid 'mode' argument. Valid values are "
1000 "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
1004 template<
class Scalar,
class LO,
class GO,
class Node>
1015 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1016 "destMatrix is required to be null.");
1020 RCP<crs_graph_type> srcGraph = rcp (
new crs_graph_type(this->getCrsGraph()));
1022 srcGraph->getDomainMap(),
1023 srcGraph->getRangeMap());
1027 destMatrix = rcp (
new this_BCRS_type (*destGraph,
getBlockSize()));
1032 template<
class Scalar,
class LO,
class GO,
class Node>
1043 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1044 "destMatrix is required to be null.");
1048 RCP<crs_graph_type> srcGraph = rcp (
new crs_graph_type(this->getCrsGraph()));
1050 srcGraph->getDomainMap(),
1051 srcGraph->getRangeMap());
1055 destMatrix = rcp (
new this_BCRS_type (*destGraph,
getBlockSize()));
1060 template<
class Scalar,
class LO,
class GO,
class Node>
1065 auto val_d = val_.getDeviceView(Access::OverwriteAll);
1066 Kokkos::deep_copy(val_d, alpha);
1069 template<
class Scalar,
class LO,
class GO,
class Node>
1074 const Scalar vals[],
1075 const LO numColInds)
const
1077 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1078 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing(
"offsets"), numColInds);
1079 ptrdiff_t * offsets = offsets_host_view.data();
1080 const LO numOffsets = this->
getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1085 template <
class Scalar,
class LO,
class GO,
class Node>
1089 Kokkos::MemoryUnmanaged>& offsets)
const
1091 graph_.getLocalDiagOffsets (offsets);
1094 template <
class Scalar,
class LO,
class GO,
class Node>
1098 Kokkos::MemoryUnmanaged>& diag,
1100 Kokkos::MemoryUnmanaged>& offsets)
const
1102 using Kokkos::parallel_for;
1103 const char prefix[] =
"Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1105 const LO lclNumMeshRows =
static_cast<LO
> (rowMeshMap_.getLocalNumElements ());
1107 TEUCHOS_TEST_FOR_EXCEPTION
1108 (
static_cast<LO
> (diag.extent (0)) < lclNumMeshRows ||
1109 static_cast<LO
> (diag.extent (1)) < blockSize ||
1110 static_cast<LO
> (diag.extent (2)) < blockSize,
1111 std::invalid_argument, prefix <<
1112 "The input Kokkos::View is not big enough to hold all the data.");
1113 TEUCHOS_TEST_FOR_EXCEPTION
1114 (
static_cast<LO
> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1115 prefix <<
"offsets.size() = " << offsets.size () <<
" < local number of "
1116 "diagonal blocks " << lclNumMeshRows <<
".");
1118 typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1119 typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1125 auto val_d = val_.getDeviceView(Access::ReadOnly);
1126 parallel_for (policy_type (0, lclNumMeshRows),
1127 functor_type (diag, val_d, offsets,
1128 graph_.getLocalGraphDevice ().row_map, blockSize_));
1131 template<
class Scalar,
class LO,
class GO,
class Node>
1136 const Scalar vals[],
1137 const LO numColInds)
const
1139 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1140 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing(
"offsets"), numColInds);
1141 ptrdiff_t * offsets = offsets_host_view.data();
1142 const LO numOffsets = this->
getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1143 const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1148 template<
class Scalar,
class LO,
class GO,
class Node>
1153 const Scalar vals[],
1154 const LO numColInds)
const
1156 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1157 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing(
"offsets"), numColInds);
1158 ptrdiff_t * offsets = offsets_host_view.data();
1159 const LO numOffsets = this->
getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1163 template<
class Scalar,
class LO,
class GO,
class Node>
1167 nonconst_local_inds_host_view_type &Indices,
1168 nonconst_values_host_view_type &Values,
1169 size_t& NumEntries)
const
1171 auto vals = getValuesHost(LocalRow);
1172 const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1173 if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
1174 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
1175 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1176 << numInds <<
" row entries");
1178 const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1179 for (LO i=0; i<numInds; ++i) {
1180 Indices[i] = colInds[i];
1182 for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1183 Values[i] = vals[i];
1185 NumEntries = numInds;
1188 template<
class Scalar,
class LO,
class GO,
class Node>
1192 ptrdiff_t offsets[],
1194 const LO numColInds)
const
1196 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1201 return static_cast<LO
> (0);
1204 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1208 for (LO k = 0; k < numColInds; ++k) {
1209 const LO relBlockOffset =
1210 this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1211 if (relBlockOffset != LINV) {
1212 offsets[k] =
static_cast<ptrdiff_t
> (relBlockOffset);
1213 hint = relBlockOffset + 1;
1221 template<
class Scalar,
class LO,
class GO,
class Node>
1225 const ptrdiff_t offsets[],
1226 const Scalar vals[],
1227 const LO numOffsets)
const
1229 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1234 return static_cast<LO
> (0);
1238 auto val_out =
const_cast<this_BCRS_type&
>(*this).getValuesHostNonConst(localRowInd);
1241 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1242 const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
1243 size_t pointOffset = 0;
1246 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1247 const size_t blockOffset = offsets[k]*perBlockSize;
1248 if (offsets[k] != STINV) {
1251 COPY (A_new, A_old);
1259 template<
class Scalar,
class LO,
class GO,
class Node>
1263 const ptrdiff_t offsets[],
1264 const Scalar vals[],
1265 const LO numOffsets)
const
1267 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1272 return static_cast<LO
> (0);
1274 const impl_scalar_type*
const vIn =
reinterpret_cast<const impl_scalar_type*
> (vals);
1275 auto val_out = getValuesHost(localRowInd);
1276 impl_scalar_type* vOut =
const_cast<impl_scalar_type*
>(val_out.data());
1278 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1279 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1280 size_t pointOffset = 0;
1283 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1284 const size_t blockOffset = offsets[k]*perBlockSize;
1285 if (blockOffset != STINV) {
1286 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1287 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1296 template<
class Scalar,
class LO,
class GO,
class Node>
1300 const ptrdiff_t offsets[],
1301 const Scalar vals[],
1302 const LO numOffsets)
const
1304 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1309 return static_cast<LO
> (0);
1314 auto val_out =
const_cast<this_BCRS_type&
>(*this).getValuesHostNonConst(localRowInd);
1317 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1318 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1319 size_t pointOffset = 0;
1322 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1323 const size_t blockOffset = offsets[k]*perBlockSize;
1324 if (blockOffset != STINV) {
1327 AXPY (ONE, A_new, A_old);
1334 template<
class Scalar,
class LO,
class GO,
class Node>
1336 impl_scalar_type_dualview::t_host::const_type
1340 return val_.getHostView(Access::ReadOnly);
1343 template<
class Scalar,
class LO,
class GO,
class Node>
1344 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1345 impl_scalar_type_dualview::t_dev::const_type
1346 BlockCrsMatrix<Scalar, LO, GO, Node>::
1347 getValuesDevice ()
const
1349 return val_.getDeviceView(Access::ReadOnly);
1352 template<
class Scalar,
class LO,
class GO,
class Node>
1354 impl_scalar_type_dualview::t_host
1358 return val_.getHostView(Access::ReadWrite);
1361 template<
class Scalar,
class LO,
class GO,
class Node>
1363 impl_scalar_type_dualview::t_dev
1367 return val_.getDeviceView(Access::ReadWrite);
1370 template<
class Scalar,
class LO,
class GO,
class Node>
1371 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1372 impl_scalar_type_dualview::t_host::const_type
1376 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1377 auto val = val_.getHostView(Access::ReadOnly);
1378 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1382 template<
class Scalar,
class LO,
class GO,
class Node>
1384 impl_scalar_type_dualview::t_dev::const_type
1388 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1389 auto val = val_.getDeviceView(Access::ReadOnly);
1390 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1394 template<
class Scalar,
class LO,
class GO,
class Node>
1399 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1400 auto val = val_.getHostView(Access::ReadWrite);
1401 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1405 template<
class Scalar,
class LO,
class GO,
class Node>
1410 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1411 auto val = val_.getDeviceView(Access::ReadWrite);
1412 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1416 template<
class Scalar,
class LO,
class GO,
class Node>
1421 const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1422 if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1423 return static_cast<LO
> (0);
1425 return static_cast<LO
> (numEntInGraph);
1429 template<
class Scalar,
class LO,
class GO,
class Node>
1430 typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
1434 auto numCols = this->graph_.
getColMap()->getLocalNumElements();
1435 auto val = val_.getDeviceView(Access::ReadWrite);
1437 const auto graph = this->graph_.getLocalGraphDevice ();
1439 return local_matrix_device_type(
"Tpetra::BlockCrsMatrix::lclMatrixDevice",
1446 template<
class Scalar,
class LO,
class GO,
class Node>
1451 const Teuchos::ETransp mode,
1461 TEUCHOS_TEST_FOR_EXCEPTION(
1462 true, std::logic_error,
"Tpetra::BlockCrsMatrix::apply: "
1463 "transpose and conjugate transpose modes are not yet implemented.");
1466 template<
class Scalar,
class LO,
class GO,
class Node>
1468 BlockCrsMatrix<Scalar, LO, GO, Node>::
1469 applyBlockNoTrans (
const BlockMultiVector<Scalar, LO, GO, Node>& X,
1470 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1476 typedef ::Tpetra::Import<LO, GO, Node> import_type;
1477 typedef ::Tpetra::Export<LO, GO, Node> export_type;
1478 const Scalar zero = STS::zero ();
1479 const Scalar one = STS::one ();
1480 RCP<const import_type>
import = graph_.getImporter ();
1482 RCP<const export_type> theExport = graph_.getExporter ();
1483 const char prefix[] =
"Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1485 if (alpha == zero) {
1489 else if (beta != one) {
1494 const BMV* X_colMap = NULL;
1495 if (
import.is_null ()) {
1499 catch (std::exception& e) {
1500 TEUCHOS_TEST_FOR_EXCEPTION
1501 (
true, std::logic_error, prefix <<
"Tpetra::MultiVector::"
1502 "operator= threw an exception: " << std::endl << e.what ());
1512 if ((*X_colMap_).is_null () ||
1513 (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1514 (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1515 *X_colMap_ = rcp (
new BMV (* (graph_.getColMap ()), getBlockSize (),
1516 static_cast<LO
> (X.getNumVectors ())));
1518 (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1522 X_colMap = &(**X_colMap_);
1524 catch (std::exception& e) {
1525 TEUCHOS_TEST_FOR_EXCEPTION
1526 (
true, std::logic_error, prefix <<
"Tpetra::MultiVector::"
1527 "operator= threw an exception: " << std::endl << e.what ());
1531 BMV* Y_rowMap = NULL;
1532 if (theExport.is_null ()) {
1535 else if ((*Y_rowMap_).is_null () ||
1536 (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1537 (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1538 *Y_rowMap_ = rcp (
new BMV (* (graph_.getRowMap ()), getBlockSize (),
1539 static_cast<LO
> (X.getNumVectors ())));
1541 Y_rowMap = &(**Y_rowMap_);
1543 catch (std::exception& e) {
1544 TEUCHOS_TEST_FOR_EXCEPTION(
1545 true, std::logic_error, prefix <<
"Tpetra::MultiVector::"
1546 "operator= threw an exception: " << std::endl << e.what ());
1551 localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1553 catch (std::exception& e) {
1554 TEUCHOS_TEST_FOR_EXCEPTION
1555 (
true, std::runtime_error, prefix <<
"localApplyBlockNoTrans threw "
1556 "an exception: " << e.what ());
1559 TEUCHOS_TEST_FOR_EXCEPTION
1560 (
true, std::runtime_error, prefix <<
"localApplyBlockNoTrans threw "
1561 "an exception not a subclass of std::exception.");
1564 if (! theExport.is_null ()) {
1570 template<
class Scalar,
class LO,
class GO,
class Node>
1578 using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1581 const auto graph = this->graph_.getLocalGraphDevice ();
1583 const LO blockSize = this->getBlockSize ();
1585 auto X_mv = X.getMultiVectorView ();
1586 auto Y_mv = Y.getMultiVectorView ();
1590 auto X_lcl = X_mv.getLocalViewDevice (Access::ReadOnly);
1591 auto Y_lcl = Y_mv.getLocalViewDevice (Access::ReadWrite);
1592 auto val = val_.getDeviceView(Access::ReadWrite);
1594 bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1598 template<
class Scalar,
class LO,
class GO,
class Node>
1602 const LO colIndexToFind,
1603 const LO hint)
const
1605 const size_t absStartOffset = ptrHost_[localRowIndex];
1606 const size_t absEndOffset = ptrHost_[localRowIndex+1];
1607 const LO numEntriesInRow =
static_cast<LO
> (absEndOffset - absStartOffset);
1609 const LO*
const curInd = indHost_.data () + absStartOffset;
1612 if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1619 LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1624 const LO maxNumEntriesForLinearSearch = 32;
1625 if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1630 const LO* beg = curInd;
1631 const LO* end = curInd + numEntriesInRow;
1632 std::pair<const LO*, const LO*> p =
1633 std::equal_range (beg, end, colIndexToFind);
1634 if (p.first != p.second) {
1636 relOffset =
static_cast<LO
> (p.first - beg);
1640 for (LO k = 0; k < numEntriesInRow; ++k) {
1641 if (colIndexToFind == curInd[k]) {
1651 template<
class Scalar,
class LO,
class GO,
class Node>
1656 return offsetPerBlock_;
1659 template<
class Scalar,
class LO,
class GO,
class Node>
1663 const size_t pointOffset)
const
1666 const LO rowStride = blockSize_;
1667 return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1670 template<
class Scalar,
class LO,
class GO,
class Node>
1674 const size_t pointOffset)
const
1677 const LO rowStride = blockSize_;
1678 return little_block_type (val + pointOffset, blockSize_, rowStride);
1681 template<
class Scalar,
class LO,
class GO,
class Node>
1682 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1685 const size_t pointOffset)
const
1688 const LO rowStride = blockSize_;
1689 const size_t bs2 = blockSize_ * blockSize_;
1690 return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1693 template<
class Scalar,
class LO,
class GO,
class Node>
1700 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1701 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1702 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1703 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1704 auto vals =
const_cast<this_BCRS_type&
>(*this).getValuesDeviceNonConst();
1705 auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1709 return little_block_type ();
1713 template<
class Scalar,
class LO,
class GO,
class Node>
1714 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1720 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1721 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1722 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1723 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1724 auto vals =
const_cast<this_BCRS_type&
>(*this).getValuesHostNonConst();
1725 auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1729 return little_block_host_type ();
1734 template<
class Scalar,
class LO,
class GO,
class Node>
1737 checkSizes (const ::Tpetra::SrcDistObject& source)
1741 const this_BCRS_type* src =
dynamic_cast<const this_BCRS_type*
> (&source);
1744 std::ostream& err = markLocalErrorAndGetStream ();
1745 err <<
"checkSizes: The source object of the Import or Export "
1746 "must be a BlockCrsMatrix with the same template parameters as the "
1747 "target object." << endl;
1752 if (src->getBlockSize () != this->getBlockSize ()) {
1753 std::ostream& err = markLocalErrorAndGetStream ();
1754 err <<
"checkSizes: The source and target objects of the Import or "
1755 <<
"Export must have the same block sizes. The source's block "
1756 <<
"size = " << src->getBlockSize () <<
" != the target's block "
1759 if (! src->graph_.isFillComplete ()) {
1760 std::ostream& err = markLocalErrorAndGetStream ();
1761 err <<
"checkSizes: The source object of the Import or Export is "
1762 "not fill complete. Both source and target objects must be fill "
1763 "complete." << endl;
1765 if (! this->graph_.isFillComplete ()) {
1766 std::ostream& err = markLocalErrorAndGetStream ();
1767 err <<
"checkSizes: The target object of the Import or Export is "
1768 "not fill complete. Both source and target objects must be fill "
1769 "complete." << endl;
1771 if (src->graph_.getColMap ().is_null ()) {
1772 std::ostream& err = markLocalErrorAndGetStream ();
1773 err <<
"checkSizes: The source object of the Import or Export does "
1774 "not have a column Map. Both source and target objects must have "
1775 "column Maps." << endl;
1777 if (this->graph_.getColMap ().is_null ()) {
1778 std::ostream& err = markLocalErrorAndGetStream ();
1779 err <<
"checkSizes: The target object of the Import or Export does "
1780 "not have a column Map. Both source and target objects must have "
1781 "column Maps." << endl;
1785 return ! (* (this->localError_));
1788 template<
class Scalar,
class LO,
class GO,
class Node>
1792 (const ::Tpetra::SrcDistObject& source,
1793 const size_t numSameIDs,
1806 ProfilingRegion profile_region(
"Tpetra::BlockCrsMatrix::copyAndPermute");
1807 const bool debug = Behavior::debug();
1808 const bool verbose = Behavior::verbose();
1813 std::ostringstream os;
1814 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1815 os <<
"Proc " << myRank <<
": BlockCrsMatrix::copyAndPermute : " << endl;
1820 if (* (this->localError_)) {
1821 std::ostream& err = this->markLocalErrorAndGetStream ();
1823 <<
"The target object of the Import or Export is already in an error state."
1832 std::ostringstream os;
1833 os << prefix << endl
1834 << prefix <<
" " << dualViewStatusToString (permuteToLIDs,
"permuteToLIDs") << endl
1835 << prefix <<
" " << dualViewStatusToString (permuteFromLIDs,
"permuteFromLIDs") << endl;
1836 std::cerr << os.str ();
1842 if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1843 std::ostream& err = this->markLocalErrorAndGetStream ();
1845 <<
"permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1846 <<
" != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1850 if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1851 std::ostream& err = this->markLocalErrorAndGetStream ();
1853 <<
"Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1858 const this_BCRS_type* src =
dynamic_cast<const this_BCRS_type*
> (&source);
1860 std::ostream& err = this->markLocalErrorAndGetStream ();
1862 <<
"The source (input) object of the Import or "
1863 "Export is either not a BlockCrsMatrix, or does not have the right "
1864 "template parameters. checkSizes() should have caught this. "
1865 "Please report this bug to the Tpetra developers." << endl;
1869 bool lclErr =
false;
1870#ifdef HAVE_TPETRA_DEBUG
1871 std::set<LO> invalidSrcCopyRows;
1872 std::set<LO> invalidDstCopyRows;
1873 std::set<LO> invalidDstCopyCols;
1874 std::set<LO> invalidDstPermuteCols;
1875 std::set<LO> invalidPermuteFromRows;
1885#ifdef HAVE_TPETRA_DEBUG
1886 const map_type& srcRowMap = * (src->graph_.getRowMap ());
1888 const map_type& dstRowMap = * (this->graph_.getRowMap ());
1889 const map_type& srcColMap = * (src->graph_.getColMap ());
1890 const map_type& dstColMap = * (this->graph_.getColMap ());
1891 const bool canUseLocalColumnIndices = srcColMap.
locallySameAs (dstColMap);
1893 const size_t numPermute =
static_cast<size_t> (permuteFromLIDs.extent(0));
1895 std::ostringstream os;
1897 <<
"canUseLocalColumnIndices: "
1898 << (canUseLocalColumnIndices ?
"true" :
"false")
1899 <<
", numPermute: " << numPermute
1901 std::cerr << os.str ();
1904 const auto permuteToLIDsHost = permuteToLIDs.view_host();
1905 const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1907 if (canUseLocalColumnIndices) {
1909 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1910#ifdef HAVE_TPETRA_DEBUG
1913 invalidSrcCopyRows.insert (localRow);
1918 local_inds_host_view_type lclSrcCols;
1919 values_host_view_type vals;
1923 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1924 if (numEntries > 0) {
1926 if (err != numEntries) {
1929#ifdef HAVE_TPETRA_DEBUG
1930 invalidDstCopyRows.insert (localRow);
1939 for (LO k = 0; k < numEntries; ++k) {
1942#ifdef HAVE_TPETRA_DEBUG
1943 (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1953 for (
size_t k = 0; k < numPermute; ++k) {
1954 const LO srcLclRow =
static_cast<LO
> (permuteFromLIDsHost(k));
1955 const LO dstLclRow =
static_cast<LO
> (permuteToLIDsHost(k));
1957 local_inds_host_view_type lclSrcCols;
1958 values_host_view_type vals;
1960 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1961 if (numEntries > 0) {
1963 if (err != numEntries) {
1965#ifdef HAVE_TPETRA_DEBUG
1966 for (LO c = 0; c < numEntries; ++c) {
1968 invalidDstPermuteCols.insert (lclSrcCols[c]);
1978 const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1979 Teuchos::Array<LO> lclDstCols (maxNumEnt);
1982 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1983 local_inds_host_view_type lclSrcCols;
1984 values_host_view_type vals;
1990 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1991 }
catch (std::exception& e) {
1993 std::ostringstream os;
1994 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1995 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" localRow "
1996 << localRow <<
", src->getLocalRowView() threw an exception: "
1998 std::cerr << os.str ();
2003 if (numEntries > 0) {
2004 if (
static_cast<size_t> (numEntries) >
static_cast<size_t> (lclDstCols.size ())) {
2007 std::ostringstream os;
2008 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2009 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" localRow "
2010 << localRow <<
", numEntries = " << numEntries <<
" > maxNumEnt = "
2011 << maxNumEnt << endl;
2012 std::cerr << os.str ();
2018 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2019 for (LO j = 0; j < numEntries; ++j) {
2021 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2023#ifdef HAVE_TPETRA_DEBUG
2024 invalidDstCopyCols.insert (lclDstColsView[j]);
2031 }
catch (std::exception& e) {
2033 std::ostringstream os;
2034 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2035 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" localRow "
2036 << localRow <<
", this->replaceLocalValues() threw an exception: "
2038 std::cerr << os.str ();
2042 if (err != numEntries) {
2045 std::ostringstream os;
2046 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2047 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" "
2048 "localRow " << localRow <<
", this->replaceLocalValues "
2049 "returned " << err <<
" instead of numEntries = "
2050 << numEntries << endl;
2051 std::cerr << os.str ();
2059 for (
size_t k = 0; k < numPermute; ++k) {
2060 const LO srcLclRow =
static_cast<LO
> (permuteFromLIDsHost(k));
2061 const LO dstLclRow =
static_cast<LO
> (permuteToLIDsHost(k));
2063 local_inds_host_view_type lclSrcCols;
2064 values_host_view_type vals;
2068 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
2069 }
catch (std::exception& e) {
2071 std::ostringstream os;
2072 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2073 os <<
"Proc " << myRank <<
": copyAndPermute: At \"permute\" "
2074 "srcLclRow " << srcLclRow <<
" and dstLclRow " << dstLclRow
2075 <<
", src->getLocalRowView() threw an exception: " << e.what ();
2076 std::cerr << os.str ();
2081 if (numEntries > 0) {
2082 if (
static_cast<size_t> (numEntries) >
static_cast<size_t> (lclDstCols.size ())) {
2088 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2089 for (LO j = 0; j < numEntries; ++j) {
2091 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2093#ifdef HAVE_TPETRA_DEBUG
2094 invalidDstPermuteCols.insert (lclDstColsView[j]);
2099 if (err != numEntries) {
2108 std::ostream& err = this->markLocalErrorAndGetStream ();
2109#ifdef HAVE_TPETRA_DEBUG
2110 err <<
"copyAndPermute: The graph structure of the source of the "
2111 "Import or Export must be a subset of the graph structure of the "
2113 err <<
"invalidSrcCopyRows = [";
2114 for (
typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2115 it != invalidSrcCopyRows.end (); ++it) {
2117 typename std::set<LO>::const_iterator itp1 = it;
2119 if (itp1 != invalidSrcCopyRows.end ()) {
2123 err <<
"], invalidDstCopyRows = [";
2124 for (
typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2125 it != invalidDstCopyRows.end (); ++it) {
2127 typename std::set<LO>::const_iterator itp1 = it;
2129 if (itp1 != invalidDstCopyRows.end ()) {
2133 err <<
"], invalidDstCopyCols = [";
2134 for (
typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2135 it != invalidDstCopyCols.end (); ++it) {
2137 typename std::set<LO>::const_iterator itp1 = it;
2139 if (itp1 != invalidDstCopyCols.end ()) {
2143 err <<
"], invalidDstPermuteCols = [";
2144 for (
typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2145 it != invalidDstPermuteCols.end (); ++it) {
2147 typename std::set<LO>::const_iterator itp1 = it;
2149 if (itp1 != invalidDstPermuteCols.end ()) {
2153 err <<
"], invalidPermuteFromRows = [";
2154 for (
typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2155 it != invalidPermuteFromRows.end (); ++it) {
2157 typename std::set<LO>::const_iterator itp1 = it;
2159 if (itp1 != invalidPermuteFromRows.end ()) {
2165 err <<
"copyAndPermute: The graph structure of the source of the "
2166 "Import or Export must be a subset of the graph structure of the "
2172 std::ostringstream os;
2173 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2174 const bool lclSuccess = ! (* (this->localError_));
2175 os <<
"*** Proc " << myRank <<
": copyAndPermute "
2176 << (lclSuccess ?
"succeeded" :
"FAILED");
2182 std::cerr << os.str ();
2206 template<
class LO,
class GO>
2208 packRowCount (
const size_t numEnt,
2209 const size_t numBytesPerValue,
2210 const size_t blkSize)
2222 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2223 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2224 const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2225 return numEntLen + gidsLen + valsLen;
2239 template<
class ST,
class LO,
class GO>
2241 unpackRowCount (
const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
2242 const size_t offset,
2243 const size_t numBytes,
2246 using Kokkos::subview;
2247 using ::Tpetra::Details::PackTraits;
2249 if (numBytes == 0) {
2251 return static_cast<size_t> (0);
2255 const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
2256 TEUCHOS_TEST_FOR_EXCEPTION
2257 (theNumBytes > numBytes, std::logic_error,
"unpackRowCount: "
2258 "theNumBytes = " << theNumBytes <<
" < numBytes = " << numBytes
2260 const char*
const inBuf = imports.data () + offset;
2261 const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
2262 TEUCHOS_TEST_FOR_EXCEPTION
2263 (actualNumBytes > numBytes, std::logic_error,
"unpackRowCount: "
2264 "actualNumBytes = " << actualNumBytes <<
" < numBytes = " << numBytes
2266 return static_cast<size_t> (numEntLO);
2273 template<
class ST,
class LO,
class GO>
2275 packRowForBlockCrs (
const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
2276 const size_t offset,
2277 const size_t numEnt,
2278 const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
2279 const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
2280 const size_t numBytesPerValue,
2281 const size_t blockSize)
2283 using Kokkos::subview;
2284 using ::Tpetra::Details::PackTraits;
2290 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2293 const LO numEntLO =
static_cast<size_t> (numEnt);
2295 const size_t numEntBeg = offset;
2296 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2297 const size_t gidsBeg = numEntBeg + numEntLen;
2298 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2299 const size_t valsBeg = gidsBeg + gidsLen;
2300 const size_t valsLen = numScalarEnt * numBytesPerValue;
2302 char*
const numEntOut = exports.data () + numEntBeg;
2303 char*
const gidsOut = exports.data () + gidsBeg;
2304 char*
const valsOut = exports.data () + valsBeg;
2306 size_t numBytesOut = 0;
2308 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
2311 Kokkos::pair<int, size_t> p;
2312 p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
2313 errorCode += p.first;
2314 numBytesOut += p.second;
2316 p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
2317 errorCode += p.first;
2318 numBytesOut += p.second;
2321 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2322 TEUCHOS_TEST_FOR_EXCEPTION
2323 (numBytesOut != expectedNumBytes, std::logic_error,
2324 "packRowForBlockCrs: numBytesOut = " << numBytesOut
2325 <<
" != expectedNumBytes = " << expectedNumBytes <<
".");
2327 TEUCHOS_TEST_FOR_EXCEPTION
2328 (errorCode != 0, std::runtime_error,
"packRowForBlockCrs: "
2329 "PackTraits::packArray returned a nonzero error code");
2335 template<
class ST,
class LO,
class GO>
2337 unpackRowForBlockCrs (
const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
2338 const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
2339 const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
2340 const size_t offset,
2341 const size_t numBytes,
2342 const size_t numEnt,
2343 const size_t numBytesPerValue,
2344 const size_t blockSize)
2346 using ::Tpetra::Details::PackTraits;
2348 if (numBytes == 0) {
2352 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2353 TEUCHOS_TEST_FOR_EXCEPTION
2354 (
static_cast<size_t> (imports.extent (0)) <= offset,
2355 std::logic_error,
"unpackRowForBlockCrs: imports.extent(0) = "
2356 << imports.extent (0) <<
" <= offset = " << offset <<
".");
2357 TEUCHOS_TEST_FOR_EXCEPTION
2358 (
static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2359 std::logic_error,
"unpackRowForBlockCrs: imports.extent(0) = "
2360 << imports.extent (0) <<
" < offset + numBytes = "
2361 << (offset + numBytes) <<
".");
2366 const size_t numEntBeg = offset;
2367 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
2368 const size_t gidsBeg = numEntBeg + numEntLen;
2369 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2370 const size_t valsBeg = gidsBeg + gidsLen;
2371 const size_t valsLen = numScalarEnt * numBytesPerValue;
2373 const char*
const numEntIn = imports.data () + numEntBeg;
2374 const char*
const gidsIn = imports.data () + gidsBeg;
2375 const char*
const valsIn = imports.data () + valsBeg;
2377 size_t numBytesOut = 0;
2380 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
2381 TEUCHOS_TEST_FOR_EXCEPTION
2382 (
static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2383 "unpackRowForBlockCrs: Expected number of entries " << numEnt
2384 <<
" != actual number of entries " << numEntOut <<
".");
2387 Kokkos::pair<int, size_t> p;
2388 p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2389 errorCode += p.first;
2390 numBytesOut += p.second;
2392 p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2393 errorCode += p.first;
2394 numBytesOut += p.second;
2397 TEUCHOS_TEST_FOR_EXCEPTION
2398 (numBytesOut != numBytes, std::logic_error,
2399 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2400 <<
" != numBytes = " << numBytes <<
".");
2402 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2403 TEUCHOS_TEST_FOR_EXCEPTION
2404 (numBytesOut != expectedNumBytes, std::logic_error,
2405 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2406 <<
" != expectedNumBytes = " << expectedNumBytes <<
".");
2408 TEUCHOS_TEST_FOR_EXCEPTION
2409 (errorCode != 0, std::runtime_error,
"unpackRowForBlockCrs: "
2410 "PackTraits::unpackArray returned a nonzero error code");
2416 template<
class Scalar,
class LO,
class GO,
class Node>
2420 (const ::Tpetra::SrcDistObject& source,
2425 Kokkos::DualView<
size_t*,
2427 size_t& constantNumPackets)
2434 typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2438 ProfilingRegion profile_region(
"Tpetra::BlockCrsMatrix::packAndPrepare");
2440 const bool debug = Behavior::debug();
2441 const bool verbose = Behavior::verbose();
2446 std::ostringstream os;
2447 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2448 os <<
"Proc " << myRank <<
": BlockCrsMatrix::packAndPrepare: " << std::endl;
2453 if (* (this->localError_)) {
2454 std::ostream& err = this->markLocalErrorAndGetStream ();
2456 <<
"The target object of the Import or Export is already in an error state."
2465 std::ostringstream os;
2466 os << prefix << std::endl
2467 << prefix <<
" " << dualViewStatusToString (exportLIDs,
"exportLIDs") << std::endl
2468 << prefix <<
" " << dualViewStatusToString (exports,
"exports") << std::endl
2469 << prefix <<
" " << dualViewStatusToString (numPacketsPerLID,
"numPacketsPerLID") << std::endl;
2470 std::cerr << os.str ();
2476 if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2477 std::ostream& err = this->markLocalErrorAndGetStream ();
2479 <<
"exportLIDs.extent(0) = " << exportLIDs.extent (0)
2480 <<
" != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2481 <<
"." << std::endl;
2484 if (exportLIDs.need_sync_host ()) {
2485 std::ostream& err = this->markLocalErrorAndGetStream ();
2486 err << prefix <<
"exportLIDs be sync'd to host." << std::endl;
2490 const this_BCRS_type* src =
dynamic_cast<const this_BCRS_type*
> (&source);
2492 std::ostream& err = this->markLocalErrorAndGetStream ();
2494 <<
"The source (input) object of the Import or "
2495 "Export is either not a BlockCrsMatrix, or does not have the right "
2496 "template parameters. checkSizes() should have caught this. "
2497 "Please report this bug to the Tpetra developers." << std::endl;
2509 constantNumPackets = 0;
2513 const size_t blockSize =
static_cast<size_t> (src->getBlockSize ());
2514 const size_t numExportLIDs = exportLIDs.extent (0);
2515 size_t numBytesPerValue(0);
2517 auto val_host = val_.getHostView(Access::ReadOnly);
2519 PackTraits<impl_scalar_type>
2527 Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2530 auto exportLIDsHost = exportLIDs.view_host();
2531 auto numPacketsPerLIDHost = numPacketsPerLID.view_host();
2532 numPacketsPerLID.modify_host ();
2534 using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2535 const auto policy = Kokkos::RangePolicy<host_exec>(
size_t(0), numExportLIDs);
2536 Kokkos::parallel_reduce
2538 [=](
const size_t &i,
typename reducer_type::value_type &update) {
2539 const LO lclRow = exportLIDsHost(i);
2541 numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2543 const size_t numBytes =
2544 packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2545 numPacketsPerLIDHost(i) = numBytes;
2546 update +=
typename reducer_type::value_type(numEnt, numBytes, numEnt);
2547 }, rowReducerStruct);
2553 const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2554 const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2555 const size_t maxRowLength = rowReducerStruct.maxRowLength;
2558 std::ostringstream os;
2560 <<
"totalNumBytes = " << totalNumBytes <<
", totalNumEntries = " << totalNumEntries
2562 std::cerr << os.str ();
2568 if(exports.extent(0) != totalNumBytes)
2570 const std::string oldLabel = exports.d_view.label ();
2571 const std::string newLabel = (oldLabel ==
"") ?
"exports" : oldLabel;
2572 exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2574 if (totalNumEntries > 0) {
2576 Kokkos::View<size_t*, host_exec> offset(
"offset", numExportLIDs+1);
2578 const auto policy = Kokkos::RangePolicy<host_exec>(
size_t(0), numExportLIDs+1);
2579 Kokkos::parallel_scan
2581 [=](
const size_t &i,
size_t &update,
const bool &
final) {
2582 if (
final) offset(i) = update;
2583 update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2586 if (offset(numExportLIDs) != totalNumBytes) {
2587 std::ostream& err = this->markLocalErrorAndGetStream ();
2589 <<
"At end of method, the final offset (in bytes) "
2590 << offset(numExportLIDs) <<
" does not equal the total number of bytes packed "
2591 << totalNumBytes <<
". "
2592 <<
"Please report this bug to the Tpetra developers." << std::endl;
2606 exports.modify_host();
2608 typedef Kokkos::TeamPolicy<host_exec> policy_type;
2610 policy_type(numExportLIDs, 1, 1)
2611 .set_scratch_size(0, Kokkos::PerTeam(
sizeof(GO)*maxRowLength));
2616 Kokkos::parallel_for
2618 [=](
const typename policy_type::member_type &member) {
2619 const size_t i = member.league_rank();
2620 Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2621 gblColInds(member.team_scratch(0), maxRowLength);
2623 const LO lclRowInd = exportLIDsHost(i);
2624 local_inds_host_view_type lclColInds;
2625 values_host_view_type vals;
2630 src->getLocalRowView (lclRowInd, lclColInds, vals);
2631 const size_t numEnt = lclColInds.extent(0);
2634 for (
size_t j = 0; j < numEnt; ++j)
2641 const size_t numBytes =
2642 packRowForBlockCrs<impl_scalar_type, LO, GO>
2643 (exports.view_host(),
2646 Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2653 const size_t offsetDiff = offset(i+1) - offset(i);
2654 if (numBytes != offsetDiff) {
2655 std::ostringstream os;
2657 <<
"numBytes computed from packRowForBlockCrs is different from "
2658 <<
"precomputed offset values, LID = " << i << std::endl;
2659 std::cerr << os.str ();
2668 std::ostringstream os;
2669 const bool lclSuccess = ! (* (this->localError_));
2671 << (lclSuccess ?
"succeeded" :
"FAILED")
2673 std::cerr << os.str ();
2677 template<
class Scalar,
class LO,
class GO,
class Node>
2685 Kokkos::DualView<
size_t*,
2696 typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2698 ProfilingRegion profile_region(
"Tpetra::BlockCrsMatrix::unpackAndCombine");
2699 const bool verbose = Behavior::verbose ();
2704 std::ostringstream os;
2705 auto map = this->graph_.getRowMap ();
2706 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2707 const int myRank = comm.is_null () ? -1 : comm->getRank ();
2708 os <<
"Proc " << myRank <<
": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2711 os <<
"Start" << endl;
2712 std::cerr << os.str ();
2717 if (* (this->localError_)) {
2718 std::ostream& err = this->markLocalErrorAndGetStream ();
2719 std::ostringstream os;
2720 os << prefix <<
"{Im/Ex}port target is already in error." << endl;
2722 std::cerr << os.str ();
2731 if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2732 std::ostream& err = this->markLocalErrorAndGetStream ();
2733 std::ostringstream os;
2734 os << prefix <<
"importLIDs.extent(0) = " << importLIDs.extent (0)
2735 <<
" != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2738 std::cerr << os.str ();
2744 if (combineMode !=
ADD && combineMode !=
INSERT &&
2746 combineMode !=
ZERO) {
2747 std::ostream& err = this->markLocalErrorAndGetStream ();
2748 std::ostringstream os;
2750 <<
"Invalid CombineMode value " << combineMode <<
". Valid "
2751 <<
"values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2754 std::cerr << os.str ();
2760 if (this->graph_.getColMap ().is_null ()) {
2761 std::ostream& err = this->markLocalErrorAndGetStream ();
2762 std::ostringstream os;
2763 os << prefix <<
"Target matrix's column Map is null." << endl;
2765 std::cerr << os.str ();
2774 const map_type& tgtColMap = * (this->graph_.getColMap ());
2778 const size_t numImportLIDs = importLIDs.extent(0);
2785 size_t numBytesPerValue(0);
2787 auto val_host = val_.getHostView(Access::ReadOnly);
2789 PackTraits<impl_scalar_type>::packValueCount
2792 const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2793 const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2796 std::ostringstream os;
2797 os << prefix <<
"combineMode: "
2799 <<
", blockSize: " << blockSize
2800 <<
", numImportLIDs: " << numImportLIDs
2801 <<
", numBytesPerValue: " << numBytesPerValue
2802 <<
", maxRowNumEnt: " << maxRowNumEnt
2803 <<
", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2804 std::cerr << os.str ();
2807 if (combineMode ==
ZERO || numImportLIDs == 0) {
2809 std::ostringstream os;
2810 os << prefix <<
"Nothing to unpack. Done!" << std::endl;
2811 std::cerr << os.str ();
2818 if (imports.need_sync_host ()) {
2819 imports.sync_host ();
2829 if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2830 importLIDs.need_sync_host ()) {
2831 std::ostream& err = this->markLocalErrorAndGetStream ();
2832 std::ostringstream os;
2833 os << prefix <<
"All input DualViews must be sync'd to host by now. "
2834 <<
"imports_nc.need_sync_host()="
2835 << (imports.need_sync_host () ?
"true" :
"false")
2836 <<
", numPacketsPerLID.need_sync_host()="
2837 << (numPacketsPerLID.need_sync_host () ?
"true" :
"false")
2838 <<
", importLIDs.need_sync_host()="
2839 << (importLIDs.need_sync_host () ?
"true" :
"false")
2842 std::cerr << os.str ();
2848 const auto importLIDsHost = importLIDs.view_host ();
2849 const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2855 Kokkos::View<size_t*, host_exec> offset (
"offset", numImportLIDs+1);
2857 const auto policy = Kokkos::RangePolicy<host_exec>(
size_t(0), numImportLIDs+1);
2858 Kokkos::parallel_scan
2859 (
"Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2860 [=] (
const size_t &i,
size_t &update,
const bool &
final) {
2861 if (
final) offset(i) = update;
2862 update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2872 Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2873 errorDuringUnpack (
"errorDuringUnpack");
2874 errorDuringUnpack () = 0;
2876 using policy_type = Kokkos::TeamPolicy<host_exec>;
2877 size_t scratch_per_row =
sizeof(GO) * maxRowNumEnt +
sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2880 const auto policy = policy_type (numImportLIDs, 1, 1)
2881 .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2882 using host_scratch_space =
typename host_exec::scratch_memory_space;
2884 using pair_type = Kokkos::pair<size_t, size_t>;
2889 Kokkos::parallel_for
2890 (
"Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2891 [=] (
const typename policy_type::member_type& member) {
2892 const size_t i = member.league_rank();
2893 Kokkos::View<GO*, host_scratch_space> gblColInds
2894 (member.team_scratch (0), maxRowNumEnt);
2895 Kokkos::View<LO*, host_scratch_space> lclColInds
2896 (member.team_scratch (0), maxRowNumEnt);
2897 Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2898 (member.team_scratch (0), maxRowNumScalarEnt);
2901 const size_t offval = offset(i);
2902 const LO lclRow = importLIDsHost(i);
2903 const size_t numBytes = numPacketsPerLIDHost(i);
2904 const size_t numEnt =
2905 unpackRowCount<impl_scalar_type, LO, GO>
2906 (imports.view_host (), offval, numBytes, numBytesPerValue);
2909 if (numEnt > maxRowNumEnt) {
2910 errorDuringUnpack() = 1;
2912 std::ostringstream os;
2914 <<
"At i = " << i <<
", numEnt = " << numEnt
2915 <<
" > maxRowNumEnt = " << maxRowNumEnt
2917 std::cerr << os.str ();
2922 const size_t numScalarEnt = numEnt*blockSize*blockSize;
2923 auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2924 auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2925 auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2930 size_t numBytesOut = 0;
2933 unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2934 (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2935 Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2936 imports.view_host(),
2937 offval, numBytes, numEnt,
2938 numBytesPerValue, blockSize);
2940 catch (std::exception& e) {
2941 errorDuringUnpack() = 1;
2943 std::ostringstream os;
2944 os << prefix <<
"At i = " << i <<
", unpackRowForBlockCrs threw: "
2945 << e.what () << endl;
2946 std::cerr << os.str ();
2951 if (numBytes != numBytesOut) {
2952 errorDuringUnpack() = 1;
2954 std::ostringstream os;
2956 <<
"At i = " << i <<
", numBytes = " << numBytes
2957 <<
" != numBytesOut = " << numBytesOut <<
"."
2959 std::cerr << os.str();
2965 for (
size_t k = 0; k < numEnt; ++k) {
2967 if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2969 std::ostringstream os;
2971 <<
"At i = " << i <<
", GID " << gidsOut(k)
2972 <<
" is not owned by the calling process."
2974 std::cerr << os.str();
2982 const LO*
const lidsRaw =
const_cast<const LO*
> (lidsOut.data ());
2983 const Scalar*
const valsRaw =
reinterpret_cast<const Scalar*
>
2985 if (combineMode ==
ADD) {
2987 }
else if (combineMode ==
INSERT || combineMode ==
REPLACE) {
2989 }
else if (combineMode ==
ABSMAX) {
2993 if (
static_cast<LO
> (numEnt) != numCombd) {
2994 errorDuringUnpack() = 1;
2996 std::ostringstream os;
2997 os << prefix <<
"At i = " << i <<
", numEnt = " << numEnt
2998 <<
" != numCombd = " << numCombd <<
"."
3000 std::cerr << os.str ();
3008 if (errorDuringUnpack () != 0) {
3009 std::ostream& err = this->markLocalErrorAndGetStream ();
3010 err << prefix <<
"Unpacking failed.";
3012 err <<
" Please run again with the environment variable setting "
3013 "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3019 std::ostringstream os;
3020 const bool lclSuccess = ! (* (this->localError_));
3022 << (lclSuccess ?
"succeeded" :
"FAILED")
3024 std::cerr << os.str ();
3028 template<
class Scalar,
class LO,
class GO,
class Node>
3032 using Teuchos::TypeNameTraits;
3033 std::ostringstream os;
3034 os <<
"\"Tpetra::BlockCrsMatrix\": { "
3035 <<
"Template parameters: { "
3036 <<
"Scalar: " << TypeNameTraits<Scalar>::name ()
3037 <<
"LO: " << TypeNameTraits<LO>::name ()
3038 <<
"GO: " << TypeNameTraits<GO>::name ()
3039 <<
"Node: " << TypeNameTraits<Node>::name ()
3041 <<
", Label: \"" << this->getObjectLabel () <<
"\""
3042 <<
", Global dimensions: ["
3043 << graph_.getDomainMap ()->getGlobalNumElements () <<
", "
3044 << graph_.getRangeMap ()->getGlobalNumElements () <<
"]"
3051 template<
class Scalar,
class LO,
class GO,
class Node>
3054 describe (Teuchos::FancyOStream& out,
3055 const Teuchos::EVerbosityLevel verbLevel)
const
3057 using Teuchos::ArrayRCP;
3058 using Teuchos::CommRequest;
3059 using Teuchos::FancyOStream;
3060 using Teuchos::getFancyOStream;
3061 using Teuchos::ireceive;
3062 using Teuchos::isend;
3063 using Teuchos::outArg;
3064 using Teuchos::TypeNameTraits;
3065 using Teuchos::VERB_DEFAULT;
3066 using Teuchos::VERB_NONE;
3067 using Teuchos::VERB_LOW;
3068 using Teuchos::VERB_MEDIUM;
3070 using Teuchos::VERB_EXTREME;
3072 using Teuchos::wait;
3076 const Teuchos::EVerbosityLevel vl =
3077 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3079 if (vl == VERB_NONE) {
3084 Teuchos::OSTab tab0 (out);
3086 out <<
"\"Tpetra::BlockCrsMatrix\":" << endl;
3087 Teuchos::OSTab tab1 (out);
3089 out <<
"Template parameters:" << endl;
3091 Teuchos::OSTab tab2 (out);
3092 out <<
"Scalar: " << TypeNameTraits<Scalar>::name () << endl
3093 <<
"LO: " << TypeNameTraits<LO>::name () << endl
3094 <<
"GO: " << TypeNameTraits<GO>::name () << endl
3095 <<
"Node: " << TypeNameTraits<Node>::name () << endl;
3097 out <<
"Label: \"" << this->getObjectLabel () <<
"\"" << endl
3098 <<
"Global dimensions: ["
3099 << graph_.getDomainMap ()->getGlobalNumElements () <<
", "
3100 << graph_.getRangeMap ()->getGlobalNumElements () <<
"]" << endl;
3103 out <<
"Block size: " << blockSize << endl;
3106 if (vl >= VERB_MEDIUM) {
3107 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->
getComm ());
3108 const int myRank = comm.getRank ();
3110 out <<
"Row Map:" << endl;
3117 out <<
"Column Map: same as row Map" << endl;
3122 out <<
"Column Map:" << endl;
3130 out <<
"Domain Map: same as row Map" << endl;
3135 out <<
"Domain Map: same as column Map" << endl;
3140 out <<
"Domain Map:" << endl;
3148 out <<
"Range Map: same as domain Map" << endl;
3153 out <<
"Range Map: same as row Map" << endl;
3158 out <<
"Range Map: " << endl;
3165 if (vl >= VERB_EXTREME) {
3166 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->
getComm ());
3167 const int myRank = comm.getRank ();
3168 const int numProcs = comm.getSize ();
3171 RCP<std::ostringstream> lclOutStrPtr (
new std::ostringstream ());
3172 RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3173 FancyOStream& os = *osPtr;
3174 os <<
"Process " << myRank <<
":" << endl;
3175 Teuchos::OSTab tab2 (os);
3177 const map_type& meshRowMap = * (graph_.getRowMap ());
3178 const map_type& meshColMap = * (graph_.getColMap ());
3183 os <<
"Row " << meshGblRow <<
": {";
3185 local_inds_host_view_type lclColInds;
3186 values_host_view_type vals;
3188 this->
getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
3190 for (LO k = 0; k < numInds; ++k) {
3193 os <<
"Col " << gblCol <<
": [";
3194 for (LO i = 0; i < blockSize; ++i) {
3195 for (LO j = 0; j < blockSize; ++j) {
3196 os << vals[blockSize*blockSize*k + i*blockSize + j];
3197 if (j + 1 < blockSize) {
3201 if (i + 1 < blockSize) {
3206 if (k + 1 < numInds) {
3216 out << lclOutStrPtr->str ();
3217 lclOutStrPtr = Teuchos::null;
3220 const int sizeTag = 1337;
3221 const int dataTag = 1338;
3223 ArrayRCP<char> recvDataBuf;
3227 for (
int p = 1; p < numProcs; ++p) {
3230 ArrayRCP<size_t> recvSize (1);
3232 RCP<CommRequest<int> > recvSizeReq =
3233 ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3234 wait<int> (comm, outArg (recvSizeReq));
3235 const size_t numCharsToRecv = recvSize[0];
3242 if (
static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3243 recvDataBuf.resize (numCharsToRecv + 1);
3245 ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3247 RCP<CommRequest<int> > recvDataReq =
3248 ireceive<int, char> (recvData, p, dataTag, comm);
3249 wait<int> (comm, outArg (recvDataReq));
3254 recvDataBuf[numCharsToRecv] =
'\0';
3255 out << recvDataBuf.getRawPtr ();
3257 else if (myRank == p) {
3261 const std::string stringToSend = lclOutStrPtr->str ();
3262 lclOutStrPtr = Teuchos::null;
3265 const size_t numCharsToSend = stringToSend.size ();
3266 ArrayRCP<size_t> sendSize (1);
3267 sendSize[0] = numCharsToSend;
3268 RCP<CommRequest<int> > sendSizeReq =
3269 isend<int, size_t> (sendSize, 0, sizeTag, comm);
3270 wait<int> (comm, outArg (sendSizeReq));
3278 ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend,
false);
3279 RCP<CommRequest<int> > sendDataReq =
3280 isend<int, char> (sendData, 0, dataTag, comm);
3281 wait<int> (comm, outArg (sendDataReq));
3287 template<
class Scalar,
class LO,
class GO,
class Node>
3288 Teuchos::RCP<const Teuchos::Comm<int> >
3292 return graph_.getComm();
3296 template<
class Scalar,
class LO,
class GO,
class Node>
3301 return graph_.getGlobalNumCols();
3305 template<
class Scalar,
class LO,
class GO,
class Node>
3310 return graph_.getLocalNumCols();
3313 template<
class Scalar,
class LO,
class GO,
class Node>
3318 return graph_.getIndexBase();
3321 template<
class Scalar,
class LO,
class GO,
class Node>
3326 return graph_.getGlobalNumEntries();
3329 template<
class Scalar,
class LO,
class GO,
class Node>
3334 return graph_.getLocalNumEntries();
3337 template<
class Scalar,
class LO,
class GO,
class Node>
3342 return graph_.getNumEntriesInGlobalRow(globalRow);
3346 template<
class Scalar,
class LO,
class GO,
class Node>
3351 return graph_.getGlobalMaxNumRowEntries();
3354 template<
class Scalar,
class LO,
class GO,
class Node>
3359 return graph_.hasColMap();
3363 template<
class Scalar,
class LO,
class GO,
class Node>
3368 return graph_.isLocallyIndexed();
3371 template<
class Scalar,
class LO,
class GO,
class Node>
3376 return graph_.isGloballyIndexed();
3379 template<
class Scalar,
class LO,
class GO,
class Node>
3384 return graph_.isFillComplete ();
3387 template<
class Scalar,
class LO,
class GO,
class Node>
3395 template<
class Scalar,
class LO,
class GO,
class Node>
3399 nonconst_global_inds_host_view_type &,
3400 nonconst_values_host_view_type &,
3403 TEUCHOS_TEST_FOR_EXCEPTION(
3404 true, std::logic_error,
"Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3405 "This class doesn't support global matrix indexing.");
3409 template<
class Scalar,
class LO,
class GO,
class Node>
3413 global_inds_host_view_type &,
3414 values_host_view_type &)
const
3416 TEUCHOS_TEST_FOR_EXCEPTION(
3417 true, std::logic_error,
"Tpetra::BlockCrsMatrix::getGlobalRowView: "
3418 "This class doesn't support global matrix indexing.");
3422 template<
class Scalar,
class LO,
class GO,
class Node>
3426 local_inds_host_view_type &colInds,
3427 values_host_view_type &vals)
const
3429 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3430 colInds = local_inds_host_view_type();
3431 vals = values_host_view_type();
3434 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3435 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3436 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3438 vals = getValuesHost (localRowInd);
3442 template<
class Scalar,
class LO,
class GO,
class Node>
3446 local_inds_host_view_type &colInds,
3447 nonconst_values_host_view_type &vals)
const
3449 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3450 colInds = nonconst_local_inds_host_view_type();
3451 vals = nonconst_values_host_view_type();
3454 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3455 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3456 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3459 vals =
const_cast<this_BCRS_type&
>(*this).getValuesHostNonConst(localRowInd);
3463 template<
class Scalar,
class LO,
class GO,
class Node>
3468 const size_t lclNumMeshRows = graph_.getLocalNumRows ();
3470 Kokkos::View<size_t*, device_type> diagOffsets (
"diagOffsets", lclNumMeshRows);
3471 graph_.getLocalDiagOffsets (diagOffsets);
3474 auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3478 auto vals_host_out = val_.getHostView(Access::ReadOnly);
3482 size_t rowOffset = 0;
3488 offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3489 for(
int b=0; b<bs; b++)
3500 template<
class Scalar,
class LO,
class GO,
class Node>
3503 leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& )
3505 TEUCHOS_TEST_FOR_EXCEPTION(
3506 true, std::logic_error,
"Tpetra::BlockCrsMatrix::leftScale: "
3507 "not implemented.");
3511 template<
class Scalar,
class LO,
class GO,
class Node>
3514 rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& )
3516 TEUCHOS_TEST_FOR_EXCEPTION(
3517 true, std::logic_error,
"Tpetra::BlockCrsMatrix::rightScale: "
3518 "not implemented.");
3522 template<
class Scalar,
class LO,
class GO,
class Node>
3523 Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3530 template<
class Scalar,
class LO,
class GO,
class Node>
3531 typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3535 TEUCHOS_TEST_FOR_EXCEPTION(
3536 true, std::logic_error,
"Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3537 "not implemented.");
3547#define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3548 template class BlockCrsMatrix< S, LO, GO, NODE >;
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual LO getBlockSize() const override
The number of degrees of freedom per mesh point.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
std::string errorMessages() const
The current stream of error messages.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
::Tpetra::MultiVector< Scalar, LO, GO, node_type > mv_type
The implementation of MultiVector that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
size_t getLocalNumRows() const override
get the local number of block rows
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
local_matrix_device_type getLocalMatrixDevice() const
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Kokkos::View< const impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
::Tpetra::Map< LO, GO, node_type > map_type
The implementation of Map that this class uses.
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
::Tpetra::CrsGraph< LO, GO, node_type > crs_graph_type
The implementation of CrsGraph that this class uses.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
virtual bool checkSizes(const ::Tpetra::SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
MultiVector for multiple degrees of freedom per mesh point.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Tpetra::Map< LO, GO, Node > map_type
The specialization of Tpetra::Map that this class uses.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
::Tpetra::Import< LO, GO, node_type > import_type
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Description of Tpetra's behavior.
virtual Teuchos::RCP< const map_type > getMap() const
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
A distributed dense vector.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Namespace for new Tpetra features that are not ready for public release, but are ready for evaluation...
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
Teuchos::RCP< CrsGraphType > exportAndFillCompleteCrsGraph(const Teuchos::RCP< const CrsGraphType > &sourceGraph, const Export< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > &exporter, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Nonmember CrsGraph constructor that fuses Export and fillComplete().
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Teuchos::RCP< CrsGraphType > importAndFillCompleteCrsGraph(const Teuchos::RCP< const CrsGraphType > &sourceGraph, const Import< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > &importer, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Nonmember CrsGraph constructor that fuses Import and fillComplete().
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
Traits class for packing / unpacking data of type T.