Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40// clang-format off
41#ifndef TPETRA_MULTIVECTOR_DEF_HPP
42#define TPETRA_MULTIVECTOR_DEF_HPP
43
51
52#include "Tpetra_Core.hpp"
53#include "Tpetra_Util.hpp"
54#include "Tpetra_Vector.hpp"
66#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
67# include "Teuchos_SerialDenseMatrix.hpp"
68#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
69#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
70#include "KokkosCompat_View.hpp"
71#include "KokkosBlas.hpp"
72#include "KokkosKernels_Utils.hpp"
73#include "Kokkos_Random.hpp"
74#include "Kokkos_ArithTraits.hpp"
75#include <memory>
76#include <sstream>
77
78#ifdef HAVE_TPETRA_INST_FLOAT128
79namespace Kokkos {
80 // FIXME (mfh 04 Sep 2015) Just a stub for now!
81 template<class Generator>
82 struct rand<Generator, __float128> {
83 static KOKKOS_INLINE_FUNCTION __float128 max ()
84 {
85 return static_cast<__float128> (1.0);
86 }
87 static KOKKOS_INLINE_FUNCTION __float128
88 draw (Generator& gen)
89 {
90 // Half the smallest normalized double, is the scaling factor of
91 // the lower-order term in the double-double representation.
92 const __float128 scalingFactor =
93 static_cast<__float128> (std::numeric_limits<double>::min ()) /
94 static_cast<__float128> (2.0);
95 const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
96 const __float128 lowerOrderTerm =
97 static_cast<__float128> (gen.drand ()) * scalingFactor;
98 return higherOrderTerm + lowerOrderTerm;
99 }
100 static KOKKOS_INLINE_FUNCTION __float128
101 draw (Generator& gen, const __float128& range)
102 {
103 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
104 const __float128 scalingFactor =
105 static_cast<__float128> (std::numeric_limits<double>::min ()) /
106 static_cast<__float128> (2.0);
107 const __float128 higherOrderTerm =
108 static_cast<__float128> (gen.drand (range));
109 const __float128 lowerOrderTerm =
110 static_cast<__float128> (gen.drand (range)) * scalingFactor;
111 return higherOrderTerm + lowerOrderTerm;
112 }
113 static KOKKOS_INLINE_FUNCTION __float128
114 draw (Generator& gen, const __float128& start, const __float128& end)
115 {
116 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
117 const __float128 scalingFactor =
118 static_cast<__float128> (std::numeric_limits<double>::min ()) /
119 static_cast<__float128> (2.0);
120 const __float128 higherOrderTerm =
121 static_cast<__float128> (gen.drand (start, end));
122 const __float128 lowerOrderTerm =
123 static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
124 return higherOrderTerm + lowerOrderTerm;
125 }
126 };
127} // namespace Kokkos
128#endif // HAVE_TPETRA_INST_FLOAT128
129
130namespace { // (anonymous)
131
138 /// "Local" means "local to the calling MPI process."
139 /// \param numCols [in] Number of columns in the DualView.
140 /// \param zeroOut [in] Whether to initialize all the entries of the
141 /// DualView to zero. Kokkos does first-touch initialization.
142 /// \param allowPadding [in] Whether to give Kokkos the option to
143 /// pad Views for alignment.
144 ///
145 /// \return The allocated Kokkos::DualView.
146 template<class ST, class LO, class GO, class NT>
147 typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type
148 allocDualView (const size_t lclNumRows,
149 const size_t numCols,
150 const bool zeroOut = true,
151 const bool allowPadding = false)
152 {
154 using Kokkos::AllowPadding;
155 using Kokkos::view_alloc;
156 using Kokkos::WithoutInitializing;
158 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
159 typedef typename dual_view_type::t_dev dev_view_type;
160
161 // This needs to be a string and not a char*, if given as an
162 // argument to Kokkos::view_alloc. This is because view_alloc
163 // also allows a raw pointer as its first argument. See
164 // https://github.com/kokkos/kokkos/issues/434.
165 const std::string label ("MV::DualView");
166 const bool debug = Behavior::debug ();
167
168 // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
169 // creation of the DualView's Views works around
170 // Kokkos::DualView's current inability to accept an
171 // AllocationProperties initial argument (as Kokkos::View does).
172 // However, the work-around is harmless, since it does what the
173 // (currently nonexistent) equivalent DualView constructor would
174 // have done anyway.
175
176 dev_view_type d_view;
177 if (zeroOut) {
178 if (allowPadding) {
179 d_view = dev_view_type (view_alloc (label, AllowPadding),
180 lclNumRows, numCols);
181 }
182 else {
183 d_view = dev_view_type (view_alloc (label),
184 lclNumRows, numCols);
185 }
186 }
187 else {
188 if (allowPadding) {
189 d_view = dev_view_type (view_alloc (label,
190 WithoutInitializing,
191 AllowPadding),
192 lclNumRows, numCols);
193 }
194 else {
195 d_view = dev_view_type (view_alloc (label, WithoutInitializing),
196 lclNumRows, numCols);
197 }
198 if (debug) {
199 // Filling with NaN is a cheap and effective way to tell if
200 // downstream code is trying to use a MultiVector's data
201 // without them having been initialized. ArithTraits lets us
202 // call nan() even if the scalar type doesn't define it; it
203 // just returns some undefined value in the latter case. This
204 // won't hurt anything because by setting zeroOut=false, users
205 // already agreed that they don't care about the contents of
206 // the MultiVector.
207 const ST nan = Kokkos::ArithTraits<ST>::nan ();
208 KokkosBlas::fill (d_view, nan);
209 }
210 }
211 if (debug) {
212 TEUCHOS_TEST_FOR_EXCEPTION
213 (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
214 static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
215 "allocDualView: d_view's dimensions actual dimensions do not match "
216 "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
217 d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
218 << ". Please report this bug to the Tpetra developers.");
219 }
220
221 return wrapped_dual_view_type(d_view);
222 }
223
224 // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
225 //
226 // T: The type of the entries of the View.
227 // ExecSpace: The Kokkos execution space.
228 template<class T, class ExecSpace>
229 struct MakeUnmanagedView {
230 // The 'false' part of the branch carefully ensures that this
231 // won't attempt to use a host execution space that hasn't been
232 // initialized. For example, if Kokkos::OpenMP is disabled and
233 // Kokkos::Threads is enabled, the latter is always the default
234 // execution space of Kokkos::HostSpace, even when ExecSpace is
235 // Kokkos::Serial. That's why we go through the trouble of asking
236 // Kokkos::DualView what _its_ space is. That seems to work
237 // around this default execution space issue.
238 //
239 typedef typename std::conditional<
240 Kokkos::SpaceAccessibility<
241 typename ExecSpace::memory_space,
242 Kokkos::HostSpace>::accessible,
243 typename ExecSpace::device_type,
244 typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
245 typedef Kokkos::LayoutLeft array_layout;
246 typedef Kokkos::View<T*, array_layout, host_exec_space,
247 Kokkos::MemoryUnmanaged> view_type;
248
249 static view_type getView (const Teuchos::ArrayView<T>& x_in)
250 {
251 const size_t numEnt = static_cast<size_t> (x_in.size ());
252 if (numEnt == 0) {
253 return view_type ();
254 } else {
255 return view_type (x_in.getRawPtr (), numEnt);
256 }
257 }
258 };
259
260
261 template<class WrappedDualViewType>
262 WrappedDualViewType
263 takeSubview (const WrappedDualViewType& X,
264 const std::pair<size_t, size_t>& rowRng,
265 const Kokkos::ALL_t& colRng)
266
267 {
268 // The bug we saw below should be harder to trigger here.
269 return WrappedDualViewType(X,rowRng,colRng);
270 }
271
272 template<class WrappedDualViewType>
273 WrappedDualViewType
274 takeSubview (const WrappedDualViewType& X,
275 const Kokkos::ALL_t& rowRng,
276 const std::pair<size_t, size_t>& colRng)
277 {
278 using DualViewType = typename WrappedDualViewType::DVT;
279 // Look carefullly at the comment in the below version of this function.
280 // The comment applies here as well.
281 if (X.extent (0) == 0 && X.extent (1) != 0) {
282 return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
283 }
284 else {
285 return WrappedDualViewType(X,rowRng,colRng);
286 }
287 }
288
289 template<class WrappedDualViewType>
290 WrappedDualViewType
291 takeSubview (const WrappedDualViewType& X,
292 const std::pair<size_t, size_t>& rowRng,
293 const std::pair<size_t, size_t>& colRng)
294 {
295 using DualViewType = typename WrappedDualViewType::DVT;
296 // If you take a subview of a view with zero rows Kokkos::subview()
297 // always returns a DualView with the same data pointers. This will break
298 // pointer equality testing in between two subviews of the same 2D View if
299 // it has zero row extent. While the one (known) case where this was actually used
300 // has been fixed, that sort of check could very easily be reintroduced in the future,
301 // hence I've added this if check here.
302 //
303 // This is not a bug in Kokkos::subview(), just some very subtle behavior which
304 // future developers should be wary of.
305 if (X.extent (0) == 0 && X.extent (1) != 0) {
306 return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
307 }
308 else {
309 return WrappedDualViewType(X,rowRng,colRng);
310 }
311 }
312
313 template<class WrappedOrNotDualViewType>
314 size_t
315 getDualViewStride (const WrappedOrNotDualViewType& dv)
316 {
317 // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
318 // method yet, but its Views do.
319 // NOTE: dv.stride() returns a vector of length one
320 // more than its rank
321 size_t strides[WrappedOrNotDualViewType::t_dev::rank+1];
322 dv.stride(strides);
323 const size_t LDA = strides[1];
324 const size_t numRows = dv.extent (0);
325
326 if (LDA == 0) {
327 return (numRows == 0) ? size_t (1) : numRows;
328 }
329 else {
330 return LDA;
331 }
332 }
333
334 template<class ViewType>
335 size_t
336 getViewStride (const ViewType& view)
337 {
338 const size_t LDA = view.stride (1);
339 const size_t numRows = view.extent (0);
340
341 if (LDA == 0) {
342 return (numRows == 0) ? size_t (1) : numRows;
343 }
344 else {
345 return LDA;
346 }
347 }
348
349 template <class impl_scalar_type, class buffer_device_type>
350 bool
351 runKernelOnHost (
352 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
353 )
354 {
355 if (! imports.need_sync_device ()) {
356 return false; // most up-to-date on device
357 }
358 else { // most up-to-date on host,
359 // but if large enough, worth running on device anyway
360 size_t localLengthThreshold =
362 return imports.extent(0) <= localLengthThreshold;
363 }
364 }
365
366
367 template <class SC, class LO, class GO, class NT>
368 bool
369 runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
370 {
371 if (! X.need_sync_device ()) {
372 return false; // most up-to-date on device
373 }
374 else { // most up-to-date on host
375 // but if large enough, worth running on device anyway
376 size_t localLengthThreshold =
378 return X.getLocalLength () <= localLengthThreshold;
379 }
380 }
381
382 template <class SC, class LO, class GO, class NT>
383 void
384 multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
386 const ::Tpetra::Details::EWhichNorm whichNorm)
387 {
388 using namespace Tpetra;
391 using val_type = typename MV::impl_scalar_type;
392 using mag_type = typename MV::mag_type;
393 using dual_view_type = typename MV::dual_view_type;
394
395 auto map = X.getMap ();
396 auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
397 auto whichVecs = getMultiVectorWhichVectors (X);
398 const bool isConstantStride = X.isConstantStride ();
399 const bool isDistributed = X.isDistributed ();
400
401 const bool runOnHost = runKernelOnHost (X);
402 if (runOnHost) {
403 using view_type = typename dual_view_type::t_host;
404 using array_layout = typename view_type::array_layout;
405 using device_type = typename view_type::device_type;
406
407 auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
408 normImpl<val_type, array_layout, device_type,
409 mag_type> (norms, X_lcl, whichNorm, whichVecs,
410 isConstantStride, isDistributed, comm);
411 }
412 else {
413 using view_type = typename dual_view_type::t_dev;
414 using array_layout = typename view_type::array_layout;
415 using device_type = typename view_type::device_type;
416
417 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
418 normImpl<val_type, array_layout, device_type,
419 mag_type> (norms, X_lcl, whichNorm, whichVecs,
420 isConstantStride, isDistributed, comm);
421 }
422 }
423} // namespace (anonymous)
424
425
426namespace Tpetra {
427
428 namespace Details {
429 template <typename DstView, typename SrcView>
430 struct AddAssignFunctor {
431 // This functor would be better as a lambda, but CUDA cannot compile
432 // lambdas in protected functions. It compiles fine with the functor.
433 AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
434
435 KOKKOS_INLINE_FUNCTION void
436 operator () (const size_t k) const { tgt(k) += src(k); }
437
438 DstView tgt;
439 SrcView src;
440 };
441 }
442
443 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
444 bool
446 vectorIndexOutOfRange (const size_t VectorIndex) const {
447 return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
448 }
449
450 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
452 MultiVector () :
453 base_type (Teuchos::rcp (new map_type ()))
454 {}
455
456 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
458 MultiVector (const Teuchos::RCP<const map_type>& map,
459 const size_t numVecs,
460 const bool zeroOut) : /* default is true */
461 base_type (map)
462 {
463 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
464
465 const size_t lclNumRows = this->getLocalLength ();
466 view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
467 }
468
469 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
472 const Teuchos::DataAccess copyOrView) :
473 base_type (source),
474 view_ (source.view_),
476 {
478 const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
479 "const Teuchos::DataAccess): ";
480
481 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
482
483 if (copyOrView == Teuchos::Copy) {
484 // Reuse the conveniently already existing function that creates
485 // a deep copy.
486 MV cpy = createCopy (source);
487 this->view_ = cpy.view_;
488 this->whichVectors_ = cpy.whichVectors_;
489 }
490 else if (copyOrView == Teuchos::View) {
491 }
492 else {
493 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
494 true, std::invalid_argument, "Second argument 'copyOrView' has an "
495 "invalid value " << copyOrView << ". Valid values include "
496 "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
497 << Teuchos::View << ".");
498 }
499 }
500
501 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
503 MultiVector (const Teuchos::RCP<const map_type>& map,
504 const dual_view_type& view) :
505 base_type (map),
506 view_ (wrapped_dual_view_type(view))
507 {
508 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
509 const size_t lclNumRows_map = map.is_null () ? size_t (0) :
510 map->getLocalNumElements ();
511 const size_t lclNumRows_view = view.extent (0);
512 const size_t LDA = getDualViewStride (view_);
513
514 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
515 (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
516 std::invalid_argument, "Kokkos::DualView does not match Map. "
517 "map->getLocalNumElements() = " << lclNumRows_map
518 << ", view.extent(0) = " << lclNumRows_view
519 << ", and getStride() = " << LDA << ".");
520
522 const bool debug = Behavior::debug ();
523 if (debug) {
524 using ::Tpetra::Details::checkGlobalDualViewValidity;
525 std::ostringstream gblErrStrm;
526 const bool verbose = Behavior::verbose ();
527 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
528 const bool gblValid =
529 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
530 comm.getRawPtr ());
531 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
532 (! gblValid, std::runtime_error, gblErrStrm.str ());
533 }
534 }
535
536
537 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
539 MultiVector (const Teuchos::RCP<const map_type>& map,
540 const wrapped_dual_view_type& view) :
541 base_type (map),
542 view_ (view)
543 {
544 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
545 const size_t lclNumRows_map = map.is_null () ? size_t (0) :
546 map->getLocalNumElements ();
547 const size_t lclNumRows_view = view.extent (0);
548 const size_t LDA = getDualViewStride (view);
549
550 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
551 (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
552 std::invalid_argument, "Kokkos::DualView does not match Map. "
553 "map->getLocalNumElements() = " << lclNumRows_map
554 << ", view.extent(0) = " << lclNumRows_view
555 << ", and getStride() = " << LDA << ".");
556
558 const bool debug = Behavior::debug ();
559 if (debug) {
560 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
561 std::ostringstream gblErrStrm;
562 const bool verbose = Behavior::verbose ();
563 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
564 const bool gblValid =
565 checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
566 comm.getRawPtr ());
567 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
568 (! gblValid, std::runtime_error, gblErrStrm.str ());
569 }
570 }
571
572
573
574 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
576 MultiVector (const Teuchos::RCP<const map_type>& map,
577 const typename dual_view_type::t_dev& d_view) :
578 base_type (map)
579 {
580 using Teuchos::ArrayRCP;
581 using Teuchos::RCP;
582 const char tfecfFuncName[] = "MultiVector(map,d_view): ";
583
584 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
585
586 const size_t LDA = getViewStride (d_view);
587 const size_t lclNumRows = map->getLocalNumElements ();
588 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
589 (LDA < lclNumRows, std::invalid_argument, "Map does not match "
590 "Kokkos::View. map->getLocalNumElements() = " << lclNumRows
591 << ", View's column stride = " << LDA
592 << ", and View's extent(0) = " << d_view.extent (0) << ".");
593
594 auto h_view = Kokkos::create_mirror_view (d_view);
595 auto dual_view = dual_view_type (d_view, h_view);
596 view_ = wrapped_dual_view_type(dual_view);
597
599 const bool debug = Behavior::debug ();
600 if (debug) {
601 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
602 std::ostringstream gblErrStrm;
603 const bool verbose = Behavior::verbose ();
604 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
605 const bool gblValid =
606 checkGlobalWrappedDualViewValidity (&gblErrStrm, view_, verbose,
607 comm.getRawPtr ());
608 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
609 (! gblValid, std::runtime_error, gblErrStrm.str ());
610 }
611 // The user gave us a device view. In order to respect its
612 // initial contents, we mark the DualView as "modified on device."
613 // That way, the next sync will synchronize it with the host view.
614 dual_view.modify_device ();
615 }
616
617 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
619 MultiVector (const Teuchos::RCP<const map_type>& map,
620 const dual_view_type& view,
621 const dual_view_type& origView) :
622 base_type (map),
623 view_ (wrapped_dual_view_type(view,origView))
624 {
625 const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
626
627 const size_t LDA = getDualViewStride (origView);
628 const size_t lclNumRows = this->getLocalLength (); // comes from the Map
629 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
630 LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
631 "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
632 << ". This may also mean that the input origView's first dimension (number "
633 "of rows = " << origView.extent (0) << ") does not not match the number "
634 "of entries in the Map on the calling process.");
635
637 const bool debug = Behavior::debug ();
638 if (debug) {
639 using ::Tpetra::Details::checkGlobalDualViewValidity;
640 std::ostringstream gblErrStrm;
641 const bool verbose = Behavior::verbose ();
642 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
643 const bool gblValid_0 =
644 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
645 comm.getRawPtr ());
646 const bool gblValid_1 =
647 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
648 comm.getRawPtr ());
649 const bool gblValid = gblValid_0 && gblValid_1;
650 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
651 (! gblValid, std::runtime_error, gblErrStrm.str ());
652 }
653 }
654
655
656 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
658 MultiVector (const Teuchos::RCP<const map_type>& map,
659 const dual_view_type& view,
660 const Teuchos::ArrayView<const size_t>& whichVectors) :
661 base_type (map),
662 view_ (view),
663 whichVectors_ (whichVectors.begin (), whichVectors.end ())
664 {
665 using Kokkos::ALL;
666 using Kokkos::subview;
667 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
668
670 const bool debug = Behavior::debug ();
671 if (debug) {
672 using ::Tpetra::Details::checkGlobalDualViewValidity;
673 std::ostringstream gblErrStrm;
674 const bool verbose = Behavior::verbose ();
675 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
676 const bool gblValid =
677 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
678 comm.getRawPtr ());
679 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
680 (! gblValid, std::runtime_error, gblErrStrm.str ());
681 }
682
683 const size_t lclNumRows = map.is_null () ? size_t (0) :
684 map->getLocalNumElements ();
685 // Check dimensions of the input DualView. We accept that Kokkos
686 // might not allow construction of a 0 x m (Dual)View with m > 0,
687 // so we only require the number of rows to match if the
688 // (Dual)View has more than zero columns. Likewise, we only
689 // require the number of columns to match if the (Dual)View has
690 // more than zero rows.
691 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
692 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
693 std::invalid_argument, "view.extent(0) = " << view.extent (0)
694 << " < map->getLocalNumElements() = " << lclNumRows << ".");
695 if (whichVectors.size () != 0) {
696 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
697 view.extent (1) != 0 && view.extent (1) == 0,
698 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
699 " = " << whichVectors.size () << " > 0.");
700 size_t maxColInd = 0;
701 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
702 for (size_type k = 0; k < whichVectors.size (); ++k) {
703 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
704 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
705 std::invalid_argument, "whichVectors[" << k << "] = "
706 "Teuchos::OrdinalTraits<size_t>::invalid().");
707 maxColInd = std::max (maxColInd, whichVectors[k]);
708 }
709 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
710 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
711 std::invalid_argument, "view.extent(1) = " << view.extent (1)
712 << " <= max(whichVectors) = " << maxColInd << ".");
713 }
714
715 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
716 // zero strides, so modify in that case.
717 const size_t LDA = getDualViewStride (view);
718 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
719 (LDA < lclNumRows, std::invalid_argument,
720 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
721
722 if (whichVectors.size () == 1) {
723 // If whichVectors has only one entry, we don't need to bother
724 // with nonconstant stride. Just shift the view over so it
725 // points to the desired column.
726 //
727 // NOTE (mfh 10 May 2014) This is a special case where we set
728 // origView_ just to view that one column, not all of the
729 // original columns. This ensures that the use of origView_ in
730 // offsetView works correctly.
731 //
732 const std::pair<size_t, size_t> colRng (whichVectors[0],
733 whichVectors[0] + 1);
734 view_ = takeSubview (view_, ALL (), colRng);
735 // whichVectors_.size() == 0 means "constant stride."
736 whichVectors_.clear ();
737 }
738 }
739
740 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
742 MultiVector (const Teuchos::RCP<const map_type>& map,
743 const wrapped_dual_view_type& view,
744 const Teuchos::ArrayView<const size_t>& whichVectors) :
745 base_type (map),
746 view_ (view),
747 whichVectors_ (whichVectors.begin (), whichVectors.end ())
748 {
749 using Kokkos::ALL;
750 using Kokkos::subview;
751 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
752
754 const bool debug = Behavior::debug ();
755 if (debug) {
756 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
757 std::ostringstream gblErrStrm;
758 const bool verbose = Behavior::verbose ();
759 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
760 const bool gblValid =
761 checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
762 comm.getRawPtr ());
763 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
764 (! gblValid, std::runtime_error, gblErrStrm.str ());
765 }
766
767 const size_t lclNumRows = map.is_null () ? size_t (0) :
768 map->getLocalNumElements ();
769 // Check dimensions of the input DualView. We accept that Kokkos
770 // might not allow construction of a 0 x m (Dual)View with m > 0,
771 // so we only require the number of rows to match if the
772 // (Dual)View has more than zero columns. Likewise, we only
773 // require the number of columns to match if the (Dual)View has
774 // more than zero rows.
775 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
776 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
777 std::invalid_argument, "view.extent(0) = " << view.extent (0)
778 << " < map->getLocalNumElements() = " << lclNumRows << ".");
779 if (whichVectors.size () != 0) {
780 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
781 view.extent (1) != 0 && view.extent (1) == 0,
782 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
783 " = " << whichVectors.size () << " > 0.");
784 size_t maxColInd = 0;
785 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
786 for (size_type k = 0; k < whichVectors.size (); ++k) {
787 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
788 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
789 std::invalid_argument, "whichVectors[" << k << "] = "
790 "Teuchos::OrdinalTraits<size_t>::invalid().");
791 maxColInd = std::max (maxColInd, whichVectors[k]);
792 }
793 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
794 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
795 std::invalid_argument, "view.extent(1) = " << view.extent (1)
796 << " <= max(whichVectors) = " << maxColInd << ".");
797 }
798
799 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
800 // zero strides, so modify in that case.
801 const size_t LDA = getDualViewStride (view);
802 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
803 (LDA < lclNumRows, std::invalid_argument,
804 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
805
806 if (whichVectors.size () == 1) {
807 // If whichVectors has only one entry, we don't need to bother
808 // with nonconstant stride. Just shift the view over so it
809 // points to the desired column.
810 //
811 // NOTE (mfh 10 May 2014) This is a special case where we set
812 // origView_ just to view that one column, not all of the
813 // original columns. This ensures that the use of origView_ in
814 // offsetView works correctly.
815 //
816 const std::pair<size_t, size_t> colRng (whichVectors[0],
817 whichVectors[0] + 1);
818 view_ = takeSubview (view_, ALL (), colRng);
819 // whichVectors_.size() == 0 means "constant stride."
820 whichVectors_.clear ();
821 }
822 }
823
824
825 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
827 MultiVector (const Teuchos::RCP<const map_type>& map,
828 const dual_view_type& view,
829 const dual_view_type& origView,
830 const Teuchos::ArrayView<const size_t>& whichVectors) :
831 base_type (map),
832 view_ (wrapped_dual_view_type(view,origView)),
833 whichVectors_ (whichVectors.begin (), whichVectors.end ())
834 {
835 using Kokkos::ALL;
836 using Kokkos::subview;
837 const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
838
840 const bool debug = Behavior::debug ();
841 if (debug) {
842 using ::Tpetra::Details::checkGlobalDualViewValidity;
843 std::ostringstream gblErrStrm;
844 const bool verbose = Behavior::verbose ();
845 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
846 const bool gblValid_0 =
847 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
848 comm.getRawPtr ());
849 const bool gblValid_1 =
850 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
851 comm.getRawPtr ());
852 const bool gblValid = gblValid_0 && gblValid_1;
853 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
854 (! gblValid, std::runtime_error, gblErrStrm.str ());
855 }
856
857 const size_t lclNumRows = this->getLocalLength ();
858 // Check dimensions of the input DualView. We accept that Kokkos
859 // might not allow construction of a 0 x m (Dual)View with m > 0,
860 // so we only require the number of rows to match if the
861 // (Dual)View has more than zero columns. Likewise, we only
862 // require the number of columns to match if the (Dual)View has
863 // more than zero rows.
864 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
865 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
866 std::invalid_argument, "view.extent(0) = " << view.extent (0)
867 << " < map->getLocalNumElements() = " << lclNumRows << ".");
868 if (whichVectors.size () != 0) {
869 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
870 view.extent (1) != 0 && view.extent (1) == 0,
871 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
872 " = " << whichVectors.size () << " > 0.");
873 size_t maxColInd = 0;
874 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
875 for (size_type k = 0; k < whichVectors.size (); ++k) {
876 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
877 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
878 std::invalid_argument, "whichVectors[" << k << "] = "
879 "Teuchos::OrdinalTraits<size_t>::invalid().");
880 maxColInd = std::max (maxColInd, whichVectors[k]);
881 }
882 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
883 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
884 std::invalid_argument, "view.extent(1) = " << view.extent (1)
885 << " <= max(whichVectors) = " << maxColInd << ".");
886 }
887
888 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
889 // zero strides, so modify in that case.
890 const size_t LDA = getDualViewStride (origView);
891 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
892 (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
893 "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
894 lclNumRows << ". origView.extent(0) = " << origView.extent(0)
895 << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
896
897 if (whichVectors.size () == 1) {
898 // If whichVectors has only one entry, we don't need to bother
899 // with nonconstant stride. Just shift the view over so it
900 // points to the desired column.
901 //
902 // NOTE (mfh 10 May 2014) This is a special case where we set
903 // origView_ just to view that one column, not all of the
904 // original columns. This ensures that the use of origView_ in
905 // offsetView works correctly.
906 const std::pair<size_t, size_t> colRng (whichVectors[0],
907 whichVectors[0] + 1);
908 view_ = takeSubview (view_, ALL (), colRng);
909 // origView_ = takeSubview (origView_, ALL (), colRng);
910 // whichVectors_.size() == 0 means "constant stride."
911 whichVectors_.clear ();
912 }
913 }
914
915 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
917 MultiVector (const Teuchos::RCP<const map_type>& map,
918 const Teuchos::ArrayView<const Scalar>& data,
919 const size_t LDA,
920 const size_t numVecs) :
921 base_type (map)
922 {
923 typedef LocalOrdinal LO;
924 typedef GlobalOrdinal GO;
925 const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
926 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
927
928 // Deep copy constructor, constant stride (NO whichVectors_).
929 // There is no need for a deep copy constructor with nonconstant stride.
930
931 const size_t lclNumRows =
932 map.is_null () ? size_t (0) : map->getLocalNumElements ();
933 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
934 (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
935 "map->getLocalNumElements() = " << lclNumRows << ".");
936 if (numVecs != 0) {
937 const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
938 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
939 (static_cast<size_t> (data.size ()) < minNumEntries,
940 std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
941 "entries, given the input Map and number of vectors in the MultiVector."
942 " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
943 "map->getLocalNumElements () = " << minNumEntries << ".");
944 }
945
946 this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
947 //Note: X_out will be completely overwritten
948 auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
949
950 // Make an unmanaged host Kokkos::View of the input data. First
951 // create a View (X_in_orig) with the original stride. Then,
952 // create a subview (X_in) with the right number of columns.
953 const impl_scalar_type* const X_in_raw =
954 reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
955 Kokkos::View<const impl_scalar_type**,
956 Kokkos::LayoutLeft,
957 Kokkos::HostSpace,
958 Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
959 const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
960 auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
961
962 // If LDA != X_out's column stride, then we need to copy one
963 // column at a time; Kokkos::deep_copy refuses to work in that
964 // case.
965 const size_t outStride =
966 X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
967 if (LDA == outStride) { // strides are the same; deep_copy once
968 // This only works because MultiVector uses LayoutLeft.
969 // We would need a custom copy functor otherwise.
970 // DEEP_COPY REVIEW - HOST-TO-DEVICE
971 Kokkos::deep_copy (execution_space(), X_out, X_in);
972 }
973 else { // strides differ; copy one column at a time
974 typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
975 out_col_view_type;
976 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
977 in_col_view_type;
978 for (size_t j = 0; j < numVecs; ++j) {
979 out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
980 in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
981 // DEEP_COPY REVIEW - HOST-TO-DEVICE
982 Kokkos::deep_copy (execution_space(), X_out_j, X_in_j);
983 }
984 }
985 }
986
987 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
989 MultiVector (const Teuchos::RCP<const map_type>& map,
990 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
991 const size_t numVecs) :
992 base_type (map)
993 {
994 typedef impl_scalar_type IST;
995 typedef LocalOrdinal LO;
996 typedef GlobalOrdinal GO;
997 const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
998 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
999
1000 const size_t lclNumRows =
1001 map.is_null () ? size_t (0) : map->getLocalNumElements ();
1002 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1003 (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
1004 std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
1005 "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
1006 for (size_t j = 0; j < numVecs; ++j) {
1007 Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
1008 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1009 static_cast<size_t> (X_j_av.size ()) < lclNumRows,
1010 std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
1011 << X_j_av.size () << " < map->getLocalNumElements() = " << lclNumRows
1012 << ".");
1013 }
1014
1015 view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
1016 auto X_out = this->getLocalViewDevice(Access::ReadWrite);
1017
1018 // Make sure that the type of a single input column has the same
1019 // array layout as each output column does, so we can deep_copy.
1020 using array_layout = typename decltype (X_out)::array_layout;
1021 using input_col_view_type = typename Kokkos::View<const IST*,
1022 array_layout,
1023 Kokkos::HostSpace,
1024 Kokkos::MemoryUnmanaged>;
1026 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1027 for (size_t j = 0; j < numVecs; ++j) {
1028 Teuchos::ArrayView<const IST> X_j_av =
1029 Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
1030 input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
1031 auto X_j_out = Kokkos::subview (X_out, rowRng, j);
1032 // DEEP_COPY REVIEW - HOST-TO-DEVICE
1033 Kokkos::deep_copy (execution_space(), X_j_out, X_j_in);
1034 }
1035 }
1036
1037 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1042
1043 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1044 size_t
1046 getLocalLength () const
1047 {
1048 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1049 return static_cast<size_t> (0);
1050 } else {
1051 return this->getMap ()->getLocalNumElements ();
1052 }
1053 }
1054
1055 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1058 getGlobalLength () const
1059 {
1060 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1061 return static_cast<size_t> (0);
1062 } else {
1063 return this->getMap ()->getGlobalNumElements ();
1064 }
1065 }
1066
1067 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1068 size_t
1070 getStride () const
1071 {
1072 return isConstantStride () ? getDualViewStride (view_) : size_t (0);
1073 }
1074
1075 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1076 bool
1079 {
1080 //Don't actually get a view, just get pointers.
1081 auto thisData = view_.getDualView().h_view.data();
1082 auto otherData = other.view_.getDualView().h_view.data();
1083 size_t thisSpan = view_.getDualView().h_view.span();
1084 size_t otherSpan = other.view_.getDualView().h_view.span();
1085 return (otherData <= thisData && thisData < otherData + otherSpan)
1086 || (thisData <= otherData && otherData < thisData + thisSpan);
1087 }
1088
1089 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1090 bool
1092 checkSizes (const SrcDistObject& sourceObj)
1093 {
1094 // Check whether the source object is a MultiVector. If not, then
1095 // we can't even compare sizes, so it's definitely not OK to
1096 // Import or Export from it.
1098 const MV* src = dynamic_cast<const MV*> (&sourceObj);
1099 if (src == nullptr) {
1100 return false;
1101 }
1102 else {
1103 // The target of the Import or Export calls checkSizes() in
1104 // DistObject::doTransfer(). By that point, we've already
1105 // constructed an Import or Export object using the two
1106 // multivectors' Maps, which means that (hopefully) we've
1107 // already checked other attributes of the multivectors. Thus,
1108 // all we need to do here is check the number of columns.
1109 return src->getNumVectors () == this->getNumVectors ();
1110 }
1111 }
1112
1113 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1114 size_t
1119
1120 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1121 void
1124 (const SrcDistObject& sourceObj,
1125 const size_t numSameIDs,
1126 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1127 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1128 const CombineMode CM,
1129 const execution_space &space)
1130 {
1134 using std::endl;
1135 using KokkosRefactor::Details::permute_array_multi_column;
1136 using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1137 using Kokkos::Compat::create_const_view;
1139 const char tfecfFuncName[] = "copyAndPermute: ";
1140 ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
1141
1142 const bool verbose = Behavior::verbose ();
1143 std::unique_ptr<std::string> prefix;
1144 if (verbose) {
1145 auto map = this->getMap ();
1146 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1147 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1148 std::ostringstream os;
1149 os << "Proc " << myRank << ": MV::copyAndPermute: ";
1150 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1151 os << "Start" << endl;
1152 std::cerr << os.str ();
1153 }
1154
1155 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1156 (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1157 std::logic_error, "permuteToLIDs.extent(0) = "
1158 << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1159 << permuteFromLIDs.extent (0) << ".");
1160
1161 // We've already called checkSizes(), so this cast must succeed.
1162 MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1163 const size_t numCols = this->getNumVectors ();
1164
1165 // sourceMV doesn't belong to us, so we can't sync it. Do the
1166 // copying where it's currently sync'd.
1167 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1168 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1169 std::logic_error, "Input MultiVector needs sync to both host "
1170 "and device.");
1171 const bool copyOnHost = runKernelOnHost(sourceMV);
1172 if (verbose) {
1173 std::ostringstream os;
1174 os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1175 std::cerr << os.str ();
1176 }
1177
1178 if (verbose) {
1179 std::ostringstream os;
1180 os << *prefix << "Copy" << endl;
1181 std::cerr << os.str ();
1183
1184 // TODO (mfh 15 Sep 2013) When we replace
1185 // KokkosClassic::MultiVector with a Kokkos::View, there are two
1186 // ways to copy the data:
1187 //
1188 // 1. Get a (sub)view of each column and call deep_copy on that.
1189 // 2. Write a custom kernel to copy the data.
1190 //
1191 // The first is easier, but the second might be more performant in
1192 // case we decide to use layouts other than LayoutLeft. It might
1193 // even make sense to hide whichVectors_ in an entirely new layout
1194 // for Kokkos Views.
1195
1196 // Copy rows [0, numSameIDs-1] of the local multivectors.
1197 //
1198 // Note (ETP 2 Jul 2014) We need to always copy one column at a
1199 // time, even when both multivectors are constant-stride, since
1200 // deep_copy between strided subviews with more than one column
1201 // doesn't currently work.
1202
1203 // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1204 // both source and target are constant stride and have multiple
1205 // columns.
1206 if (numSameIDs > 0) {
1207 const std::pair<size_t, size_t> rows (0, numSameIDs);
1208 if (copyOnHost) {
1209 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1210 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1211
1212 for (size_t j = 0; j < numCols; ++j) {
1213 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1214 const size_t srcCol =
1215 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1216
1217 auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1218 auto src_j = Kokkos::subview (src_h, rows, srcCol);
1219 if (CM == ADD_ASSIGN) {
1220 // Sum src_j into tgt_j
1221 using range_t =
1222 Kokkos::RangePolicy<execution_space, size_t>;
1223 range_t rp(space, 0,numSameIDs);
1224 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1225 aaf(tgt_j, src_j);
1226 Kokkos::parallel_for(rp, aaf);
1227 }
1228 else {
1229 // Copy src_j into tgt_j
1230 // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1231 Kokkos::deep_copy (space, tgt_j, src_j);
1232 space.fence();
1233 }
1234 }
1235 }
1236 else { // copy on device
1237 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1238 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1239
1240 for (size_t j = 0; j < numCols; ++j) {
1241 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1242 const size_t srcCol =
1243 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1244
1245 auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1246 auto src_j = Kokkos::subview (src_d, rows, srcCol);
1247 if (CM == ADD_ASSIGN) {
1248 // Sum src_j into tgt_j
1249 using range_t =
1250 Kokkos::RangePolicy<execution_space, size_t>;
1251 range_t rp(space, 0,numSameIDs);
1252 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1253 aaf(tgt_j, src_j);
1254 Kokkos::parallel_for(rp, aaf);
1255 }
1256 else {
1257 // Copy src_j into tgt_j
1258 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1259 Kokkos::deep_copy (space, tgt_j, src_j);
1260 space.fence();
1261 }
1262 }
1263 }
1264 }
1265
1266
1267 // For the remaining GIDs, execute the permutations. This may
1268 // involve noncontiguous access of both source and destination
1269 // vectors, depending on the LID lists.
1270 //
1271 // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1272 // the same process, this merges their values by replacement of
1273 // the last encountered GID, not by the specified merge rule
1274 // (such as ADD).
1275
1276 // If there are no permutations, we are done
1277 if (permuteFromLIDs.extent (0) == 0 ||
1278 permuteToLIDs.extent (0) == 0) {
1279 if (verbose) {
1280 std::ostringstream os;
1281 os << *prefix << "No permutations. Done!" << endl;
1282 std::cerr << os.str ();
1283 }
1284 return;
1285 }
1286
1287 if (verbose) {
1288 std::ostringstream os;
1289 os << *prefix << "Permute" << endl;
1290 std::cerr << os.str ();
1291 }
1292
1293 // We could in theory optimize for the case where exactly one of
1294 // them is constant stride, but we don't currently do that.
1295 const bool nonConstStride =
1296 ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1297
1298 if (verbose) {
1299 std::ostringstream os;
1300 os << *prefix << "nonConstStride="
1301 << (nonConstStride ? "true" : "false") << endl;
1302 std::cerr << os.str ();
1303 }
1304
1305 // We only need the "which vectors" arrays if either the source or
1306 // target MV is not constant stride. Since we only have one
1307 // kernel that must do double-duty, we have to create a "which
1308 // vectors" array for the MV that _is_ constant stride.
1309 Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1310 Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1311 if (nonConstStride) {
1312 if (this->whichVectors_.size () == 0) {
1313 Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1314 tmpTgt.modify_host ();
1315 for (size_t j = 0; j < numCols; ++j) {
1316 tmpTgt.h_view(j) = j;
1317 }
1318 if (! copyOnHost) {
1319 tmpTgt.sync_device ();
1320 }
1321 tgtWhichVecs = tmpTgt;
1322 }
1323 else {
1324 Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1325 tgtWhichVecs =
1326 getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1327 "tgtWhichVecs",
1328 copyOnHost);
1329 }
1330 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1331 (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1332 this->getNumVectors (),
1333 std::logic_error, "tgtWhichVecs.extent(0) = " <<
1334 tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1335 this->getNumVectors () << ".");
1336
1337 if (sourceMV.whichVectors_.size () == 0) {
1338 Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1339 tmpSrc.modify_host ();
1340 for (size_t j = 0; j < numCols; ++j) {
1341 tmpSrc.h_view(j) = j;
1342 }
1343 if (! copyOnHost) {
1344 tmpSrc.sync_device ();
1345 }
1346 srcWhichVecs = tmpSrc;
1347 }
1348 else {
1349 Teuchos::ArrayView<const size_t> srcWhichVecsT =
1350 sourceMV.whichVectors_ ();
1351 srcWhichVecs =
1352 getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1353 "srcWhichVecs",
1354 copyOnHost);
1355 }
1356 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1357 (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1358 sourceMV.getNumVectors (), std::logic_error,
1359 "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1360 << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1361 << ".");
1362 }
1363
1364 if (copyOnHost) { // permute on host too
1365 if (verbose) {
1366 std::ostringstream os;
1367 os << *prefix << "Get permute LIDs on host" << std::endl;
1368 std::cerr << os.str ();
1369 }
1370 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1371 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1372
1373 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1374 auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1375 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1376 auto permuteFromLIDs_h =
1377 create_const_view (permuteFromLIDs.view_host ());
1378
1379 if (verbose) {
1380 std::ostringstream os;
1381 os << *prefix << "Permute on host" << endl;
1382 std::cerr << os.str ();
1383 }
1384 if (nonConstStride) {
1385 // No need to sync first, because copyOnHost argument to
1386 // getDualViewCopyFromArrayView puts them in the right place.
1387 auto tgtWhichVecs_h =
1388 create_const_view (tgtWhichVecs.view_host ());
1389 auto srcWhichVecs_h =
1390 create_const_view (srcWhichVecs.view_host ());
1391 if (CM == ADD_ASSIGN) {
1392 using op_type = KokkosRefactor::Details::AddOp;
1393 permute_array_multi_column_variable_stride (tgt_h, src_h,
1394 permuteToLIDs_h,
1395 permuteFromLIDs_h,
1396 tgtWhichVecs_h,
1397 srcWhichVecs_h, numCols,
1398 op_type());
1399 }
1400 else {
1401 using op_type = KokkosRefactor::Details::InsertOp;
1402 permute_array_multi_column_variable_stride (tgt_h, src_h,
1403 permuteToLIDs_h,
1404 permuteFromLIDs_h,
1405 tgtWhichVecs_h,
1406 srcWhichVecs_h, numCols,
1407 op_type());
1409 }
1410 else {
1411 if (CM == ADD_ASSIGN) {
1412 using op_type = KokkosRefactor::Details::AddOp;
1413 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1414 permuteFromLIDs_h, numCols, op_type());
1415 }
1416 else {
1417 using op_type = KokkosRefactor::Details::InsertOp;
1418 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1419 permuteFromLIDs_h, numCols, op_type());
1420 }
1422 }
1423 else { // permute on device
1424 if (verbose) {
1425 std::ostringstream os;
1426 os << *prefix << "Get permute LIDs on device" << endl;
1427 std::cerr << os.str ();
1428 }
1429 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1430 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1432 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1433 auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1434 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1435 auto permuteFromLIDs_d =
1436 create_const_view (permuteFromLIDs.view_device ());
1437
1438 if (verbose) {
1439 std::ostringstream os;
1440 os << *prefix << "Permute on device" << endl;
1441 std::cerr << os.str ();
1442 }
1443 if (nonConstStride) {
1444 // No need to sync first, because copyOnHost argument to
1445 // getDualViewCopyFromArrayView puts them in the right place.
1446 auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1447 auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1448 if (CM == ADD_ASSIGN) {
1449 using op_type = KokkosRefactor::Details::AddOp;
1450 permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1451 permuteToLIDs_d,
1452 permuteFromLIDs_d,
1453 tgtWhichVecs_d,
1454 srcWhichVecs_d, numCols,
1455 op_type());
1456 }
1457 else {
1458 using op_type = KokkosRefactor::Details::InsertOp;
1459 permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1460 permuteToLIDs_d,
1461 permuteFromLIDs_d,
1462 tgtWhichVecs_d,
1463 srcWhichVecs_d, numCols,
1464 op_type());
1465 }
1466 }
1467 else {
1468 if (CM == ADD_ASSIGN) {
1469 using op_type = KokkosRefactor::Details::AddOp;
1470 permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1471 permuteFromLIDs_d, numCols, op_type());
1472 }
1473 else {
1474 using op_type = KokkosRefactor::Details::InsertOp;
1475 permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1476 permuteFromLIDs_d, numCols, op_type());
1477 }
1478 }
1479 }
1480
1481 if (verbose) {
1482 std::ostringstream os;
1483 os << *prefix << "Done!" << endl;
1484 std::cerr << os.str ();
1485 }
1486 }
1488// clang-format on
1489template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1490void MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::copyAndPermute(
1491 const SrcDistObject &sourceObj, const size_t numSameIDs,
1492 const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1493 &permuteToLIDs,
1494 const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1495 &permuteFromLIDs,
1496 const CombineMode CM) {
1497 copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1498 execution_space());
1499}
1500// clang-format off
1501
1502 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1503 void
1506 (const SrcDistObject& sourceObj,
1507 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1508 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1509 Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1510 size_t& constantNumPackets,
1511 const execution_space &space)
1512 {
1516 using Kokkos::Compat::create_const_view;
1517 using Kokkos::Compat::getKokkosViewDeepCopy;
1518 using std::endl;
1520 const char tfecfFuncName[] = "packAndPrepare: ";
1521 ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1522
1523 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1524 // have the option to check indices. We do so when Tpetra is in
1525 // debug mode. It is in debug mode by default in a debug build,
1526 // but you may control this at run time, before launching the
1527 // executable, by setting the TPETRA_DEBUG environment variable to
1528 // "1" (or "TRUE").
1529 const bool debugCheckIndices = Behavior::debug ();
1530 // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1531 // environment variable to "1" (or "TRUE") for copious debug
1532 // output to std::cerr on every MPI process. This is unwise for
1533 // runs with large numbers of MPI processes.
1534 const bool printDebugOutput = Behavior::verbose ();
1535
1536 std::unique_ptr<std::string> prefix;
1537 if (printDebugOutput) {
1538 auto map = this->getMap ();
1539 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1540 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1541 std::ostringstream os;
1542 os << "Proc " << myRank << ": MV::packAndPrepare: ";
1543 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1544 os << "Start" << endl;
1545 std::cerr << os.str ();
1546 }
1547
1548 // We've already called checkSizes(), so this cast must succeed.
1549 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1550
1551 const size_t numCols = sourceMV.getNumVectors ();
1552
1553 // This spares us from needing to fill numExportPacketsPerLID.
1554 // Setting constantNumPackets to a nonzero value signals that
1555 // all packets have the same number of entries.
1556 constantNumPackets = numCols;
1558 // If we have no exports, there is nothing to do. Make sure this
1559 // goes _after_ setting constantNumPackets correctly.
1560 if (exportLIDs.extent (0) == 0) {
1561 if (printDebugOutput) {
1562 std::ostringstream os;
1563 os << *prefix << "No exports on this proc, DONE" << std::endl;
1564 std::cerr << os.str ();
1565 }
1566 return;
1567 }
1568
1569 /* The layout in the export for MultiVectors is as follows:
1570 exports = { all of the data from row exportLIDs.front() ;
1571 ....
1572 all of the data from row exportLIDs.back() }
1573 This doesn't have the best locality, but is necessary because
1574 the data for a Packet (all data associated with an LID) is
1575 required to be contiguous. */
1576
1577 // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1578 // packing scheme in the above comment? The data going to a
1579 // particular process must be contiguous, of course, but those
1580 // data could include entries from multiple LIDs. DistObject just
1581 // needs to know how to index into that data. Kokkos is good at
1582 // decoupling storage intent from data layout choice.
1583
1584 const size_t numExportLIDs = exportLIDs.extent (0);
1585 const size_t newExportsSize = numCols * numExportLIDs;
1586 if (printDebugOutput) {
1587 std::ostringstream os;
1588 os << *prefix << "realloc: "
1589 << "numExportLIDs: " << numExportLIDs
1590 << ", exports.extent(0): " << exports.extent (0)
1591 << ", newExportsSize: " << newExportsSize << std::endl;
1592 std::cerr << os.str ();
1593 }
1594 reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1595
1596 // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1597 // sync it. Pack it where it's currently sync'd.
1598 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1599 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1600 std::logic_error, "Input MultiVector needs sync to both host "
1601 "and device.");
1602 const bool packOnHost = runKernelOnHost(sourceMV);
1603 if (printDebugOutput) {
1604 std::ostringstream os;
1605 os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1606 std::cerr << os.str ();
1607 }
1608
1609 // Mark 'exports' here, since we might have resized it above.
1610 // Resizing currently requires calling the constructor, which
1611 // clears out the 'modified' flags.
1612 if (packOnHost) {
1613 // nde 06 Feb 2020: If 'exports' does not require resize
1614 // when reallocDualViewIfNeeded is called, the modified flags
1615 // are not cleared out. This can result in host and device views
1616 // being out-of-sync, resuling in an error in exports.modify_* calls.
1617 // Clearing the sync flags prevents this possible case.
1618 exports.clear_sync_state ();
1619 exports.modify_host ();
1620 }
1621 else {
1622 // nde 06 Feb 2020: If 'exports' does not require resize
1623 // when reallocDualViewIfNeeded is called, the modified flags
1624 // are not cleared out. This can result in host and device views
1625 // being out-of-sync, resuling in an error in exports.modify_* calls.
1626 // Clearing the sync flags prevents this possible case.
1627 exports.clear_sync_state ();
1628 exports.modify_device ();
1629 }
1630
1631 if (numCols == 1) { // special case for one column only
1632 // MultiVector always represents a single column with constant
1633 // stride, but it doesn't hurt to implement both cases anyway.
1634 //
1635 // ETP: I'm not sure I agree with the above statement. Can't a single-
1636 // column multivector be a subview of another multi-vector, in which case
1637 // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1638 // separately.
1639 //
1640 // mfh 18 Jan 2016: In answer to ETP's comment above:
1641 // MultiVector treats single-column MultiVectors created using a
1642 // "nonconstant stride constructor" as a special case, and makes
1643 // them constant stride (by making whichVectors_ have length 0).
1644 if (sourceMV.isConstantStride ()) {
1645 using KokkosRefactor::Details::pack_array_single_column;
1646 if (printDebugOutput) {
1647 std::ostringstream os;
1648 os << *prefix << "Pack numCols=1 const stride" << endl;
1649 std::cerr << os.str ();
1650 }
1651 if (packOnHost) {
1652 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1653 pack_array_single_column (exports.view_host (),
1654 src_host,
1655 exportLIDs.view_host (),
1656 0,
1657 debugCheckIndices);
1658 }
1659 else { // pack on device
1660 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1661 pack_array_single_column (exports.view_device (),
1662 src_dev,
1663 exportLIDs.view_device (),
1664 0,
1665 debugCheckIndices,
1666 space);
1667 }
1668 }
1669 else {
1670 using KokkosRefactor::Details::pack_array_single_column;
1671 if (printDebugOutput) {
1672 std::ostringstream os;
1673 os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1674 std::cerr << os.str ();
1675 }
1676 if (packOnHost) {
1677 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1678 pack_array_single_column (exports.view_host (),
1679 src_host,
1680 exportLIDs.view_host (),
1681 sourceMV.whichVectors_[0],
1682 debugCheckIndices);
1683 }
1684 else { // pack on device
1685 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1686 pack_array_single_column (exports.view_device (),
1687 src_dev,
1688 exportLIDs.view_device (),
1689 sourceMV.whichVectors_[0],
1690 debugCheckIndices,
1691 space);
1692 }
1693 }
1694 }
1695 else { // the source MultiVector has multiple columns
1696 if (sourceMV.isConstantStride ()) {
1697 using KokkosRefactor::Details::pack_array_multi_column;
1698 if (printDebugOutput) {
1699 std::ostringstream os;
1700 os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1701 std::cerr << os.str ();
1702 }
1703 if (packOnHost) {
1704 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1705 pack_array_multi_column (exports.view_host (),
1706 src_host,
1707 exportLIDs.view_host (),
1708 numCols,
1709 debugCheckIndices);
1710 }
1711 else { // pack on device
1712 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1713 pack_array_multi_column (exports.view_device (),
1714 src_dev,
1715 exportLIDs.view_device (),
1716 numCols,
1717 debugCheckIndices,
1718 space);
1720 }
1721 else {
1722 using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1723 if (printDebugOutput) {
1724 std::ostringstream os;
1725 os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1726 << endl;
1727 std::cerr << os.str ();
1728 }
1729 // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1730 // whichVectors_ can be expensive, but pack and unpack for
1731 // nonconstant-stride MultiVectors is slower anyway.
1732 using IST = impl_scalar_type;
1733 using DV = Kokkos::DualView<IST*, device_type>;
1734 using HES = typename DV::t_host::execution_space;
1735 using DES = typename DV::t_dev::execution_space;
1736 Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1737 if (packOnHost) {
1738 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1739 pack_array_multi_column_variable_stride
1740 (exports.view_host (),
1741 src_host,
1742 exportLIDs.view_host (),
1743 getKokkosViewDeepCopy<HES> (whichVecs),
1744 numCols,
1745 debugCheckIndices);
1746 }
1747 else { // pack on device
1748 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1749 pack_array_multi_column_variable_stride
1750 (exports.view_device (),
1751 src_dev,
1752 exportLIDs.view_device (),
1753 getKokkosViewDeepCopy<DES> (whichVecs),
1754 numCols,
1755 debugCheckIndices, space);
1756 }
1757 }
1758 }
1759
1760 if (printDebugOutput) {
1761 std::ostringstream os;
1762 os << *prefix << "Done!" << endl;
1763 std::cerr << os.str ();
1764 }
1765
1766 }
1767
1768// clang-format on
1769 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1770 void
1773 (const SrcDistObject& sourceObj,
1774 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1775 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1776 Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1777 size_t& constantNumPackets) {
1778 packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space());
1779 }
1780// clang-format off
1781
1782 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1783 template <class NO>
1784 typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1785 typename NO::device_type::memory_space>::value,
1786 bool>::type
1788 reallocImportsIfNeededImpl (const size_t newSize,
1789 const bool verbose,
1790 const std::string* prefix,
1791 const bool areRemoteLIDsContiguous,
1792 const CombineMode CM)
1793 {
1794 // This implementation of reallocImportsIfNeeded is an
1795 // optimization that is specific to MultiVector. We check if the
1796 // imports_ view can be aliased to the end of the data view_. If
1797 // that is the case, we can skip the unpackAndCombine call.
1798
1799 if (verbose) {
1800 std::ostringstream os;
1801 os << *prefix << "Realloc (if needed) imports_ from "
1802 << this->imports_.extent (0) << " to " << newSize << std::endl;
1803 std::cerr << os.str ();
1804 }
1805
1806 bool reallocated = false;
1807
1808 using IST = impl_scalar_type;
1809 using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1810
1811 // Conditions for aliasing memory:
1812 // - When both sides of the dual view are in the same memory
1813 // space, we do not need to worry about syncing things.
1814 // - When both memory spaces are different, we only alias if this
1815 // does not incur additional sync'ing.
1816 // - The remote LIDs need to be contiguous, so that we do not need
1817 // to reorder received information.
1818 // - CombineMode needs to be INSERT.
1819 // - The number of vectors needs to be 1, otherwise we need to
1820 // reorder the received data.
1821 if ((dual_view_type::impl_dualview_is_single_device::value ||
1822 (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) ||
1823 (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) &&
1824 areRemoteLIDsContiguous &&
1825 (CM == INSERT || CM == REPLACE) &&
1826 (getNumVectors() == 1) &&
1827 (newSize > 0)) {
1828
1829 size_t offset = getLocalLength () - newSize;
1830 reallocated = this->imports_.d_view.data() != view_.getDualView().d_view.data() + offset;
1831 if (reallocated) {
1832 typedef std::pair<size_t, size_t> range_type;
1833 this->imports_ = DV(view_.getDualView(),
1834 range_type (offset, getLocalLength () ),
1835 0);
1836
1837 if (verbose) {
1838 std::ostringstream os;
1839 os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1840 std::cerr << os.str ();
1841 }
1843 return reallocated;
1844 }
1845 {
1847 reallocated =
1848 reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1849 if (verbose) {
1850 std::ostringstream os;
1851 os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1852 std::cerr << os.str ();
1853 }
1854 this->imports_ = this->unaliased_imports_;
1855 }
1856 return reallocated;
1857 }
1858
1859 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1860 template <class NO>
1861 typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1862 typename NO::device_type::memory_space>::value,
1863 bool>::type
1865 reallocImportsIfNeededImpl (const size_t newSize,
1866 const bool verbose,
1867 const std::string* prefix,
1868 const bool areRemoteLIDsContiguous,
1869 const CombineMode CM)
1870 {
1871 return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1872 }
1873
1874 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1875 bool
1877 reallocImportsIfNeeded(const size_t newSize,
1878 const bool verbose,
1879 const std::string* prefix,
1880 const bool areRemoteLIDsContiguous,
1881 const CombineMode CM) {
1883 return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1884 }
1885
1886 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1887 bool
1890 return (this->imports_.d_view.data() + this->imports_.d_view.extent(0) ==
1891 view_.getDualView().d_view.data() + view_.getDualView().d_view.extent(0));
1892 }
1893
1894
1895 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1896 void
1899 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1900 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1901 Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1902 const size_t constantNumPackets,
1903 const CombineMode CM,
1904 const execution_space &space)
1905 {
1908 using KokkosRefactor::Details::unpack_array_multi_column;
1909 using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1910 using Kokkos::Compat::getKokkosViewDeepCopy;
1911 using std::endl;
1912 const char longFuncName[] = "Tpetra::MultiVector::unpackAndCombine";
1913 const char tfecfFuncName[] = "unpackAndCombine: ";
1914 ProfilingRegion regionUAC (longFuncName);
1915
1916 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1917 // have the option to check indices. We do so when Tpetra is in
1918 // debug mode. It is in debug mode by default in a debug build,
1919 // but you may control this at run time, before launching the
1920 // executable, by setting the TPETRA_DEBUG environment variable to
1921 // "1" (or "TRUE").
1922 const bool debugCheckIndices = Behavior::debug ();
1923
1924 const bool printDebugOutput = Behavior::verbose ();
1925 std::unique_ptr<std::string> prefix;
1926 if (printDebugOutput) {
1927 auto map = this->getMap ();
1928 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1929 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1930 std::ostringstream os;
1931 os << "Proc " << myRank << ": " << longFuncName << ": ";
1932 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1933 os << "Start" << endl;
1934 std::cerr << os.str ();
1935 }
1936
1937 // If we have no imports, there is nothing to do
1938 if (importLIDs.extent (0) == 0) {
1939 if (printDebugOutput) {
1940 std::ostringstream os;
1941 os << *prefix << "No imports. Done!" << endl;
1942 std::cerr << os.str ();
1943 }
1944 return;
1945 }
1946
1947 // Check, whether imports_ is a subview of the MV view.
1948 if (importsAreAliased()) {
1949 if (printDebugOutput) {
1950 std::ostringstream os;
1951 os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1952 std::cerr << os.str ();
1953 }
1954 return;
1955 }
1956
1957
1958 const size_t numVecs = getNumVectors ();
1959 if (debugCheckIndices) {
1960 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1961 (static_cast<size_t> (imports.extent (0)) !=
1962 numVecs * importLIDs.extent (0),
1963 std::runtime_error,
1964 "imports.extent(0) = " << imports.extent (0)
1965 << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1966 << " * " << importLIDs.extent (0) << " = "
1967 << numVecs * importLIDs.extent (0) << ".");
1968
1969 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1970 (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1971 "constantNumPackets input argument must be nonzero.");
1972
1973 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1974 (static_cast<size_t> (numVecs) !=
1975 static_cast<size_t> (constantNumPackets),
1976 std::runtime_error, "constantNumPackets must equal numVecs.");
1978
1979 // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1980 // the memory space in which the imports buffer was last modified and
1981 // the size of the imports buffer.
1982 // DistObject::doTransferNew decides where it was last modified (based on
1983 // whether communication buffers used were on host or device).
1984 const bool unpackOnHost = runKernelOnHost(imports);
1985 if (unpackOnHost) {
1986 if (this->imports_.need_sync_host()) this->imports_.sync_host();
1987 }
1988 else {
1989 if (this->imports_.need_sync_device()) this->imports_.sync_device();
1990 }
1991
1992 if (printDebugOutput) {
1993 std::ostringstream os;
1994 os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1995 << endl;
1996 std::cerr << os.str ();
1997 }
1998
1999 // We have to sync before modifying, because this method may read
2000 // as well as write (depending on the CombineMode).
2001 auto imports_d = imports.view_device ();
2002 auto imports_h = imports.view_host ();
2003 auto importLIDs_d = importLIDs.view_device ();
2004 auto importLIDs_h = importLIDs.view_host ();
2005
2006 Kokkos::DualView<size_t*, device_type> whichVecs;
2007 if (! isConstantStride ()) {
2008 Kokkos::View<const size_t*, Kokkos::HostSpace,
2009 Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
2010 numVecs);
2011 whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
2012 if (unpackOnHost) {
2013 whichVecs.modify_host ();
2014 // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
2015 Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
2016 }
2017 else {
2018 whichVecs.modify_device ();
2019 // DEEP_COPY REVIEW - HOST-TO-DEVICE
2020 Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
2021 }
2022 }
2023 auto whichVecs_d = whichVecs.view_device ();
2024 auto whichVecs_h = whichVecs.view_host ();
2025
2026 /* The layout in the export for MultiVectors is as follows:
2027 imports = { all of the data from row exportLIDs.front() ;
2028 ....
2029 all of the data from row exportLIDs.back() }
2030 This doesn't have the best locality, but is necessary because
2031 the data for a Packet (all data associated with an LID) is
2032 required to be contiguous. */
2033
2034 if (numVecs > 0 && importLIDs.extent (0) > 0) {
2035 using dev_exec_space = typename dual_view_type::t_dev::execution_space;
2036 using host_exec_space = typename dual_view_type::t_host::execution_space;
2037
2038 // This fixes GitHub Issue #4418.
2039 const bool use_atomic_updates = unpackOnHost ?
2040 host_exec_space().concurrency () != 1 :
2041 dev_exec_space().concurrency () != 1;
2042
2043 if (printDebugOutput) {
2044 std::ostringstream os;
2045 os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
2046 std::cerr << os.str ();
2047 }
2048
2049 // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
2050 // custom combine modes, start editing here.
2051
2052 if (CM == INSERT || CM == REPLACE) {
2053 using op_type = KokkosRefactor::Details::InsertOp;
2054 if (isConstantStride ()) {
2055 if (unpackOnHost) {
2056 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2057 unpack_array_multi_column (host_exec_space (),
2058 X_h, imports_h, importLIDs_h,
2059 op_type (), numVecs,
2060 use_atomic_updates,
2061 debugCheckIndices);
2062
2064 else { // unpack on device
2065 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2066 unpack_array_multi_column (space,
2067 X_d, imports_d, importLIDs_d,
2068 op_type (), numVecs,
2069 use_atomic_updates,
2070 debugCheckIndices);
2071 }
2073 else { // not constant stride
2074 if (unpackOnHost) {
2075 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2076 unpack_array_multi_column_variable_stride (host_exec_space (),
2077 X_h, imports_h,
2078 importLIDs_h,
2079 whichVecs_h,
2080 op_type (),
2081 numVecs,
2082 use_atomic_updates,
2083 debugCheckIndices);
2084 }
2085 else { // unpack on device
2086 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2087 unpack_array_multi_column_variable_stride (space,
2088 X_d, imports_d,
2089 importLIDs_d,
2090 whichVecs_d,
2091 op_type (),
2092 numVecs,
2093 use_atomic_updates,
2094 debugCheckIndices);
2096 }
2097 }
2098 else if (CM == ADD || CM == ADD_ASSIGN) {
2099 using op_type = KokkosRefactor::Details::AddOp;
2100 if (isConstantStride ()) {
2101 if (unpackOnHost) {
2102 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2103 unpack_array_multi_column (host_exec_space (),
2104 X_h, imports_h, importLIDs_h,
2105 op_type (), numVecs,
2106 use_atomic_updates,
2107 debugCheckIndices);
2108 }
2109 else { // unpack on device
2110 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2111 unpack_array_multi_column (space,
2112 X_d, imports_d, importLIDs_d,
2113 op_type (), numVecs,
2114 use_atomic_updates,
2115 debugCheckIndices);
2116 }
2117 }
2118 else { // not constant stride
2119 if (unpackOnHost) {
2120 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2121 unpack_array_multi_column_variable_stride (host_exec_space (),
2122 X_h, imports_h,
2123 importLIDs_h,
2124 whichVecs_h,
2125 op_type (),
2126 numVecs,
2127 use_atomic_updates,
2128 debugCheckIndices);
2129 }
2130 else { // unpack on device
2131 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2132 unpack_array_multi_column_variable_stride (space,
2133 X_d, imports_d,
2134 importLIDs_d,
2135 whichVecs_d,
2136 op_type (),
2137 numVecs,
2138 use_atomic_updates,
2139 debugCheckIndices);
2140 }
2141 }
2142 }
2143 else if (CM == ABSMAX) {
2144 using op_type = KokkosRefactor::Details::AbsMaxOp;
2145 if (isConstantStride ()) {
2146 if (unpackOnHost) {
2147 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2148 unpack_array_multi_column (host_exec_space (),
2149 X_h, imports_h, importLIDs_h,
2150 op_type (), numVecs,
2151 use_atomic_updates,
2152 debugCheckIndices);
2154 else { // unpack on device
2155 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2156 unpack_array_multi_column (space,
2157 X_d, imports_d, importLIDs_d,
2158 op_type (), numVecs,
2159 use_atomic_updates,
2160 debugCheckIndices);
2161 }
2162 }
2163 else {
2164 if (unpackOnHost) {
2165 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2166 unpack_array_multi_column_variable_stride (host_exec_space (),
2167 X_h, imports_h,
2168 importLIDs_h,
2169 whichVecs_h,
2170 op_type (),
2171 numVecs,
2172 use_atomic_updates,
2173 debugCheckIndices);
2174 }
2175 else { // unpack on device
2176 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2177 unpack_array_multi_column_variable_stride (space,
2178 X_d, imports_d,
2179 importLIDs_d,
2180 whichVecs_d,
2181 op_type (),
2182 numVecs,
2183 use_atomic_updates,
2184 debugCheckIndices);
2185 }
2186 }
2187 }
2188 else {
2189 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2190 (true, std::logic_error, "Invalid CombineMode");
2191 }
2192 }
2193 else {
2194 if (printDebugOutput) {
2195 std::ostringstream os;
2196 os << *prefix << "Nothing to unpack" << endl;
2197 std::cerr << os.str ();
2198 }
2199 }
2200
2201 if (printDebugOutput) {
2202 std::ostringstream os;
2203 os << *prefix << "Done!" << endl;
2204 std::cerr << os.str ();
2205 }
2206 }
2208 // clang-format on
2209 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2210 void
2213 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2214 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2215 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2216 const size_t constantNumPackets,
2217 const CombineMode CM) {
2218 unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2219 }
2220 // clang-format off
2221
2222 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2223 size_t
2225 getNumVectors () const
2226 {
2227 if (isConstantStride ()) {
2228 return static_cast<size_t> (view_.extent (1));
2229 } else {
2230 return static_cast<size_t> (whichVectors_.size ());
2231 }
2232 }
2233
2234 namespace { // (anonymous)
2235
2236 template<class RV>
2237 void
2238 gblDotImpl (const RV& dotsOut,
2239 const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2240 const bool distributed)
2241 {
2242 using Teuchos::REDUCE_MAX;
2243 using Teuchos::REDUCE_SUM;
2244 using Teuchos::reduceAll;
2245 typedef typename RV::non_const_value_type dot_type;
2246
2247 const size_t numVecs = dotsOut.extent (0);
2248
2249 // If the MultiVector is distributed over multiple processes, do
2250 // the distributed (interprocess) part of the dot product. We
2251 // assume that the MPI implementation can read from and write to
2252 // device memory.
2253 //
2254 // replaceMap() may have removed some processes. Those
2255 // processes have a null Map. They must not participate in any
2256 // collective operations. We ask first whether the Map is null,
2257 // because isDistributed() defers that question to the Map. We
2258 // still compute and return local dots for processes not
2259 // participating in collective operations; those probably don't
2260 // make any sense, but it doesn't hurt to do them, since it's
2261 // illegal to call dot() on those processes anyway.
2262 if (distributed && ! comm.is_null ()) {
2263 // The calling process only participates in the collective if
2264 // both the Map and its Comm on that process are nonnull.
2265 const int nv = static_cast<int> (numVecs);
2266 const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
2267
2268 if (commIsInterComm) {
2269 // If comm is an intercomm, then we may not alias input and
2270 // output buffers, so we have to make a copy of the local
2271 // sum.
2272 typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2273 // DEEP_COPY REVIEW - NOT TESTED
2274 Kokkos::deep_copy (lclDots, dotsOut);
2275 const dot_type* const lclSum = lclDots.data ();
2276 dot_type* const gblSum = dotsOut.data ();
2277 reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2278 }
2279 else {
2280 dot_type* const inout = dotsOut.data ();
2281 reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
2282 }
2283 }
2284 }
2285 } // namespace (anonymous)
2286
2287 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2288 void
2291 const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2292 {
2294 using Kokkos::subview;
2295 using Teuchos::Comm;
2296 using Teuchos::null;
2297 using Teuchos::RCP;
2298 // View of all the dot product results.
2299 typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2300 typedef typename dual_view_type::t_dev_const XMV;
2301 const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2302
2303 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2304
2305 const size_t numVecs = this->getNumVectors ();
2306 if (numVecs == 0) {
2307 return; // nothing to do
2308 }
2309 const size_t lclNumRows = this->getLocalLength ();
2310 const size_t numDots = static_cast<size_t> (dots.extent (0));
2311 const bool debug = Behavior::debug ();
2312
2313 if (debug) {
2314 const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2315 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2316 (! compat, std::invalid_argument, "'*this' MultiVector is not "
2317 "compatible with the input MultiVector A. We only test for this "
2318 "in debug mode.");
2319 }
2320
2321 // FIXME (mfh 11 Jul 2014) These exception tests may not
2322 // necessarily be thrown on all processes consistently. We should
2323 // instead pass along error state with the inner product. We
2324 // could do this by setting an extra slot to
2325 // Kokkos::ArithTraits<dot_type>::one() on error. The
2326 // final sum should be
2327 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2328 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2329 lclNumRows != A.getLocalLength (), std::runtime_error,
2330 "MultiVectors do not have the same local length. "
2331 "this->getLocalLength() = " << lclNumRows << " != "
2332 "A.getLocalLength() = " << A.getLocalLength () << ".");
2333 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2334 numVecs != A.getNumVectors (), std::runtime_error,
2335 "MultiVectors must have the same number of columns (vectors). "
2336 "this->getNumVectors() = " << numVecs << " != "
2337 "A.getNumVectors() = " << A.getNumVectors () << ".");
2338 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2339 numDots != numVecs, std::runtime_error,
2340 "The output array 'dots' must have the same number of entries as the "
2341 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2342 numDots << " != this->getNumVectors() = " << numVecs << ".");
2343
2344 const std::pair<size_t, size_t> colRng (0, numVecs);
2345 RV dotsOut = subview (dots, colRng);
2346 RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2347 this->getMap ()->getComm ();
2349 auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2350 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2351
2352 ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2353 this->whichVectors_.getRawPtr (),
2354 A.whichVectors_.getRawPtr (),
2355 this->isConstantStride (), A.isConstantStride ());
2356
2357 // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2358 // functions unless they explicitely return a value to Host.
2359 // Here while the lclDot are on host, they are not a return
2360 // value, therefore they might be avaible to us immediately.
2361 // Adding a frnce here guarantees that we will have the lclDot
2362 // ahead of the MPI reduction.
2363 execution_space exec_space_instance = execution_space();
2364 exec_space_instance.fence();
2365
2366 gblDotImpl (dotsOut, comm, this->isDistributed ());
2367 }
2368
2369 namespace { // (anonymous)
2370 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2372 multiVectorSingleColumnDot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
2374 {
2377 using dot_type = typename MV::dot_type;
2378 ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2379
2380 auto map = x.getMap ();
2381 Teuchos::RCP<const Teuchos::Comm<int> > comm =
2382 map.is_null () ? Teuchos::null : map->getComm ();
2383 if (comm.is_null ()) {
2384 return Kokkos::ArithTraits<dot_type>::zero ();
2385 }
2386 else {
2387 using LO = LocalOrdinal;
2388 // The min just ensures that we don't overwrite memory that
2389 // doesn't belong to us, in the erroneous input case where x
2390 // and y have different numbers of rows.
2391 const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2392 y.getLocalLength ()));
2393 const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2394 dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2395 dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2396
2397 // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2398 //const_cast<MV&>(x).sync_device ();
2399 //const_cast<MV&>(y).sync_device ();
2400
2401 auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2402 auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2403 auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2404 auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2405 lclDot = KokkosBlas::dot (x_1d, y_1d);
2406
2407 if (x.isDistributed ()) {
2408 using Teuchos::outArg;
2409 using Teuchos::REDUCE_SUM;
2410 using Teuchos::reduceAll;
2411 reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2412 }
2413 else {
2414 gblDot = lclDot;
2415 }
2416 return gblDot;
2417 }
2418 }
2419 } // namespace (anonymous)
2420
2421
2422
2423 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2424 void
2427 const Teuchos::ArrayView<dot_type>& dots) const
2428 {
2429 const char tfecfFuncName[] = "dot: ";
2430 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2431
2432 const size_t numVecs = this->getNumVectors ();
2433 const size_t lclNumRows = this->getLocalLength ();
2434 const size_t numDots = static_cast<size_t> (dots.size ());
2435
2436 // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2437 // not necessarily be thrown on all processes consistently. We
2438 // keep them for now, because MultiVector's unit tests insist on
2439 // them. In the future, we should instead pass along error state
2440 // with the inner product. We could do this by setting an extra
2441 // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2442 // The final sum should be
2443 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2444 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2445 (lclNumRows != A.getLocalLength (), std::runtime_error,
2446 "MultiVectors do not have the same local length. "
2447 "this->getLocalLength() = " << lclNumRows << " != "
2448 "A.getLocalLength() = " << A.getLocalLength () << ".");
2449 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2450 (numVecs != A.getNumVectors (), std::runtime_error,
2451 "MultiVectors must have the same number of columns (vectors). "
2452 "this->getNumVectors() = " << numVecs << " != "
2453 "A.getNumVectors() = " << A.getNumVectors () << ".");
2454 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2455 (numDots != numVecs, std::runtime_error,
2456 "The output array 'dots' must have the same number of entries as the "
2457 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2458 numDots << " != this->getNumVectors() = " << numVecs << ".");
2459
2460 if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2461 const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
2462 dots[0] = gblDot;
2463 }
2464 else {
2465 this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2466 }
2467 }
2468
2469 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2470 void
2472 norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2473 {
2475 using ::Tpetra::Details::NORM_TWO;
2477 ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2478
2479 // The function needs to be able to sync X.
2480 MV& X = const_cast<MV&> (*this);
2481 multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2482 }
2483
2484 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2485 void
2487 norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2488 {
2489 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2490 this->norm2 (norms_av);
2491 }
2492
2493
2494 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2495 void
2497 norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2498 {
2500 using ::Tpetra::Details::NORM_ONE;
2502 ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2503
2504 // The function needs to be able to sync X.
2505 MV& X = const_cast<MV&> (*this);
2506 multiVectorNormImpl (norms.data (), X, NORM_ONE);
2507 }
2508
2509 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2510 void
2512 norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2513 {
2514 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2515 this->norm1 (norms_av);
2516 }
2517
2518 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2519 void
2521 normInf (const Teuchos::ArrayView<mag_type>& norms) const
2522 {
2524 using ::Tpetra::Details::NORM_INF;
2526 ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2527
2528 // The function needs to be able to sync X.
2529 MV& X = const_cast<MV&> (*this);
2530 multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2531 }
2532
2533 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2534 void
2536 normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2537 {
2538 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2539 this->normInf (norms_av);
2540 }
2541
2542 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2543 void
2545 meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2546 {
2547 // KR FIXME Overload this method to take a View.
2548 using Kokkos::ALL;
2549 using Kokkos::subview;
2550 using Teuchos::Comm;
2551 using Teuchos::RCP;
2552 using Teuchos::reduceAll;
2553 using Teuchos::REDUCE_SUM;
2554 typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2555
2556 const size_t lclNumRows = this->getLocalLength ();
2557 const size_t numVecs = this->getNumVectors ();
2558 const size_t numMeans = static_cast<size_t> (means.size ());
2559
2560 TEUCHOS_TEST_FOR_EXCEPTION(
2561 numMeans != numVecs, std::runtime_error,
2562 "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2563 << " != this->getNumVectors() = " << numVecs << ".");
2564
2565 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2566 const std::pair<size_t, size_t> colRng (0, numVecs);
2567
2568 // Make sure that the final output view has the same layout as the
2569 // temporary view's HostMirror. Left or Right doesn't matter for
2570 // a 1-D array anyway; this is just to placate the compiler.
2571 typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2572 typedef Kokkos::View<impl_scalar_type*,
2573 typename local_view_type::HostMirror::array_layout,
2574 Kokkos::HostSpace,
2575 Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2576 host_local_view_type meansOut (means.getRawPtr (), numMeans);
2577
2578 RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2579 this->getMap ()->getComm ();
2580
2581 // If we need sync to device, then host has the most recent version.
2582 const bool useHostVersion = this->need_sync_device ();
2583 if (useHostVersion) {
2584 // DualView was last modified on host, so run the local kernel there.
2585 auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2586 rowRng, Kokkos::ALL ());
2587 // Compute the local sum of each column.
2588 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2589 if (isConstantStride ()) {
2590 KokkosBlas::sum (lclSums, X_lcl);
2591 }
2592 else {
2593 for (size_t j = 0; j < numVecs; ++j) {
2594 const size_t col = whichVectors_[j];
2595 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2596 }
2597 }
2598
2599 // If there are multiple MPI processes, the all-reduce reads
2600 // from lclSums, and writes to meansOut. Otherwise, we just
2601 // copy lclSums into meansOut.
2602 if (! comm.is_null () && this->isDistributed ()) {
2603 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2604 lclSums.data (), meansOut.data ());
2605 }
2606 else {
2607 // DEEP_COPY REVIEW - NOT TESTED
2608 Kokkos::deep_copy (meansOut, lclSums);
2609 }
2610 }
2611 else {
2612 // DualView was last modified on device, so run the local kernel there.
2613 auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2614 rowRng, Kokkos::ALL ());
2615
2616 // Compute the local sum of each column.
2617 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2618 if (isConstantStride ()) {
2619 KokkosBlas::sum (lclSums, X_lcl);
2620 }
2621 else {
2622 for (size_t j = 0; j < numVecs; ++j) {
2623 const size_t col = whichVectors_[j];
2624 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2625 }
2626 }
2627 // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2628 // functions unless they explicitly return a value to Host.
2629 // Here while the lclSums are on the host, they are not a return
2630 // value, therefore they might be available to us immediately.
2631 // Adding a fence here guarantees that we will have the lclSums
2632 // ahead of the MPI reduction.
2633 execution_space exec_space_instance = execution_space();
2634 exec_space_instance.fence();
2635
2636 // If there are multiple MPI processes, the all-reduce reads
2637 // from lclSums, and writes to meansOut. (We assume that MPI
2638 // can read device memory.) Otherwise, we just copy lclSums
2639 // into meansOut.
2640 if (! comm.is_null () && this->isDistributed ()) {
2641 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2642 lclSums.data (), meansOut.data ());
2643 }
2644 else {
2645 // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2646 Kokkos::deep_copy (meansOut, lclSums);
2647 }
2648 }
2649
2650 // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2651 // to the magnitude type, since operator/ (std::complex<T>, int)
2652 // isn't necessarily defined.
2653 const impl_scalar_type OneOverN =
2654 ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2655 for (size_t k = 0; k < numMeans; ++k) {
2656 meansOut(k) = meansOut(k) * OneOverN;
2657 }
2658 }
2659
2660
2661 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2662 void
2664 randomize ()
2665 {
2666 typedef impl_scalar_type IST;
2667 typedef Kokkos::ArithTraits<IST> ATS;
2668 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2669 typedef typename pool_type::generator_type generator_type;
2670
2671 const IST max = Kokkos::rand<generator_type, IST>::max ();
2672 const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2673
2674 this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2675 }
2676
2677
2678 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2679 void
2681 randomize (const Scalar& minVal, const Scalar& maxVal)
2682 {
2683 typedef impl_scalar_type IST;
2684 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2685
2686 // Seed the pseudorandom number generator using the calling
2687 // process' rank. This helps decorrelate different process'
2688 // pseudorandom streams. It's not perfect but it's effective and
2689 // doesn't require MPI communication. The seed also includes bits
2690 // from the standard library's rand().
2691 //
2692 // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2693 // The code below just makes a new seed each time.
2694
2695 const uint64_t myRank =
2696 static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2697 uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2698 unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2699
2700 pool_type rand_pool (seed);
2701 const IST max = static_cast<IST> (maxVal);
2702 const IST min = static_cast<IST> (minVal);
2703
2704 auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2705
2706 if (isConstantStride ()) {
2707 Kokkos::fill_random (thisView, rand_pool, min, max);
2708 }
2709 else {
2710 const size_t numVecs = getNumVectors ();
2711 for (size_t k = 0; k < numVecs; ++k) {
2712 const size_t col = whichVectors_[k];
2713 auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2714 Kokkos::fill_random (X_k, rand_pool, min, max);
2715 }
2716 }
2717 }
2718
2719 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2720 void
2722 putScalar (const Scalar& alpha)
2723 {
2726 using DES = typename dual_view_type::t_dev::execution_space;
2727 using HES = typename dual_view_type::t_host::execution_space;
2728 using LO = LocalOrdinal;
2729 ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2730
2731 // We need this cast for cases like Scalar = std::complex<T> but
2732 // impl_scalar_type = Kokkos::complex<T>.
2733 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2734 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2735 const LO numVecs = static_cast<LO> (this->getNumVectors ());
2736
2737 // Modify the most recently updated version of the data. This
2738 // avoids sync'ing, which could violate users' expectations.
2739 //
2740 // If we need sync to device, then host has the most recent version.
2741 const bool runOnHost = runKernelOnHost(*this);
2742 if (! runOnHost) {
2743 auto X = this->getLocalViewDevice(Access::OverwriteAll);
2744 if (this->isConstantStride ()) {
2745 fill (DES (), X, theAlpha, lclNumRows, numVecs);
2746 }
2747 else {
2748 fill (DES (), X, theAlpha, lclNumRows, numVecs,
2749 this->whichVectors_.getRawPtr ());
2750 }
2751 }
2752 else { // last modified in host memory, so modify data there.
2753 auto X = this->getLocalViewHost(Access::OverwriteAll);
2754 if (this->isConstantStride ()) {
2755 fill (HES (), X, theAlpha, lclNumRows, numVecs);
2756 }
2757 else {
2758 fill (HES (), X, theAlpha, lclNumRows, numVecs,
2759 this->whichVectors_.getRawPtr ());
2760 }
2761 }
2762 }
2763
2764
2765 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2766 void
2768 replaceMap (const Teuchos::RCP<const map_type>& newMap)
2769 {
2770 using Teuchos::ArrayRCP;
2771 using Teuchos::Comm;
2772 using Teuchos::RCP;
2773 using ST = Scalar;
2774 using LO = LocalOrdinal;
2775 using GO = GlobalOrdinal;
2776
2777 // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2778 // it might work if the MV is a column view of another MV.
2779 // However, things might go wrong when restoring the original
2780 // Map, so we don't allow this case for now.
2781 TEUCHOS_TEST_FOR_EXCEPTION(
2782 ! this->isConstantStride (), std::logic_error,
2783 "Tpetra::MultiVector::replaceMap: This method does not currently work "
2784 "if the MultiVector is a column view of another MultiVector (that is, if "
2785 "isConstantStride() == false).");
2786
2787 // Case 1: current Map and new Map are both nonnull on this process.
2788 // Case 2: current Map is nonnull, new Map is null.
2789 // Case 3: current Map is null, new Map is nonnull.
2790 // Case 4: both Maps are null: forbidden.
2791 //
2792 // Case 1 means that we don't have to do anything on this process,
2793 // other than assign the new Map. (We always have to do that.)
2794 // It's an error for the user to supply a Map that requires
2795 // resizing in this case.
2796 //
2797 // Case 2 means that the calling process is in the current Map's
2798 // communicator, but will be excluded from the new Map's
2799 // communicator. We don't have to do anything on the calling
2800 // process; just leave whatever data it may have alone.
2801 //
2802 // Case 3 means that the calling process is excluded from the
2803 // current Map's communicator, but will be included in the new
2804 // Map's communicator. This means we need to (re)allocate the
2805 // local DualView if it does not have the right number of rows.
2806 // If the new number of rows is nonzero, we'll fill the newly
2807 // allocated local data with zeros, as befits a projection
2808 // operation.
2809 //
2810 // The typical use case for Case 3 is that the MultiVector was
2811 // first created with the Map with more processes, then that Map
2812 // was replaced with a Map with fewer processes, and finally the
2813 // original Map was restored on this call to replaceMap.
2814
2815 if (this->getMap ().is_null ()) { // current Map is null
2816 // If this->getMap() is null, that means that this MultiVector
2817 // has already had replaceMap happen to it. In that case, just
2818 // reallocate the DualView with the right size.
2819
2820 TEUCHOS_TEST_FOR_EXCEPTION(
2821 newMap.is_null (), std::invalid_argument,
2822 "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2823 "This probably means that the input Map is incorrect.");
2824
2825 // Case 3: current Map is null, new Map is nonnull.
2826 // Reallocate the DualView with the right dimensions.
2827 const size_t newNumRows = newMap->getLocalNumElements ();
2828 const size_t origNumRows = view_.extent (0);
2829 const size_t numCols = this->getNumVectors ();
2830
2831 if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2832 view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2833 }
2834 }
2835 else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2836 // I am an excluded process. Reinitialize my data so that I
2837 // have 0 rows. Keep the number of columns as before.
2838 const size_t newNumRows = static_cast<size_t> (0);
2839 const size_t numCols = this->getNumVectors ();
2840 view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2841 }
2842
2843 this->map_ = newMap;
2844 }
2845
2846 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2847 void
2849 scale (const Scalar& alpha)
2850 {
2851 using Kokkos::ALL;
2852 using IST = impl_scalar_type;
2853
2854 const IST theAlpha = static_cast<IST> (alpha);
2855 if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2856 return; // do nothing
2857 }
2858 const size_t lclNumRows = getLocalLength ();
2859 const size_t numVecs = getNumVectors ();
2860 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2861 const std::pair<size_t, size_t> colRng (0, numVecs);
2862
2863 // We can't substitute putScalar(0.0) for scale(0.0), because the
2864 // former will overwrite NaNs present in the MultiVector. The
2865 // semantics of this call require multiplying them by 0, which
2866 // IEEE 754 requires to be NaN.
2867
2868 // If we need sync to device, then host has the most recent version.
2869 const bool useHostVersion = need_sync_device ();
2870 if (useHostVersion) {
2871 auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2872 if (isConstantStride ()) {
2873 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2874 }
2875 else {
2876 for (size_t k = 0; k < numVecs; ++k) {
2877 const size_t Y_col = whichVectors_[k];
2878 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2879 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2880 }
2881 }
2882 }
2883 else { // work on device
2884 auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2885 if (isConstantStride ()) {
2886 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2887 }
2888 else {
2889 for (size_t k = 0; k < numVecs; ++k) {
2890 const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2891 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2892 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2893 }
2894 }
2895 }
2896 }
2897
2898
2899 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2900 void
2902 scale (const Teuchos::ArrayView<const Scalar>& alphas)
2903 {
2904 const size_t numVecs = this->getNumVectors ();
2905 const size_t numAlphas = static_cast<size_t> (alphas.size ());
2906 TEUCHOS_TEST_FOR_EXCEPTION(
2907 numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2908 "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2909 << numVecs << ".");
2910
2911 // Use a DualView to copy the scaling constants onto the device.
2912 using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2913 k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2914 k_alphas.modify_host ();
2915 for (size_t i = 0; i < numAlphas; ++i) {
2916 k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2917 }
2918 k_alphas.sync_device ();
2919 // Invoke the scale() overload that takes a device View of coefficients.
2920 this->scale (k_alphas.view_device ());
2921 }
2922
2923 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2924 void
2926 scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2927 {
2928 using Kokkos::ALL;
2929 using Kokkos::subview;
2930
2931 const size_t lclNumRows = this->getLocalLength ();
2932 const size_t numVecs = this->getNumVectors ();
2933 TEUCHOS_TEST_FOR_EXCEPTION(
2934 static_cast<size_t> (alphas.extent (0)) != numVecs,
2935 std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2936 "alphas.extent(0) = " << alphas.extent (0)
2937 << " != this->getNumVectors () = " << numVecs << ".");
2938 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2939 const std::pair<size_t, size_t> colRng (0, numVecs);
2940
2941 // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2942 // type of the return value of subview. This is because if we
2943 // switch the array layout from LayoutLeft to LayoutRight
2944 // (preferred for performance of block operations), the types
2945 // below won't be valid. (A view of a column of a LayoutRight
2946 // multivector has LayoutStride, not LayoutLeft.)
2947
2948 // If we need sync to device, then host has the most recent version.
2949 const bool useHostVersion = this->need_sync_device ();
2950 if (useHostVersion) {
2951 // Work in host memory. This means we need to create a host
2952 // mirror of the input View of coefficients.
2953 auto alphas_h = Kokkos::create_mirror_view (alphas);
2954 // DEEP_COPY REVIEW - NOT TESTED
2955 Kokkos::deep_copy (alphas_h, alphas);
2956
2957 auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2958 if (isConstantStride ()) {
2959 KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2960 }
2961 else {
2962 for (size_t k = 0; k < numVecs; ++k) {
2963 const size_t Y_col = this->isConstantStride () ? k :
2964 this->whichVectors_[k];
2965 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2966 // We don't have to use the entire 1-D View here; we can use
2967 // the version that takes a scalar coefficient.
2968 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2969 }
2970 }
2971 }
2972 else { // Work in device memory, using the input View 'alphas' directly.
2973 auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2974 if (isConstantStride ()) {
2975 KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2976 }
2977 else {
2978 // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2979 // as values on host, so copy them to host. Another approach
2980 // would be to fix scal() so that it takes a 0-D View as the
2981 // second argument.
2982 auto alphas_h = Kokkos::create_mirror_view (alphas);
2983 // DEEP_COPY REVIEW - NOT TESTED
2984 Kokkos::deep_copy (alphas_h, alphas);
2985
2986 for (size_t k = 0; k < numVecs; ++k) {
2987 const size_t Y_col = this->isConstantStride () ? k :
2988 this->whichVectors_[k];
2989 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2990 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2991 }
2992 }
2993 }
2994 }
2995
2996 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2997 void
2999 scale (const Scalar& alpha,
3001 {
3002 using Kokkos::ALL;
3003 using Kokkos::subview;
3004 const char tfecfFuncName[] = "scale: ";
3005
3006 const size_t lclNumRows = getLocalLength ();
3007 const size_t numVecs = getNumVectors ();
3008
3009 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3010 lclNumRows != A.getLocalLength (), std::invalid_argument,
3011 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3012 << A.getLocalLength () << ".");
3013 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3014 numVecs != A.getNumVectors (), std::invalid_argument,
3015 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3016 << A.getNumVectors () << ".");
3017
3018 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3019 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3020 const std::pair<size_t, size_t> colRng (0, numVecs);
3021
3022 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3023 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3024 auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
3025 auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
3026
3027 if (isConstantStride () && A.isConstantStride ()) {
3028 KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
3029 }
3030 else {
3031 // Make sure that Kokkos only uses the local length for add.
3032 for (size_t k = 0; k < numVecs; ++k) {
3033 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3034 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3035 auto Y_k = subview (Y_lcl, ALL (), Y_col);
3036 auto X_k = subview (X_lcl, ALL (), X_col);
3037
3038 KokkosBlas::scal (Y_k, theAlpha, X_k);
3039 }
3040 }
3041 }
3042
3043
3044
3045 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3046 void
3049 {
3050 const char tfecfFuncName[] = "reciprocal: ";
3051
3052 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3053 getLocalLength () != A.getLocalLength (), std::runtime_error,
3054 "MultiVectors do not have the same local length. "
3055 "this->getLocalLength() = " << getLocalLength ()
3056 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3057 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3058 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3059 ": MultiVectors do not have the same number of columns (vectors). "
3060 "this->getNumVectors() = " << getNumVectors ()
3061 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3062
3063 const size_t numVecs = getNumVectors ();
3064
3065 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3066 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3067
3068 if (isConstantStride () && A.isConstantStride ()) {
3069 KokkosBlas::reciprocal (this_view_dev, A_view_dev);
3070 }
3071 else {
3072 using Kokkos::ALL;
3073 using Kokkos::subview;
3074 for (size_t k = 0; k < numVecs; ++k) {
3075 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3076 auto vector_k = subview (this_view_dev, ALL (), this_col);
3077 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3078 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3079 KokkosBlas::reciprocal (vector_k, vector_Ak);
3080 }
3081 }
3082 }
3083
3084 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3085 void
3088 {
3089 const char tfecfFuncName[] = "abs";
3090
3091 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3092 getLocalLength () != A.getLocalLength (), std::runtime_error,
3093 ": MultiVectors do not have the same local length. "
3094 "this->getLocalLength() = " << getLocalLength ()
3095 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3096 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3097 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3098 ": MultiVectors do not have the same number of columns (vectors). "
3099 "this->getNumVectors() = " << getNumVectors ()
3100 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3101 const size_t numVecs = getNumVectors ();
3102
3103 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3104 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3105
3106 if (isConstantStride () && A.isConstantStride ()) {
3107 KokkosBlas::abs (this_view_dev, A_view_dev);
3108 }
3109 else {
3110 using Kokkos::ALL;
3111 using Kokkos::subview;
3112
3113 for (size_t k=0; k < numVecs; ++k) {
3114 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3115 auto vector_k = subview (this_view_dev, ALL (), this_col);
3116 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3117 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3118 KokkosBlas::abs (vector_k, vector_Ak);
3119 }
3120 }
3121 }
3122
3123 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3124 void
3126 update (const Scalar& alpha,
3128 const Scalar& beta)
3129 {
3130 const char tfecfFuncName[] = "update: ";
3131 using Kokkos::subview;
3132 using Kokkos::ALL;
3133
3134 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3135
3136 const size_t lclNumRows = getLocalLength ();
3137 const size_t numVecs = getNumVectors ();
3138
3140 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3141 lclNumRows != A.getLocalLength (), std::invalid_argument,
3142 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3143 << A.getLocalLength () << ".");
3144 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3145 numVecs != A.getNumVectors (), std::invalid_argument,
3146 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3147 << A.getNumVectors () << ".");
3148 }
3149
3150 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3151 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3152 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3153 const std::pair<size_t, size_t> colRng (0, numVecs);
3154
3155 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3156 auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3157 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3158 auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3159
3160 // The device memory of *this is about to be modified
3161 if (isConstantStride () && A.isConstantStride ()) {
3162 KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3163 }
3164 else {
3165 // Make sure that Kokkos only uses the local length for add.
3166 for (size_t k = 0; k < numVecs; ++k) {
3167 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3168 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3169 auto Y_k = subview (Y_lcl, ALL (), Y_col);
3170 auto X_k = subview (X_lcl, ALL (), X_col);
3171
3172 KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3173 }
3174 }
3175 }
3176
3177 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3178 void
3180 update (const Scalar& alpha,
3182 const Scalar& beta,
3184 const Scalar& gamma)
3185 {
3186 using Kokkos::ALL;
3187 using Kokkos::subview;
3188
3189 const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3190
3191 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3192
3193 const size_t lclNumRows = this->getLocalLength ();
3194 const size_t numVecs = getNumVectors ();
3195
3197 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3198 lclNumRows != A.getLocalLength (), std::invalid_argument,
3199 "The input MultiVector A has " << A.getLocalLength () << " local "
3200 "row(s), but this MultiVector has " << lclNumRows << " local "
3201 "row(s).");
3202 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3203 lclNumRows != B.getLocalLength (), std::invalid_argument,
3204 "The input MultiVector B has " << B.getLocalLength () << " local "
3205 "row(s), but this MultiVector has " << lclNumRows << " local "
3206 "row(s).");
3207 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3208 A.getNumVectors () != numVecs, std::invalid_argument,
3209 "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3210 "but this MultiVector has " << numVecs << " column(s).");
3211 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3212 B.getNumVectors () != numVecs, std::invalid_argument,
3213 "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3214 "but this MultiVector has " << numVecs << " column(s).");
3215 }
3216
3217 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3218 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3219 const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3220
3221 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3222 const std::pair<size_t, size_t> colRng (0, numVecs);
3223
3224 // Prefer 'auto' over specifying the type explicitly. This avoids
3225 // issues with a subview possibly having a different type than the
3226 // original view.
3227 auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3228 auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3229 auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3230
3231 if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3232 KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3233 }
3234 else {
3235 // Some input (or *this) is not constant stride,
3236 // so perform the update one column at a time.
3237 for (size_t k = 0; k < numVecs; ++k) {
3238 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3239 const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3240 const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3241 KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3242 theBeta, subview (B_lcl, rowRng, B_col),
3243 theGamma, subview (C_lcl, rowRng, this_col));
3244 }
3245 }
3246 }
3247
3248 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3249 Teuchos::ArrayRCP<const Scalar>
3251 getData (size_t j) const
3252 {
3253 using Kokkos::ALL;
3254 using IST = impl_scalar_type;
3255 const char tfecfFuncName[] = "getData: ";
3256
3257 // Any MultiVector method that called the (classic) Kokkos Node's
3258 // viewBuffer or viewBufferNonConst methods always implied a
3259 // device->host synchronization. Thus, we synchronize here as
3260 // well.
3261
3262 auto hostView = getLocalViewHost(Access::ReadOnly);
3263 const size_t col = isConstantStride () ? j : whichVectors_[j];
3264 auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3265 Teuchos::ArrayRCP<const IST> dataAsArcp =
3266 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3267
3268 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3269 (static_cast<size_t> (hostView_j.extent (0)) <
3270 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3271 "hostView_j.extent(0) = " << hostView_j.extent (0)
3272 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3273 "Please report this bug to the Tpetra developers.");
3274
3275 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3276 }
3277
3278 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3279 Teuchos::ArrayRCP<Scalar>
3281 getDataNonConst (size_t j)
3282 {
3283 using Kokkos::ALL;
3284 using Kokkos::subview;
3285 using IST = impl_scalar_type;
3286 const char tfecfFuncName[] = "getDataNonConst: ";
3287
3288 auto hostView = getLocalViewHost(Access::ReadWrite);
3289 const size_t col = isConstantStride () ? j : whichVectors_[j];
3290 auto hostView_j = subview (hostView, ALL (), col);
3291 Teuchos::ArrayRCP<IST> dataAsArcp =
3292 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3293
3294 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3295 (static_cast<size_t> (hostView_j.extent (0)) <
3296 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3297 "hostView_j.extent(0) = " << hostView_j.extent (0)
3298 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3299 "Please report this bug to the Tpetra developers.");
3300
3301 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3302 }
3303
3304 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3305 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3307 subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3308 {
3309 using Teuchos::RCP;
3311
3312 // Check whether the index set in cols is contiguous. If it is,
3313 // use the more efficient Range1D version of subCopy.
3314 bool contiguous = true;
3315 const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3316 for (size_t j = 1; j < numCopyVecs; ++j) {
3317 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3318 contiguous = false;
3319 break;
3320 }
3321 }
3322 if (contiguous && numCopyVecs > 0) {
3323 return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3324 }
3325 else {
3326 RCP<const MV> X_sub = this->subView (cols);
3327 RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3328 Y->assign (*X_sub);
3329 return Y;
3330 }
3331 }
3332
3333 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3334 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3336 subCopy (const Teuchos::Range1D &colRng) const
3337 {
3338 using Teuchos::RCP;
3340
3341 RCP<const MV> X_sub = this->subView (colRng);
3342 RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3343 Y->assign (*X_sub);
3344 return Y;
3345 }
3346
3347 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3348 size_t
3353
3354 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3355 size_t
3360
3361 template <class Scalar, class LO, class GO, class Node>
3364 const Teuchos::RCP<const map_type>& subMap,
3365 const local_ordinal_type rowOffset) :
3366 base_type (subMap)
3367 {
3368 using Kokkos::ALL;
3369 using Kokkos::subview;
3370 using Teuchos::outArg;
3371 using Teuchos::RCP;
3372 using Teuchos::rcp;
3373 using Teuchos::reduceAll;
3374 using Teuchos::REDUCE_MIN;
3375 using std::endl;
3377 const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3378 const char suffix[] = "Please report this bug to the Tpetra developers.";
3379 int lclGood = 1;
3380 int gblGood = 1;
3381 std::unique_ptr<std::ostringstream> errStrm;
3382 const bool debug = ::Tpetra::Details::Behavior::debug ();
3383 const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3384
3385 // Be careful to use the input Map's communicator, not X's. The
3386 // idea is that, on return, *this is a subview of X, using the
3387 // input Map.
3388 const auto comm = subMap->getComm ();
3389 TEUCHOS_ASSERT( ! comm.is_null () );
3390 const int myRank = comm->getRank ();
3391
3392 const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3393 const LO numCols = static_cast<LO> (X.getNumVectors ());
3394 const LO newNumRows = static_cast<LO> (subMap->getLocalNumElements ());
3395 if (verbose) {
3396 std::ostringstream os;
3397 os << "Proc " << myRank << ": " << prefix
3398 << "X: {lclNumRows: " << lclNumRowsBefore
3399 << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3400 << ", numCols: " << numCols << "}, "
3401 << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3402 std::cerr << os.str ();
3403 }
3404 // We ask for the _original_ number of rows in X, because X could
3405 // be a shorter (fewer rows) view of a longer MV. (For example, X
3406 // could be a domain Map view of a column Map MV.)
3407 const bool tooManyElts =
3408 newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3409 if (tooManyElts) {
3410 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3411 *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3412 << newNumRows << ") + rowOffset (=" << rowOffset
3413 << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3414 << ")." << endl;
3415 lclGood = 0;
3416 TEUCHOS_TEST_FOR_EXCEPTION
3417 (! debug && tooManyElts, std::invalid_argument,
3418 prefix << errStrm->str () << suffix);
3419 }
3420
3421 if (debug) {
3422 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3423 if (gblGood != 1) {
3424 std::ostringstream gblErrStrm;
3425 const std::string myErrStr =
3426 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3427 ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3428 TEUCHOS_TEST_FOR_EXCEPTION
3429 (true, std::invalid_argument, gblErrStrm.str ());
3430 }
3431 }
3432
3433 using range_type = std::pair<LO, LO>;
3434 const range_type origRowRng
3435 (rowOffset, static_cast<LO> (X.view_.origExtent (0)));
3436 const range_type rowRng
3437 (rowOffset, rowOffset + newNumRows);
3438
3439 wrapped_dual_view_type newView = takeSubview (X.view_, rowRng, ALL ());
3440
3441 // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3442 // handling subviews of degenerate Views quite so well. For some
3443 // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3444 // 0. We work around by creating a new empty DualView of the
3445 // desired (degenerate) dimensions.
3446 if (newView.extent (0) == 0 &&
3447 newView.extent (1) != X.view_.extent (1)) {
3448 newView =
3449 allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3450 }
3451
3452 MV subViewMV = X.isConstantStride () ?
3453 MV (subMap, newView) :
3454 MV (subMap, newView, X.whichVectors_ ());
3455
3456 if (debug) {
3457 const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3458 const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3459 if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3460 lclGood = 0;
3461 if (errStrm.get () == nullptr) {
3462 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3463 }
3464 *errStrm << " Proc " << myRank <<
3465 ": subMap.getLocalNumElements(): " << newNumRows <<
3466 ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3467 ", X.getNumVectors(): " << numCols <<
3468 ", subViewMV.getNumVectors(): " << numColsRet << endl;
3469 }
3470 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3471 if (gblGood != 1) {
3472 std::ostringstream gblErrStrm;
3473 if (myRank == 0) {
3474 gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3475 "dimensions on one or more processes:" << endl;
3476 }
3477 const std::string myErrStr =
3478 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3479 ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3480 gblErrStrm << suffix << endl;
3481 TEUCHOS_TEST_FOR_EXCEPTION
3482 (true, std::invalid_argument, gblErrStrm.str ());
3483 }
3484 }
3485
3486 if (verbose) {
3487 std::ostringstream os;
3488 os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3489 std::cerr << os.str ();
3490 }
3491
3492 *this = subViewMV; // shallow copy
3493
3494 if (verbose) {
3495 std::ostringstream os;
3496 os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3497 std::cerr << os.str ();
3498 }
3499 }
3500
3501 template <class Scalar, class LO, class GO, class Node>
3504 const map_type& subMap,
3505 const size_t rowOffset) :
3506 MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3507 static_cast<local_ordinal_type> (rowOffset))
3508 {}
3509
3510 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3511 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3513 offsetView (const Teuchos::RCP<const map_type>& subMap,
3514 const size_t offset) const
3515 {
3517 return Teuchos::rcp (new MV (*this, *subMap, offset));
3518 }
3519
3520 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3521 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3523 offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3524 const size_t offset)
3525 {
3527 return Teuchos::rcp (new MV (*this, *subMap, offset));
3528 }
3529
3530 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3531 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3533 subView (const Teuchos::ArrayView<const size_t>& cols) const
3534 {
3535 using Teuchos::Array;
3536 using Teuchos::rcp;
3538
3539 const size_t numViewCols = static_cast<size_t> (cols.size ());
3540 TEUCHOS_TEST_FOR_EXCEPTION(
3541 numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3542 "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3543 "contain at least one entry, but cols.size() = " << cols.size ()
3544 << " == 0.");
3545
3546 // Check whether the index set in cols is contiguous. If it is,
3547 // use the more efficient Range1D version of subView.
3548 bool contiguous = true;
3549 for (size_t j = 1; j < numViewCols; ++j) {
3550 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3551 contiguous = false;
3552 break;
3553 }
3554 }
3555 if (contiguous) {
3556 if (numViewCols == 0) {
3557 // The output MV has no columns, so there is nothing to view.
3558 return rcp (new MV (this->getMap (), numViewCols));
3559 } else {
3560 // Use the more efficient contiguous-index-range version.
3561 return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3562 }
3563 }
3564
3565 if (isConstantStride ()) {
3566 return rcp (new MV (this->getMap (), view_, cols));
3567 }
3568 else {
3569 Array<size_t> newcols (cols.size ());
3570 for (size_t j = 0; j < numViewCols; ++j) {
3571 newcols[j] = whichVectors_[cols[j]];
3572 }
3573 return rcp (new MV (this->getMap (), view_, newcols ()));
3574 }
3575 }
3576
3577
3578 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3579 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3581 subView (const Teuchos::Range1D& colRng) const
3582 {
3584 using Kokkos::ALL;
3585 using Kokkos::subview;
3586 using Teuchos::Array;
3587 using Teuchos::RCP;
3588 using Teuchos::rcp;
3590 const char tfecfFuncName[] = "subView(Range1D): ";
3591
3592 const size_t lclNumRows = this->getLocalLength ();
3593 const size_t numVecs = this->getNumVectors ();
3594 // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3595 // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3596 // "at least one vector.");
3597 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3598 static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3599 "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3600 << numVecs << ".");
3601 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3602 numVecs != 0 && colRng.size () != 0 &&
3603 (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3604 static_cast<size_t> (colRng.ubound ()) >= numVecs),
3605 std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3606 "," << colRng.ubound () << "] exceeds the valid range of column indices "
3607 "[0, " << numVecs << "].");
3608
3609 RCP<const MV> X_ret; // the MultiVector subview to return
3610
3611 // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3612 // broken for the case of views with zero rows. I will brutally
3613 // enforce that the subview has the correct dimensions. In
3614 // particular, in the case of zero rows, I will, if necessary,
3615 // create a new dual_view_type with zero rows and the correct
3616 // number of columns. In a debug build, I will use an all-reduce
3617 // to ensure that it has the correct dimensions on all processes.
3618
3619 const std::pair<size_t, size_t> rows (0, lclNumRows);
3620 if (colRng.size () == 0) {
3621 const std::pair<size_t, size_t> cols (0, 0); // empty range
3622 wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3623 X_ret = rcp (new MV (this->getMap (), X_sub));
3624 }
3625 else {
3626 // Returned MultiVector is constant stride only if *this is.
3627 if (isConstantStride ()) {
3628 const std::pair<size_t, size_t> cols (colRng.lbound (),
3629 colRng.ubound () + 1);
3630 wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3631 X_ret = rcp (new MV (this->getMap (), X_sub));
3632 }
3633 else {
3634 if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3635 // We're only asking for one column, so the result does have
3636 // constant stride, even though this MultiVector does not.
3637 const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3638 whichVectors_[0] + colRng.ubound () + 1);
3639 wrapped_dual_view_type X_sub = takeSubview (view_, ALL (), col);
3640 X_ret = rcp (new MV (this->getMap (), X_sub));
3641 }
3642 else {
3643 Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3644 whichVectors_.begin () + colRng.ubound () + 1);
3645 X_ret = rcp (new MV (this->getMap (), view_, which));
3646 }
3647 }
3648 }
3649
3650 const bool debug = Behavior::debug ();
3651 if (debug) {
3652 using Teuchos::Comm;
3653 using Teuchos::outArg;
3654 using Teuchos::REDUCE_MIN;
3655 using Teuchos::reduceAll;
3656
3657 RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3658 Teuchos::null : this->getMap ()->getComm ();
3659 if (! comm.is_null ()) {
3660 int lclSuccess = 1;
3661 int gblSuccess = 1;
3662
3663 if (X_ret.is_null ()) {
3664 lclSuccess = 0;
3665 }
3666 reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3667 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3668 (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3669 "MultiVector; the return value of this method) is null on some MPI "
3670 "process in this MultiVector's communicator. This should never "
3671 "happen. Please report this bug to the Tpetra developers.");
3672 if (! X_ret.is_null () &&
3673 X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3674 lclSuccess = 0;
3675 }
3676 reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3677 outArg (gblSuccess));
3678 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3679 (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3680 "colRng.size(), on at least one MPI process in this MultiVector's "
3681 "communicator. This should never happen. "
3682 "Please report this bug to the Tpetra developers.");
3683 }
3684 }
3685 return X_ret;
3686 }
3687
3688
3689 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3690 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3692 subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3693 {
3695 return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3696 }
3697
3698
3699 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3700 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3702 subViewNonConst (const Teuchos::Range1D &colRng)
3703 {
3705 return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3706 }
3707
3708
3709 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3712 const size_t j)
3713 : base_type (X.getMap ())
3714 {
3715 using Kokkos::subview;
3716 typedef std::pair<size_t, size_t> range_type;
3717 const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3718
3719 const size_t numCols = X.getNumVectors ();
3720 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3721 (j >= numCols, std::invalid_argument, "Input index j (== " << j
3722 << ") exceeds valid column index range [0, " << numCols << " - 1].");
3723 const size_t jj = X.isConstantStride () ?
3724 static_cast<size_t> (j) :
3725 static_cast<size_t> (X.whichVectors_[j]);
3726 this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3727
3728 // mfh 31 Jul 2017: It would be unwise to execute concurrent
3729 // Export or Import operations with different subviews of a
3730 // MultiVector. Thus, it is safe to reuse communication buffers.
3731 // See #1560 discussion.
3732 //
3733 // We only need one column's worth of buffer for imports_ and
3734 // exports_. Taking subviews now ensures that their lengths will
3735 // be exactly what we need, so we won't have to resize them later.
3736 {
3737 const size_t newSize = X.imports_.extent (0) / numCols;
3738 const size_t offset = jj*newSize;
3739 auto newImports = X.imports_;
3740 newImports.d_view = subview (X.imports_.d_view,
3741 range_type (offset, offset+newSize));
3742 newImports.h_view = subview (X.imports_.h_view,
3743 range_type (offset, offset+newSize));
3744 this->imports_ = newImports;
3745 }
3746 {
3747 const size_t newSize = X.exports_.extent (0) / numCols;
3748 const size_t offset = jj*newSize;
3749 auto newExports = X.exports_;
3750 newExports.d_view = subview (X.exports_.d_view,
3751 range_type (offset, offset+newSize));
3752 newExports.h_view = subview (X.exports_.h_view,
3753 range_type (offset, offset+newSize));
3754 this->exports_ = newExports;
3755 }
3756 // These two DualViews already either have the right number of
3757 // entries, or zero entries. This means that we don't need to
3758 // resize them.
3761 }
3762
3763
3764 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3765 Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3767 getVector (const size_t j) const
3768 {
3770 return Teuchos::rcp (new V (*this, j));
3771 }
3772
3773
3774 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3775 Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3777 getVectorNonConst (const size_t j)
3778 {
3780 return Teuchos::rcp (new V (*this, j));
3781 }
3782
3783
3784 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3785 void
3787 get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3788 {
3789 using IST = impl_scalar_type;
3790 using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3791 Kokkos::HostSpace,
3792 Kokkos::MemoryUnmanaged>;
3793 const char tfecfFuncName[] = "get1dCopy: ";
3794
3795 const size_t numRows = this->getLocalLength ();
3796 const size_t numCols = this->getNumVectors ();
3797
3798 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3799 (LDA < numRows, std::runtime_error,
3800 "LDA = " << LDA << " < numRows = " << numRows << ".");
3801 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3802 (numRows > size_t (0) && numCols > size_t (0) &&
3803 size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3804 std::runtime_error,
3805 "A.size() = " << A.size () << ", but its size must be at least "
3806 << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3807
3808 const std::pair<size_t, size_t> rowRange (0, numRows);
3809 const std::pair<size_t, size_t> colRange (0, numCols);
3810
3811 input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3812 LDA, numCols);
3813 auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3814
3826 if (this->need_sync_host() && this->need_sync_device()) {
3829 throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3830 }
3831 else {
3832 const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3833 if (this->isConstantStride ()) {
3834 if (useHostView) {
3835 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3836 // DEEP_COPY REVIEW - NOT TESTED
3837 Kokkos::deep_copy (A_view, srcView_host);
3838 } else {
3839 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3840 // DEEP_COPY REVIEW - NOT TESTED
3841 Kokkos::deep_copy (A_view, srcView_device);
3842 }
3843 }
3844 else {
3845 for (size_t j = 0; j < numCols; ++j) {
3846 const size_t srcCol = this->whichVectors_[j];
3847 auto dstColView = Kokkos::subview (A_view, rowRange, j);
3848
3849 if (useHostView) {
3850 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3851 auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3852 // DEEP_COPY REVIEW - NOT TESTED
3853 Kokkos::deep_copy (dstColView, srcColView_host);
3854 } else {
3855 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3856 auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3857 // DEEP_COPY REVIEW - NOT TESTED
3858 Kokkos::deep_copy (dstColView, srcColView_device);
3859 }
3860 }
3861 }
3862 }
3863 }
3864
3865
3866 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3867 void
3869 get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3870 {
3872 const char tfecfFuncName[] = "get2dCopy: ";
3873 const size_t numRows = this->getLocalLength ();
3874 const size_t numCols = this->getNumVectors ();
3875
3876 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3877 static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3878 std::runtime_error, "Input array of pointers must contain as many "
3879 "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3880 << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3881
3882 if (numRows != 0 && numCols != 0) {
3883 // No side effects until we've validated the input.
3884 for (size_t j = 0; j < numCols; ++j) {
3885 const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3886 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3887 dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3888 "the input array of arrays is not long enough to fit all entries in "
3889 "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3890 << " < getLocalLength() = " << numRows << ".");
3891 }
3892
3893 // We've validated the input, so it's safe to start copying.
3894 for (size_t j = 0; j < numCols; ++j) {
3895 Teuchos::RCP<const V> X_j = this->getVector (j);
3896 const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3897 X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3898 }
3899 }
3900 }
3901
3902
3903 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3904 Teuchos::ArrayRCP<const Scalar>
3906 get1dView () const
3907 {
3908 if (getLocalLength () == 0 || getNumVectors () == 0) {
3909 return Teuchos::null;
3910 } else {
3911 TEUCHOS_TEST_FOR_EXCEPTION(
3912 ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3913 "get1dView: This MultiVector does not have constant stride, so it is "
3914 "not possible to view its data as a single array. You may check "
3915 "whether a MultiVector has constant stride by calling "
3916 "isConstantStride().");
3917 // Since get1dView() is and was always marked const, I have to
3918 // cast away const here in order not to break backwards
3919 // compatibility.
3920 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3921 Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3922 Kokkos::Compat::persistingView (X_lcl);
3923 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3924 }
3925 }
3926
3927 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3928 Teuchos::ArrayRCP<Scalar>
3931 {
3932 if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3933 return Teuchos::null;
3934 }
3935 else {
3936 TEUCHOS_TEST_FOR_EXCEPTION
3937 (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3938 "get1dViewNonConst: This MultiVector does not have constant stride, "
3939 "so it is not possible to view its data as a single array. You may "
3940 "check whether a MultiVector has constant stride by calling "
3941 "isConstantStride().");
3942 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3943 Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3944 Kokkos::Compat::persistingView (X_lcl);
3945 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3946 }
3947 }
3948
3949 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3950 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3953 {
3954 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3955
3956 // Don't use the row range here on the outside, in order to avoid
3957 // a strided return type (in case Kokkos::subview is conservative
3958 // about that). Instead, use the row range for the column views
3959 // in the loop.
3960 const size_t myNumRows = this->getLocalLength ();
3961 const size_t numCols = this->getNumVectors ();
3962 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3963
3964 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3965 for (size_t j = 0; j < numCols; ++j) {
3966 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3967 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3968 Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3969 Kokkos::Compat::persistingView (X_lcl_j);
3970 views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3971 }
3972 return views;
3973 }
3974
3975 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3978 getLocalViewHost(Access::ReadOnlyStruct s) const
3979 {
3980 return view_.getHostView(s);
3981 }
3982
3983 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3986 getLocalViewHost(Access::ReadWriteStruct s)
3987 {
3988 return view_.getHostView(s);
3989 }
3990
3991 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3994 getLocalViewHost(Access::OverwriteAllStruct s)
3995 {
3996 return view_.getHostView(s);
3997 }
3998
3999 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4002 getLocalViewDevice(Access::ReadOnlyStruct s) const
4003 {
4004 return view_.getDeviceView(s);
4005 }
4006
4007 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4010 getLocalViewDevice(Access::ReadWriteStruct s)
4011 {
4012 return view_.getDeviceView(s);
4013 }
4014
4015 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4018 getLocalViewDevice(Access::OverwriteAllStruct s)
4019 {
4020 return view_.getDeviceView(s);
4021 }
4022
4023 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4024 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
4026 get2dView () const
4027 {
4028 // Since get2dView() is and was always marked const, I have to
4029 // cast away const here in order not to break backwards
4030 // compatibility.
4031 auto X_lcl = getLocalViewHost(Access::ReadOnly);
4032
4033 // Don't use the row range here on the outside, in order to avoid
4034 // a strided return type (in case Kokkos::subview is conservative
4035 // about that). Instead, use the row range for the column views
4036 // in the loop.
4037 const size_t myNumRows = this->getLocalLength ();
4038 const size_t numCols = this->getNumVectors ();
4039 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
4040
4041 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
4042 for (size_t j = 0; j < numCols; ++j) {
4043 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
4044 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
4045 Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
4046 Kokkos::Compat::persistingView (X_lcl_j);
4047 views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
4048 }
4049 return views;
4050 }
4051
4052 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4053 void
4055 multiply (Teuchos::ETransp transA,
4056 Teuchos::ETransp transB,
4057 const Scalar& alpha,
4060 const Scalar& beta)
4061 {
4063 using Teuchos::CONJ_TRANS;
4064 using Teuchos::NO_TRANS;
4065 using Teuchos::TRANS;
4066 using Teuchos::RCP;
4067 using Teuchos::rcp;
4068 using Teuchos::rcpFromRef;
4069 using std::endl;
4070 using ATS = Kokkos::ArithTraits<impl_scalar_type>;
4071 using LO = local_ordinal_type;
4072 using STS = Teuchos::ScalarTraits<Scalar>;
4074 const char tfecfFuncName[] = "multiply: ";
4075 ProfilingRegion region ("Tpetra::MV::multiply");
4076
4077 // This routine performs a variety of matrix-matrix multiply
4078 // operations, interpreting the MultiVector (this-aka C , A and B)
4079 // as 2D matrices. Variations are due to the fact that A, B and C
4080 // can be locally replicated or globally distributed MultiVectors
4081 // and that we may or may not operate with the transpose of A and
4082 // B. Possible cases are:
4083 //
4084 // Operations # Cases Notes
4085 // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4086 // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4087 // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4088 //
4089 // The following operations are not meaningful for 1-D
4090 // distributions:
4091 //
4092 // u1) C(local) = A^T(distr) * B^T(distr) 1
4093 // u2) C(local) = A (distr) * B^X(distr) 2
4094 // u3) C(distr) = A^X(local) * B^X(local) 4
4095 // u4) C(distr) = A^X(local) * B^X(distr) 4
4096 // u5) C(distr) = A^T(distr) * B^X(local) 2
4097 // u6) C(local) = A^X(distr) * B^X(local) 4
4098 // u7) C(distr) = A^X(distr) * B^X(local) 4
4099 // u8) C(local) = A^X(local) * B^X(distr) 4
4100 //
4101 // Total number of cases: 32 (= 2^5).
4102
4103 impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4104
4105 const bool A_is_local = ! A.isDistributed ();
4106 const bool B_is_local = ! B.isDistributed ();
4107 const bool C_is_local = ! this->isDistributed ();
4108
4109 // In debug mode, check compatibility of local dimensions. We
4110 // only do this in debug mode, since it requires an all-reduce
4111 // to ensure correctness on all processses. It's entirely
4112 // possible that only some processes may have incompatible local
4113 // dimensions. Throwing an exception only on those processes
4114 // could cause this method to hang.
4115 const bool debug = ::Tpetra::Details::Behavior::debug ();
4116 if (debug) {
4117 auto myMap = this->getMap ();
4118 if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4119 using Teuchos::REDUCE_MIN;
4120 using Teuchos::reduceAll;
4121 using Teuchos::outArg;
4122
4123 auto comm = myMap->getComm ();
4124 const size_t A_nrows =
4125 (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4126 const size_t A_ncols =
4127 (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4128 const size_t B_nrows =
4129 (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4130 const size_t B_ncols =
4131 (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4132
4133 const bool lclBad = this->getLocalLength () != A_nrows ||
4134 this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4135
4136 const int myRank = comm->getRank ();
4137 std::ostringstream errStrm;
4138 if (this->getLocalLength () != A_nrows) {
4139 errStrm << "Proc " << myRank << ": this->getLocalLength()="
4140 << this->getLocalLength () << " != A_nrows=" << A_nrows
4141 << "." << std::endl;
4142 }
4143 if (this->getNumVectors () != B_ncols) {
4144 errStrm << "Proc " << myRank << ": this->getNumVectors()="
4145 << this->getNumVectors () << " != B_ncols=" << B_ncols
4146 << "." << std::endl;
4147 }
4148 if (A_ncols != B_nrows) {
4149 errStrm << "Proc " << myRank << ": A_ncols="
4150 << A_ncols << " != B_nrows=" << B_nrows
4151 << "." << std::endl;
4152 }
4153
4154 const int lclGood = lclBad ? 0 : 1;
4155 int gblGood = 0;
4156 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4157 if (gblGood != 1) {
4158 std::ostringstream os;
4159 ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
4160
4161 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4162 (true, std::runtime_error, "Inconsistent local dimensions on at "
4163 "least one process in this object's communicator." << std::endl
4164 << "Operation: "
4165 << "C(" << (C_is_local ? "local" : "distr") << ") = "
4166 << alpha << "*A"
4167 << (transA == Teuchos::TRANS ? "^T" :
4168 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4169 << "(" << (A_is_local ? "local" : "distr") << ") * "
4170 << beta << "*B"
4171 << (transA == Teuchos::TRANS ? "^T" :
4172 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4173 << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4174 << "Global dimensions: C(" << this->getGlobalLength () << ", "
4175 << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4176 << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4177 << ", " << B.getNumVectors () << ")." << std::endl
4178 << os.str ());
4179 }
4180 }
4181 }
4182
4183 // Case 1: C(local) = A^X(local) * B^X(local)
4184 const bool Case1 = C_is_local && A_is_local && B_is_local;
4185 // Case 2: C(local) = A^T(distr) * B (distr)
4186 const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4187 transA != NO_TRANS &&
4188 transB == NO_TRANS;
4189 // Case 3: C(distr) = A (distr) * B^X(local)
4190 const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4191 transA == NO_TRANS;
4192
4193 // Test that we are considering a meaningful case
4194 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4195 (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4196 "Multiplication of op(A) and op(B) into *this is not a "
4197 "supported use case.");
4198
4199 if (beta != STS::zero () && Case2) {
4200 // If Case2, then C is local and contributions must be summed
4201 // across all processes. However, if beta != 0, then accumulate
4202 // beta*C into the sum. When summing across all processes, we
4203 // only want to accumulate this once, so set beta == 0 on all
4204 // processes except Process 0.
4205 const int myRank = this->getMap ()->getComm ()->getRank ();
4206 if (myRank != 0) {
4207 beta_local = ATS::zero ();
4208 }
4209 }
4210
4211 // We only know how to do matrix-matrix multiplies if all the
4212 // MultiVectors have constant stride. If not, we have to make
4213 // temporary copies of those MultiVectors (including possibly
4214 // *this) that don't have constant stride.
4215 RCP<MV> C_tmp;
4216 if (! isConstantStride ()) {
4217 C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4218 } else {
4219 C_tmp = rcp (this, false);
4220 }
4221
4222 RCP<const MV> A_tmp;
4223 if (! A.isConstantStride ()) {
4224 A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4225 } else {
4226 A_tmp = rcpFromRef (A);
4227 }
4228
4229 RCP<const MV> B_tmp;
4230 if (! B.isConstantStride ()) {
4231 B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4232 } else {
4233 B_tmp = rcpFromRef (B);
4234 }
4235
4236 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4237 (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4238 ! A_tmp->isConstantStride (), std::logic_error,
4239 "Failed to make temporary constant-stride copies of MultiVectors.");
4240
4241 {
4242 const LO A_lclNumRows = A_tmp->getLocalLength ();
4243 const LO A_numVecs = A_tmp->getNumVectors ();
4244 auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4245 auto A_sub = Kokkos::subview (A_lcl,
4246 std::make_pair (LO (0), A_lclNumRows),
4247 std::make_pair (LO (0), A_numVecs));
4248
4249
4250 const LO B_lclNumRows = B_tmp->getLocalLength ();
4251 const LO B_numVecs = B_tmp->getNumVectors ();
4252 auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4253 auto B_sub = Kokkos::subview (B_lcl,
4254 std::make_pair (LO (0), B_lclNumRows),
4255 std::make_pair (LO (0), B_numVecs));
4256
4257 const LO C_lclNumRows = C_tmp->getLocalLength ();
4258 const LO C_numVecs = C_tmp->getNumVectors ();
4259
4260 auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4261 auto C_sub = Kokkos::subview (C_lcl,
4262 std::make_pair (LO (0), C_lclNumRows),
4263 std::make_pair (LO (0), C_numVecs));
4264 const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4265 (transA == Teuchos::TRANS ? 'T' : 'C'));
4266 const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4267 (transB == Teuchos::TRANS ? 'T' : 'C'));
4268 const impl_scalar_type alpha_IST (alpha);
4269
4270 ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4271
4272 KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4273 beta_local, C_sub);
4274 }
4275
4276 if (! isConstantStride ()) {
4277 ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4278 }
4279
4280 // Dispose of (possibly) extra copies of A and B.
4281 A_tmp = Teuchos::null;
4282 B_tmp = Teuchos::null;
4283
4284 // If Case 2 then sum up *this and distribute it to all processes.
4285 if (Case2) {
4286 this->reduce ();
4287 }
4288 }
4289
4290 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4291 void
4293 elementWiseMultiply (Scalar scalarAB,
4296 Scalar scalarThis)
4297 {
4298 using Kokkos::ALL;
4299 using Kokkos::subview;
4300 const char tfecfFuncName[] = "elementWiseMultiply: ";
4301
4302 const size_t lclNumRows = this->getLocalLength ();
4303 const size_t numVecs = this->getNumVectors ();
4304
4305 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4306 (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4307 std::runtime_error, "MultiVectors do not have the same local length.");
4308 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4309 numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4310 "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4311 << ".");
4312
4313 auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4314 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4315 auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4316
4317 if (isConstantStride () && B.isConstantStride ()) {
4318 // A is just a Vector; it only has one column, so it always has
4319 // constant stride.
4320 //
4321 // If both *this and B have constant stride, we can do an
4322 // element-wise multiply on all columns at once.
4323 KokkosBlas::mult (scalarThis,
4324 this_view,
4325 scalarAB,
4326 subview (A_view, ALL (), 0),
4327 B_view);
4328 }
4329 else {
4330 for (size_t j = 0; j < numVecs; ++j) {
4331 const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4332 const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4333 KokkosBlas::mult (scalarThis,
4334 subview (this_view, ALL (), C_col),
4335 scalarAB,
4336 subview (A_view, ALL (), 0),
4337 subview (B_view, ALL (), B_col));
4338 }
4339 }
4340 }
4341
4342 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4343 void
4345 reduce ()
4346 {
4349 ProfilingRegion region ("Tpetra::MV::reduce");
4350
4351 const auto map = this->getMap ();
4352 if (map.get () == nullptr) {
4353 return;
4354 }
4355 const auto comm = map->getComm ();
4356 if (comm.get () == nullptr) {
4357 return;
4358 }
4359
4360 // Avoid giving device buffers to MPI. Even if MPI handles them
4361 // correctly, doing so may not perform well.
4362 const bool changed_on_device = this->need_sync_host ();
4363 if (changed_on_device) {
4364 // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4365 // and host will be separate allocations. In that case, it may
4366 // pay to do the all-reduce from device to host.
4367 Kokkos::fence(); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4368 auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4369 allReduceView (X_lcl, X_lcl, *comm);
4370 }
4371 else {
4372 auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4373 allReduceView (X_lcl, X_lcl, *comm);
4374 }
4375 }
4376
4377 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4378 void
4380 replaceLocalValue (const LocalOrdinal lclRow,
4381 const size_t col,
4382 const impl_scalar_type& ScalarValue)
4383 {
4385 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4386 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4387 TEUCHOS_TEST_FOR_EXCEPTION(
4388 lclRow < minLocalIndex || lclRow > maxLocalIndex,
4389 std::runtime_error,
4390 "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4391 << " is invalid. The range of valid row indices on this process "
4392 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4393 << ", " << maxLocalIndex << "].");
4394 TEUCHOS_TEST_FOR_EXCEPTION(
4395 vectorIndexOutOfRange(col),
4396 std::runtime_error,
4397 "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4398 << " of the multivector is invalid.");
4399 }
4400
4401 auto view = this->getLocalViewHost(Access::ReadWrite);
4402 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4403 view (lclRow, colInd) = ScalarValue;
4404 }
4405
4406
4407 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4408 void
4410 sumIntoLocalValue (const LocalOrdinal lclRow,
4411 const size_t col,
4412 const impl_scalar_type& value,
4413 const bool atomic)
4414 {
4416 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4417 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4418 TEUCHOS_TEST_FOR_EXCEPTION(
4419 lclRow < minLocalIndex || lclRow > maxLocalIndex,
4420 std::runtime_error,
4421 "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4422 << " is invalid. The range of valid row indices on this process "
4423 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4424 << ", " << maxLocalIndex << "].");
4425 TEUCHOS_TEST_FOR_EXCEPTION(
4426 vectorIndexOutOfRange(col),
4427 std::runtime_error,
4428 "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4429 << " of the multivector is invalid.");
4430 }
4431
4432 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4433
4434 auto view = this->getLocalViewHost(Access::ReadWrite);
4435 if (atomic) {
4436 Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4437 }
4438 else {
4439 view(lclRow, colInd) += value;
4440 }
4441 }
4442
4443
4444 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4445 void
4447 replaceGlobalValue (const GlobalOrdinal gblRow,
4448 const size_t col,
4449 const impl_scalar_type& ScalarValue)
4450 {
4451 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4452 // touches the RCP's reference count, which isn't thread safe.
4453 const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4454
4456 const char tfecfFuncName[] = "replaceGlobalValue: ";
4457 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4458 (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4459 std::runtime_error,
4460 "Global row index " << gblRow << " is not present on this process "
4461 << this->getMap ()->getComm ()->getRank () << ".");
4462 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4463 (this->vectorIndexOutOfRange (col), std::runtime_error,
4464 "Vector index " << col << " of the MultiVector is invalid.");
4465 }
4466
4467 this->replaceLocalValue (lclRow, col, ScalarValue);
4468 }
4469
4470 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4471 void
4473 sumIntoGlobalValue (const GlobalOrdinal globalRow,
4474 const size_t col,
4475 const impl_scalar_type& value,
4476 const bool atomic)
4477 {
4478 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4479 // touches the RCP's reference count, which isn't thread safe.
4480 const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4481
4483 TEUCHOS_TEST_FOR_EXCEPTION(
4484 lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4485 std::runtime_error,
4486 "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4487 << " is not present on this process "
4488 << this->getMap ()->getComm ()->getRank () << ".");
4489 TEUCHOS_TEST_FOR_EXCEPTION(
4490 vectorIndexOutOfRange(col),
4491 std::runtime_error,
4492 "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4493 << " of the multivector is invalid.");
4494 }
4495
4496 this->sumIntoLocalValue (lclRow, col, value, atomic);
4497 }
4498
4499 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4500 template <class T>
4501 Teuchos::ArrayRCP<T>
4503 getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4504 size_t j) const
4505 {
4506 typedef Kokkos::DualView<impl_scalar_type*,
4507 typename dual_view_type::array_layout,
4508 execution_space> col_dual_view_type;
4509 const size_t col = isConstantStride () ? j : whichVectors_[j];
4510 col_dual_view_type X_col =
4511 Kokkos::subview (view_, Kokkos::ALL (), col);
4512 return Kokkos::Compat::persistingView (X_col.d_view);
4513 }
4514
4515 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4516 bool
4518 need_sync_host () const {
4519 return view_.need_sync_host ();
4520 }
4521
4522 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4523 bool
4525 need_sync_device () const {
4526 return view_.need_sync_device ();
4527 }
4528
4529 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4530 std::string
4532 descriptionImpl (const std::string& className) const
4533 {
4534 using Teuchos::TypeNameTraits;
4535
4536 std::ostringstream out;
4537 out << "\"" << className << "\": {";
4538 out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4539 << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4540 << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4541 << ", Node" << Node::name ()
4542 << "}, ";
4543 if (this->getObjectLabel () != "") {
4544 out << "Label: \"" << this->getObjectLabel () << "\", ";
4545 }
4546 out << ", numRows: " << this->getGlobalLength ();
4547 if (className != "Tpetra::Vector") {
4548 out << ", numCols: " << this->getNumVectors ()
4549 << ", isConstantStride: " << this->isConstantStride ();
4550 }
4551 if (this->isConstantStride ()) {
4552 out << ", columnStride: " << this->getStride ();
4553 }
4554 out << "}";
4555
4556 return out.str ();
4557 }
4558
4559 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4560 std::string
4562 description () const
4563 {
4564 return this->descriptionImpl ("Tpetra::MultiVector");
4565 }
4566
4567 namespace Details
4568 {
4569 template<typename ViewType>
4570 void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4571 {
4572 using std::endl;
4573 static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4574 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4575 static_assert(ViewType::rank == 2,
4576 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4577 // The square braces [] and their contents are in Matlab
4578 // format, so users may copy and paste directly into Matlab.
4579 out << "Values("<<prefix<<"): " << std::endl
4580 << "[";
4581 const size_t numRows = v.extent(0);
4582 const size_t numCols = v.extent(1);
4583 if (numCols == 1) {
4584 for (size_t i = 0; i < numRows; ++i) {
4585 out << v(i,0);
4586 if (i + 1 < numRows) {
4587 out << "; ";
4588 }
4589 }
4590 }
4591 else {
4592 for (size_t i = 0; i < numRows; ++i) {
4593 for (size_t j = 0; j < numCols; ++j) {
4594 out << v(i,j);
4595 if (j + 1 < numCols) {
4596 out << ", ";
4597 }
4598 }
4599 if (i + 1 < numRows) {
4600 out << "; ";
4601 }
4602 }
4603 }
4604 out << "]" << endl;
4605 }
4606 }
4607
4608 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4609 std::string
4611 localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4612 {
4613 typedef LocalOrdinal LO;
4614 using std::endl;
4615
4616 if (vl <= Teuchos::VERB_LOW) {
4617 return std::string ();
4618 }
4619 auto map = this->getMap ();
4620 if (map.is_null ()) {
4621 return std::string ();
4622 }
4623 auto outStringP = Teuchos::rcp (new std::ostringstream ());
4624 auto outp = Teuchos::getFancyOStream (outStringP);
4625 Teuchos::FancyOStream& out = *outp;
4626 auto comm = map->getComm ();
4627 const int myRank = comm->getRank ();
4628 const int numProcs = comm->getSize ();
4629
4630 out << "Process " << myRank << " of " << numProcs << ":" << endl;
4631 Teuchos::OSTab tab1 (out);
4632
4633 // VERB_MEDIUM and higher prints getLocalLength()
4634 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4635 out << "Local number of rows: " << lclNumRows << endl;
4636
4637 if (vl > Teuchos::VERB_MEDIUM) {
4638 // VERB_HIGH and higher prints isConstantStride() and getStride().
4639 // The first is only relevant if the Vector has multiple columns.
4640 if (this->getNumVectors () != static_cast<size_t> (1)) {
4641 out << "isConstantStride: " << this->isConstantStride () << endl;
4642 }
4643 if (this->isConstantStride ()) {
4644 out << "Column stride: " << this->getStride () << endl;
4645 }
4646
4647 if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4648 // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4650 // so we can't use our regular accessor functins
4651
4652 // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4653 // to trigger (since this function is conceptually const). Thus, we
4654 // get *copies* of the view's data instead.
4655 auto X_dev = view_.getDeviceCopy();
4656 auto X_host = view_.getHostCopy();
4657
4658 if(X_dev.data() == X_host.data()) {
4659 // One single allocation
4660 Details::print_vector(out,"unified",X_host);
4661 }
4662 else {
4663 Details::print_vector(out,"host",X_host);
4664 Details::print_vector(out,"dev",X_dev);
4665 }
4666 }
4667 }
4668 out.flush (); // make sure the ostringstream got everything
4669 return outStringP->str ();
4670 }
4671
4672 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4673 void
4675 describeImpl (Teuchos::FancyOStream& out,
4676 const std::string& className,
4677 const Teuchos::EVerbosityLevel verbLevel) const
4678 {
4679 using Teuchos::TypeNameTraits;
4680 using Teuchos::VERB_DEFAULT;
4681 using Teuchos::VERB_NONE;
4682 using Teuchos::VERB_LOW;
4683 using std::endl;
4684 const Teuchos::EVerbosityLevel vl =
4685 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4686
4687 if (vl == VERB_NONE) {
4688 return; // don't print anything
4689 }
4690 // If this Vector's Comm is null, then the Vector does not
4691 // participate in collective operations with the other processes.
4692 // In that case, it is not even legal to call this method. The
4693 // reasonable thing to do in that case is nothing.
4694 auto map = this->getMap ();
4695 if (map.is_null ()) {
4696 return;
4697 }
4698 auto comm = map->getComm ();
4699 if (comm.is_null ()) {
4700 return;
4701 }
4702
4703 const int myRank = comm->getRank ();
4704
4705 // Only Process 0 should touch the output stream, but this method
4706 // in general may need to do communication. Thus, we may need to
4707 // preserve the current tab level across multiple "if (myRank ==
4708 // 0) { ... }" inner scopes. This is why we sometimes create
4709 // OSTab instances by pointer, instead of by value. We only need
4710 // to create them by pointer if the tab level must persist through
4711 // multiple inner scopes.
4712 Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4713
4714 // VERB_LOW and higher prints the equivalent of description().
4715 if (myRank == 0) {
4716 tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4717 out << "\"" << className << "\":" << endl;
4718 tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4719 {
4720 out << "Template parameters:" << endl;
4721 Teuchos::OSTab tab2 (out);
4722 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4723 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4724 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4725 << "Node: " << Node::name () << endl;
4726 }
4727 if (this->getObjectLabel () != "") {
4728 out << "Label: \"" << this->getObjectLabel () << "\", ";
4729 }
4730 out << "Global number of rows: " << this->getGlobalLength () << endl;
4731 if (className != "Tpetra::Vector") {
4732 out << "Number of columns: " << this->getNumVectors () << endl;
4733 }
4734 // getStride() may differ on different processes, so it (and
4735 // isConstantStride()) properly belong to per-process data.
4736 }
4737
4738 // This is collective over the Map's communicator.
4739 if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4740 const std::string lclStr = this->localDescribeToString (vl);
4741 ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4742 }
4743 }
4744
4745 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4746 void
4748 describe (Teuchos::FancyOStream &out,
4749 const Teuchos::EVerbosityLevel verbLevel) const
4750 {
4751 this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4752 }
4753
4754 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4755 void
4757 removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4758 {
4759 replaceMap (newMap);
4760 }
4761
4762 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4763 void
4766 {
4768 const char prefix[] = "Tpetra::MultiVector::assign: ";
4769
4770 TEUCHOS_TEST_FOR_EXCEPTION
4771 (this->getGlobalLength () != src.getGlobalLength () ||
4772 this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4773 prefix << "Global dimensions of the two Tpetra::MultiVector "
4774 "objects do not match. src has dimensions [" << src.getGlobalLength ()
4775 << "," << src.getNumVectors () << "], and *this has dimensions ["
4776 << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4777 // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4778 TEUCHOS_TEST_FOR_EXCEPTION
4779 (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4780 prefix << "The local row counts of the two Tpetra::MultiVector "
4781 "objects do not match. src has " << src.getLocalLength () << " row(s) "
4782 << " and *this has " << this->getLocalLength () << " row(s).");
4783
4784
4785 // See #1510. We're writing to *this, so we don't care about its
4786 // contents in either memory space, and we don't want
4787 // DualView::modify to complain about "concurrent modification" of
4788 // host and device Views.
4789
4793
4794 // If need sync to device, then host has most recent version.
4795 const bool src_last_updated_on_host = src.need_sync_device ();
4796
4797 if (src_last_updated_on_host) {
4798 localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4799 src.getLocalViewHost(Access::ReadOnly),
4800 this->isConstantStride (),
4801 src.isConstantStride (),
4802 this->whichVectors_ (),
4803 src.whichVectors_ ());
4804 }
4805 else {
4806 localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4807 src.getLocalViewDevice(Access::ReadOnly),
4808 this->isConstantStride (),
4809 src.isConstantStride (),
4810 this->whichVectors_ (),
4811 src.whichVectors_ ());
4812 }
4813 }
4814
4815 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4816 template<class T>
4817 Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4819 convert () const
4820 {
4821 using Teuchos::RCP;
4822
4823 auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4824 this->getNumVectors(),
4825 /*zeroOut=*/false));
4826 Tpetra::deep_copy(*newMV, *this);
4827 return newMV;
4828 }
4829
4830 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4831 bool
4834 {
4836
4837 const size_t l1 = this->getLocalLength();
4838 const size_t l2 = vec.getLocalLength();
4839 if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4840 return false;
4841 }
4842
4843 return true;
4844 }
4845
4846 template <class ST, class LO, class GO, class NT>
4849 std::swap(mv.map_, this->map_);
4850 std::swap(mv.view_, this->view_);
4851 std::swap(mv.whichVectors_, this->whichVectors_);
4852 }
4853
4854#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4855 template <class ST, class LO, class GO, class NT>
4856 void
4858 const Teuchos::SerialDenseMatrix<int, ST>& src)
4859 {
4861 using MV = MultiVector<ST, LO, GO, NT>;
4862 using IST = typename MV::impl_scalar_type;
4863 using input_view_type =
4864 Kokkos::View<const IST**, Kokkos::LayoutLeft,
4865 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4866 using pair_type = std::pair<LO, LO>;
4867
4868 const LO numRows = static_cast<LO> (src.numRows ());
4869 const LO numCols = static_cast<LO> (src.numCols ());
4870
4871 TEUCHOS_TEST_FOR_EXCEPTION
4872 (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4873 numCols != static_cast<LO> (dst.getNumVectors ()),
4874 std::invalid_argument, "Tpetra::deep_copy: On Process "
4875 << dst.getMap ()->getComm ()->getRank () << ", dst is "
4876 << dst.getLocalLength () << " x " << dst.getNumVectors ()
4877 << ", but src is " << numRows << " x " << numCols << ".");
4878
4879 const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4880 input_view_type src_orig (src_raw, src.stride (), numCols);
4881 input_view_type src_view =
4882 Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4883
4884 constexpr bool src_isConstantStride = true;
4885 Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4886 localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4887 src_view,
4888 dst.isConstantStride (),
4889 src_isConstantStride,
4890 getMultiVectorWhichVectors (dst),
4891 srcWhichVectors);
4892 }
4893
4894 template <class ST, class LO, class GO, class NT>
4895 void
4896 deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4897 const MultiVector<ST, LO, GO, NT>& src)
4898 {
4900 using MV = MultiVector<ST, LO, GO, NT>;
4901 using IST = typename MV::impl_scalar_type;
4902 using output_view_type =
4903 Kokkos::View<IST**, Kokkos::LayoutLeft,
4904 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4905 using pair_type = std::pair<LO, LO>;
4906
4907 const LO numRows = static_cast<LO> (dst.numRows ());
4908 const LO numCols = static_cast<LO> (dst.numCols ());
4909
4910 TEUCHOS_TEST_FOR_EXCEPTION
4911 (numRows != static_cast<LO> (src.getLocalLength ()) ||
4912 numCols != static_cast<LO> (src.getNumVectors ()),
4913 std::invalid_argument, "Tpetra::deep_copy: On Process "
4914 << src.getMap ()->getComm ()->getRank () << ", src is "
4915 << src.getLocalLength () << " x " << src.getNumVectors ()
4916 << ", but dst is " << numRows << " x " << numCols << ".");
4917
4918 IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4919 output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4920 auto dst_view =
4921 Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4922
4923 constexpr bool dst_isConstantStride = true;
4924 Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4925
4926 // Prefer the host version of src's data.
4927 localDeepCopy (dst_view,
4928 src.getLocalViewHost(Access::ReadOnly),
4929 dst_isConstantStride,
4930 src.isConstantStride (),
4931 dstWhichVectors,
4932 getMultiVectorWhichVectors (src));
4933 }
4934#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4935
4936 template <class Scalar, class LO, class GO, class NT>
4937 Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4938 createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4939 size_t numVectors)
4940 {
4942 return Teuchos::rcp (new MV (map, numVectors));
4943 }
4944
4945 template <class ST, class LO, class GO, class NT>
4948 {
4949 typedef MultiVector<ST, LO, GO, NT> MV;
4950 MV cpy (src.getMap (), src.getNumVectors (), false);
4951 cpy.assign (src);
4952 return cpy;
4953 }
4954
4955} // namespace Tpetra
4956
4957//
4958// Explicit instantiation macro
4959//
4960// Must be expanded from within the Tpetra namespace!
4961//
4962
4963#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4964# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4965 template class MultiVector< SCALAR , LO , GO , NODE >; \
4966 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4967 template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4968 template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4969 template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4970
4971#else
4972# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4973 template class MultiVector< SCALAR , LO , GO , NODE >; \
4974 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4975
4976#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4977
4978
4979#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4980 \
4981 template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4982 MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4983
4984
4985#endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void fill(const ExecutionSpace &execSpace, const ViewType &X, const ValueType &alpha, const IndexType numRows, const IndexType numCols)
Fill the entries of the given 1-D or 2-D Kokkos::View with the given scalar value alpha.
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
Description of Tpetra's behavior.
static bool assumeMpiIsGPUAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT) override
Reallocate imports_ if needed.
void reduce()
Sum values of a locally replicated multivector across all processes.
virtual bool checkSizes(const SrcDistObject &sourceObj) override
Whether data redistribution between sourceObj and this object is legal.
void randomize()
Set all values in the multivector to pseudorandom numbers.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Map and their communicator.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
size_t getNumVectors() const
Number of columns in the multivector.
Map< LocalOrdinal, GlobalOrdinal, Node > map_type
The type of the Map specialization used by this class.
virtual size_t constantNumberOfPackets() const override
Number of packets to send per LID.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this).
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print the object with the given verbosity level to a FancyOStream.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
virtual std::string description() const override
A simple one-line description of this object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
base_type::impl_scalar_type impl_scalar_type
Implementation details of Tpetra.
Nonmember function that computes a residual Computes R = B - A * X.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
void localDeepCopy(const DstViewType &dst, const SrcViewType &src, const bool dstConstStride, const bool srcConstStride, const DstWhichVecsType &dstWhichVecs, const SrcWhichVecsType &srcWhichVecs)
Implementation of Tpetra::MultiVector deep copy of local data.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes).
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.