MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48
49#include "Kokkos_UnorderedMap.hpp"
50#include "Xpetra_CrsGraphFactory.hpp"
51
53
54#include "MueLu_Aggregates_kokkos.hpp"
55#include "MueLu_AmalgamationFactory_kokkos.hpp"
56#include "MueLu_AmalgamationInfo_kokkos.hpp"
57#include "MueLu_CoarseMapFactory_kokkos.hpp"
58
59#include "MueLu_MasterList.hpp"
60#include "MueLu_NullspaceFactory_kokkos.hpp"
61#include "MueLu_PerfUtils.hpp"
62#include "MueLu_Monitor.hpp"
63#include "MueLu_Utilities_kokkos.hpp"
64
65#include "Xpetra_IO.hpp"
66
67namespace MueLu {
68
69 namespace { // anonymous
70
71 template<class LocalOrdinal, class View>
72 class ReduceMaxFunctor{
73 public:
74 ReduceMaxFunctor(View view) : view_(view) { }
75
76 KOKKOS_INLINE_FUNCTION
77 void operator()(const LocalOrdinal &i, LocalOrdinal& vmax) const {
78 if (vmax < view_(i))
79 vmax = view_(i);
80 }
81
82 KOKKOS_INLINE_FUNCTION
83 void join (LocalOrdinal& dst, const LocalOrdinal& src) const {
84 if (dst < src) {
85 dst = src;
86 }
87 }
88
89 KOKKOS_INLINE_FUNCTION
90 void init (LocalOrdinal& dst) const {
91 dst = 0;
92 }
93 private:
94 View view_;
95 };
96
97 // local QR decomposition
98 template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
99 class LocalQRDecompFunctor {
100 private:
101 typedef LOType LO;
102 typedef GOType GO;
103 typedef SCType SC;
104
105 typedef typename DeviceType::execution_space execution_space;
106 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
107 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
108 typedef typename impl_ATS::magnitudeType Magnitude;
109
110 typedef Kokkos::View<impl_SC**,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
111 typedef Kokkos::View<impl_SC* ,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
112
113 private:
114
115 NspType fineNS;
116 NspType coarseNS;
117 aggRowsType aggRows;
118 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
119 agg2RowMapLOType agg2RowMapLO;
120 statusType statusAtomic;
121 rowsType rows;
122 rowsAuxType rowsAux;
123 colsAuxType colsAux;
124 valsAuxType valsAux;
125 bool doQRStep;
126 public:
127 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) :
128 fineNS(fineNS_),
129 coarseNS(coarseNS_),
130 aggRows(aggRows_),
131 maxAggDofSize(maxAggDofSize_),
132 agg2RowMapLO(agg2RowMapLO_),
133 statusAtomic(statusAtomic_),
134 rows(rows_),
135 rowsAux(rowsAux_),
136 colsAux(colsAux_),
137 valsAux(valsAux_),
138 doQRStep(doQRStep_)
139 { }
140
141 KOKKOS_INLINE_FUNCTION
142 void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
143 auto agg = thread.league_rank();
144
145 // size of aggregate: number of DOFs in aggregate
146 auto aggSize = aggRows(agg+1) - aggRows(agg);
147
148 const impl_SC one = impl_ATS::one();
149 const impl_SC two = one + one;
150 const impl_SC zero = impl_ATS::zero();
151 const auto zeroM = impl_ATS::magnitude(zero);
152
153 int m = aggSize;
154 int n = fineNS.extent(1);
155
156 // calculate row offset for coarse nullspace
157 Xpetra::global_size_t offset = agg * n;
158
159 if (doQRStep) {
160
161 // Extract the piece of the nullspace corresponding to the aggregate
162 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
163 for (int j = 0; j < n; j++)
164 for (int k = 0; k < m; k++)
165 r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
166#if 0
167 printf("A\n");
168 for (int i = 0; i < m; i++) {
169 for (int j = 0; j < n; j++)
170 printf(" %5.3lf ", r(i,j));
171 printf("\n");
172 }
173#endif
174
175 // Calculate QR decomposition (standard)
176 shared_matrix q(thread.team_shmem(), m, m); // Q
177 if (m >= n) {
178 bool isSingular = false;
179
180 // Initialize Q^T
181 auto qt = q;
182 for (int i = 0; i < m; i++) {
183 for (int j = 0; j < m; j++)
184 qt(i,j) = zero;
185 qt(i,i) = one;
186 }
187
188 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
189 // FIXME_KOKKOS: use team
190 Magnitude s = zeroM, norm, norm_x;
191 for (int i = k+1; i < m; i++)
192 s += pow(impl_ATS::magnitude(r(i,k)), 2);
193 norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
194
195 if (norm == zero) {
196 isSingular = true;
197 break;
198 }
199
200 r(k,k) -= norm*one;
201
202 norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
203 if (norm_x == zeroM) {
204 // We have a single diagonal element in the column.
205 // No reflections required. Just need to restor r(k,k).
206 r(k,k) = norm*one;
207 continue;
208 }
209
210 // FIXME_KOKKOS: use team
211 for (int i = k; i < m; i++)
212 r(i,k) /= norm_x;
213
214 // Update R(k:m,k+1:n)
215 for (int j = k+1; j < n; j++) {
216 // FIXME_KOKKOS: use team in the loops
217 impl_SC si = zero;
218 for (int i = k; i < m; i++)
219 si += r(i,k) * r(i,j);
220 for (int i = k; i < m; i++)
221 r(i,j) -= two*si * r(i,k);
222 }
223
224 // Update Q^T (k:m,k:m)
225 for (int j = k; j < m; j++) {
226 // FIXME_KOKKOS: use team in the loops
227 impl_SC si = zero;
228 for (int i = k; i < m; i++)
229 si += r(i,k) * qt(i,j);
230 for (int i = k; i < m; i++)
231 qt(i,j) -= two*si * r(i,k);
232 }
233
234 // Fix R(k:m,k)
235 r(k,k) = norm*one;
236 for (int i = k+1; i < m; i++)
237 r(i,k) = zero;
238 }
239
240#if 0
241 // Q = (Q^T)^T
242 for (int i = 0; i < m; i++)
243 for (int j = 0; j < i; j++) {
244 impl_SC tmp = qt(i,j);
245 qt(i,j) = qt(j,i);
246 qt(j,i) = tmp;
247 }
248#endif
249
250 // Build coarse nullspace using the upper triangular part of R
251 for (int j = 0; j < n; j++)
252 for (int k = 0; k <= j; k++)
253 coarseNS(offset+k,j) = r(k,j);
254
255 if (isSingular) {
256 statusAtomic(1) = true;
257 return;
258 }
259
260 } else {
261 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
262
263 // The local QR decomposition is not possible in the "overconstrained"
264 // case (i.e. number of columns in qr > number of rowsAux), which
265 // corresponds to #DOFs in Aggregate < n. For usual problems this
266 // is only possible for single node aggregates in structural mechanics.
267 // (Similar problems may arise in discontinuous Galerkin problems...)
268 // We bypass the QR decomposition and use an identity block in the
269 // tentative prolongator for the single node aggregate and transfer the
270 // corresponding fine level null space information 1-to-1 to the coarse
271 // level null space part.
272
273 // NOTE: The resulting tentative prolongation operator has
274 // (m*DofsPerNode-n) zero columns leading to a singular
275 // coarse level operator A. To deal with that one has the following
276 // options:
277 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
278 // false) to add some identity block to the diagonal of the zero rowsAux
279 // in the coarse level operator A, such that standard level smoothers
280 // can be used again.
281 // - Use special (projection-based) level smoothers, which can deal
282 // with singular matrices (very application specific)
283 // - Adapt the code below to avoid zero columns. However, we do not
284 // support a variable number of DOFs per node in MueLu/Xpetra which
285 // makes the implementation really hard.
286 //
287 // FIXME: do we need to check for singularity here somehow? Zero
288 // columns would be easy but linear dependency would require proper QR.
289
290 // R = extended (by adding identity rowsAux) qr
291 for (int j = 0; j < n; j++)
292 for (int k = 0; k < n; k++)
293 if (k < m)
294 coarseNS(offset+k,j) = r(k,j);
295 else
296 coarseNS(offset+k,j) = (k == j ? one : zero);
297
298 // Q = I (rectangular)
299 for (int i = 0; i < m; i++)
300 for (int j = 0; j < n; j++)
301 q(i,j) = (j == i ? one : zero);
302 }
303
304 // Process each row in the local Q factor and fill helper arrays to assemble P
305 for (int j = 0; j < m; j++) {
306 LO localRow = agg2RowMapLO(aggRows(agg)+j);
307 size_t rowStart = rowsAux(localRow);
308 size_t lnnz = 0;
309 for (int k = 0; k < n; k++) {
310 // skip zeros
311 if (q(j,k) != zero) {
312 colsAux(rowStart+lnnz) = offset + k;
313 valsAux(rowStart+lnnz) = q(j,k);
314 lnnz++;
315 }
316 }
317 rows(localRow+1) = lnnz;
318 nnz += lnnz;
319 }
320
321#if 0
322 printf("R\n");
323 for (int i = 0; i < m; i++) {
324 for (int j = 0; j < n; j++)
325 printf(" %5.3lf ", coarseNS(i,j));
326 printf("\n");
327 }
328
329 printf("Q\n");
330 for (int i = 0; i < aggSize; i++) {
331 for (int j = 0; j < aggSize; j++)
332 printf(" %5.3lf ", q(i,j));
333 printf("\n");
334 }
335#endif
336 } else {
338 // "no-QR" option //
340 // Local Q factor is just the fine nullspace support over the current aggregate.
341 // Local R factor is the identity.
342 // TODO I have not implemented any special handling for aggregates that are too
343 // TODO small to locally support the nullspace, as is done in the standard QR
344 // TODO case above.
345
346 for (int j = 0; j < m; j++) {
347 LO localRow = agg2RowMapLO(aggRows(agg)+j);
348 size_t rowStart = rowsAux(localRow);
349 size_t lnnz = 0;
350 for (int k = 0; k < n; k++) {
351 const impl_SC qr_jk = fineNS(localRow,k);
352 // skip zeros
353 if (qr_jk != zero) {
354 colsAux(rowStart+lnnz) = offset + k;
355 valsAux(rowStart+lnnz) = qr_jk;
356 lnnz++;
357 }
358 }
359 rows(localRow+1) = lnnz;
360 nnz += lnnz;
361 }
362
363 for (int j = 0; j < n; j++)
364 coarseNS(offset+j,j) = one;
365
366 }
367
368 }
369
370 // amount of shared memory
371 size_t team_shmem_size( int /* team_size */ ) const {
372 if (doQRStep) {
373 int m = maxAggDofSize;
374 int n = fineNS.extent(1);
375 return shared_matrix::shmem_size(m, n) + // r
376 shared_matrix::shmem_size(m, m); // q
377 } else
378 return 0;
379 }
380 };
381
382 } // namespace anonymous
383
384 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
386 RCP<ParameterList> validParamList = rcp(new ParameterList());
387
388#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
389 SET_VALID_ENTRY("tentative: calculate qr");
390 SET_VALID_ENTRY("tentative: build coarse coordinates");
391#undef SET_VALID_ENTRY
392
393 validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
394 validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
395 validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
396 validParamList->set< RCP<const FactoryBase> >("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
397 validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
398 validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
399 validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory of the coordinates");
400
401 // Make sure we don't recursively validate options for the matrixmatrix kernels
402 ParameterList norecurse;
403 norecurse.disableRecursiveValidation();
404 validParamList->set<ParameterList> ("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
405
406 return validParamList;
407 }
408
409 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
411
412 const ParameterList& pL = GetParameterList();
413 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
414 std::string nspName = "Nullspace";
415 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
416
417 Input(fineLevel, "A");
418 Input(fineLevel, "Aggregates");
419 Input(fineLevel, nspName);
420 Input(fineLevel, "UnAmalgamationInfo");
421 Input(fineLevel, "CoarseMap");
422 if( fineLevel.GetLevelID() == 0 &&
423 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
424 pL.get<bool>("tentative: build coarse coordinates") ) { // and we want coordinates on other levels
425 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
426 Input(fineLevel, "Coordinates");
427 } else if (bTransferCoordinates_) {
428 Input(fineLevel, "Coordinates");
429 }
430 }
431
432 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
436
437 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
439 FactoryMonitor m(*this, "Build", coarseLevel);
440
441 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
442 typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
443 const ParameterList& pL = GetParameterList();
444 std::string nspName = "Nullspace";
445 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
446
447 auto A = Get< RCP<Matrix> > (fineLevel, "A");
448 auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel, "Aggregates");
449 auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel, "UnAmalgamationInfo");
450 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
451 auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
452 RCP<RealValuedMultiVector> fineCoords;
454 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, "Coordinates");
455 }
456
457 RCP<Matrix> Ptentative;
458 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
459 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
460 if ( aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
461 Ptentative = Teuchos::null;
462 Set(coarseLevel, "P", Ptentative);
463 return;
464 }
465 RCP<MultiVector> coarseNullspace;
466 RCP<RealValuedMultiVector> coarseCoords;
467
469 ArrayView<const GO> elementAList = coarseMap->getLocalElementList();
470 GO indexBase = coarseMap->getIndexBase();
471
472 LO blkSize = 1;
473 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
474 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
475
476 Array<GO> elementList;
477 ArrayView<const GO> elementListView;
478 if (blkSize == 1) {
479 // Scalar system
480 // No amalgamation required
481 elementListView = elementAList;
482
483 } else {
484 auto numElements = elementAList.size() / blkSize;
485
486 elementList.resize(numElements);
487
488 // Amalgamate the map
489 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
490 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
491
492 elementListView = elementList;
493 }
494
495 auto uniqueMap = fineCoords->getMap();
496 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
497 elementListView, indexBase, coarseMap->getComm());
498 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
499
500 // Create overlapped fine coordinates to reduce global communication
501 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
502 if (aggregates->AggregatesCrossProcessors()) {
503 auto nonUniqueMap = aggregates->GetMap();
504 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
505
506 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
507 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
508 }
509
510 // The good new is that his graph has already been constructed for the
511 // TentativePFactory and was cached in Aggregates. So this is a no-op.
512 auto aggGraph = aggregates->GetGraph();
513 auto numAggs = aggGraph.numRows();
514
515 auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
516 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
517
518 // Fill in coarse coordinates
519 {
520 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
521
522 const auto dim = fineCoords->getNumVectors();
523
524 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
525 for (size_t j = 0; j < dim; j++) {
526 Kokkos::parallel_for("MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
527 KOKKOS_LAMBDA(const LO i) {
528 // A row in this graph represents all node ids in the aggregate
529 // Therefore, averaging is very easy
530
531 auto aggregate = aggGraph.rowConst(i);
532
533 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
534 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
535 sum += fineCoordsRandomView(aggregate(colID),j);
536
537 coarseCoordsView(i,j) = sum / aggregate.length;
538 });
539 }
540 }
541 }
542
543 if (!aggregates->AggregatesCrossProcessors()) {
544 if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
545 BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
546 coarseLevel.GetLevelID());
547 }
548 else {
549 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
550 }
551 }
552 else
553 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
554
555 // If available, use striding information of fine level matrix A for range
556 // map and coarseMap as domain map; otherwise use plain range map of
557 // Ptent = plain range map of A for range map and coarseMap as domain map.
558 // NOTE:
559 // The latter is not really safe, since there is no striding information
560 // for the range map. This is not really a problem, since striding
561 // information is always available on the intermedium levels and the
562 // coarsest levels.
563 if (A->IsView("stridedMaps") == true)
564 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
565 else
566 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
567
569 Set(coarseLevel, "Coordinates", coarseCoords);
570 }
571 Set(coarseLevel, "Nullspace", coarseNullspace);
572 Set(coarseLevel, "P", Ptentative);
573
574 if (IsPrint(Statistics2)) {
575 RCP<ParameterList> params = rcp(new ParameterList());
576 params->set("printLoadBalancingInfo", true);
577 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
578 }
579 }
580
581 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
583 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
584 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
585 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
586 RCP<MultiVector>& coarseNullspace, const int levelID) const {
587 auto rowMap = A->getRowMap();
588 auto colMap = A->getColMap();
589
590 const size_t numRows = rowMap->getLocalNumElements();
591 const size_t NSDim = fineNullspace->getNumVectors();
592
593 typedef Kokkos::ArithTraits<SC> ATS;
594 using impl_SC = typename ATS::val_type;
595 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
596 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
597
598 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
599
600 typename Aggregates_kokkos::local_graph_type aggGraph;
601 {
602 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
603 aggGraph = aggregates->GetGraph();
604 }
605 auto aggRows = aggGraph.row_map;
606 auto aggCols = aggGraph.entries;
607
608 // Aggregates map is based on the amalgamated column map
609 // We can skip global-to-local conversion if LIDs in row map are
610 // same as LIDs in column map
611 bool goodMap;
612 {
613 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
614 goodMap = isGoodMap(*rowMap, *colMap);
615 }
616 // FIXME_KOKKOS: need to proofread later code for bad maps
617 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
618 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
619 "(i.e. \"matching\" row and column maps)");
620
621 // STEP 1: do unamalgamation
622 // The non-kokkos version uses member functions from the AmalgamationInfo
623 // container class to unamalgamate the data. In contrast, the kokkos
624 // version of TentativePFactory does the unamalgamation here and only uses
625 // the data of the AmalgamationInfo container class
626
627 // Extract information for unamalgamation
628 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
629 GO indexBase;
630 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
631 GO globalOffset = amalgInfo->GlobalOffset();
632
633 // Extract aggregation info (already in Kokkos host views)
634 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
635 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
636 const size_t numAggregates = aggregates->GetNumAggregates();
637
638 int myPID = aggregates->GetMap()->getComm()->getRank();
639
640 // Create Kokkos::View (on the device) to store the aggreate dof sizes
641 // Later used to get aggregate dof offsets
642 // NOTE: This zeros itself on construction
643 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
644 AggSizeType aggDofSizes;
645
646 if (stridedBlockSize == 1) {
647 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
648
649 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
650 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
651
652 auto sizesConst = aggregates->ComputeAggregateSizes();
653 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
654
655 } else {
656 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
657
658 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
659 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
660
661 auto nodeMap = aggregates->GetMap()->getLocalMap();
662 auto dofMap = colMap->getLocalMap();
663
664 Kokkos::parallel_for("MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
665 KOKKOS_LAMBDA(const LO agg) {
666 auto aggRowView = aggGraph.rowConst(agg);
667
668 size_t size = 0;
669 for (LO colID = 0; colID < aggRowView.length; colID++) {
670 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
671
672 for (LO k = 0; k < stridedBlockSize; k++) {
673 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
674
675 if (dofMap.getLocalElement(dofGID) != INVALID)
676 size++;
677 }
678 }
679 aggDofSizes(agg+1) = size;
680 });
681 }
682
683 // Find maximum dof size for aggregates
684 // Later used to reserve enough scratch space for local QR decompositions
685 LO maxAggSize = 0;
686 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
687 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
688
689 // parallel_scan (exclusive)
690 // The aggDofSizes View then contains the aggregate dof offsets
691 Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
692 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
693 update += aggDofSizes(i);
694 if (final_pass)
695 aggDofSizes(i) = update;
696 });
697
698 // Create Kokkos::View on the device to store mapping
699 // between (local) aggregate id and row map ids (LIDs)
700 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
701 {
702 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
703
704 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
705 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
706
707 Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
708 KOKKOS_LAMBDA(const LO lnode) {
709 if (procWinner(lnode, 0) == myPID) {
710 // No need for atomics, it's one-to-one
711 auto aggID = vertex2AggId(lnode,0);
712
713 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
714 // FIXME: I think this may be wrong
715 // We unconditionally add the whole block here. When we calculated
716 // aggDofSizes, we did the isLocalElement check. Something's fishy.
717 for (LO k = 0; k < stridedBlockSize; k++)
718 agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
719 }
720 });
721 }
722
723 // STEP 2: prepare local QR decomposition
724 // Reserve memory for tentative prolongation operator
725 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
726
727 // Pull out the nullspace vectors so that we can have random access (on the device)
728 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
729 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
730
731 size_t nnz = 0; // actual number of nnz
732
733 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
734 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
735 typedef typename local_matrix_type::index_type::non_const_type cols_type;
736 typedef typename local_matrix_type::values_type::non_const_type vals_type;
737
738
739 // Device View for status (error messages...)
740 typedef Kokkos::View<int[10], DeviceType> status_type;
741 status_type status("status");
742
743 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
744 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
745
746 const ParameterList& pL = GetParameterList();
747 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
748 if (!doQRStep) {
749 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
750 if (NSDim>1)
751 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
752 }
753
754 size_t nnzEstimate = numRows * NSDim;
755 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows+1);
756 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
757 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
758 rows_type rows("Ptent_rows", numRows+1);
759 {
760 // Stage 0: fill in views.
761 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
762
763 // The main thing to notice is initialization of vals with INVALID. These
764 // values will later be used to compress the arrays
765 Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
766 KOKKOS_LAMBDA(const LO row) {
767 rowsAux(row) = row*NSDim;
768 });
769 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
770 KOKKOS_LAMBDA(const LO j) {
771 colsAux(j) = INVALID;
772 });
773 }
774
775 if (NSDim == 1) {
776 // 1D is special, as it is the easiest. We don't even need to the QR,
777 // just normalize an array. Plus, no worries abot small aggregates. In
778 // addition, we do not worry about compression. It is unlikely that
779 // nullspace will have zeros. If it does, a prolongator row would be
780 // zero and we'll get singularity anyway.
781 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
782
783 // Set up team policy with numAggregates teams and one thread per team.
784 // Each team handles a slice of the data associated with one aggregate
785 // and performs a local QR decomposition (in this case real QR is
786 // unnecessary).
787 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
788
789 if (doQRStep) {
790 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop", policy,
791 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
792 auto agg = thread.league_rank();
793
794 // size of the aggregate (number of DOFs in aggregate)
795 LO aggSize = aggRows(agg+1) - aggRows(agg);
796
797 // Extract the piece of the nullspace corresponding to the aggregate, and
798 // put it in the flat array, "localQR" (in column major format) for the
799 // QR routine. Trivial in 1D.
800 auto norm = impl_ATS::magnitude(zero);
801
802 // Calculate QR by hand
803 // FIXME: shouldn't there be stridedblock here?
804 // FIXME_KOKKOS: shouldn't there be stridedblock here?
805 for (decltype(aggSize) k = 0; k < aggSize; k++) {
806 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
807 norm += dnorm*dnorm;
808 }
809 norm = sqrt(norm);
810
811 if (norm == zero) {
812 // zero column; terminate the execution
813 statusAtomic(1) = true;
814 return;
815 }
816
817 // R = norm
818 coarseNS(agg, 0) = norm;
819
820 // Q = localQR(:,0)/norm
821 for (decltype(aggSize) k = 0; k < aggSize; k++) {
822 LO localRow = agg2RowMapLO(aggRows(agg)+k);
823 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
824
825 rows(localRow+1) = 1;
826 colsAux(localRow) = agg;
827 valsAux(localRow) = localVal;
828
829 }
830 });
831
832 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
833 Kokkos::deep_copy(statusHost, status);
834 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
835 if (statusHost(i)) {
836 std::ostringstream oss;
837 oss << "MueLu::TentativePFactory::MakeTentative: ";
838 switch (i) {
839 case 0: oss << "!goodMap is not implemented"; break;
840 case 1: oss << "fine level NS part has a zero column"; break;
841 }
842 throw Exceptions::RuntimeError(oss.str());
843 }
844
845 } else {
846 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
847 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
848 auto agg = thread.league_rank();
849
850 // size of the aggregate (number of DOFs in aggregate)
851 LO aggSize = aggRows(agg+1) - aggRows(agg);
852
853 // R = norm
854 coarseNS(agg, 0) = one;
855
856 // Q = localQR(:,0)/norm
857 for (decltype(aggSize) k = 0; k < aggSize; k++) {
858 LO localRow = agg2RowMapLO(aggRows(agg)+k);
859 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
860
861 rows(localRow+1) = 1;
862 colsAux(localRow) = agg;
863 valsAux(localRow) = localVal;
864
865 }
866 });
867 }
868
869 Kokkos::parallel_reduce("MueLu:TentativeP:CountNNZ", range_type(0, numRows+1),
870 KOKKOS_LAMBDA(const LO i, size_t &nnz_count) {
871 nnz_count += rows(i);
872 }, nnz);
873
874 } else { // NSdim > 1
875 // FIXME_KOKKOS: This code branch is completely unoptimized.
876 // Work to do:
877 // - Optimize QR decomposition
878 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
879 // packing new values in the beginning of each row
880 // We do use auxilary view in this case, so keep a second rows view for
881 // counting nonzeros in rows
882
883 {
884 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
885 // Set up team policy with numAggregates teams and one thread per team.
886 // Each team handles a slice of the data associated with one aggregate
887 // and performs a local QR decomposition
888 const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1); // numAggregates teams a 1 thread
889 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
890 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
891 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
892 decltype(valsAux)>
893 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
894 rows, rowsAux, colsAux, valsAux, doQRStep);
895 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
896 }
897
898 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
899 Kokkos::deep_copy(statusHost, status);
900 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
901 if (statusHost(i)) {
902 std::ostringstream oss;
903 oss << "MueLu::TentativePFactory::MakeTentative: ";
904 switch(i) {
905 case 0: oss << "!goodMap is not implemented"; break;
906 case 1: oss << "fine level NS part has a zero column"; break;
907 }
908 throw Exceptions::RuntimeError(oss.str());
909 }
910 }
911
912 // Compress the cols and vals by ignoring INVALID column entries that correspond
913 // to 0 in QR.
914
915 // The real cols and vals are constructed using calculated (not estimated) nnz
916 cols_type cols;
917 vals_type vals;
918
919 if (nnz != nnzEstimate) {
920 {
921 // Stage 2: compress the arrays
922 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
923
924 Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
925 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
926 upd += rows(i);
927 if (final)
928 rows(i) = upd;
929 });
930 }
931
932 {
933 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
934
935 cols = cols_type("Ptent_cols", nnz);
936 vals = vals_type("Ptent_vals", nnz);
937
938 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
939 // to the beginning of rows. See CoalesceDropFactory_kokkos for
940 // example.
941 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
942 KOKKOS_LAMBDA(const LO i) {
943 LO rowStart = rows(i);
944
945 size_t lnnz = 0;
946 for (auto j = rowsAux(i); j < rowsAux(i+1); j++)
947 if (colsAux(j) != INVALID) {
948 cols(rowStart+lnnz) = colsAux(j);
949 vals(rowStart+lnnz) = valsAux(j);
950 lnnz++;
951 }
952 });
953 }
954
955 } else {
956 rows = rowsAux;
957 cols = colsAux;
958 vals = valsAux;
959 }
960
961 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
962
963 {
964 // Stage 3: construct Xpetra::Matrix
965 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
966
967 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
968
969 // Managing labels & constants for ESFC
970 RCP<ParameterList> FCparams;
971 if (pL.isSublist("matrixmatrix: kernel params"))
972 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
973 else
974 FCparams = rcp(new ParameterList);
975
976 // By default, we don't need global constants for TentativeP
977 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
978 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
979
980 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
981 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
982 }
983 }
984
985
986 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
988 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
989 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
990 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
991 RCP<MultiVector>& coarseNullspace, const int levelID) const {
992#ifdef HAVE_MUELU_TPETRA
993 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
994 be generalized later, if we ever need to do so:
995 1) Null space dimension === block size of matrix: So no elasticity right now
996 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
997 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
998
999 These assumptions keep our code way simpler and still support the use cases we actually care about.
1000 */
1001
1002 RCP<const Map> rowMap = A->getRowMap();
1003 RCP<const Map> rangeMap = A->getRangeMap();
1004 RCP<const Map> colMap = A->getColMap();
1005 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
1006 const size_t numFineBlockRows = rowMap->getLocalNumElements();
1007
1008 // typedef Teuchos::ScalarTraits<SC> STS;
1009 // typedef typename STS::magnitudeType Magnitude;
1010 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1011
1012 typedef Kokkos::ArithTraits<SC> ATS;
1013 using impl_SC = typename ATS::val_type;
1014 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1015 const impl_SC one = impl_ATS::one();
1016
1017 // const GO numAggs = aggregates->GetNumAggregates();
1018 const size_t NSDim = fineNullspace->getNumVectors();
1019 auto aggSizes = aggregates->ComputeAggregateSizes();
1020
1021
1022 typename Aggregates_kokkos::local_graph_type aggGraph;
1023 {
1024 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
1025 aggGraph = aggregates->GetGraph();
1026 }
1027 auto aggRows = aggGraph.row_map;
1028 auto aggCols = aggGraph.entries;
1029
1030
1031 // Need to generate the coarse block map
1032 // NOTE: We assume NSDim == block size here
1033 // NOTE: We also assume that coarseMap has contiguous GIDs
1034 //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1035 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1036 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1037 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1038 numCoarseBlockRows,
1039 coarsePointMap->getIndexBase(),
1040 coarsePointMap->getComm());
1041 // Sanity checking
1042 const ParameterList& pL = GetParameterList();
1043 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1044
1045
1046 // The aggregates use the amalgamated column map, which in this case is what we want
1047
1048 // Aggregates map is based on the amalgamated column map
1049 // We can skip global-to-local conversion if LIDs in row map are
1050 // same as LIDs in column map
1051 bool goodMap = MueLu::Utilities<SC,LO,GO,NO>::MapsAreNested(*rowMap, *colMap);
1052 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1053 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1054 "(i.e. \"matching\" row and column maps)");
1055
1056 // STEP 1: do unamalgamation
1057 // The non-kokkos version uses member functions from the AmalgamationInfo
1058 // container class to unamalgamate the data. In contrast, the kokkos
1059 // version of TentativePFactory does the unamalgamation here and only uses
1060 // the data of the AmalgamationInfo container class
1061
1062 // Extract information for unamalgamation
1063 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1064 GO indexBase;
1065 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1066 //GO globalOffset = amalgInfo->GlobalOffset();
1067
1068 // Extract aggregation info (already in Kokkos host views)
1069 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
1070 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1071 const size_t numAggregates = aggregates->GetNumAggregates();
1072
1073 int myPID = aggregates->GetMap()->getComm()->getRank();
1074
1075 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1076 // Later used to get aggregate dof offsets
1077 // NOTE: This zeros itself on construction
1078 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
1079 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1080
1081 {
1082 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1083
1084 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1085 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
1086
1087 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), aggSizes);
1088 }
1089
1090 // Find maximum dof size for aggregates
1091 // Later used to reserve enough scratch space for local QR decompositions
1092 LO maxAggSize = 0;
1093 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
1094 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1095
1096 // parallel_scan (exclusive)
1097 // The aggDofSizes View then contains the aggregate dof offsets
1098 Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
1099 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1100 update += aggDofSizes(i);
1101 if (final_pass)
1102 aggDofSizes(i) = update;
1103 });
1104
1105 // Create Kokkos::View on the device to store mapping
1106 // between (local) aggregate id and row map ids (LIDs)
1107 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1108 {
1109 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1110
1111 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1112 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1113
1114 Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1115 KOKKOS_LAMBDA(const LO lnode) {
1116 if (procWinner(lnode, 0) == myPID) {
1117 // No need for atomics, it's one-to-one
1118 auto aggID = vertex2AggId(lnode,0);
1119
1120 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
1121 // FIXME: I think this may be wrong
1122 // We unconditionally add the whole block here. When we calculated
1123 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1124 for (LO k = 0; k < stridedBlockSize; k++)
1125 aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k;
1126 }
1127 });
1128 }
1129
1130 // STEP 2: prepare local QR decomposition
1131 // Reserve memory for tentative prolongation operator
1132 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1133
1134 // Pull out the nullspace vectors so that we can have random access (on the device)
1135 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
1136 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1137
1138 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
1139 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1140 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1141 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1142
1143
1144 // Device View for status (error messages...)
1145 typedef Kokkos::View<int[10], DeviceType> status_type;
1146 status_type status("status");
1147
1148 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1149 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
1150
1151 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1152 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1153
1154 // BlockCrs requires that we build the (block) graph first, so let's do that...
1155
1156 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1157 // block non-zero per row in the matrix;
1158 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1159 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1160
1161 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1162 KOKKOS_LAMBDA(const LO j) {
1163 ia[j] = j;
1164 ja[j] = INVALID;
1165
1166 if(j==(LO)numFineBlockRows-1)
1167 ia[numFineBlockRows] = numFineBlockRows;
1168 });
1169
1170 // Fill Graph
1171 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1172 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:fillGraph", policy,
1173 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1174 auto agg = thread.league_rank();
1175 Xpetra::global_size_t offset = agg;
1176
1177 // size of the aggregate (number of DOFs in aggregate)
1178 LO aggSize = aggRows(agg+1) - aggRows(agg);
1179
1180 for (LO j = 0; j < aggSize; j++) {
1181 // FIXME: Allow for bad maps
1182 const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j];
1183 const size_t rowStart = ia[localRow];
1184 ja[rowStart] = offset;
1185 }
1186 });
1187
1188 // Compress storage (remove all INVALID, which happen when we skip zeros)
1189 // We do that in-place
1190 {
1191 // Stage 2: compress the arrays
1192 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1193 // Fill i_temp with the correct row starts
1194 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1195 LO nnz=0;
1196 Kokkos::parallel_scan("MueLu:TentativePF:BlockCrs:compress_rows", range_type(0,numFineBlockRows),
1197 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1198 if(final)
1199 i_temp[i] = upd;
1200 for (auto j = ia[i]; j < ia[i+1]; j++)
1201 if (ja[j] != INVALID)
1202 upd++;
1203 if(final && i == (LO) numFineBlockRows-1)
1204 i_temp[numFineBlockRows] = upd;
1205 },nnz);
1206
1207 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1208
1209
1210 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:compress_cols", range_type(0,numFineBlockRows),
1211 KOKKOS_LAMBDA(const LO i) {
1212 size_t rowStart = i_temp[i];
1213 size_t lnnz = 0;
1214 for (auto j = ia[i]; j < ia[i+1]; j++)
1215 if (ja[j] != INVALID) {
1216 j_temp[rowStart+lnnz] = ja[j];
1217 lnnz++;
1218 }
1219 });
1220
1221 ia = i_temp;
1222 ja = j_temp;
1223 }
1224
1225 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja);
1226
1227
1228 // Managing labels & constants for ESFC
1229 {
1230 RCP<ParameterList> FCparams;
1231 if(pL.isSublist("matrixmatrix: kernel params"))
1232 FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1233 else
1234 FCparams= rcp(new ParameterList);
1235 // By default, we don't need global constants for TentativeP
1236 FCparams->set("compute global constants",FCparams->get("compute global constants",false));
1237 std::string levelIDs = toString(levelID);
1238 FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
1239 RCP<const Export> dummy_e;
1240 RCP<const Import> dummy_i;
1241 BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
1242 }
1243
1244 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1245 // we clear them here
1246 ia = rows_type();
1247 ja = cols_type();
1248
1249
1250 // Now let's make a BlockCrs Matrix
1251 // NOTE: Assumes block size== NSDim
1252 RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
1253 RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
1254 if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1255 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1256
1257 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1258 const LO stride = NSDim*NSDim;
1259
1260 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1261 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1262 auto agg = thread.league_rank();
1263
1264 // size of the aggregate (number of DOFs in aggregate)
1265 LO aggSize = aggRows(agg+1) - aggRows(agg);
1266 Xpetra::global_size_t offset = agg*NSDim;
1267
1268 // Q = localQR(:,0)/norm
1269 for (LO j = 0; j < aggSize; j++) {
1270 LO localBlockRow = aggToRowMapLO(aggRows(agg)+j);
1271 LO rowStart = localBlockRow * stride;
1272 for (LO r = 0; r < (LO)NSDim; r++) {
1273 LO localPointRow = localBlockRow*NSDim + r;
1274 for (LO c = 0; c < (LO)NSDim; c++) {
1275 values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c);
1276 }
1277 }
1278 }
1279
1280 // R = norm
1281 for(LO j=0; j<(LO)NSDim; j++)
1282 coarseNS(offset+j,j) = one;
1283 });
1284
1285 Ptentative = P_wrap;
1286
1287#else
1288 throw std::runtime_error("TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra");
1289#endif
1290 }
1291
1292 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1294 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates_kokkos> /* aggregates */,
1295 RCP<AmalgamationInfo_kokkos> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1296 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1297 RCP<MultiVector>& /* coarseNullspace */) const {
1298 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1299 }
1300
1301 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1303 isGoodMap(const Map& rowMap, const Map& colMap) const {
1304 auto rowLocalMap = rowMap.getLocalMap();
1305 auto colLocalMap = colMap.getLocalMap();
1306
1307 const size_t numRows = rowLocalMap.getLocalNumElements();
1308 const size_t numCols = colLocalMap.getLocalNumElements();
1309
1310 if (numCols < numRows)
1311 return false;
1312
1313 size_t numDiff = 0;
1314 Kokkos::parallel_reduce("MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1315 KOKKOS_LAMBDA(const LO i, size_t &diff) {
1316 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1317 }, numDiff);
1318
1319 return (numDiff == 0);
1320 }
1321
1322} //namespace MueLu
1323
1324#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1325#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
View
#define SET_VALID_ENTRY(name)
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
void Input(Level &level, const std::string &varName) const
T Get(Level &level, const std::string &varName) const
void Set(Level &level, const std::string &varName, const T &data) const
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
virtual const Teuchos::ParameterList & GetParameterList() const
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates_kokkos > aggregates, RCP< AmalgamationInfo_kokkos > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates_kokkos > aggregates, RCP< AmalgamationInfo_kokkos > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates_kokkos > aggregates, RCP< AmalgamationInfo_kokkos > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
static bool MapsAreNested(const Xpetra::Map< DefaultLocalOrdinal, DefaultGlobalOrdinal, DefaultNode > &rowMap, const Xpetra::Map< DefaultLocalOrdinal, DefaultGlobalOrdinal, DefaultNode > &colMap)
Teuchos::FancyOStream & GetOStream(MsgType type, int thisProcRankOnly=0) const
Get an output stream for outputting the input message type.
bool IsPrint(MsgType type, int thisProcRankOnly=-1) const
Find out whether we need to print out information for a specific message type.
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.