MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48
49#include "Kokkos_UnorderedMap.hpp"
50#include "Xpetra_CrsGraphFactory.hpp"
51
53
54#include "MueLu_Aggregates.hpp"
55#include "MueLu_AmalgamationInfo.hpp"
56
57#include "MueLu_MasterList.hpp"
58#include "MueLu_PerfUtils.hpp"
59#include "MueLu_Monitor.hpp"
60
61#include "Xpetra_IO.hpp"
62
63namespace MueLu {
64
65 namespace { // anonymous
66
67 template<class LocalOrdinal, class View>
68 class ReduceMaxFunctor{
69 public:
70 ReduceMaxFunctor(View view) : view_(view) { }
71
72 KOKKOS_INLINE_FUNCTION
73 void operator()(const LocalOrdinal &i, LocalOrdinal& vmax) const {
74 if (vmax < view_(i))
75 vmax = view_(i);
76 }
77
78 KOKKOS_INLINE_FUNCTION
79 void join (LocalOrdinal& dst, const LocalOrdinal& src) const {
80 if (dst < src) {
81 dst = src;
82 }
83 }
84
85 KOKKOS_INLINE_FUNCTION
86 void init (LocalOrdinal& dst) const {
87 dst = 0;
88 }
89 private:
90 View view_;
91 };
92
93 // local QR decomposition
94 template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
95 class LocalQRDecompFunctor {
96 private:
97 typedef LOType LO;
98 typedef GOType GO;
99 typedef SCType SC;
100
101 typedef typename DeviceType::execution_space execution_space;
102 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
103 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
104 typedef typename impl_ATS::magnitudeType Magnitude;
105
106 typedef Kokkos::View<impl_SC**,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
107 typedef Kokkos::View<impl_SC* ,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
108
109 private:
110
111 NspType fineNS;
112 NspType coarseNS;
113 aggRowsType aggRows;
114 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
115 agg2RowMapLOType agg2RowMapLO;
116 statusType statusAtomic;
117 rowsType rows;
118 rowsAuxType rowsAux;
119 colsAuxType colsAux;
120 valsAuxType valsAux;
121 bool doQRStep;
122 public:
123 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) :
124 fineNS(fineNS_),
125 coarseNS(coarseNS_),
126 aggRows(aggRows_),
127 maxAggDofSize(maxAggDofSize_),
128 agg2RowMapLO(agg2RowMapLO_),
129 statusAtomic(statusAtomic_),
130 rows(rows_),
131 rowsAux(rowsAux_),
132 colsAux(colsAux_),
133 valsAux(valsAux_),
134 doQRStep(doQRStep_)
135 { }
136
137 KOKKOS_INLINE_FUNCTION
138 void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
139 auto agg = thread.league_rank();
140
141 // size of aggregate: number of DOFs in aggregate
142 auto aggSize = aggRows(agg+1) - aggRows(agg);
143
144 const impl_SC one = impl_ATS::one();
145 const impl_SC two = one + one;
146 const impl_SC zero = impl_ATS::zero();
147 const auto zeroM = impl_ATS::magnitude(zero);
148
149 int m = aggSize;
150 int n = fineNS.extent(1);
151
152 // calculate row offset for coarse nullspace
153 Xpetra::global_size_t offset = agg * n;
154
155 if (doQRStep) {
156
157 // Extract the piece of the nullspace corresponding to the aggregate
158 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
159 for (int j = 0; j < n; j++)
160 for (int k = 0; k < m; k++)
161 r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
162#if 0
163 printf("A\n");
164 for (int i = 0; i < m; i++) {
165 for (int j = 0; j < n; j++)
166 printf(" %5.3lf ", r(i,j));
167 printf("\n");
168 }
169#endif
170
171 // Calculate QR decomposition (standard)
172 shared_matrix q(thread.team_shmem(), m, m); // Q
173 if (m >= n) {
174 bool isSingular = false;
175
176 // Initialize Q^T
177 auto qt = q;
178 for (int i = 0; i < m; i++) {
179 for (int j = 0; j < m; j++)
180 qt(i,j) = zero;
181 qt(i,i) = one;
182 }
183
184 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
185 // FIXME_KOKKOS: use team
186 Magnitude s = zeroM, norm, norm_x;
187 for (int i = k+1; i < m; i++)
188 s += pow(impl_ATS::magnitude(r(i,k)), 2);
189 norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
190
191 if (norm == zero) {
192 isSingular = true;
193 break;
194 }
195
196 r(k,k) -= norm*one;
197
198 norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
199 if (norm_x == zeroM) {
200 // We have a single diagonal element in the column.
201 // No reflections required. Just need to restor r(k,k).
202 r(k,k) = norm*one;
203 continue;
204 }
205
206 // FIXME_KOKKOS: use team
207 for (int i = k; i < m; i++)
208 r(i,k) /= norm_x;
209
210 // Update R(k:m,k+1:n)
211 for (int j = k+1; j < n; j++) {
212 // FIXME_KOKKOS: use team in the loops
213 impl_SC si = zero;
214 for (int i = k; i < m; i++)
215 si += r(i,k) * r(i,j);
216 for (int i = k; i < m; i++)
217 r(i,j) -= two*si * r(i,k);
218 }
219
220 // Update Q^T (k:m,k:m)
221 for (int j = k; j < m; j++) {
222 // FIXME_KOKKOS: use team in the loops
223 impl_SC si = zero;
224 for (int i = k; i < m; i++)
225 si += r(i,k) * qt(i,j);
226 for (int i = k; i < m; i++)
227 qt(i,j) -= two*si * r(i,k);
228 }
229
230 // Fix R(k:m,k)
231 r(k,k) = norm*one;
232 for (int i = k+1; i < m; i++)
233 r(i,k) = zero;
234 }
235
236#if 0
237 // Q = (Q^T)^T
238 for (int i = 0; i < m; i++)
239 for (int j = 0; j < i; j++) {
240 impl_SC tmp = qt(i,j);
241 qt(i,j) = qt(j,i);
242 qt(j,i) = tmp;
243 }
244#endif
245
246 // Build coarse nullspace using the upper triangular part of R
247 for (int j = 0; j < n; j++)
248 for (int k = 0; k <= j; k++)
249 coarseNS(offset+k,j) = r(k,j);
250
251 if (isSingular) {
252 statusAtomic(1) = true;
253 return;
254 }
255
256 } else {
257 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
258
259 // The local QR decomposition is not possible in the "overconstrained"
260 // case (i.e. number of columns in qr > number of rowsAux), which
261 // corresponds to #DOFs in Aggregate < n. For usual problems this
262 // is only possible for single node aggregates in structural mechanics.
263 // (Similar problems may arise in discontinuous Galerkin problems...)
264 // We bypass the QR decomposition and use an identity block in the
265 // tentative prolongator for the single node aggregate and transfer the
266 // corresponding fine level null space information 1-to-1 to the coarse
267 // level null space part.
268
269 // NOTE: The resulting tentative prolongation operator has
270 // (m*DofsPerNode-n) zero columns leading to a singular
271 // coarse level operator A. To deal with that one has the following
272 // options:
273 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
274 // false) to add some identity block to the diagonal of the zero rowsAux
275 // in the coarse level operator A, such that standard level smoothers
276 // can be used again.
277 // - Use special (projection-based) level smoothers, which can deal
278 // with singular matrices (very application specific)
279 // - Adapt the code below to avoid zero columns. However, we do not
280 // support a variable number of DOFs per node in MueLu/Xpetra which
281 // makes the implementation really hard.
282 //
283 // FIXME: do we need to check for singularity here somehow? Zero
284 // columns would be easy but linear dependency would require proper QR.
285
286 // R = extended (by adding identity rowsAux) qr
287 for (int j = 0; j < n; j++)
288 for (int k = 0; k < n; k++)
289 if (k < m)
290 coarseNS(offset+k,j) = r(k,j);
291 else
292 coarseNS(offset+k,j) = (k == j ? one : zero);
293
294 // Q = I (rectangular)
295 for (int i = 0; i < m; i++)
296 for (int j = 0; j < n; j++)
297 q(i,j) = (j == i ? one : zero);
298 }
299
300 // Process each row in the local Q factor and fill helper arrays to assemble P
301 for (int j = 0; j < m; j++) {
302 LO localRow = agg2RowMapLO(aggRows(agg)+j);
303 size_t rowStart = rowsAux(localRow);
304 size_t lnnz = 0;
305 for (int k = 0; k < n; k++) {
306 // skip zeros
307 if (q(j,k) != zero) {
308 colsAux(rowStart+lnnz) = offset + k;
309 valsAux(rowStart+lnnz) = q(j,k);
310 lnnz++;
311 }
312 }
313 rows(localRow+1) = lnnz;
314 nnz += lnnz;
315 }
316
317#if 0
318 printf("R\n");
319 for (int i = 0; i < m; i++) {
320 for (int j = 0; j < n; j++)
321 printf(" %5.3lf ", coarseNS(i,j));
322 printf("\n");
323 }
324
325 printf("Q\n");
326 for (int i = 0; i < aggSize; i++) {
327 for (int j = 0; j < aggSize; j++)
328 printf(" %5.3lf ", q(i,j));
329 printf("\n");
330 }
331#endif
332 } else {
334 // "no-QR" option //
336 // Local Q factor is just the fine nullspace support over the current aggregate.
337 // Local R factor is the identity.
338 // TODO I have not implemented any special handling for aggregates that are too
339 // TODO small to locally support the nullspace, as is done in the standard QR
340 // TODO case above.
341
342 for (int j = 0; j < m; j++) {
343 LO localRow = agg2RowMapLO(aggRows(agg)+j);
344 size_t rowStart = rowsAux(localRow);
345 size_t lnnz = 0;
346 for (int k = 0; k < n; k++) {
347 const impl_SC qr_jk = fineNS(localRow,k);
348 // skip zeros
349 if (qr_jk != zero) {
350 colsAux(rowStart+lnnz) = offset + k;
351 valsAux(rowStart+lnnz) = qr_jk;
352 lnnz++;
353 }
354 }
355 rows(localRow+1) = lnnz;
356 nnz += lnnz;
357 }
358
359 for (int j = 0; j < n; j++)
360 coarseNS(offset+j,j) = one;
361
362 }
363
364 }
365
366 // amount of shared memory
367 size_t team_shmem_size( int /* team_size */ ) const {
368 if (doQRStep) {
369 int m = maxAggDofSize;
370 int n = fineNS.extent(1);
371 return shared_matrix::shmem_size(m, n) + // r
372 shared_matrix::shmem_size(m, m); // q
373 } else
374 return 0;
375 }
376 };
377
378 } // namespace anonymous
379
380 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
382 RCP<ParameterList> validParamList = rcp(new ParameterList());
383
384#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
385 SET_VALID_ENTRY("tentative: calculate qr");
386 SET_VALID_ENTRY("tentative: build coarse coordinates");
387#undef SET_VALID_ENTRY
388
389 validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
390 validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
391 validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
392 validParamList->set< RCP<const FactoryBase> >("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
393 validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
394 validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
395 validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory of the coordinates");
396
397 // Make sure we don't recursively validate options for the matrixmatrix kernels
398 ParameterList norecurse;
399 norecurse.disableRecursiveValidation();
400 validParamList->set<ParameterList> ("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
401
402 return validParamList;
403 }
404
405 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
407
408 const ParameterList& pL = GetParameterList();
409 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
410 std::string nspName = "Nullspace";
411 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
412
413 Input(fineLevel, "A");
414 Input(fineLevel, "Aggregates");
415 Input(fineLevel, nspName);
416 Input(fineLevel, "UnAmalgamationInfo");
417 Input(fineLevel, "CoarseMap");
418 if( fineLevel.GetLevelID() == 0 &&
419 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
420 pL.get<bool>("tentative: build coarse coordinates") ) { // and we want coordinates on other levels
421 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
422 Input(fineLevel, "Coordinates");
423 } else if (bTransferCoordinates_) {
424 Input(fineLevel, "Coordinates");
425 }
426 }
427
428 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
432
433 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
435 FactoryMonitor m(*this, "Build", coarseLevel);
436
437 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
438 typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
439 const ParameterList& pL = GetParameterList();
440 std::string nspName = "Nullspace";
441 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
442
443 auto A = Get< RCP<Matrix> > (fineLevel, "A");
444 auto aggregates = Get< RCP<Aggregates> > (fineLevel, "Aggregates");
445 auto amalgInfo = Get< RCP<AmalgamationInfo> > (fineLevel, "UnAmalgamationInfo");
446 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
447 auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
448 RCP<RealValuedMultiVector> fineCoords;
450 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, "Coordinates");
451 }
452
453 RCP<Matrix> Ptentative;
454 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
455 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
456 if ( aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
457 Ptentative = Teuchos::null;
458 Set(coarseLevel, "P", Ptentative);
459 return;
460 }
461 RCP<MultiVector> coarseNullspace;
462 RCP<RealValuedMultiVector> coarseCoords;
463
465 ArrayView<const GO> elementAList = coarseMap->getLocalElementList();
466 GO indexBase = coarseMap->getIndexBase();
467
468 LO blkSize = 1;
469 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
470 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
471
472 Array<GO> elementList;
473 ArrayView<const GO> elementListView;
474 if (blkSize == 1) {
475 // Scalar system
476 // No amalgamation required
477 elementListView = elementAList;
478
479 } else {
480 auto numElements = elementAList.size() / blkSize;
481
482 elementList.resize(numElements);
483
484 // Amalgamate the map
485 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
486 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
487
488 elementListView = elementList;
489 }
490
491 auto uniqueMap = fineCoords->getMap();
492 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
493 elementListView, indexBase, coarseMap->getComm());
494 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
495
496 // Create overlapped fine coordinates to reduce global communication
497 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
498 if (aggregates->AggregatesCrossProcessors()) {
499 auto nonUniqueMap = aggregates->GetMap();
500 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
501
502 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
503 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
504 }
505
506 // The good new is that his graph has already been constructed for the
507 // TentativePFactory and was cached in Aggregates. So this is a no-op.
508 auto aggGraph = aggregates->GetGraph();
509 auto numAggs = aggGraph.numRows();
510
511 auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
512 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
513
514 // Fill in coarse coordinates
515 {
516 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
517
518 const auto dim = fineCoords->getNumVectors();
519
520 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
521 for (size_t j = 0; j < dim; j++) {
522 Kokkos::parallel_for("MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
523 KOKKOS_LAMBDA(const LO i) {
524 // A row in this graph represents all node ids in the aggregate
525 // Therefore, averaging is very easy
526
527 auto aggregate = aggGraph.rowConst(i);
528
529 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
530 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
531 sum += fineCoordsRandomView(aggregate(colID),j);
532
533 coarseCoordsView(i,j) = sum / aggregate.length;
534 });
535 }
536 }
537 }
538
539 if (!aggregates->AggregatesCrossProcessors()) {
540 if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
541 BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
542 coarseLevel.GetLevelID());
543 }
544 else {
545 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
546 }
547 }
548 else
549 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
550
551 // If available, use striding information of fine level matrix A for range
552 // map and coarseMap as domain map; otherwise use plain range map of
553 // Ptent = plain range map of A for range map and coarseMap as domain map.
554 // NOTE:
555 // The latter is not really safe, since there is no striding information
556 // for the range map. This is not really a problem, since striding
557 // information is always available on the intermedium levels and the
558 // coarsest levels.
559 if (A->IsView("stridedMaps") == true)
560 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
561 else
562 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
563
565 Set(coarseLevel, "Coordinates", coarseCoords);
566 }
567 Set(coarseLevel, "Nullspace", coarseNullspace);
568 Set(coarseLevel, "P", Ptentative);
569
570 if (IsPrint(Statistics2)) {
571 RCP<ParameterList> params = rcp(new ParameterList());
572 params->set("printLoadBalancingInfo", true);
573 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
574 }
575 }
576
577 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
579 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
580 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
581 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
582 RCP<MultiVector>& coarseNullspace, const int levelID) const {
583 auto rowMap = A->getRowMap();
584 auto colMap = A->getColMap();
585
586 const size_t numRows = rowMap->getLocalNumElements();
587 const size_t NSDim = fineNullspace->getNumVectors();
588
589 typedef Kokkos::ArithTraits<SC> ATS;
590 using impl_SC = typename ATS::val_type;
591 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
592 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
593
594 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
595
596 typename Aggregates::local_graph_type aggGraph;
597 {
598 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
599 aggGraph = aggregates->GetGraph();
600 }
601 auto aggRows = aggGraph.row_map;
602 auto aggCols = aggGraph.entries;
603
604 // Aggregates map is based on the amalgamated column map
605 // We can skip global-to-local conversion if LIDs in row map are
606 // same as LIDs in column map
607 bool goodMap;
608 {
609 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
610 goodMap = isGoodMap(*rowMap, *colMap);
611 }
612 // FIXME_KOKKOS: need to proofread later code for bad maps
613 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
614 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
615 "(i.e. \"matching\" row and column maps)");
616
617 // STEP 1: do unamalgamation
618 // The non-kokkos version uses member functions from the AmalgamationInfo
619 // container class to unamalgamate the data. In contrast, the kokkos
620 // version of TentativePFactory does the unamalgamation here and only uses
621 // the data of the AmalgamationInfo container class
622
623 // Extract information for unamalgamation
624 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
625 GO indexBase;
626 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
627 GO globalOffset = amalgInfo->GlobalOffset();
628
629 // Extract aggregation info (already in Kokkos host views)
630 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
631 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
632 const size_t numAggregates = aggregates->GetNumAggregates();
633
634 int myPID = aggregates->GetMap()->getComm()->getRank();
635
636 // Create Kokkos::View (on the device) to store the aggreate dof sizes
637 // Later used to get aggregate dof offsets
638 // NOTE: This zeros itself on construction
639 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
640 AggSizeType aggDofSizes;
641
642 if (stridedBlockSize == 1) {
643 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
644
645 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
646 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
647
648 auto sizesConst = aggregates->ComputeAggregateSizes();
649 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
650
651 } else {
652 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
653
654 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
655 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
656
657 auto nodeMap = aggregates->GetMap()->getLocalMap();
658 auto dofMap = colMap->getLocalMap();
659
660 Kokkos::parallel_for("MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
661 KOKKOS_LAMBDA(const LO agg) {
662 auto aggRowView = aggGraph.rowConst(agg);
663
664 size_t size = 0;
665 for (LO colID = 0; colID < aggRowView.length; colID++) {
666 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
667
668 for (LO k = 0; k < stridedBlockSize; k++) {
669 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
670
671 if (dofMap.getLocalElement(dofGID) != INVALID)
672 size++;
673 }
674 }
675 aggDofSizes(agg+1) = size;
676 });
677 }
678
679 // Find maximum dof size for aggregates
680 // Later used to reserve enough scratch space for local QR decompositions
681 LO maxAggSize = 0;
682 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
683 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
684
685 // parallel_scan (exclusive)
686 // The aggDofSizes View then contains the aggregate dof offsets
687 Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
688 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
689 update += aggDofSizes(i);
690 if (final_pass)
691 aggDofSizes(i) = update;
692 });
693
694 // Create Kokkos::View on the device to store mapping
695 // between (local) aggregate id and row map ids (LIDs)
696 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
697 {
698 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
699
700 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
701 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
702
703 Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
704 KOKKOS_LAMBDA(const LO lnode) {
705 if (procWinner(lnode, 0) == myPID) {
706 // No need for atomics, it's one-to-one
707 auto aggID = vertex2AggId(lnode,0);
708
709 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
710 // FIXME: I think this may be wrong
711 // We unconditionally add the whole block here. When we calculated
712 // aggDofSizes, we did the isLocalElement check. Something's fishy.
713 for (LO k = 0; k < stridedBlockSize; k++)
714 agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
715 }
716 });
717 }
718
719 // STEP 2: prepare local QR decomposition
720 // Reserve memory for tentative prolongation operator
721 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
722
723 // Pull out the nullspace vectors so that we can have random access (on the device)
724 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
725 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
726
727 size_t nnz = 0; // actual number of nnz
728
729 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
730 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
731 typedef typename local_matrix_type::index_type::non_const_type cols_type;
732 typedef typename local_matrix_type::values_type::non_const_type vals_type;
733
734
735 // Device View for status (error messages...)
736 typedef Kokkos::View<int[10], DeviceType> status_type;
737 status_type status("status");
738
739 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
740 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
741
742 const ParameterList& pL = GetParameterList();
743 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
744 if (!doQRStep) {
745 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
746 if (NSDim>1)
747 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
748 }
749
750 size_t nnzEstimate = numRows * NSDim;
751 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows+1);
752 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
753 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
754 rows_type rows("Ptent_rows", numRows+1);
755 {
756 // Stage 0: fill in views.
757 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
758
759 // The main thing to notice is initialization of vals with INVALID. These
760 // values will later be used to compress the arrays
761 Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
762 KOKKOS_LAMBDA(const LO row) {
763 rowsAux(row) = row*NSDim;
764 });
765 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
766 KOKKOS_LAMBDA(const LO j) {
767 colsAux(j) = INVALID;
768 });
769 }
770
771 if (NSDim == 1) {
772 // 1D is special, as it is the easiest. We don't even need to the QR,
773 // just normalize an array. Plus, no worries abot small aggregates. In
774 // addition, we do not worry about compression. It is unlikely that
775 // nullspace will have zeros. If it does, a prolongator row would be
776 // zero and we'll get singularity anyway.
777 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
778
779 // Set up team policy with numAggregates teams and one thread per team.
780 // Each team handles a slice of the data associated with one aggregate
781 // and performs a local QR decomposition (in this case real QR is
782 // unnecessary).
783 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
784
785 if (doQRStep) {
786 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop", policy,
787 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
788 auto agg = thread.league_rank();
789
790 // size of the aggregate (number of DOFs in aggregate)
791 LO aggSize = aggRows(agg+1) - aggRows(agg);
792
793 // Extract the piece of the nullspace corresponding to the aggregate, and
794 // put it in the flat array, "localQR" (in column major format) for the
795 // QR routine. Trivial in 1D.
796 auto norm = impl_ATS::magnitude(zero);
797
798 // Calculate QR by hand
799 // FIXME: shouldn't there be stridedblock here?
800 // FIXME_KOKKOS: shouldn't there be stridedblock here?
801 for (decltype(aggSize) k = 0; k < aggSize; k++) {
802 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
803 norm += dnorm*dnorm;
804 }
805 norm = sqrt(norm);
806
807 if (norm == zero) {
808 // zero column; terminate the execution
809 statusAtomic(1) = true;
810 return;
811 }
812
813 // R = norm
814 coarseNS(agg, 0) = norm;
815
816 // Q = localQR(:,0)/norm
817 for (decltype(aggSize) k = 0; k < aggSize; k++) {
818 LO localRow = agg2RowMapLO(aggRows(agg)+k);
819 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
820
821 rows(localRow+1) = 1;
822 colsAux(localRow) = agg;
823 valsAux(localRow) = localVal;
824
825 }
826 });
827
828 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
829 Kokkos::deep_copy(statusHost, status);
830 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
831 if (statusHost(i)) {
832 std::ostringstream oss;
833 oss << "MueLu::TentativePFactory::MakeTentative: ";
834 switch (i) {
835 case 0: oss << "!goodMap is not implemented"; break;
836 case 1: oss << "fine level NS part has a zero column"; break;
837 }
838 throw Exceptions::RuntimeError(oss.str());
839 }
840
841 } else {
842 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
843 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
844 auto agg = thread.league_rank();
845
846 // size of the aggregate (number of DOFs in aggregate)
847 LO aggSize = aggRows(agg+1) - aggRows(agg);
848
849 // R = norm
850 coarseNS(agg, 0) = one;
851
852 // Q = localQR(:,0)/norm
853 for (decltype(aggSize) k = 0; k < aggSize; k++) {
854 LO localRow = agg2RowMapLO(aggRows(agg)+k);
855 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
856
857 rows(localRow+1) = 1;
858 colsAux(localRow) = agg;
859 valsAux(localRow) = localVal;
860
861 }
862 });
863 }
864
865 Kokkos::parallel_reduce("MueLu:TentativeP:CountNNZ", range_type(0, numRows+1),
866 KOKKOS_LAMBDA(const LO i, size_t &nnz_count) {
867 nnz_count += rows(i);
868 }, nnz);
869
870 } else { // NSdim > 1
871 // FIXME_KOKKOS: This code branch is completely unoptimized.
872 // Work to do:
873 // - Optimize QR decomposition
874 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
875 // packing new values in the beginning of each row
876 // We do use auxilary view in this case, so keep a second rows view for
877 // counting nonzeros in rows
878
879 {
880 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
881 // Set up team policy with numAggregates teams and one thread per team.
882 // Each team handles a slice of the data associated with one aggregate
883 // and performs a local QR decomposition
884 const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1); // numAggregates teams a 1 thread
885 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
886 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
887 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
888 decltype(valsAux)>
889 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
890 rows, rowsAux, colsAux, valsAux, doQRStep);
891 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
892 }
893
894 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
895 Kokkos::deep_copy(statusHost, status);
896 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
897 if (statusHost(i)) {
898 std::ostringstream oss;
899 oss << "MueLu::TentativePFactory::MakeTentative: ";
900 switch(i) {
901 case 0: oss << "!goodMap is not implemented"; break;
902 case 1: oss << "fine level NS part has a zero column"; break;
903 }
904 throw Exceptions::RuntimeError(oss.str());
905 }
906 }
907
908 // Compress the cols and vals by ignoring INVALID column entries that correspond
909 // to 0 in QR.
910
911 // The real cols and vals are constructed using calculated (not estimated) nnz
912 cols_type cols;
913 vals_type vals;
914
915 if (nnz != nnzEstimate) {
916 {
917 // Stage 2: compress the arrays
918 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
919
920 Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
921 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
922 upd += rows(i);
923 if (final)
924 rows(i) = upd;
925 });
926 }
927
928 {
929 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
930
931 cols = cols_type("Ptent_cols", nnz);
932 vals = vals_type("Ptent_vals", nnz);
933
934 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
935 // to the beginning of rows. See CoalesceDropFactory_kokkos for
936 // example.
937 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
938 KOKKOS_LAMBDA(const LO i) {
939 LO rowStart = rows(i);
940
941 size_t lnnz = 0;
942 for (auto j = rowsAux(i); j < rowsAux(i+1); j++)
943 if (colsAux(j) != INVALID) {
944 cols(rowStart+lnnz) = colsAux(j);
945 vals(rowStart+lnnz) = valsAux(j);
946 lnnz++;
947 }
948 });
949 }
950
951 } else {
952 rows = rowsAux;
953 cols = colsAux;
954 vals = valsAux;
955 }
956
957 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
958
959 {
960 // Stage 3: construct Xpetra::Matrix
961 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
962
963 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
964
965 // Managing labels & constants for ESFC
966 RCP<ParameterList> FCparams;
967 if (pL.isSublist("matrixmatrix: kernel params"))
968 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
969 else
970 FCparams = rcp(new ParameterList);
971
972 // By default, we don't need global constants for TentativeP
973 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
974 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
975
976 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
977 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
978 }
979 }
980
981
982 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
984 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
985 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
986 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
987 RCP<MultiVector>& coarseNullspace, const int levelID) const {
988 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
989 be generalized later, if we ever need to do so:
990 1) Null space dimension === block size of matrix: So no elasticity right now
991 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
992 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
993
994 These assumptions keep our code way simpler and still support the use cases we actually care about.
995 */
996
997 RCP<const Map> rowMap = A->getRowMap();
998 RCP<const Map> rangeMap = A->getRangeMap();
999 RCP<const Map> colMap = A->getColMap();
1000 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
1001 const size_t numFineBlockRows = rowMap->getLocalNumElements();
1002
1003 // typedef Teuchos::ScalarTraits<SC> STS;
1004 // typedef typename STS::magnitudeType Magnitude;
1005 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1006
1007 typedef Kokkos::ArithTraits<SC> ATS;
1008 using impl_SC = typename ATS::val_type;
1009 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1010 const impl_SC one = impl_ATS::one();
1011
1012 // const GO numAggs = aggregates->GetNumAggregates();
1013 const size_t NSDim = fineNullspace->getNumVectors();
1014 auto aggSizes = aggregates->ComputeAggregateSizes();
1015
1016
1017 typename Aggregates::local_graph_type aggGraph;
1018 {
1019 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
1020 aggGraph = aggregates->GetGraph();
1021 }
1022 auto aggRows = aggGraph.row_map;
1023 auto aggCols = aggGraph.entries;
1024
1025
1026 // Need to generate the coarse block map
1027 // NOTE: We assume NSDim == block size here
1028 // NOTE: We also assume that coarseMap has contiguous GIDs
1029 //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1030 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1031 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1032 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1033 numCoarseBlockRows,
1034 coarsePointMap->getIndexBase(),
1035 coarsePointMap->getComm());
1036 // Sanity checking
1037 const ParameterList& pL = GetParameterList();
1038 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1039
1040
1041 // The aggregates use the amalgamated column map, which in this case is what we want
1042
1043 // Aggregates map is based on the amalgamated column map
1044 // We can skip global-to-local conversion if LIDs in row map are
1045 // same as LIDs in column map
1046 bool goodMap = MueLu::Utilities<SC,LO,GO,NO>::MapsAreNested(*rowMap, *colMap);
1047 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1048 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1049 "(i.e. \"matching\" row and column maps)");
1050
1051 // STEP 1: do unamalgamation
1052 // The non-kokkos version uses member functions from the AmalgamationInfo
1053 // container class to unamalgamate the data. In contrast, the kokkos
1054 // version of TentativePFactory does the unamalgamation here and only uses
1055 // the data of the AmalgamationInfo container class
1056
1057 // Extract information for unamalgamation
1058 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1059 GO indexBase;
1060 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1061 //GO globalOffset = amalgInfo->GlobalOffset();
1062
1063 // Extract aggregation info (already in Kokkos host views)
1064 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
1065 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1066 const size_t numAggregates = aggregates->GetNumAggregates();
1067
1068 int myPID = aggregates->GetMap()->getComm()->getRank();
1069
1070 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1071 // Later used to get aggregate dof offsets
1072 // NOTE: This zeros itself on construction
1073 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1074 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1075
1076 {
1077 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1078
1079 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1080 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
1081
1082 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), aggSizes);
1083 }
1084
1085 // Find maximum dof size for aggregates
1086 // Later used to reserve enough scratch space for local QR decompositions
1087 LO maxAggSize = 0;
1088 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
1089 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1090
1091 // parallel_scan (exclusive)
1092 // The aggDofSizes View then contains the aggregate dof offsets
1093 Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
1094 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1095 update += aggDofSizes(i);
1096 if (final_pass)
1097 aggDofSizes(i) = update;
1098 });
1099
1100 // Create Kokkos::View on the device to store mapping
1101 // between (local) aggregate id and row map ids (LIDs)
1102 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1103 {
1104 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1105
1106 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1107 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1108
1109 Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1110 KOKKOS_LAMBDA(const LO lnode) {
1111 if (procWinner(lnode, 0) == myPID) {
1112 // No need for atomics, it's one-to-one
1113 auto aggID = vertex2AggId(lnode,0);
1114
1115 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
1116 // FIXME: I think this may be wrong
1117 // We unconditionally add the whole block here. When we calculated
1118 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1119 for (LO k = 0; k < stridedBlockSize; k++)
1120 aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k;
1121 }
1122 });
1123 }
1124
1125 // STEP 2: prepare local QR decomposition
1126 // Reserve memory for tentative prolongation operator
1127 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1128
1129 // Pull out the nullspace vectors so that we can have random access (on the device)
1130 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
1131 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1132
1133 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
1134 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1135 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1136 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1137
1138
1139 // Device View for status (error messages...)
1140 typedef Kokkos::View<int[10], DeviceType> status_type;
1141 status_type status("status");
1142
1143 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1144 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
1145
1146 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1147 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1148
1149 // BlockCrs requires that we build the (block) graph first, so let's do that...
1150
1151 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1152 // block non-zero per row in the matrix;
1153 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1154 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1155
1156 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1157 KOKKOS_LAMBDA(const LO j) {
1158 ia[j] = j;
1159 ja[j] = INVALID;
1160
1161 if(j==(LO)numFineBlockRows-1)
1162 ia[numFineBlockRows] = numFineBlockRows;
1163 });
1164
1165 // Fill Graph
1166 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1167 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:fillGraph", policy,
1168 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1169 auto agg = thread.league_rank();
1170 Xpetra::global_size_t offset = agg;
1171
1172 // size of the aggregate (number of DOFs in aggregate)
1173 LO aggSize = aggRows(agg+1) - aggRows(agg);
1174
1175 for (LO j = 0; j < aggSize; j++) {
1176 // FIXME: Allow for bad maps
1177 const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j];
1178 const size_t rowStart = ia[localRow];
1179 ja[rowStart] = offset;
1180 }
1181 });
1182
1183 // Compress storage (remove all INVALID, which happen when we skip zeros)
1184 // We do that in-place
1185 {
1186 // Stage 2: compress the arrays
1187 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1188 // Fill i_temp with the correct row starts
1189 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1190 LO nnz=0;
1191 Kokkos::parallel_scan("MueLu:TentativePF:BlockCrs:compress_rows", range_type(0,numFineBlockRows),
1192 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1193 if(final)
1194 i_temp[i] = upd;
1195 for (auto j = ia[i]; j < ia[i+1]; j++)
1196 if (ja[j] != INVALID)
1197 upd++;
1198 if(final && i == (LO) numFineBlockRows-1)
1199 i_temp[numFineBlockRows] = upd;
1200 },nnz);
1201
1202 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1203
1204
1205 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:compress_cols", range_type(0,numFineBlockRows),
1206 KOKKOS_LAMBDA(const LO i) {
1207 size_t rowStart = i_temp[i];
1208 size_t lnnz = 0;
1209 for (auto j = ia[i]; j < ia[i+1]; j++)
1210 if (ja[j] != INVALID) {
1211 j_temp[rowStart+lnnz] = ja[j];
1212 lnnz++;
1213 }
1214 });
1215
1216 ia = i_temp;
1217 ja = j_temp;
1218 }
1219
1220 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja);
1221
1222
1223 // Managing labels & constants for ESFC
1224 {
1225 RCP<ParameterList> FCparams;
1226 if(pL.isSublist("matrixmatrix: kernel params"))
1227 FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1228 else
1229 FCparams= rcp(new ParameterList);
1230 // By default, we don't need global constants for TentativeP
1231 FCparams->set("compute global constants",FCparams->get("compute global constants",false));
1232 std::string levelIDs = toString(levelID);
1233 FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
1234 RCP<const Export> dummy_e;
1235 RCP<const Import> dummy_i;
1236 BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
1237 }
1238
1239 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1240 // we clear them here
1241 ia = rows_type();
1242 ja = cols_type();
1243
1244
1245 // Now let's make a BlockCrs Matrix
1246 // NOTE: Assumes block size== NSDim
1247 RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
1248 RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
1249 if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1250 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1251
1252 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1253 const LO stride = NSDim*NSDim;
1254
1255 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1256 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1257 auto agg = thread.league_rank();
1258
1259 // size of the aggregate (number of DOFs in aggregate)
1260 LO aggSize = aggRows(agg+1) - aggRows(agg);
1261 Xpetra::global_size_t offset = agg*NSDim;
1262
1263 // Q = localQR(:,0)/norm
1264 for (LO j = 0; j < aggSize; j++) {
1265 LO localBlockRow = aggToRowMapLO(aggRows(agg)+j);
1266 LO rowStart = localBlockRow * stride;
1267 for (LO r = 0; r < (LO)NSDim; r++) {
1268 LO localPointRow = localBlockRow*NSDim + r;
1269 for (LO c = 0; c < (LO)NSDim; c++) {
1270 values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c);
1271 }
1272 }
1273 }
1274
1275 // R = norm
1276 for(LO j=0; j<(LO)NSDim; j++)
1277 coarseNS(offset+j,j) = one;
1278 });
1279
1280 Ptentative = P_wrap;
1281 }
1282
1283 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1285 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1286 RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1287 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1288 RCP<MultiVector>& /* coarseNullspace */) const {
1289 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1290 }
1291
1292 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1294 isGoodMap(const Map& rowMap, const Map& colMap) const {
1295 auto rowLocalMap = rowMap.getLocalMap();
1296 auto colLocalMap = colMap.getLocalMap();
1297
1298 const size_t numRows = rowLocalMap.getLocalNumElements();
1299 const size_t numCols = colLocalMap.getLocalNumElements();
1300
1301 if (numCols < numRows)
1302 return false;
1303
1304 size_t numDiff = 0;
1305 Kokkos::parallel_reduce("MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1306 KOKKOS_LAMBDA(const LO i, size_t &diff) {
1307 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1308 }, numDiff);
1309
1310 return (numDiff == 0);
1311 }
1312
1313} //namespace MueLu
1314
1315#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1316#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
View
#define SET_VALID_ENTRY(name)
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
void Input(Level &level, const std::string &varName) const
T Get(Level &level, const std::string &varName) const
void Set(Level &level, const std::string &varName, const T &data) const
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
virtual const Teuchos::ParameterList & GetParameterList() const
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
static bool MapsAreNested(const Xpetra::Map< DefaultLocalOrdinal, DefaultGlobalOrdinal, DefaultNode > &rowMap, const Xpetra::Map< DefaultLocalOrdinal, DefaultGlobalOrdinal, DefaultNode > &colMap)
Teuchos::FancyOStream & GetOStream(MsgType type, int thisProcRankOnly=0) const
Get an output stream for outputting the input message type.
bool IsPrint(MsgType type, int thisProcRankOnly=-1) const
Find out whether we need to print out information for a specific message type.
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line).
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose).
std::string toString(const T &what)
Little helper function to convert non-string types to strings.