Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1// @HEADER
2// ************************************************************************
3//
4// Intrepid2 Package
5// Copyright (2007) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Kyungjoo Kim (kyukim@sandia.gov),
38// Mauro Perego (mperego@sandia.gov), or
39// Nate Roberts (nvrober@sandia.gov)
40//
41// ************************************************************************
42// @HEADER
43
48
49#ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
50#define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
51
54
55#include "Teuchos_TimeMonitor.hpp"
56
57namespace Intrepid2 {
58
59 namespace Impl
60 {
64 template<class Scalar, class DeviceType, int integralViewRank>
65 class F_Integrate
66 {
67 using ExecutionSpace = typename DeviceType::execution_space;
68 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69 using TeamMember = typename TeamPolicy::member_type;
70
71 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72 IntegralViewType integralView_;
73 TensorData<Scalar,DeviceType> leftComponent_;
74 Data<Scalar,DeviceType> composedTransform_;
75 TensorData<Scalar,DeviceType> rightComponent_;
77 int a_offset_;
78 int b_offset_;
79 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
80 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
81 int numTensorComponents_;
82 int leftFieldOrdinalOffset_;
83 int rightFieldOrdinalOffset_;
84 bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
85
86 size_t fad_size_output_ = 0; // 0 if not a fad type
87
88 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
89
90 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
91 // (this also makes it easier to reorder loops, etc., for further optimizations)
92 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
95
96 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
97 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
98
99 int maxFieldsLeft_;
100 int maxFieldsRight_;
101 int maxPointCount_;
102 public:
103 F_Integrate(Data<Scalar,DeviceType> integralData,
104 TensorData<Scalar,DeviceType> leftComponent,
105 Data<Scalar,DeviceType> composedTransform,
106 TensorData<Scalar,DeviceType> rightComponent,
108 int a_offset,
109 int b_offset,
110 int leftFieldOrdinalOffset,
111 int rightFieldOrdinalOffset,
112 bool forceNonSpecialized)
113 :
114 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115 leftComponent_(leftComponent),
116 composedTransform_(composedTransform),
117 rightComponent_(rightComponent),
118 cellMeasures_(cellMeasures),
119 a_offset_(a_offset),
120 b_offset_(b_offset),
121 leftComponentSpan_(leftComponent.extent_int(2)),
122 rightComponentSpan_(rightComponent.extent_int(2)),
123 numTensorComponents_(leftComponent.numTensorComponents()),
124 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126 forceNonSpecialized_(forceNonSpecialized)
127 {
128 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
129
130 // set up bounds containers
131 const int FIELD_DIM = 0;
132 const int POINT_DIM = 1;
133 maxFieldsLeft_ = 0;
134 maxFieldsRight_ = 0;
135 maxPointCount_ = 0;
136 for (int r=0; r<numTensorComponents_; r++)
137 {
138 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
139 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
140 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
141 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
142 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
143 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
144 }
145
146 // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
147 int leftRelativeEnumerationSpan = 1;
148 int rightRelativeEnumerationSpan = 1;
149 for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
150 {
151 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
155 }
156
157 // prepare for allocation of temporary storage
158 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
159
160 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161 if (allocateFadStorage)
162 {
163 fad_size_output_ = dimension_scalar(integralView_);
164 }
165
166 const int R = numTensorComponents_ - 1;
167
168 int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
169 int allocationSoFar = 0;
170 offsetsForComponentOrdinal_[R] = allocationSoFar;
171 allocationSoFar++; // we store one entry corresponding to R, the final component
172
173 for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
174 {
175 const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
176 const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
177
178 num_ij *= leftFields * rightFields;
179
180 offsetsForComponentOrdinal_[r] = allocationSoFar;
181 allocationSoFar += num_ij;
182 }
183 offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
184 }
185
186 template<size_t maxComponents, size_t numComponents = maxComponents>
187 KOKKOS_INLINE_FUNCTION
188 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189 const Kokkos::Array<int,maxComponents> &bounds) const
190 {
191 if (numComponents == 0)
192 {
193 return -1;
194 }
195 else
196 {
197 int r = static_cast<int>(numComponents - 1);
198 while (arguments[r] + 1 >= bounds[r])
199 {
200 arguments[r] = 0; // reset
201 r--;
202 if (r < 0) break;
203 }
204 if (r >= 0) ++arguments[r];
205 return r;
206 }
207 }
208
210 KOKKOS_INLINE_FUNCTION
211 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
212 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213 const int &numComponents) const
214 {
215 if (numComponents == 0) return -1;
216 int r = static_cast<int>(numComponents - 1);
217 while (arguments[r] + 1 >= bounds[r])
218 {
219 arguments[r] = 0; // reset
220 r--;
221 if (r < 0) break;
222 }
223 if (r >= 0) ++arguments[r];
224 return r;
225 }
226
227 template<size_t maxComponents, size_t numComponents = maxComponents>
228 KOKKOS_INLINE_FUNCTION
229 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
230 const Kokkos::Array<int,maxComponents> &bounds) const
231 {
232 if (numComponents == 0)
233 {
234 return -1;
235 }
236 else
237 {
238 int r = static_cast<int>(numComponents - 1);
239 while (arguments[r] + 1 >= bounds[r])
240 {
241 r--;
242 if (r < 0) break;
243 }
244 return r;
245 }
246 }
247
249 KOKKOS_INLINE_FUNCTION
250 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
251 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252 const int &numComponents) const
253 {
254 if (numComponents == 0) return -1;
255 int r = numComponents - 1;
256 while (arguments[r] + 1 >= bounds[r])
257 {
258 r--;
259 if (r < 0) break;
260 }
261 return r;
262 }
263
264 template<size_t maxComponents, size_t numComponents = maxComponents>
265 KOKKOS_INLINE_FUNCTION
266 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
267 const Kokkos::Array<int,maxComponents> &bounds,
268 const int startIndex) const
269 {
270 // the following mirrors what is done in TensorData
271 if (numComponents == 0)
272 {
273 return 0;
274 }
275 else
276 {
277 int enumerationIndex = 0;
278 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
279 {
280 enumerationIndex += arguments[r];
281 enumerationIndex *= bounds[r-1];
282 }
283 enumerationIndex += arguments[startIndex];
284 return enumerationIndex;
285 }
286 }
287
289
290 // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
291// template<size_t numTensorComponents>
292// KOKKOS_INLINE_FUNCTION
293// void runSpecialized( const TeamMember & teamMember ) const;
294
295// template<>
296// KOKKOS_INLINE_FUNCTION
297// void runSpecialized<3>( const TeamMember & teamMember ) const
298 KOKKOS_INLINE_FUNCTION
299 void runSpecialized3( const TeamMember & teamMember ) const
300 {
301 constexpr int numTensorComponents = 3;
302
303 Kokkos::Array<int,numTensorComponents> pointBounds;
304 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306 for (unsigned r=0; r<numTensorComponents; r++)
307 {
308 pointBounds[r] = pointBounds_[r];
309 leftFieldBounds[r] = leftFieldBounds_[r];
310 rightFieldBounds[r] = rightFieldBounds_[r];
311 }
312
313 const int cellDataOrdinal = teamMember.league_rank();
314 const int numThreads = teamMember.team_size(); // num threads
315 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
316
317 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
318 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
319 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
320 if (fad_size_output_ > 0) {
321 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
323 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
329 }
330 else {
331 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
333 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
339 }
340
341// int approximateFlopCount = 0;
342// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
343
344 constexpr int R = numTensorComponents - 1;
345
346 const int composedTransformRank = composedTransform_.rank();
347
348 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
349 {
350 const int a = a_offset_ + a_component;
351 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
352 {
353 const int b = b_offset_ + b_component;
354
355 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
356 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
357
358 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
359 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
360
361 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
362 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
363 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
364 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
365 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
366 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
367
368 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
369 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
370 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
371 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
372 if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
373 {
374 const int leftRank = leftTensorComponent_x.rank();
375 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
376 }
377 if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
378 {
379 const int leftRank = leftTensorComponent_y.rank();
380 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
381 }
382 if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
383 {
384 const int leftRank = leftTensorComponent_z.rank();
385 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
386 }
387 if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
388 {
389 const int rightRank = rightTensorComponent_x.rank();
390 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
391 }
392 if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
393 {
394 const int rightRank = rightTensorComponent_y.rank();
395 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
396 }
397 if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
398 {
399 const int rightRank = rightTensorComponent_z.rank();
400 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
401 }
402 });
403
404 if (composedTransform_.underlyingMatchesLogical())
405 {
406 if (composedTransformRank == 4) // (C,P,D,D)
407 {
408 const auto & composedTransformView = composedTransform_.getUnderlyingView4();
409 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
410 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
411 });
412 }
413 else // rank 2, then: (C,P)
414 {
415 const auto & composedTransformView = composedTransform_.getUnderlyingView2();
416 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
417 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
418 });
419 }
420 }
421 else
422 {
423 if (composedTransformRank == 4) // (C,P,D,D)
424 {
425 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
426 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
427 });
428 }
429 else // rank 2, then: (C,P)
430 {
431 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
432 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
433 });
434 }
435 }
436
437 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
438
439 // synchronize threads
440 teamMember.team_barrier();
441
442 // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
443 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
444 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
445 {
446 scratchView(i) = 0.0;
447 }
448
449 // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
450 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
451 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
452 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
453
454 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
455 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
456 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
457 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
458 rightFieldArguments[R] = jz;
459 leftFieldArguments[R] = iz;
460
461 Kokkos::Array<int,numTensorComponents> pointArguments;
462 for (int i=0; i<numTensorComponents; i++)
463 {
464 pointArguments[i] = 0;
465 }
466
467 for (int lx=0; lx<pointBounds[0]; lx++)
468 {
469 pointArguments[0] = lx;
470
471 // clear Gy scratch:
472 // in scratch, Gz (1 entry) comes first, then Gy entries.
473 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
474 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
475
476 for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
477 {
478 scratchView(Gy_index) = 0;
479 }
480
481 for (int ly=0; ly<pointBounds[1]; ly++)
482 {
483 pointArguments[1] = ly;
484
485 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
486 *Gz = 0;
487
488 for (int lz=0; lz < pointBounds[2]; lz++)
489 {
490 const Scalar & leftValue = leftFields_z(iz,lz);
491 const Scalar & rightValue = rightFields_z(jz,lz);
492
493 pointArguments[2] = lz;
494 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
495
496 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
497
498 // approximateFlopCount += 3; // 2 multiplies, 1 sum
499 } // lz
500
501 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
502 {
503 leftFieldArguments[1] = iy;
504 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
505
506 const Scalar & leftValue = leftFields_y(iy,ly);
507
508 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
509 {
510 rightFieldArguments[1] = jy;
511
512 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
513 const Scalar & rightValue = rightFields_y(jy,ly);
514
515 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
516 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
517
518 const int Gz_index = 0;
519 const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
520
521 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
522
523 Gy += leftValue * Gz * rightValue;
524 // approximateFlopCount += 3; // 2 multiplies, 1 sum
525 }
526 }
527 } // ly
528 for (int ix=0; ix<leftFieldBounds_[0]; ix++)
529 {
530 leftFieldArguments[0] = ix;
531 const Scalar & leftValue = leftFields_x(ix,lx);
532
533 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
534 {
535 leftFieldArguments[1] = iy;
536
537 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
538
539 for (int jx=0; jx<rightFieldBounds_[0]; jx++)
540 {
541 rightFieldArguments[0] = jx;
542 const Scalar & rightValue = rightFields_x(jx,lx);
543
544 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
545 {
546 rightFieldArguments[1] = jy;
547 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
548
549 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
550
551 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
552 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
553
554 // compute enumeration indices to get field indices into output view
555 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
556 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
557 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
558 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
559
560 if (integralViewRank == 3)
561 {
562// if ((leftFieldIndex==0) && (rightFieldIndex==2))
563// {
564// using std::cout;
565// using std::endl;
566// cout << "***** Contribution to (0,0,2) *****\n";
567// cout << "lx = " << lx << endl;
568// cout << "ix = " << ix << endl;
569// cout << "iy = " << iy << endl;
570// cout << "jx = " << jx << endl;
571// cout << "jy = " << jy << endl;
572// cout << "iz = " << iz << endl;
573// cout << "jz = " << jz << endl;
574// cout << " leftValue = " << leftValue << endl;
575// cout << "rightValue = " << rightValue << endl;
576// cout << "Gy = " << Gy << endl;
577//
578// cout << endl;
579// }
580
581 // shape (C,F1,F2)
582 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
583 }
584 else
585 {
586 // shape (F1,F2)
587 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
588 }
589 // approximateFlopCount += 3; // 2 multiplies, 1 sum
590 } // jy
591 } // ix
592 } // iy
593 } // ix
594 } // lx
595 }); // TeamThreadRange parallel_for - (iz,jz) loop
596 }
597 }
598// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
599 }
600
601 template<size_t numTensorComponents>
602 KOKKOS_INLINE_FUNCTION
603 void run( const TeamMember & teamMember ) const
604 {
605 Kokkos::Array<int,numTensorComponents> pointBounds;
606 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
607 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
608 for (unsigned r=0; r<numTensorComponents; r++)
609 {
610 pointBounds[r] = pointBounds_[r];
611 leftFieldBounds[r] = leftFieldBounds_[r];
612 rightFieldBounds[r] = rightFieldBounds_[r];
613 }
614
615 const int cellDataOrdinal = teamMember.league_rank();
616 const int numThreads = teamMember.team_size(); // num threads
617 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
618 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
619 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
620 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
621 if (fad_size_output_ > 0) {
622 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
623 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
624 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
625 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
626 }
627 else {
628 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
629 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
630 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
631 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
632 }
633
634// int approximateFlopCount = 0;
635// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
636
637 constexpr int R = numTensorComponents - 1;
638
639 const int composedTransformRank = composedTransform_.rank();
640
641 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
642 {
643 const int a = a_offset_ + a_component;
644 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
645 {
646 const int b = b_offset_ + b_component;
647
648 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
649 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
650
651 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
652 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
653
654 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
655 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
656 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
657 const int leftFieldCount = leftTensorComponent.extent_int(0);
658 const int pointCount = leftTensorComponent.extent_int(1);
659 const int leftRank = leftTensorComponent.rank();
660 const int rightFieldCount = rightTensorComponent.extent_int(0);
661 const int rightRank = rightTensorComponent.rank();
662 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
663 {
664 // slightly weird logic here in effort to avoid branch divergence
665 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
666 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
667 {
668 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
669 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
670 }
671 }
672 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
673 {
674 // slightly weird logic here in effort to avoid branch divergence
675 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
676 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
677 {
678 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
679 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
680 }
681 }
682 });
683
684 if (composedTransformRank == 4)
685 {
686 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
687 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
688 });
689 }
690 else
691 {
692 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
693 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
694 });
695 }
696 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
697
698 // synchronize threads
699 teamMember.team_barrier();
700
701 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
702 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
703 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
704 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
705
706 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
707 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
708 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
709 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
710 rightFieldArguments[R] = j_R;
711 leftFieldArguments[R] = i_R;
712
713 //TODO: I believe that this can be moved outside the thread parallel_for
714 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
715 {
716 scratchView(i) = 0.0;
717 }
718 Kokkos::Array<int,numTensorComponents> pointArguments;
719 for (unsigned i=0; i<numTensorComponents; i++)
720 {
721 pointArguments[i] = 0;
722 }
723
724 int r = R;
725 while (r >= 0)
726 {
727 // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
728 const int pointBounds_R = pointBounds[R];
729 int & pointArgument_R = pointArguments[R];
730 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
731 {
732 Scalar * G_R;
733 if (R != 0)
734 {
735 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
736 }
737 else
738 {
739 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
740 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
741
742 if (integralViewRank == 3)
743 {
744 // shape (C,F1,F2)
745 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
746 }
747 else
748 {
749 // shape (F1,F2)
750 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
751 }
752 }
753
754 const int & pointIndex_R = pointArguments[R];
755
756 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
757 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
758
759 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
760
761 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
762
763// approximateFlopCount += 3; // 2 multiplies, 1 sum
764 } // pointArgument_R
765
766 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
767 const int r_min = (r_next >= 0) ? r_next : 0;
768
769 for (int s=R-1; s>=r_min; s--)
770 {
771 const int & pointIndex_s = pointArguments[s];
772
773 // want to cover all the multi-indices from s to R-1
774 for (int s1=s; s1<R; s1++)
775 {
776 leftFieldArguments[s1] = 0;
777 }
778
779 // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
780 const int & i_s = leftFieldArguments[s];
781 const int & j_s = rightFieldArguments[s];
782
783 int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
784 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
785 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
786
787 while (sLeft >= s)
788 {
789 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
790
791 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
792 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
793
794 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
795
796 for (int s1=s; s1<R; s1++)
797 {
798 rightFieldArguments[s1] = 0;
799 }
800 int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
801 while (sRight >= s)
802 {
803 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
804
805 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
806 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
807
808 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
809
810 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
811
812 Scalar* G_s;
813
814 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
815 // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
816 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
817
818 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
819
820
821 if (s != 0)
822 {
823 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
824 }
825 else
826 {
827 // compute enumeration indices
828 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
829 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
830 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
831 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
832
833 if (integralViewRank == 3)
834 {
835 // shape (C,F1,F2)
836 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
837 }
838 else
839 {
840 // shape (F1,F2)
841 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
842 }
843 }
844
845 *G_s += leftValue * G_sp * rightValue;
846
847// approximateFlopCount += 3; // 2 multiplies, 1 sum
848
849 // increment rightField
850 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
851 }
852
853 // increment leftField
854 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
855 }
856 }
857
858 // clear tempStorage for r_next+1 to R
859 if (r_min+1 <= R)
860 {
861 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
862 for (int i=scratchOffsetForThread; i<endIndex; i++)
863 {
864 scratchView(i) = 0.0;
865 }
866// auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
867// Kokkos::deep_copy(tempStorageSubview, 0.0);
868 }
869
870 // proceed to next point
871 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
872 }
873 }); // TeamThreadRange parallel_for
874 }
875 }
876// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
877 }
878
879 KOKKOS_INLINE_FUNCTION
880 void operator()( const TeamMember & teamMember ) const
881 {
882 switch (numTensorComponents_)
883 {
884 case 1: run<1>(teamMember); break;
885 case 2: run<2>(teamMember); break;
886 case 3:
887 if (forceNonSpecialized_)
888 run<3>(teamMember);
889 else
890 runSpecialized3(teamMember);
891 break;
892 case 4: run<4>(teamMember); break;
893 case 5: run<5>(teamMember); break;
894 case 6: run<6>(teamMember); break;
895 case 7: run<7>(teamMember); break;
896#ifdef INTREPID2_HAVE_DEBUG
897 default:
898 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
899#endif
900 }
901 }
902
905 {
906 // compute flop count on a single cell, then multiply by the number of cells
907 int flopCount = 0;
908
909 const int R = numTensorComponents_ - 1;
910
911 // we cache the value of M_ab * cellMeasure at each point.
912 // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
913 const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
914
915 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
916 {
917 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
918 {
919 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
920 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
921
922 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
923 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
924
925 flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
926
927 int flopsPer_i_R_j_R = 0;
928
929 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
930 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
931 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
932 leftFieldArguments[R] = 0;
933
934 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
935 for (int i=0; i<numTensorComponents_; i++)
936 {
937 pointArguments[i] = 0;
938 }
939
940 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
941 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
942 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
943 rightFieldArguments[R] = 0;
944
945 int r = R;
946 while (r >= 0)
947 {
948 // integrate in final component dimension
949 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
950 {
951 flopsPer_i_R_j_R += 4;
952 }
953 // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
954// if (0 < pointBounds_[R])
955// {
956// flopsPer_i_R_j_R += pointBounds_[R] * 4;
957// }
958
959 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
960 const int r_min = (r_next >= 0) ? r_next : 0;
961
962 for (int s=R-1; s>=r_min; s--)
963 {
964 // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
965 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
966 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
967
968 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
969 }
970
971 // proceed to next point
972 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
973 }
974
975 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
976 }
977 }
978// std::cout << "flop count per cell: " << flopCount << std::endl;
979 return flopCount;
980 }
981
983 int teamSize(const int &maxTeamSizeFromKokkos) const
984 {
985 const int R = numTensorComponents_ - 1;
986 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
987 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
988 }
989
991 size_t team_shmem_size (int team_size) const
992 {
993 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
994 size_t shmem_size = 0;
995
996 if (fad_size_output_ > 0)
997 {
998 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
999 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
1000 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
1001 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
1002 }
1003 else
1004 {
1005 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
1006 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
1007 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
1008 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
1009 }
1010
1011 return shmem_size;
1012 }
1013 };
1014
1024 template<class Scalar, class DeviceType, int integralViewRank>
1025 class F_IntegratePointValueCache
1026 {
1027 using ExecutionSpace = typename DeviceType::execution_space;
1028 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
1029 using TeamMember = typename TeamPolicy::member_type;
1030
1031 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1032 IntegralViewType integralView_;
1033 TensorData<Scalar,DeviceType> leftComponent_;
1034 Data<Scalar,DeviceType> composedTransform_;
1035 TensorData<Scalar,DeviceType> rightComponent_;
1036 TensorData<Scalar,DeviceType> cellMeasures_;
1037 int a_offset_;
1038 int b_offset_;
1039 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1040 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1041 int numTensorComponents_;
1042 int leftFieldOrdinalOffset_;
1043 int rightFieldOrdinalOffset_;
1044
1045 size_t fad_size_output_ = 0; // 0 if not a fad type
1046
1047 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1048 // (this also makes it easier to reorder loops, etc., for further optimizations)
1049 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1050 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1051 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1052
1053 int maxFieldsLeft_;
1054 int maxFieldsRight_;
1055 int maxPointCount_;
1056 public:
1057 F_IntegratePointValueCache(Data<Scalar,DeviceType> integralData,
1058 TensorData<Scalar,DeviceType> leftComponent,
1059 Data<Scalar,DeviceType> composedTransform,
1060 TensorData<Scalar,DeviceType> rightComponent,
1061 TensorData<Scalar,DeviceType> cellMeasures,
1062 int a_offset,
1063 int b_offset,
1064 int leftFieldOrdinalOffset,
1065 int rightFieldOrdinalOffset)
1066 :
1067 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1068 leftComponent_(leftComponent),
1069 composedTransform_(composedTransform),
1070 rightComponent_(rightComponent),
1071 cellMeasures_(cellMeasures),
1072 a_offset_(a_offset),
1073 b_offset_(b_offset),
1074 leftComponentSpan_(leftComponent.extent_int(2)),
1075 rightComponentSpan_(rightComponent.extent_int(2)),
1076 numTensorComponents_(leftComponent.numTensorComponents()),
1077 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1078 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1079 {
1080 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1081
1082 const int FIELD_DIM = 0;
1083 const int POINT_DIM = 1;
1084 maxFieldsLeft_ = 0;
1085 maxFieldsRight_ = 0;
1086 maxPointCount_ = 0;
1087 for (int r=0; r<numTensorComponents_; r++)
1088 {
1089 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1090 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1091 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1092 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1093 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1094 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1095 }
1096
1097 // prepare for allocation of temporary storage
1098 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1099
1100 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1101 if (allocateFadStorage)
1102 {
1103 fad_size_output_ = dimension_scalar(integralView_);
1104 }
1105 }
1106
1107 template<size_t maxComponents, size_t numComponents = maxComponents>
1108 KOKKOS_INLINE_FUNCTION
1109 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1110 const Kokkos::Array<int,maxComponents> &bounds) const
1111 {
1112 if (numComponents == 0) return -1;
1113 int r = static_cast<int>(numComponents - 1);
1114 while (arguments[r] + 1 >= bounds[r])
1115 {
1116 arguments[r] = 0; // reset
1117 r--;
1118 if (r < 0) break;
1119 }
1120 if (r >= 0) ++arguments[r];
1121 return r;
1122 }
1123
1125 KOKKOS_INLINE_FUNCTION
1126 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1127 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128 const int &numComponents) const
1129 {
1130 if (numComponents == 0) return -1;
1131 int r = static_cast<int>(numComponents - 1);
1132 while (arguments[r] + 1 >= bounds[r])
1133 {
1134 arguments[r] = 0; // reset
1135 r--;
1136 if (r < 0) break;
1137 }
1138 if (r >= 0) ++arguments[r];
1139 return r;
1140 }
1141
1142 template<size_t maxComponents, size_t numComponents = maxComponents>
1143 KOKKOS_INLINE_FUNCTION
1144 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1145 const Kokkos::Array<int,maxComponents> &bounds) const
1146 {
1147 if (numComponents == 0) return -1;
1148 int r = static_cast<int>(numComponents - 1);
1149 while (arguments[r] + 1 >= bounds[r])
1150 {
1151 r--;
1152 if (r < 0) break;
1153 }
1154 return r;
1155 }
1156
1158 KOKKOS_INLINE_FUNCTION
1159 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1160 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1161 const int &numComponents) const
1162 {
1163 if (numComponents == 0) return -1;
1164 int r = numComponents - 1;
1165 while (arguments[r] + 1 >= bounds[r])
1166 {
1167 r--;
1168 if (r < 0) break;
1169 }
1170 return r;
1171 }
1172
1173 template<size_t maxComponents, size_t numComponents = maxComponents>
1174 KOKKOS_INLINE_FUNCTION
1175 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1176 const Kokkos::Array<int,maxComponents> &bounds,
1177 const int startIndex) const
1178 {
1179 // the following mirrors what is done in TensorData
1180 if (numComponents == 0)
1181 {
1182 return 0;
1183 }
1184 int enumerationIndex = 0;
1185 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1186 {
1187 enumerationIndex += arguments[r];
1188 enumerationIndex *= bounds[r-1];
1189 }
1190 enumerationIndex += arguments[startIndex];
1191 return enumerationIndex;
1192 }
1193
1194 template<int rank>
1195 KOKKOS_INLINE_FUNCTION
1196 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1197 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1198 {
1199 return integralView(cellDataOrdinal,i,j);
1200 }
1201
1202 template<int rank>
1203 KOKKOS_INLINE_FUNCTION
1204 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1205 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1206 {
1207 return integralView(i,j);
1208 }
1209
1211 KOKKOS_INLINE_FUNCTION
1212 void runSpecialized3( const TeamMember & teamMember ) const
1213 {
1214 constexpr int numTensorComponents = 3;
1215
1216 const int pointBounds_x = pointBounds_[0];
1217 const int pointBounds_y = pointBounds_[1];
1218 const int pointBounds_z = pointBounds_[2];
1219 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1220
1221 const int leftFieldBounds_x = leftFieldBounds_[0];
1222 const int rightFieldBounds_x = rightFieldBounds_[0];
1223 const int leftFieldBounds_y = leftFieldBounds_[1];
1224 const int rightFieldBounds_y = rightFieldBounds_[1];
1225 const int leftFieldBounds_z = leftFieldBounds_[2];
1226 const int rightFieldBounds_z = rightFieldBounds_[2];
1227
1228 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1229 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1230 for (unsigned r=0; r<numTensorComponents; r++)
1231 {
1232 leftFieldBounds[r] = leftFieldBounds_[r];
1233 rightFieldBounds[r] = rightFieldBounds_[r];
1234 }
1235
1236 const auto integralView = integralView_;
1237 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1238 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1239
1240 const int cellDataOrdinal = teamMember.league_rank();
1241 const int threadNumber = teamMember.team_rank();
1242
1243 const int numThreads = teamMember.team_size(); // num threads
1244 const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1245 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1246 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1247 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral; // for one Gz value that we sum into before summing into the destination matrix
1248 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1249
1250 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1251 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1252 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1253 if (fad_size_output_ > 0) {
1254 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1255 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1256 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1257 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1258
1259 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1260 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1261 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1262 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1263 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1264 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1265 }
1266 else {
1267 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1268 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1269 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1270 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1271
1272 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1273 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1274 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1275 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1276 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1277 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1278 }
1279
1280// int approximateFlopCount = 0;
1281// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1282
1283 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1284
1285 const int composedTransformRank = composedTransform_.rank();
1286
1287 // synchronize threads
1288 teamMember.team_barrier();
1289
1290 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1291 {
1292 const int a = a_offset_ + a_component;
1293 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1294 {
1295 const int b = b_offset_ + b_component;
1296
1297 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1298 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1299 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1300 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1301 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1302 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1303
1304 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1305 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1306 if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1307 {
1308 const int pointCount = leftTensorComponent_x.extent_int(1);
1309 const int leftRank = leftTensorComponent_x.rank();
1310 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1311 {
1312 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1313 }
1314 }
1315 if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1316 {
1317 const int pointCount = leftTensorComponent_y.extent_int(1);
1318 const int leftRank = leftTensorComponent_y.rank();
1319 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1320 {
1321 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1322 }
1323 }
1324 if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1325 {
1326 const int pointCount = leftTensorComponent_z.extent_int(1);
1327 const int leftRank = leftTensorComponent_z.rank();
1328 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1329 {
1330 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1331 }
1332 }
1333 if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1334 {
1335 const int pointCount = rightTensorComponent_x.extent_int(1);
1336 const int rightRank = rightTensorComponent_x.rank();
1337 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1338 {
1339 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1340 }
1341 }
1342 if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1343 {
1344 const int pointCount = rightTensorComponent_y.extent_int(1);
1345 const int rightRank = rightTensorComponent_y.rank();
1346 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1347 {
1348 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1349 }
1350 }
1351 if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1352 {
1353 const int pointCount = rightTensorComponent_z.extent_int(1);
1354 const int rightRank = rightTensorComponent_z.rank();
1355 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1356 {
1357 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1358 }
1359 }
1360 });
1361
1362 if (composedTransformRank == 4) // (C,P,D,D)
1363 {
1364 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1365 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1366 });
1367 }
1368 else // (C,P)
1369 {
1370 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1371 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1372 });
1373 }
1374
1375 // synchronize threads
1376 teamMember.team_barrier();
1377
1378 for (int i0=0; i0<leftFieldBounds_x; i0++)
1379 {
1380 for (int j0=0; j0<rightFieldBounds_x; j0++)
1381 {
1382 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1383 // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1384 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1385
1386 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1387
1388 Gx = 0;
1389 if (fad_size_output_ == 0)
1390 {
1391 // not a Fad type; we're allow to have a vector range
1392 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1393 {
1394 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1395 }, Gx);
1396 }
1397 else
1398 {
1399 for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1400 {
1401 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1402 }
1403 }
1404 });
1405
1406 // synchronize threads
1407 teamMember.team_barrier();
1408
1409 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1410 const int i1 = i1j1 % leftFieldBounds_y;
1411 const int j1 = i1j1 / leftFieldBounds_y;
1412
1413 int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1414
1415 int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1416 for (int lz=0; lz<pointBounds_z; lz++)
1417 {
1418 Scalar & Gy = GyIntegrals(Gy_index);
1419 Gy = 0.0;
1420
1421 for (int ly=0; ly<pointBounds_y; ly++)
1422 {
1423 const Scalar & leftValue = leftFields_y(i1,ly);
1424 const Scalar & rightValue = rightFields_y(j1,ly);
1425
1426 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1427
1428 pointEnumerationIndex++;
1429 }
1430 Gy_index++;
1431 }
1432
1433 Scalar & Gz = GzIntegral(threadNumber); // one entry per thread
1434 for (int i2=0; i2<leftFieldBounds_z; i2++)
1435 {
1436 for (int j2=0; j2<rightFieldBounds_z; j2++)
1437 {
1438 Gz = 0.0;
1439
1440 int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1441
1442 for (int lz=0; lz<pointBounds_z; lz++)
1443 {
1444 const Scalar & leftValue = leftFields_z(i2,lz);
1445 const Scalar & rightValue = rightFields_z(j2,lz);
1446
1447 Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1448
1449 Gy_index++;
1450 }
1451
1452 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1453 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1454 // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1455// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1456// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1457
1458 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1459 }
1460 }
1461 });
1462 // synchronize threads
1463 teamMember.team_barrier();
1464 }
1465 }
1466 }
1467 }
1468 }
1469
1470 template<size_t numTensorComponents>
1471 KOKKOS_INLINE_FUNCTION
1472 void run( const TeamMember & teamMember ) const
1473 {
1474 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1475// Kokkos::Array<int,numTensorComponents> pointBounds;
1476// Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1477// Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1478//
1479// int pointsInNonzeroComponentDimensions = 1;
1480// for (unsigned r=0; r<numTensorComponents; r++)
1481// {
1482// pointBounds[r] = pointBounds_[r];
1483// if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1484// leftFieldBounds[r] = leftFieldBounds_[r];
1485// rightFieldBounds[r] = rightFieldBounds_[r];
1486// }
1487//
1488// const int cellDataOrdinal = teamMember.league_rank();
1489// const int numThreads = teamMember.team_size(); // num threads
1490// const int G_k_StackHeight = numTensorComponents - 1; // per thread
1491// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1492// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1493// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1494// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1495// if (fad_size_output_ > 0) {
1496// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1497// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1498// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1499// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1500// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1501// }
1502// else {
1503// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1504// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1505// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1506// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1507// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1508// }
1509//
1512//
1513// constexpr int R = numTensorComponents - 1;
1514//
1515// // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1516//
1517// // synchronize threads
1518// teamMember.team_barrier();
1519//
1520// for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1521// {
1522// const int a = a_offset_ + a_component;
1523// for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1524// {
1525// const int b = b_offset_ + b_component;
1526//
1527// const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1528// const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1529//
1530// const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1531// const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1532//
1533// const int numPointsFirst = leftFirstComponent.extent_int(1);
1534//
1535// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1536// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1537// });
1538//
1539// // synchronize threads
1540// teamMember.team_barrier();
1541//
1542// for (int i0=0; i0<numLeftFieldsFirst; i0++)
1543// {
1544// for (int j0=0; j0<numRightFieldsFirst; j0++)
1545// {
1546// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1547// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1548// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1549// leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1550// });
1551//
1552// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1553// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1554// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1555// rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1556// });
1557//
1558// // synchronize threads
1559// teamMember.team_barrier();
1560//
1561// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1562// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1563// int remainingIndex = pointEnumerationIndexLaterDimensions;
1564//
1565// for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1566// {
1567// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1568// remainingIndex /= pointBounds[d+1];
1569// }
1570// pointArgumentsInLaterDimensions[0] = remainingIndex;
1571//
1572// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1573// for (int d=R; d>0; d--)
1574// {
1575// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1576// tensorPointEnumerationOffset *= pointBounds[d-1];
1577// }
1578//
1579// Scalar integralValue = 0;
1580// if (fad_size_output_ == 0)
1581// {
1582// // not a Fad type; we're allow to have a vector range
1583// Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1584// {
1585// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1586// }, integralValue);
1587// }
1588// else
1589// {
1590// for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1591// {
1592// integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1593// }
1594// }
1595//
1596// G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1597// });
1598//
1599// // synchronize threads
1600// teamMember.team_barrier();
1601//
1602// // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1603// }
1604// }
1605// }
1606// }
1608 }
1609
1610 KOKKOS_INLINE_FUNCTION
1611 void operator()( const TeamMember & teamMember ) const
1612 {
1613 switch (numTensorComponents_)
1614 {
1615 case 1: run<1>(teamMember); break;
1616 case 2: run<2>(teamMember); break;
1617 case 3: runSpecialized3(teamMember); break;
1618 case 4: run<4>(teamMember); break;
1619 case 5: run<5>(teamMember); break;
1620 case 6: run<6>(teamMember); break;
1621 case 7: run<7>(teamMember); break;
1622#ifdef INTREPID2_HAVE_DEBUG
1623 default:
1624 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1625#endif
1626 }
1627 }
1628
1631 {
1632 // compute flop count on a single cell
1633 int flopCount = 0;
1634
1635 constexpr int numTensorComponents = 3;
1636 Kokkos::Array<int,numTensorComponents> pointBounds;
1637 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1638 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1639
1640 int pointsInNonzeroComponentDimensions = 1;
1641 for (unsigned r=0; r<numTensorComponents; r++)
1642 {
1643 pointBounds[r] = pointBounds_[r];
1644 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1645 leftFieldBounds[r] = leftFieldBounds_[r];
1646 rightFieldBounds[r] = rightFieldBounds_[r];
1647 }
1648
1649 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1650 {
1651 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1652 {
1653// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1654// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1655// });
1656 flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1657
1658 for (int i0=0; i0<leftFieldBounds[0]; i0++)
1659 {
1660 for (int j0=0; j0<rightFieldBounds[0]; j0++)
1661 {
1662 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1663// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1664// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1665// int remainingIndex = pointEnumerationIndexLaterDimensions;
1666//
1667// for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1668// {
1669// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1670// remainingIndex /= pointBounds[d+1];
1671// }
1672// pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1673//
1674// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1675// for (int d=R; d>0; d--)
1676// {
1677// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1678// tensorPointEnumerationOffset *= pointBounds[d-1];
1679// }
1680//
1681// Scalar integralValue = 0;
1682// if (fad_size_output_ == 0)
1683// {
1684// // not a Fad type; we're allow to have a vector range
1685// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1686// {
1687// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1688// }, integralValue);
1689// }
1690// else
1691// {
1692// for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1693// {
1694// integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1695// }
1696// }
1697//
1698// GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1699// });
1700
1701
1702 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1703 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1704 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1705
1706// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1707// const int i1 = i1j1 % leftFieldBounds[1];
1708// const int j1 = i1j1 / leftFieldBounds[1];
1709//
1711//
1712// int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1713// for (int lz=0; lz<pointBounds[2]; lz++)
1714// {
1715// Scalar & Gy = GyIntegrals(Gy_index);
1716// Gy = 0.0;
1717//
1718// const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1719// const bool rightRankIs3 = (rightFields_y.rank() == 3);
1720// for (int ly=0; ly<pointBounds[1]; ly++)
1721// {
1722// const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1723// const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1724//
1725// Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1726//
1727// pointEnumerationIndex++;
1728// }
1729// Gy_index++;
1730// }
1731//
1732// for (int i2=0; i2<leftFieldBounds[2]; i2++)
1733// {
1734// for (int j2=0; j2<rightFieldBounds[2]; j2++)
1735// {
1736// Scalar Gz = 0.0;
1737//
1738// int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1739//
1740// const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1741// const bool rightRankIs3 = (rightFields_z.rank() == 3);
1742// for (int lz=0; lz<pointBounds[2]; lz++)
1743// {
1744// const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1745// const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1746//
1747// Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1748//
1749// Gy_index++;
1750// }
1751//
1752// Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1753// Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1754//
1755// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1756// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1757//
1758// if (integralViewRank == 2)
1759// {
1760// integralView_.access(i,j,0) += Gz;
1761// }
1762// else
1763// {
1764// integralView_.access(cellDataOrdinal,i,j) += Gz;
1765// }
1766// }
1767// }
1768// });
1769 }
1770 }
1771 }
1772 }
1773 return flopCount;
1774 }
1775
1777 int teamSize(const int &maxTeamSizeFromKokkos) const
1778 {
1779 // TODO: fix this to match the actual parallelism expressed
1780 const int R = numTensorComponents_ - 1;
1781 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1782 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1783 }
1784
1786 size_t team_shmem_size (int numThreads) const
1787 {
1788 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1789 size_t shmem_size = 0;
1790
1791 const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1792
1793 int pointsInNonzeroComponentDimensions = 1;
1794 for (int d=1; d<numTensorComponents_; d++)
1795 {
1796 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1797 }
1798
1799 if (fad_size_output_ > 0)
1800 {
1801 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1802 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1803 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_); // GzIntegral: entry with x,y,z integrated away
1804 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1805
1806 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1807 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1808 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1809 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1810 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1811 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1812 }
1813 else
1814 {
1815 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1816 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1817 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads); // GzIntegral: entry with x,y,z integrated away
1818 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1819
1820 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1821 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1822 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1823 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1824 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1825 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1826 }
1827
1828 return shmem_size;
1829 }
1830 };
1831
1832 template<class Scalar, class DeviceType>
1833 class F_RefSpaceIntegral
1834 {
1835 ScalarView<Scalar,DeviceType> integral_;
1838 Data<Scalar,DeviceType> weights_;
1839 ordinal_type dimSpan_;
1840 ordinal_type leftRank_;
1841 ordinal_type rightRank_;
1842 ordinal_type numPoints_;
1843 public:
1844 F_RefSpaceIntegral(ScalarView<Scalar,DeviceType> integralView,
1846 ordinal_type dimSpan)
1847 :
1848 integral_(integralView),
1849 left_(left),
1850 right_(right),
1851 weights_(weights),
1852 dimSpan_(dimSpan)
1853 {
1854 leftRank_ = left.rank();
1855 rightRank_ = right.rank();
1856 numPoints_ = weights.extent_int(0);
1857 }
1858
1859 KOKKOS_INLINE_FUNCTION
1860 void operator()( const ordinal_type & i, const ordinal_type & j ) const
1861 {
1862 Scalar refSpaceIntegral = 0.0;
1863 for (int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1864 {
1865 const Scalar & weight = weights_(ptOrdinal);
1866 for (int a=0; a<dimSpan_; a++)
1867 {
1868 const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1869 const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1870 refSpaceIntegral += leftValue * rightValue * weight;
1871 }
1872 }
1873 integral_(i,j) = refSpaceIntegral;
1874 }
1875 };
1876 }
1877
1878template<typename DeviceType>
1879template<class Scalar>
1881 const TensorData<Scalar,DeviceType> cellMeasures,
1882 const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
1883{
1884 // allocates a (C,F,F) container for storing integral data
1885
1886 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1887 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1888 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1889 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1890 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1891
1892 // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1893 // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1894 // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1895 // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1896 const int CELL_DIM = 0;
1897 const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1898 const auto leftTransform = vectorDataLeft.transform();
1899
1900 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1901 // transforms may be invalid, indicating an identity transform. If so, it will not constrain the output at all.
1902 if (vectorDataLeft.transform().isValid())
1903 {
1904 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1905 }
1906 if (vectorDataRight.transform().isValid())
1907 {
1908 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1909 }
1910
1911 DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1912 int cellDataExtent = combinedCellDimInfo.dataExtent;
1913
1914 const int numCells = vectorDataLeft.numCells();
1915 const int numFieldsLeft = vectorDataLeft.numFields();
1916 const int numFieldsRight = vectorDataRight.numFields();
1917
1918 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1919 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1920
1921 if (cellVariationType != CONSTANT)
1922 {
1923 Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1924 return Data<Scalar,DeviceType>(data, extents, variationTypes);
1925 }
1926 else
1927 {
1928 Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1929 return Data<Scalar,DeviceType>(data, extents, variationTypes);
1930 }
1931}
1932
1936template<typename DeviceType>
1937template<class Scalar>
1939 const TensorData<Scalar,DeviceType> & cellMeasures,
1940 const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
1941 double* approximateFlops)
1942{
1943 using ExecutionSpace = typename DeviceType::execution_space;
1944
1945 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1946 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1947 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1948 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1949 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1950
1951 if (approximateFlops != NULL)
1952 {
1953 *approximateFlops = 0;
1954 }
1955
1956 // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1957 const int integralViewRank = integrals.getUnderlyingViewRank();
1958
1959 if (!sumInto)
1960 {
1961 integrals.clear();
1962 }
1963
1964 const int cellDataExtent = integrals.getDataExtent(0);
1965 const int numFieldsLeft = basisValuesLeft.numFields();
1966 const int numFieldsRight = basisValuesRight.numFields();
1967 const int spaceDim = basisValuesLeft.spaceDim();
1968
1969 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1970
1971 const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1972 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1973
1974 // we require that the number of tensor components in the vectors are the same for each vector entry
1975 // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1976 int numTensorComponentsLeft = -1;
1977 const bool isVectorValued = basisValuesLeft.vectorData().isValid();
1978 if (isVectorValued)
1979 {
1980 const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
1981 INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument, "left and right must either both be vector-valued, or both scalar-valued");
1982 const auto &refVectorLeft = basisValuesLeft.vectorData();
1983 int numFamiliesLeft = refVectorLeft.numFamilies();
1984 int numVectorComponentsLeft = refVectorLeft.numComponents();
1985 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1986 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1987 for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1988 {
1989 for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1990 {
1991 const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1992 if (tensorData.numTensorComponents() > 0)
1993 {
1994 if (numTensorComponentsLeft == -1)
1995 {
1996 numTensorComponentsLeft = tensorData.numTensorComponents();
1997 }
1998 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1999 for (int r=0; r<numTensorComponentsLeft; r++)
2000 {
2001 maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
2002 }
2003 }
2004 }
2005 }
2006 int numTensorComponentsRight = -1;
2007 const auto &refVectorRight = basisValuesRight.vectorData();
2008 int numFamiliesRight = refVectorRight.numFamilies();
2009 int numVectorComponentsRight = refVectorRight.numComponents();
2010 for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2011 {
2012 for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2013 {
2014 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2015 if (tensorData.numTensorComponents() > 0)
2016 {
2017 if (numTensorComponentsRight == -1)
2018 {
2019 numTensorComponentsRight = tensorData.numTensorComponents();
2020 }
2021 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2022 for (int r=0; r<numTensorComponentsRight; r++)
2023 {
2024 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2025 }
2026 }
2027 }
2028 }
2029 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
2030 }
2031 else
2032 {
2033 numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
2034 for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
2035 {
2036 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
2037 }
2038
2039 // check that right tensor component count also agrees
2040 for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2041 {
2042 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2043 }
2044 }
2045 const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
2046
2047 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
2048 {
2049 // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
2050
2051 // in this case, the integrals in each tensorial direction are entirely separable
2052 // allocate some temporary storage for the integrals in each tensorial direction
2053
2054 // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
2055
2056 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2057 for (int r=0; r<numPointTensorComponents; r++)
2058 {
2059 // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
2060 pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
2061 }
2062
2063 // only one of these will be a valid container:
2064 Kokkos::View<Scalar**, DeviceType> integralView2;
2065 Kokkos::View<Scalar***, DeviceType> integralView3;
2066 if (integralViewRank == 3)
2067 {
2068 integralView3 = integrals.getUnderlyingView3();
2069 }
2070 else
2071 {
2072 integralView2 = integrals.getUnderlyingView2();
2073 }
2074 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2075 {
2076 int a_offset = 0; // left vector component offset
2077 int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2078
2079 const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
2080 for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2081 {
2082 TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
2083 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2084 if (!leftComponent.isValid())
2085 {
2086 a_offset++; // empty components are understood to take up one dimension
2087 continue;
2088 }
2089 const int leftDimSpan = leftComponent.extent_int(2);
2090
2091 const int leftComponentFieldCount = leftComponent.extent_int(0);
2092
2093 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2094 {
2095 int b_offset = 0; // right vector component offset
2096 int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2097
2098 const int rightVectorComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2099 for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2100 {
2101 TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2102 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2103 if (!rightComponent.isValid())
2104 {
2105 b_offset++; // empty components are understood to take up one dimension
2106 continue;
2107 }
2108 const int rightDimSpan = rightComponent.extent_int(2);
2109
2110 const int rightComponentFieldCount = rightComponent.extent_int(0);
2111
2112 // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2113 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2114 {
2115 b_offset += rightDimSpan;
2116
2117 continue;
2118 }
2119
2120 // if the a, b spans intersect, by assumption they should align exactly
2121 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2122 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2123
2124 const int d_start = a_offset;
2125 const int d_end = d_start + leftDimSpan;
2126
2127 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2128 ComponentIntegralsArray componentIntegrals;
2129 for (int r=0; r<numPointTensorComponents; r++)
2130 {
2131 /*
2132 Four vector data cases to consider:
2133 1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2134 2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2135 3. First container is axial, second arbitrary.
2136 4. First is arbitrary, second axial.
2137
2138 But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2139 the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2140
2141 The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2142 while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2143 */
2144
2145 Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2146 const int numPoints = pointDimensions[r];
2147
2148 // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2149
2150 Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2151 Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2152
2153 const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2154 const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2155 const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2156 const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2157
2158 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2159
2160 for (int d=d_start; d<d_end; d++)
2161 {
2162 ScalarView<Scalar,DeviceType> componentIntegralView;
2163
2164 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
2165 if (allocateFadStorage)
2166 {
2167 auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2168 componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2169 }
2170 else
2171 {
2172 componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2173 }
2174
2175 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2176
2177 Impl::F_RefSpaceIntegral<Scalar, DeviceType> refSpaceIntegralFunctor(componentIntegralView, leftTensorComponent, rightTensorComponent, quadratureWeights,
2178 leftTensorComponentDimSpan);
2179 Kokkos::parallel_for("compute componentIntegrals", policy, refSpaceIntegralFunctor);
2180
2181 componentIntegrals[r][d] = componentIntegralView;
2182
2183 if (approximateFlops != NULL)
2184 {
2185 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2186 }
2187 } // d
2188 } // r
2189
2190 ExecutionSpace().fence();
2191
2192 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2193 Kokkos::Array<int,3> lowerBounds {0,0,0};
2194 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2195 // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2196 Kokkos::parallel_for("compute field integrals", policy,
2197 KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2198 const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2199
2200 TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2201 leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2202
2203 TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2204 rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2205
2206 Scalar integralSum = 0.0;
2207 for (int d=d_start; d<d_end; d++)
2208 {
2209 const Scalar & transformLeft_d = basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
2210 const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
2211
2212 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2213 // approximateFlopCount++;
2214
2215 Scalar integral_d = 1.0;
2216
2217 for (int r=0; r<numPointTensorComponents; r++)
2218 {
2219 integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2220 // approximateFlopCount++; // product
2221 }
2222 integralSum += leftRightTransform_d * integral_d;
2223 // approximateFlopCount += 2; // multiply and sum
2224
2225 const int i = leftFieldOrdinal + leftFieldOffset;
2226 const int j = rightFieldOrdinal + rightFieldOffset;
2227
2228 if (integralViewRank == 3)
2229 {
2230 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2231 }
2232 else
2233 {
2234 integralView2(i,j) += cellMeasureWeight * integralSum;
2235 }
2236 }
2237 });
2238 b_offset += rightDimSpan;
2239 } // rightVectorComponentOrdinal
2240 } // rightFamilyOrdinal
2241 a_offset += leftDimSpan;
2242 } // leftVectorComponentOrdinal
2243 } // leftFamilyOrdinal
2244
2245 if (approximateFlops != NULL)
2246 {
2247 // TODO: check the accuracy of this
2248 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2249 }
2250 }
2251 else // general case (not axis-aligned + affine tensor-product structure)
2252 {
2253 // prepare composed transformation matrices
2254 const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2255 const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2256 const bool transposeLeft = true;
2257 const bool transposeRight = false;
2258// auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2259// timer->start();
2260 // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2
2261 const bool matrixTransform = (leftTransform.rank() == 4) || (rightTransform.rank() == 4);
2262 Data<Scalar,DeviceType> composedTransform;
2263 // invalid/empty transforms are used when the identity is intended.
2264 if (leftTransform.isValid() && rightTransform.isValid())
2265 {
2266 if (matrixTransform)
2267 {
2268 composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2269 composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2270
2271 // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2272 if (approximateFlops != NULL)
2273 {
2274 *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2275 }
2276 }
2277 else
2278 {
2279 composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2280 composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2281
2282 // re-cast composedTranform as a rank 4 (C,P,D,D) object -- a 1 x 1 matrix at each (C,P).
2283 const int newRank = 4;
2284 auto extents = composedTransform.getExtents();
2285 auto variationTypes = composedTransform.getVariationTypes();
2286 composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2287 if (approximateFlops != NULL)
2288 {
2289 *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2290 }
2291 }
2292 }
2293 else if (leftTransform.isValid())
2294 {
2295 // rightTransform is the identity
2296 composedTransform = leftTransform;
2297 }
2298 else if (rightTransform.isValid())
2299 {
2300 // leftTransform is the identity
2301 composedTransform = rightTransform;
2302 }
2303 else
2304 {
2305 // both left and right transforms are identity
2306 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2307 Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2308
2309 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2310 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2311 composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2312 }
2313
2314// timer->stop();
2315// std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2316// timer->reset();
2317
2318 const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2319 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2320 const int leftComponentCount = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2321 const int rightComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2322
2323 int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2324 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2325 {
2326 // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2327 // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2328 int a_offset = 0;
2329 bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2330 for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2331 {
2332 TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2333 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2334 if (!leftComponent.isValid())
2335 {
2336 // represents zero
2337 a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2338 continue;
2339 }
2340
2341 int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2342 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2343 {
2344 // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2345 // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2346 bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2347 int b_offset = 0;
2348 for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2349 {
2350 TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2351 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2352 if (!rightComponent.isValid())
2353 {
2354 // represents zero
2355 b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2356 continue;
2357 }
2358
2359 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2360
2361 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2362 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2363
2364 // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2365 // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2366 // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2367 // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2368 // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2369 // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2370 // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2371 bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2372 bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2373
2374 // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2375 // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2376 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2377 {
2378 ExecutionSpace().fence();
2379 }
2380 haveLaunchedContributionToCurrentFamilyLeft = true;
2381 haveLaunchedContributionToCurrentFamilyRight = true;
2382
2383 if (integralViewRank == 2)
2384 {
2385 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2386 {
2387 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2388
2389 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2390 const int teamSize = functor.teamSize(recommendedTeamSize);
2391
2392 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2393
2394 Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2395
2396 if (approximateFlops != NULL)
2397 {
2398 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2399 }
2400 }
2401 else
2402 {
2403 auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2404
2405 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2406 const int teamSize = functor.teamSize(recommendedTeamSize);
2407
2408 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2409
2410 Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2411
2412 if (approximateFlops != NULL)
2413 {
2414 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2415 }
2416 }
2417 }
2418 else if (integralViewRank == 3)
2419 {
2420 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2421 {
2422 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2423
2424 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2425 const int teamSize = functor.teamSize(recommendedTeamSize);
2426
2427 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2428
2429 Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2430
2431 if (approximateFlops != NULL)
2432 {
2433 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2434 }
2435 }
2436 else
2437 {
2438 auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2439
2440 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2441 const int teamSize = functor.teamSize(recommendedTeamSize);
2442
2443 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2444
2445 Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2446
2447 if (approximateFlops != NULL)
2448 {
2449 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2450 }
2451 }
2452 }
2453 b_offset += isVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2454 }
2455 rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2456 }
2457 a_offset += isVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2458 }
2459 leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2460 }
2461 }
2462// if (approximateFlops != NULL)
2463// {
2464// std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2465// }
2466 ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2467}
2468
2469} // end namespace Intrepid2
2470
2471#endif
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say,...
@ GENERAL
arbitrary variation
@ BLOCK_PLUS_DIAGONAL
one of two dimensions in a matrix; bottom-right part of matrix is diagonal
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_pod< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
constexpr int getVectorSizeForHierarchicalParallelism()
Returns a vector size to be used for the provided Scalar type in the context of hierarchically-parall...
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
KOKKOS_INLINE_FUNCTION int familyFieldOrdinalOffset(const int &familyOrdinal) const
Returns the field ordinal offset for the specified family.
KOKKOS_INLINE_FUNCTION int numFamilies() const
For valid vectorData, returns the number of families in vectorData; otherwise, returns number of Tens...
TensorDataType & tensorData()
TensorData accessor for single-family scalar data.
Wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimen...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank,...
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts,...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
static Data< Scalar, DeviceType > allocateIntegralData(const TransformedBasisValues< Scalar, DeviceType > vectorDataLeft, const TensorData< Scalar, DeviceType > cellMeasures, const TransformedBasisValues< Scalar, DeviceType > vectorDataRight)
Allocates storage for the contraction of vectorDataLeft and vectorDataRight containers on point and s...
static void integrate(Data< Scalar, DeviceType > integrals, const TransformedBasisValues< Scalar, DeviceType > &vectorDataLeft, const TensorData< Scalar, DeviceType > &cellMeasures, const TransformedBasisValues< Scalar, DeviceType > &vectorDataRight, const bool sumInto=false, double *approximateFlops=NULL)
Contracts vectorDataLeft and vectorDataRight containers on point and space dimensions,...
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
Structure-preserving representation of transformed vector data; reference space values and transforma...
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container.
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container.
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container.
Struct expressing all variation information about a Data object in a single dimension,...