Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// Zoltan2: A package of combinatorial algorithms for scientific computing
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact Karen Devine (kddevin@sandia.gov)
39// Erik Boman (egboman@sandia.gov)
40// Siva Rajamanickam (srajama@sandia.gov)
41//
42// ***********************************************************************
43//
44// @HEADER
48
49#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50#define _ZOLTAN2_ALGMultiJagged_HPP_
51
55#include <Zoltan2_Algorithm.hpp>
58#include <Zoltan2_Util.hpp>
59#include <Tpetra_Distributor.hpp>
60#include <Teuchos_StandardParameterEntryValidators.hpp>
61#include <Teuchos_ParameterList.hpp>
62#include <Kokkos_Sort.hpp>
63
64#include <algorithm> // std::sort
65#include <vector>
66#include <unordered_map>
67
68#ifdef ZOLTAN2_USEZOLTANCOMM
69#ifdef HAVE_ZOLTAN2_MPI
70#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71#include "zoltan_comm_cpp.h"
72#include "zoltan_types.h" // for error codes
73#endif
74#endif
75
76namespace Teuchos{
77
81template <typename Ordinal, typename T>
82class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
83{
84private:
85 Ordinal size;
86 T epsilon;
87
88public:
92 epsilon(std::numeric_limits<T>::epsilon()) {}
93
98 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
99
105 void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
106 for(Ordinal i = 0; i < count; i++) {
107 if(Z2_ABS(inBuffer[i]) > epsilon) {
108 inoutBuffer[i] = inBuffer[i];
109 }
110 }
111 }
112};
113
114} // namespace Teuchos
115
116namespace Zoltan2{
117
124template <typename IT, typename CT, typename WT>
126{
127public:
128 // TODO: Why volatile?
129 // no idea, another intel compiler failure.
130 volatile IT index;
131 volatile CT count;
132 volatile WT *val;
133 volatile WT epsilon;
134
136 this->index = 0;
137 this->count = 0;
138 this->val = NULL;
139 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
140 }
141
142 // TODO: Document these methods?
143 uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
144 this->index = index_;
145 this->count = count_;
146 this->val = vals_;
147 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
148 }
149
152
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
156 this->val = vals_;
157 }
158
159 bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
160 assert(this->count == other.count);
161 for(CT i = 0; i < this->count; ++i) {
162 // if the values are equal go to next one.
163 if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
164 continue;
165 }
166 // if next value is smaller return true;
167 if(this->val[i] < other.val[i]) {
168 return true;
169 }
170 // if next value is bigger return false;
171 else {
172 return false;
173 }
174 }
175 // if they are totally equal.
176 return this->index < other.index;
177 }
178};
179
182template <class IT, class WT>
184{
185 IT id;
186 WT val;
187};
188
193template <class IT, class WT>
194void uqsort(IT n, uSortItem<IT, WT> * arr) {
195 const int NSTACK = 50;
196 int M = 7;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[NSTACK];
199 WT aval;
201
202 --arr;
203 for(;;) {
204 if(ir-l < M) {
205 for(j=l+1;j<=ir;j++) {
206 a=arr[j];
207 aval = a.val;
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
210 break;
211 arr[i+1] = arr[i];
212 }
213 arr[i+1]=a;
214 }
215 if(jstack == 0)
216 break;
217 ir=istack[jstack--];
218 l=istack[jstack--];
219 }
220 else {
221 k=(l+ir) >> 1;
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
225 }
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
228 }
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
231 }
232 i=l+1;
233 j=ir;
234 a=arr[l];
235 aval = a.val;
236 for(;;) {
237 do i++; while (arr[i].val < aval);
238 do j--; while (arr[j].val > aval);
239 if(j < i) break;
240 std::swap(arr[i],arr[j]);
241 }
242 arr[l]=arr[j];
243 arr[j]=a;
244 jstack += 2;
245 if(jstack > NSTACK) {
246 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
247 std::terminate();
248 }
249 if(ir-i+1 >= j-l) {
250 istack[jstack]=ir;
251 istack[jstack-1]=i;
252 ir=j-1;
253 }
254 else {
255 istack[jstack]=j-1;
256 istack[jstack-1]=l;
257 l=i;
258 }
259 }
260 }
261}
262
263template <class IT, class WT, class SIGN>
265{
266 IT id;
267 WT val;
268 SIGN signbit; // 1 means positive, 0 means negative.
270 /*if I am negative, the other is positive*/
271 if(this->signbit < rhs.signbit) {
272 return true;
273 }
274 /*if both has the same sign*/
275 else if(this->signbit == rhs.signbit) {
276 if(this->val < rhs.val) {//if my value is smaller,
277 return this->signbit;//then if we both are positive return true.
278 //if we both are negative, return false.
279 }
280 else if(this->val > rhs.val) {//if my value is larger,
281 return !this->signbit; //then if we both are positive return false.
282 //if we both are negative, return true.
283 }
284 else { //if both are equal.
285 return false;
286 }
287 }
288 else {
289 /*if I am positive, the other is negative*/
290 return false;
291 }
292 }
293
295 return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
296 }
297};
298
302template <class IT, class WT, class SIGN>
304 const IT NSTACK = 50;
305 IT M = 7;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[NSTACK];
309
310 --arr;
311 for(;;) {
312 if(ir < M + l) {
313 for(j=l+1;j<=ir;j++) {
314 a=arr[j];
315 for(i=j-1;i>=1;i--) {
316 if(arr[i] <= a) {
317 break;
318 }
319 arr[i+1] = arr[i];
320 }
321 arr[i+1]=a;
322 }
323 if(jstack == 0) {
324 break;
325 }
326 ir=istack[jstack--];
327 l=istack[jstack--];
328 }
329 else {
330 k=(l+ir) >> 1;
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
334 }
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
337 }
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
340 }
341 i=l+1;
342 j=ir;
343 a=arr[l];
344 for(;;) {
345 do i++; while (arr[i] < a);
346 do j--; while (a < arr[j]);
347 if(j < i) break;
348 std::swap(arr[i],arr[j]);
349 }
350 arr[l]=arr[j];
351 arr[j]=a;
352 jstack += 2;
353 if(jstack > NSTACK) {
354 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
355 std::terminate();
356 }
357 if(ir+l+1 >= j+i) {
358 istack[jstack]=ir;
359 istack[jstack-1]=i;
360 ir=j-1;
361 }
362 else {
363 istack[jstack]=j-1;
364 istack[jstack-1]=l;
365 l=i;
366 }
367 }
368 }
369}
370
371// This exists only so we can track how many times the MJ algorithm is
372// called and put each of those into different timer names.
373// Currently the MultiJaggedTest.cpp will actually call it twice.
374// First time with data from a Tpetra MultiVector and then a second time using
375// a BasicVectorAdapter which allows us to turn UVM off for some tests. The
376// results of the two runs are compared which helps to catch a lot of bugs. For
377// profiling I'm mostly just interested in the UVM off case and need it to be
378// in separate timers. Passing a value through would mess up the API. Possibly
379// we could check the Adapter and use that. The statics have to be outside the
380// templated class as the two called instances will be different template
381// parameters. Another complication is that MultiJagged.cpp will call through
382// the Zoltan2_AlgMJ class and we want to time things in both classes. However
383// TaskMapper will directly call AlgMJ so I made two counters for the two
384// classes to make sure it was always correct. This does not impact any
385// behavior and has the sole purpose of generating unique timer names. If you
386// run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
387// 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
389 static int get_counter_AlgMJ() {
390 static int counter = 0;
391 return counter++;
392 }
394 static int counter = 0;
395 return counter++;
396 }
397};
398
401template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
402 typename mj_part_t, typename mj_node_t>
403class AlgMJ
404{
405private:
406 typedef typename mj_node_t::device_type device_t; // for views
407 typedef coordinateModelPartBox mj_partBox_t;
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
409
410 //if the (last dimension reduce all count) x the mpi world size
411 //estimated to be bigger than this number then migration will be forced
412 //in earlier iterations.
413 static constexpr size_t future_reduceall_cutoff = 1500000;
414
415 //if parts right before last dimension are estimated to have less than
416 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
417 static constexpr mj_lno_t min_work_last_dim = 1000;
418
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr int significance_mul = 1000;
421
422 std::string mj_timer_base_string; // for convenience making timer names
423
424 RCP<const Environment> mj_env; // the environment object
425 RCP<const Comm<int> > mj_problemComm; // initial comm object
426 RCP<Comm<int> > comm; // comm object than can be altered during execution
427 double imbalance_tolerance; // input imbalance tolerance.
428 int recursion_depth; // number of steps that partitioning will be solved in.
429 int coord_dim; // coordinate dim
430 int num_weights_per_coord; // # of weights per coord
431 size_t initial_num_loc_coords; // initial num local coords.
432 global_size_t initial_num_glob_coords; // initial num global coords.
433 mj_lno_t num_local_coords; // number of local coords.
434 mj_gno_t num_global_coords; // number of global coords.
435 mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
436
437 // can distribute points on same coordinant to different parts.
438 bool distribute_points_on_cut_lines;
439
440 // how many parts we can calculate concurrently.
441 mj_part_t max_concurrent_part_calculation;
442
443 bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
444 int mj_user_recursion_depth; // the recursion depth value provided by user.
445 bool mj_keep_part_boxes; // if the boxes need to be kept.
446
447 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
448 int check_migrate_avoid_migration_option;
449
450 // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
451 // aim for minimized number of messages with possibly bad load-imbalance
452 int migration_type;
453
454 // when MJ decides whether to migrate, the minimum imbalance for migration.
455 double minimum_migration_imbalance;
456
457 // Nonuniform first level partitioning
458 // (Currently available only for sequential_task_partitioning):
459 // Used for Dragonfly task mapping by partitioning Dragonfly RCA
460 // machine coordinates and application coordinates.
461 // An optimization that completely partitions the most important machine dimension
462 // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
463 // MJ alg follows after the nonuniform first level partitioning.
464 //
465 // Ex. (first level partitioning): If we have 120 elements,
466 // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
467 // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
468 // continues for all subsequent levels.
469
470 // If used, number of parts requested for a nonuniform
471 // first level partitioning
472 mj_part_t num_first_level_parts;
473
474 // If used, the requested distribution of parts for the
475 // nonuniform first level partitioning
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
477
478 mj_part_t total_num_cut ; // how many cuts will be totally
479 mj_part_t total_num_part; // how many parts will be totally
480
481 mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
482 mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
483
484 // maximum part+cut count along a dimension.
485 size_t max_num_total_part_along_dim;
486
487 mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
488
489 // max no of parts that might occur during the partition before the last
490 // partitioning dimension.
491 mj_part_t last_dim_num_part;
492
493 // input part array specifying num part to divide along each dim.
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
495
496 // two dimension coordinate array
497 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
499 mj_coordinates;
500
501 // two dimension weight array
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
503
504 // if the target parts are uniform
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
506
507 // if the coordinates have uniform weights
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
509
510 int mj_num_teams; // the number of teams
511
512 size_t num_global_parts; // the targeted number of parts
513
514 // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
515 RCP<mj_partBoxVector_t> kept_boxes;
516
517 RCP<mj_partBox_t> global_box;
518
519 int myRank; // processor rank
520 int myActualRank; // initial rank
521
522 bool divide_to_prime_first;
523
524 // initial global ids of the coordinates.
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
526
527 // current global ids of the coordinates, might change during migration.
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
529
530 // the actual processor owner of the coordinate, to track after migrations.
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
532
533 // permutation of coordinates, for partitioning.
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
535
536 // permutation work array.
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
538
539 // the part ids assigned to coordinates.
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
541
542 // beginning and end of each part.
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
544
545 // work array for beginning and end of each part.
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
547
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
549
550 // how much weight should a MPI put left side of the each cutline
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
553
554 // weight percentage each thread in MPI puts left side of the each outline
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
557
558 // work array to manipulate coordinate of cutlines in different iterations.
559 // necessary because previous cut line information is used for determining
560 // the next cutline information. therefore, cannot update the cut work array
561 // until all cutlines are determined.
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
563
564 // Used for swapping above cut_coordinates_work_array
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
566
567 // cumulative part weight array.
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
569
570 // upper bound coordinate of a cut line
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
572
573 // lower bound coordinate of a cut line
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
575
576 // lower bound weight of a cut line
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
578
579 // upper bound weight of a cut line
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
581
582 // combined array to exchange the min and max coordinate, and total
583 // weight of part.
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
586
587 // global combined array with the results for min, max and total weight.
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
590
591 // isDone is used to determine if a cutline is determined already. If a cut
592 // line is already determined, the next iterations will skip this cut line.
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
594
595 // incomplete_cut_count count holds the number of cutlines that have not
596 // been finalized for each part when concurrentPartCount>1, using this
597 // information, if incomplete_cut_count[x]==0, then no work is done
598 // for this part.
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
602
603 // Need a quick accessor for this on host
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
605
606 // local part weights of each thread.
607 Kokkos::View<double *, device_t>
608 thread_part_weights;
609
610 // the work manupulation array for partweights.
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
613
614 // thread_cut_left_closest_point to hold the closest coordinate
615 // to a cutline from left (for each thread).
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
618
619 // thread_cut_right_closest_point to hold the closest coordinate
620 // to a cutline from right (for each thread)
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
623
624 // to store how many points in each part a thread has.
625 Kokkos::View<mj_lno_t *, device_t>
626 thread_point_counts;
627
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
630
631 // for faster communication, concatanation of
632 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
633 // leftClosest distances sized P-1, since P-1 cut lines
634 // rightClosest distances size P-1, since P-1 cut lines.
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
639
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim; // for quick access on host
643
644 /* \brief helper functio to calculate imbalance.
645 * \param achieved balance we achieved.
646 * \param expected balance expected.
647 */
648 static
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
652 }
653
654 /* \brief Either the mj array (part_no_array) or num_global_parts should be
655 * provided in the input. part_no_array takes precedence if both are
656 * provided. Depending on these parameters, total cut/part number, maximum
657 * part/cut number along a dimension, estimated number of reduceAlls,
658 * and the number of parts before the last dimension is calculated.
659 * */
660 void set_part_specifications();
661
662 /* \brief Tries to determine the part number for current dimension,
663 * by trying to make the partitioning as square as possible.
664 * \param num_total_future how many more partitionings are required.
665 * \param root how many more recursion depth is left.
666 */
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
669 double root);
670
671 /* \brief for part communication we keep track of the box boundaries.
672 * This is performed when either asked specifically, or when geometric
673 * mapping is performed afterwards. This function initializes a single box
674 * with all global min and max coordinates.
675 * \param initial_partitioning_boxes the input and output vector for boxes.
676 */
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
678
679 /* \brief Function returns how many parts that will be obtained after this
680 * dimension partitioning. It sets how many parts each current part will be
681 * partitioned into in this dimension to device_num_partitioning_in_current_dim
682 * vector, sets how many total future parts each obtained part will be
683 * partitioned into in next_future_num_parts_in_parts vector, If part boxes
684 * are kept, then sets initializes the output_part_boxes as its ancestor.
685 * \param future_num_part_in_parts: input, how many future parts each
686 * current part will be partitioned into.
687 * \param next_future_num_parts_in_parts: output, how many future parts
688 * each obtained part will be partitioned into.
689 * \param future_num_parts: output, max number of future parts that will be
690 * obtained from a single
691 * \param current_num_parts: input, how many parts are there currently.
692 * \param current_iteration: input, current dimension iteration number.
693 * \param input_part_boxes: input, if boxes are kept, current boxes.
694 * \param output_part_boxes: output, if boxes are kept, the initial box
695 * boundaries for obtained parts.
696 * \param atomic_part_count // DOCWORK: Documentation
697 */
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
707
719 static
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
729
754 bool mj_perform_migration(
755 mj_part_t in_num_parts, //current number of parts
756 mj_part_t &out_num_parts, //output number of parts.
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
764
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
785 mj_part_t num_procs,
786 mj_part_t num_parts,
787 mj_gno_t *num_points_in_all_processor_parts);
788
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
815 mj_part_t num_parts,
816 mj_part_t num_procs,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
824
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
852 mj_part_t num_parts,
853 mj_part_t num_procs,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
860
876 void assign_send_destinations(
877 mj_part_t num_parts,
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
882
897 void assign_send_destinations2(
898 mj_part_t num_parts,
899 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
903
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
928 mj_part_t num_parts,
929 mj_part_t num_procs,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
936
950 void mj_migrate_coords(
951 mj_part_t num_procs,
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
956
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
964
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
973 while (n > 1) {
974 while (n % divisor == 0) {
975 n = n / divisor;
976 largest_factor = divisor;
977 }
978 ++divisor;
979 if(divisor * divisor > n) {
980 if(n > 1) {
981 largest_factor = n;
982 }
983 break;
984 }
985 }
986 return largest_factor;
987 }
988
989public:
990 AlgMJ();
991
992 // DOCWORK: Make param documentation use : consistently
1018 void multi_jagged_part(
1019 const RCP<const Environment> &env,
1020 RCP<const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1022 int num_teams,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1026 int coord_dim,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1030 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1038
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1057
1061
1064 RCP<mj_partBox_t> get_global_box() const;
1065
1068 RCP<mj_partBoxVector_t> get_kept_boxes() const;
1069
1072 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1073 RCP<mj_partBoxVector_t> &localPartBoxes) const;
1074
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1118 int coord_dim,
1119 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1132
1133#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1134 public:
1135#else
1136 private:
1137#endif
1138
1139 /* \brief Allocates all required memory for the mj partitioning algorithm.
1140 */
1141 void allocate_set_work_memory();
1142
1143 /* \brief compute global bounding box: min/max coords of global domain */
1144 void compute_global_box();
1145
1146 // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1157
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1174
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts/*p-1*/ ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1219
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1246
1263 void mj_1D_part(
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1272
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1282 int loop_count);
1283
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1294
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1314
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1352 mj_part_t kk,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1370
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1384
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1392
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1422 int coordInd,
1423 bool longest_dim_part,
1424 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1425
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1439};
1440
1443template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1444 typename mj_part_t, typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1464 mj_num_teams(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1469{
1470}
1471
1515template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1516 typename mj_part_t, typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1523 int coord_dim_,
1524 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1526 mj_coordinates_,
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1536{
1537 this->mj_env = env;
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1542
1543 this->divide_to_prime_first = divide_to_prime_first_;
1544 //weights are uniform for task mapping
1545
1546 //parts are uniform for task mapping
1547 //as input indices.
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1552
1553 // If nonuniform first level partitioning, the requested num of parts and the
1554 // requested distribution of elements for each part
1555 this->num_first_level_parts = num_first_level_parts_;
1556
1557 this->first_level_distribution = first_level_distribution_;
1558
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1561
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1564
1565
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1568
1569 this->num_weights_per_coord = 0;
1570
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) = true;
1574
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1576 ("weights", 1, 1);
1577
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1580 this->mj_uniform_parts(0) = true;
1581
1582 this->set_part_specifications();
1583
1584 this->allocate_set_work_memory();
1585
1586 // Do single init
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (int dummy) {
1591 local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1592 });
1593
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1595
1596 mj_part_t current_num_parts = 1;
1597
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1600
1601 mj_part_t future_num_parts = this->total_num_part;
1602
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1610
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1613 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1617
1618 // Need a device counter - how best to allocate?
1619 // Putting this allocation in the loops is very costly so moved out here.
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size("view_total_reduction_size", 1);
1624
1625 for(int rd = 0; rd < this->recursion_depth; ++rd) {
1626 // next_future_num_parts_in_parts will be as the size of outnumParts,
1627 // and this will hold how many more parts that each output part
1628 // should be divided. this array will also be used to determine the weight
1629 // ratios of the parts.
1630 // swap the arrays to use iteratively..
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1634
1635 // clear next_future_num_parts_in_parts array as
1636 // getPartitionArrays expects it to be empty.
1637 next_future_num_parts_in_parts->clear();
1638
1639 // returns the total number of output parts for this dimension partitioning.
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1644 future_num_parts,
1645 current_num_parts,
1646 rd,
1647 t1,
1648 t2, num_ranks_per_node);
1649
1650 // if the number of obtained parts equal to current number of parts,
1651 // skip this dimension. For example, this happens when 1 is given in
1652 // the input part array is given. P=4,5,1,2
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1657 continue;
1658 }
1659
1660 //convert i to string to be used for debugging purposes.
1661 std::string istring = std::to_string(rd);
1662
1663 // alloc Memory to point the indices
1664 // of the parts in the permutation array.
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1667
1668 // the index where in the outtotalCounts will be written.
1669
1670 mj_part_t output_part_index = 0;
1671
1672 // whatever is written to outTotalCounts will be added with previousEnd
1673 // so that the points will be shifted.
1674 mj_part_t output_coordinate_end_index = 0;
1675
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1678
1679 mj_part_t obtained_part_index = 0;
1680
1681 // get the coordinate axis along which the partitioning will be done.
1682 int coordInd = rd % this->coord_dim;
1683
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1686
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1691
1692 // run for all available parts.
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1695
1696 mj_part_t actual_work_part_count = 0;
1697
1698 // initialization for 1D partitioning.
1699 // get the min and max coordinates of each part
1700 // together with the part weights of each part.
1701 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1704
1705 // if this part wont be partitioned any further
1706 // dont do any work for this part.
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1710 continue;
1711 }
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1718
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1721
1722 this->mj_get_local_min_max_coord_totW(
1723 current_work_part,
1724 current_concurrent_num_parts,
1725 coords);
1726
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1728 coord_traverse_ind;
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1730
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1733
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1743 }
1744
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1755 });
1756
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1759 }
1760 else {
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1764 current_work_part,
1765 current_concurrent_num_parts,
1766 coords);
1767 }
1768 }
1769
1770 // 1D partitioning
1771 if(actual_work_part_count > 0) {
1772 // obtain global Min max of the part.
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1777
1778 // update host copy
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1781
1782 // represents the total number of cutlines
1783 // whose coordinate should be determined.
1784 mj_part_t total_incomplete_cut_count = 0;
1785
1786 //Compute weight ratios for parts & cuts:
1787 //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1788 // part0 cut0 part1 cut1 part2 cut2 part3
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1798
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1800
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1803
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1815
1816 // shift the usedCutCoordinate array as noCuts.
1817 concurrent_part_cut_shift += partition_count - 1;
1818 // shift the partRatio array as noParts.
1819 concurrent_part_part_shift += partition_count;
1820 // calculate only if part is not empty,
1821 // and part will be further partitioend.
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1823 // increase allDone by the number of cuts of the current
1824 // part's cut line number.
1825 total_incomplete_cut_count += partition_count - 1;
1826
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1828
1829 // When num_first_level_parts != 1 we have
1830 // nonuniform partitioning on the first level, providing
1831 // requested number of parts (num_first_level_parts) and
1832 // requested distribution in parts (first_level_distribution)
1833
1834 // Get the target part weights given a desired distribution
1835 this->mj_get_initial_cut_coords_target_weights(
1836 min_coordinate,
1837 max_coordinate,
1838 partition_count - 1,
1839 global_total_weight,
1840 usedCutCoordinate,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1848
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1854
1855 // get the initial estimated part assignments of the coordinates.
1856 this->set_initial_coordinate_parts(
1857 max_coordinate,
1858 min_coordinate,
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1863 partition_count);
1864 }
1865 else {
1866 // e.g., if have fewer coordinates than parts, don't need to do
1867 // next dim.
1868 this->incomplete_cut_count(kk) = 0;
1869 }
1870 obtained_part_index += partition_count;
1871 }
1872
1873 // used imbalance, it is always 0, as it is difficult
1874 // to estimate a range.
1875 double used_imbalance = 0;
1876
1877 // Determine cut lines for k parts here.
1878 this->mj_env->timerStart(MACRO_TIMERS,
1879 mj_timer_base_string + "mj_1D_part()");
1880
1881 this->mj_1D_part(
1882 mj_current_dim_coords,
1883 used_imbalance,
1884 current_work_part,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1890
1891 this->mj_env->timerStop(MACRO_TIMERS,
1892 mj_timer_base_string + "mj_1D_part()");
1893 }
1894 else {
1895 obtained_part_index += current_concurrent_num_parts;
1896 }
1897 // create part chunks
1898 {
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1903
1904 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1906
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1909
1910 // if the part is empty, skip the part.
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1915
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1917 // we still need to write the begin and end point of the empty part.
1918 // simply set it zero, the array indices will be shifted later
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1925 });
1926
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1931 continue;
1932 }
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1938
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1943 cut_shift,
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1949 cut_shift,
1950 process_cut_line_weight_to_put_left.size()));
1951
1952 this->thread_part_weight_work =
1953 Kokkos::subview(
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1958
1959 if(num_parts > 1) {
1960 // Rewrite the indices based on the computed cuts.
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1966
1967 this->create_consistent_chunks(
1968 num_parts,
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1971 coordinate_begin,
1972 coordinate_end,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1975 coordInd,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1978 }
1979 else {
1980 // if this part is partitioned into 1 then just copy
1981 // the old values.
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1983
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1989 = part_size;
1990 });
1991
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1995 coordinate_begin,
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2000 coordinate_begin,
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2004 }
2005
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2010 }
2011
2012 // shift cut coordinates so that all cut coordinates are stored.
2013 // current_cut_coordinates += cutShift;
2014
2015 // getChunks from coordinates partitioned the parts and
2016 // wrote the indices as if there were a single part.
2017 // now we need to shift the beginning indices.
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2028 //shift it by previousCount
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2031
2032 if(ii % 2 == 1) {
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2037
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2041 //MARKER: FLIPPED ZORDER BELOW
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2043 }
2044 }
2045 });
2046
2047 // increase the previous count by current end.
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce("Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2053 }, get_single);;
2054
2055 output_coordinate_end_index = get_single;
2056 // increase the current out.
2057 output_part_index += num_parts;
2058 }
2059 }
2060 }
2061
2062 // end of this partitioning dimension
2063 // set the current num parts for next dim partitioning
2064 current_num_parts = output_part_count_in_dimension;
2065
2066 //swap the coordinate permutations for the next dimension.
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2070
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2075 }
2076
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2078
2079 // Return output_xadj in CSR format
2080 output_xadj[0] = 0;
2081 for(size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2083 }
2084
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2087}
2088
2092template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2093 typename mj_part_t, typename mj_node_t>
2094RCP<typename AlgMJ
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2097 get_global_box() const
2098{
2099 return this->global_box;
2100}
2101
2104template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2105 typename mj_part_t, typename mj_node_t>
2106void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2108{
2109 this->mj_keep_part_boxes = true;
2110}
2111
2112/* \brief Either the mj array (part_no_array) or num_global_parts should be
2113 * provided in the input. part_no_array takes
2114 * precedence if both are provided.
2115 * Depending on these parameters, total cut/part number,
2116 * maximum part/cut number along a dimension, estimated number of reduceAlls,
2117 * and the number of parts before the last dimension is calculated.
2118 * */
2119template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2120 typename mj_part_t, typename mj_node_t>
2123{
2124 this->total_num_cut = 0; //how many cuts will be totally
2125 this->total_num_part = 1; //how many parts will be totally
2126 this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2127 this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2128 this->last_dim_num_part = 1; //max no of parts that might occur
2129 //during the partition before the
2130 //last partitioning dimension.
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2133
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2136
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2139
2140 this->total_num_part = 1;
2141 for(int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2143 }
2144
2145 mj_part_t track_max = 0;
2146 for(int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2149 };
2150 }
2151
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2154
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2157 } else {
2158 mj_part_t future_num_parts = this->num_global_parts;
2159
2160 // If using nonuniform first level partitioning.
2161 // initial value max_num_part_along_dim == num_first_level_parts
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2165 }
2166
2167 // we need to calculate the part numbers now, to determine
2168 // the maximum along the dimensions.
2169 for(int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2172
2173 // Nonuniform first level partitioning sets part specificiations for
2174 // rd == 0 only, given requested num of parts and distribution in parts
2175 // for the first level.
2176 if (rd == 0 &&
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2179
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2182
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2185
2186 // Cumulative sum of distribution of parts and size of largest part
2187 for (int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2191 }
2192
2193 // Total parts in largest nonuniform superpart from
2194 // first level partitioning
2195 nfutureNumParts =
2196 this->num_global_parts * max_part / sum_first_level_dist;
2197 }
2198 // Standard uniform partitioning this level
2199 else {
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2206 ++nfutureNumParts;
2207 }
2208 }
2209 future_num_parts = nfutureNumParts;
2210 }
2211 this->total_num_part = this->num_global_parts;
2212
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2216 }
2217 else {
2218 //this is the lower bound.
2219 //estimate reduceAll Count here.
2220 //we find the upperbound instead.
2221 size_t p = 1;
2222 for(int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2225 }
2226
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2229 }
2230 else {
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2232 }
2233 }
2234 }
2235
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2240 // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2241
2242 // refine the concurrent part count, if it is given bigger than the maximum
2243 // possible part count.
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr << "Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part << "." << std::endl;
2250 }
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2252 }
2253}
2254
2255/* \brief Tries to determine the part number for current dimension,
2256 * by trying to make the partitioning as square as possible.
2257 * \param num_total_future how many more partitionings are required.
2258 * \param root how many more recursion depth is left.
2259 */
2260template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2261 typename mj_part_t, typename mj_node_t>
2263 get_part_count(mj_part_t num_total_future, double root)
2264{
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2267 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2268 return ip;
2269 }
2270 else {
2271 return ip + 1;
2272 }
2273}
2274
2275/* \brief Function returns how many parts that will be obtained after this
2276 * dimension partitioning. It sets how many parts each current part will be
2277 * partitioned into in this dimension to device_num_partitioning_in_current_dim
2278 * view, sets how many total future parts each obtained part will be
2279 * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2280 * kept, then sets initializes the output_part_boxes as its ancestor.
2281 * \param future_num_part_in_parts: input, how many future parts each current
2282 * part will be partitioned into.
2283 * \param next_future_num_parts_in_parts: output, how many future parts each
2284 * obtained part will be partitioned into.
2285 * \param future_num_parts: output, max number of future parts that will be
2286 * obtained from a single
2287 * \param current_num_parts: input, how many parts are there currently.
2288 * \param current_iteration: input, current dimension iteration number.
2289 * \param input_part_boxes: input, if boxes are kept, current boxes.
2290 * \param output_part_boxes: output, if boxes are kept, the initial box
2291 * boundaries for obtained parts.
2292 * \param atomic_part_count DOCWORK: Documentation
2293 */
2294template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2295 typename mj_part_t, typename mj_node_t>
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2306{
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2308
2309 // how many parts that will be obtained after this dimension.
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2312 // when the partNo array is provided as input,
2313 // each current partition will be partition to the same number of parts.
2314 // we dont need to use the future_num_part_in_parts vector in this case.
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2317
2318 if(current_part_no_array < 1) {
2319 std::cout << "Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration << "] is given as:" <<
2321 current_part_no_array << std::endl;
2322 std::terminate();
2323 }
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2326 }
2327
2328 // If using part_no_array, ensure compatibility with num_first_level_parts.
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout << "Current recursive iteration: " << current_iteration
2333 << " part_no_array[" << current_iteration << "] is given as: " <<
2334 current_part_no_array << " and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2336 std::terminate();
2337 }
2338
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2341 }
2342
2343/*
2344 std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2345 current_iteration << " current_num_parts: " <<
2346 current_num_parts << "\n\n";
2347
2348 std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2349 num_partitioning_in_current_dim[0] << "\n\n";
2350
2351 std::cout << "\n\nfuture_num_parts: " << future_num_parts
2352 << " num_partitioning_in_current_dim[0]: " <<
2353 num_partitioning_in_current_dim[0] << " " <<
2354 future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2355*/
2356
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2362 //initialized the output boxes as its ancestor.
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2366 }
2367 }
2368 }
2369
2370 // set the how many more parts each part will be divided.
2371 // this is obvious when partNo array is provided as input.
2372 // however, fill this so weights will be calculated according to this array.
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2375 }
2376 }
2377 else {
2378 // if partNo array is not provided as input, future_num_part_in_parts
2379 // holds how many parts each part should be divided. Initially it holds a
2380 // single number equal to the total number of global parts.
2381
2382 // calculate the future_num_parts from beginning,
2383 // since each part might be divided into different number of parts.
2384 future_num_parts = 1;
2385
2386 // cout << "i:" << i << std::endl;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2388 // get how many parts a part should be divided.
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2390
2391 // get the ideal number of parts that is close to the
2392 // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2396 );
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr << "ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2407 std::endl;
2408 std::terminate();
2409 }
2410 // add this number to vector_num_partitioning_in_current_dim vector.
2411 // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2412 // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2413
2414 // Update part num arrays when on current_iteration == 0 and
2415 // using nonuniform first level partitioning
2416 // with requested num parts (num_first_level_parts) and
2417 // a requested distribution in parts (first_level_distribution).
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2421 // Only 1 current part to begin and partitions into
2422 // num_first_level_parts many parts
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2424
2425 // The output number of parts from first level partitioning
2426 output_num_parts = this->num_first_level_parts;
2427
2428 // Remaining parts left to partition for all future levels
2429 future_num_parts /= this->num_first_level_parts;
2430
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2433
2434 // Cumulative sum of distribution of first level parts
2435 // and size of largest first level part
2436 for (int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2438
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2441 }
2442
2443 // Maximum # of remaining parts left to partition for all future levels
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2445
2446 // Number of parts remaining left to partition for each future_part
2447 // The sum must exactly equal global_num_parts
2448 for (int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2451 }
2452 }
2453 else if (this->divide_to_prime_first) {
2454 // Add this number to num_partitioning_in_current_dim vector.
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2456
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2458
2459 //increase the output number of parts.
2460 output_num_parts += num_partitions_in_current_dim;
2461
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2465 }
2466
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2469
2470 // We divide to num_partitions_in_current_dim. But we adjust the weights
2471 // based on largest prime/ if num_partitions_in_current_dim = 2,
2472 // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2473 // if the largest prime is less than part count, we use the part count
2474 // so that we divide uniformly.
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2477 }
2478 //ideal number of future partitions for each part.
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2481 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2483
2484/*
2485 std::cout << "\ncurrent num part: " << ii
2486 << " largest_prime_factor: " << largest_prime_factor
2487 << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2488*/
2489
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2491 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2493 //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2496 }
2497 //scale with 'x';
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2500
2501 //if there is a remainder in the part increase the part weight.
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2503 //if not uniform, add 1 for the extra parts.
2504 ++num_future_parts_for_part_iii;
2505 }
2506
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2508
2509 //if part boxes are stored, initialize the box of the parts as the ancestor.
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2512 }
2513
2514 //set num future_num_parts to maximum in this part.
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2517
2518 }
2519 }
2520 else {
2521 // Add this number to num_partitioning_in_current_dim vector.
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2523
2524 //increase the output number of parts.
2525 output_num_parts += num_partitions_in_current_dim;
2526
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2530 }
2531 //ideal number of future partitions for each part.
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2538
2539 //if there is a remainder in the part increase the part weight.
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2542 // if not uniform, add 1 for the extra parts.
2543 ++num_future_parts_for_part_iii;
2544 }
2545
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2548
2549 // if part boxes are stored, initialize the box of the parts as
2550 // the ancestor.
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2553 }
2554 //set num future_num_parts to maximum in this part.
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2557 }
2558 }
2559 }
2560 }
2561 // move temp std::vector to host view
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2569 }
2570 // setup device equivalent - this data is used on host and device and it's
2571 // more efficient to just setup array on both sides now rather than copy
2572 // values as needed later.
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2576}
2577
2578/* \brief Allocates and initializes the work memory that will be used by MJ.
2579 * */
2580template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2581 typename mj_part_t, typename mj_node_t>
2584{
2585 // Throughout the partitioning execution,
2586 // instead of the moving the coordinates, hold a permutation array for parts.
2587 // coordinate_permutations holds the current permutation.
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2596 });
2597
2598 // new_coordinate_permutations holds the current permutation.
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2601 this->num_local_coords);
2602
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2608 this->num_local_coords);
2609 }
2610
2611 // single partition starts at index-0, and ends at numLocalCoords
2612 // inTotalCounts array holds the end points in coordinate_permutations array
2613 // for each partition. Initially sized 1, and single element is set to
2614 // numLocalCoords.
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2620
2621 // the ends points of the output, this is allocated later.
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2624
2625 // only store this much if cuts are needed to be stored.
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2629
2630 // how much weight percentage should a MPI put left side of the each cutline
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2633
2634 // how much weight percentage should each thread in MPI put left side of
2635 // each outline
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2639
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2659 }
2660
2661 // work array to manipulate coordinate of cutlines in different iterations.
2662 // necessary because previous cut line information is used for determining
2663 // the next cutline information. therefore, cannot update the cut work array
2664 // until all cutlines are determined.
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2669
2670 // cumulative part weight array.
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2674
2675 // upper bound coordinate of a cut line
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2680
2681 // lower bound coordinate of a cut line
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2686
2687 // lower bound weight of a cut line
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2692
2693 //upper bound weight of a cut line
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2698
2699 // combined array to exchange the min and max coordinate,
2700 // and total weight of part.
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2706
2707 // global combined array with the results for min, max and total weight.
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2712
2713 // is_cut_line_determined is used to determine if a cutline is
2714 // determined already. If a cut line is already determined, the next
2715 // iterations will skip this cut line.
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2719
2720 // incomplete_cut_count count holds the number of cutlines that have not
2721 // been finalized for each part when concurrentPartCount>1, using this
2722 // information, if incomplete_cut_count[x]==0, then no work is done for
2723 // this part.
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2729
2730 // local part weights of each thread.
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2734
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2738
2739 // thread_cut_right_closest_point to hold the closest coordinate to a
2740 // cutline from right (for each thread)
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2744
2745 // to store how many points in each part a thread has.
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2748 this->max_num_part_along_dim);
2749
2750 // for faster communication, concatanation of
2751 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2752 // leftClosest distances sized P-1, since P-1 cut lines
2753 // rightClosest distances size P-1, since P-1 cut lines.
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2760
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2767
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2770
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2773 num_local_coords);
2774
2775 // changes owners back to host - so we don't run them on device
2776 // this improves migration code but means we have to serial init here.
2777 // Note we might allow this to be OpenMP when available even for CUDA.
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2779
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2786 });
2787}
2788
2789/* \brief compute the global bounding box
2790 */
2791template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2792 typename mj_part_t, typename mj_node_t>
2793void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2795{
2796 //local min coords
2797 mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2798 //global min coords
2799 mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2800 //local max coords
2801 mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2802 //global max coords
2803 mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2804
2805 auto local_mj_coordinates = this->mj_coordinates;
2806
2807 // If we are only doing 2 parts then we don't need these values
2808 // for y and z. Init them all to 0 first
2809 for(int i = 0; i < this->coord_dim; ++i) {
2810 mins[i] = 0;
2811 maxs[i] = 0;
2812 }
2813
2814 for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce("MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2821 }
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce("MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2829 }
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2831 }
2832
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2835 );
2836
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2839 );
2840
2841 //create single box with all areas.
2842 global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2843 //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2844 delete [] mins;
2845 delete [] gmins;
2846 delete [] maxs;
2847 delete [] gmaxs;
2848}
2849
2850/* \brief for part communication we keep track of the box boundaries.
2851 * This is performed when either asked specifically, or when geometric mapping
2852 * is performed afterwards.
2853 * This function initializes a single box with all global min, max coordinates.
2854 * \param initial_partitioning_boxes the input and output vector for boxes.
2855 */
2856template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2857 typename mj_part_t, typename mj_node_t>
2858void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2861{
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2864}
2865
2870template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2871 typename mj_part_t,
2872 typename mj_node_t>
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2878{
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2883
2884 bool bUniformWeights = mj_uniform_weights(0);
2885
2886 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2887
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2893
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2897 //if the part is empty.
2898 //set the min and max coordinates as reverse.
2899 if(coordinate_begin_index >= coordinate_end_index)
2900 {
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2904 }
2905 else {
2906 // get min
2907 Kokkos::parallel_reduce("get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2915 // get max
2916 Kokkos::parallel_reduce("get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2926 }
2927 else {
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce("get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2936 }
2937 }
2938
2939 // single write
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2944 my_min_coord;
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2949 });
2950 }
2951}
2952
2965template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2966 typename mj_part_t, typename mj_node_t>
2967void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2972 // reduce min for first current_concurrent_num_parts elements, reduce
2973 // max for next concurrentPartCount elements, reduce sum for the last
2974 // concurrentPartCount elements.
2975 if(this->comm->getSize() > 1) {
2976 // We're using explicit host here as Spectrum MPI would fail
2977 // with the prior HostMirror UVMSpace to UVMSpace setup.
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2983 Teuchos::MultiJaggedCombinedMinMaxTotalReductionOp<int, mj_scalar_t>
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2986 try {
2987 reduceAll<int, mj_scalar_t>(
2988 *(this->comm),
2989 reductionOp,
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2993 }
2994 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2996 }
2997 else {
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3003 });
3004 }
3005}
3006
3039template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3040 typename mj_part_t, typename mj_node_t>
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts/*p-1*/ ,
3046 mj_scalar_t global_weight,
3047 /*p - 1 sized, coordinate of each cut line*/
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3049 /*cumulative weights, at left side of each cut line. p-1 sized*/
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3057{
3058 mj_scalar_t coord_range = max_coord - min_coord;
3059
3060 // We decided we could keep some std::vectors around for now. Eventually
3061 // it would be nice to have everything just as views with some being device
3062 // and some host. This particular case needs a bit of work to get setup
3063 // in a cleaner way so not going to mess with it at the moment.
3064
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3067
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3073 std::terminate();
3074 }
3075 }
3076
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3080
3081 mj_scalar_t cumulative = 0;
3082
3083 if(bUniformPartsCheck) {
3084 // How many total future parts the part will be partitioned into.
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3087
3088 // How much each part should weigh in ideal case.
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3091
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3095 }
3096 }
3097 else {
3098 // Sum of entries in the first level partition distribution vector
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3102 }
3103
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3108 }
3109 }
3110
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3112
3113 Kokkos::parallel_for("Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3116 // set target part weight.
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3120 // set this multiple times but here for device handling
3121 current_target_part_weights(num_cuts) = global_weight;
3122 });
3123
3124 // round the target part weights.
3125 // Note need to discuss regarding DragonFly commits and determine if we
3126 // would not simply check mj_uniform_weights here.
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3130 (0, num_cuts + 1),
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3134 });
3135 }
3136}
3137
3154template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3155 typename mj_part_t, typename mj_node_t>
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3166{
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3168
3169 // if there is single point, or if all points are along a line.
3170 // set initial part to 0 for all.
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3177 });
3178 }
3179 else {
3180 // otherwise estimate an initial part for each coordinate.
3181 // assuming uniform distribution of points.
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3188 mj_part_t pp =
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1; // don't want last coord in an invalid part
3192 }
3193 mj_part_ids[iii] = 2 * pp;
3194 });
3195 }
3196}
3197
3212template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3213 typename mj_part_t, typename mj_node_t>
3214void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3223{
3224 this->temp_cut_coords = current_cut_coordinates;
3225
3226 Teuchos::MultiJaggedCombinedReductionOp<mj_part_t, mj_scalar_t>
3227 *reductionOp = NULL;
3228
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3230
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3235 }
3236 reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
3237 <mj_part_t, mj_scalar_t>(
3238 &temp,
3239 current_work_part,
3240 current_concurrent_num_parts);
3241 }
3242
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3266
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3270
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (int dummy) {
3274
3275 // these need to be initialized
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3278
3279 // initialize the lower and upper bounds of the cuts.
3280 mj_part_t next = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3286
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) = false;
3289 // min coordinate
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3292 // max coordinate
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3296 // total weight
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3303 }
3304 ++next;
3305 }
3306 }
3307 });
3308
3309 // loop_count allows the kernel to behave differently on the first loop
3310 // and subsequent loops. First loop we do a binary search and subsequent
3311 // loops we simply step towards our target.
3312 int loop_count = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3316 current_work_part,
3317 mj_current_dim_coords,
3318 loop_count);
3319 ++loop_count;
3320
3321 this->mj_combine_rightleft_and_weights(
3322 current_work_part,
3323 current_concurrent_num_parts);
3324
3325 // now sum up the results of mpi processors.
3326 if(!bSingleProcess) {
3327 // We're using explicit host here as Spectrum MPI would fail
3328 // with the prior HostMirror UVMSpace to UVMSpace setup.
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3335
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3338
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce("Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3345
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3352 }
3353 else {
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3356 }
3357
3358 // how much cut will be shifted for the next part in the concurrent
3359 // part calculation.
3360 mj_part_t cut_shift = 0;
3361
3362 // how much the concantaneted array will be shifted for the next part
3363 // in concurrent part calculation.
3364 size_t tlr_shift = 0;
3365
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3369
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3371
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3374
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3377
3378 //if the cuts of this cut has already been completed.
3379 //nothing to do for this part.
3380 //just update the shift amount and proceed.
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3382
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3386 continue;
3387 }
3388
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3392 tlr_shift,
3393 this->total_part_weight_left_right_closests.size()));
3394
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3396 Kokkos::subview(
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3399 tlr_shift,
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3405 num_total_part,
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3414 current_global_tlr;
3415
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3419 cut_shift,
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3424 cut_shift + kk,
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3430 cut_shift,
3431 local_process_cut_line_weight_to_put_left.size()));
3432
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3435
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3440 cut_shift,
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3445 cut_shift,
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3450 cut_shift,
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3455 cut_shift,
3456 local_cut_lower_bound_coordinates.size()));
3457
3458 // Now compute the new cut coordinates.
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3467
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3470 kk,
3471 num_cuts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3487
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3490 } // end of kk loop
3491
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3496 }
3497
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3504 });
3505 } // end of the while loop
3506
3507 // Needed only if keep_cuts; otherwise can simply swap array pointers
3508 // cutCoordinates and cutCoordinatesWork.
3509 // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3510 // computed cuts must be in cutCoordinates.
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(int dummy) {
3515 mj_part_t next = 0;
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3523 }
3524 next += num_cuts;
3525 }
3526 for(int n = 0; n <
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3529 }
3530 });
3531 }
3532
3533 delete reductionOp;
3534}
3535
3536template<class scalar_t>
3539
3540 // With new kokkos setup parallel_reduce will call empty constructor and
3541 // we update the ptr in the init method.
3542 KOKKOS_INLINE_FUNCTION
3544
3545 KOKKOS_INLINE_FUNCTION
3546 Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3547
3549 ptr = zmj.ptr;
3550 return *this;
3551 }
3552};
3553
3554#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3555
3556template<class policy_t, class scalar_t, class part_t>
3558
3565
3566 KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3567 scalar_t mj_max_scalar,
3568 value_type &val,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3571 max_scalar(mj_max_scalar),
3572 value(&val),
3573 value_count_rightleft(mj_value_count_rightleft),
3574 value_count_weights(mj_value_count_weights)
3575 {}
3576
3577 KOKKOS_INLINE_FUNCTION
3579 return *value;
3580 }
3581
3582 KOKKOS_INLINE_FUNCTION
3583 void join(value_type& dst, const value_type& src) const {
3584 for(int n = 0; n < value_count_weights; ++n) {
3585 dst.ptr[n] += src.ptr[n];
3586 }
3587
3588 for(int n = value_count_weights + 2;
3589 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3590 if(src.ptr[n] > dst.ptr[n]) {
3591 dst.ptr[n] = src.ptr[n];
3592 }
3593 if(src.ptr[n+1] < dst.ptr[n+1]) {
3594 dst.ptr[n+1] = src.ptr[n+1];
3595 }
3596 }
3597 }
3598
3599 KOKKOS_INLINE_FUNCTION
3600 void join (volatile value_type& dst, const volatile value_type& src) const {
3601 for(int n = 0; n < value_count_weights; ++n) {
3602 dst.ptr[n] += src.ptr[n];
3603 }
3604
3605 for(int n = value_count_weights + 2;
3606 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3607 if(src.ptr[n] > dst.ptr[n]) {
3608 dst.ptr[n] = src.ptr[n];
3609 }
3610 if(src.ptr[n+1] < dst.ptr[n+1]) {
3611 dst.ptr[n+1] = src.ptr[n+1];
3612 }
3613 }
3614 }
3615
3616 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3617 dst.ptr = value->ptr; // must update ptr
3618
3619 for(int n = 0; n < value_count_weights; ++n) {
3620 dst.ptr[n] = 0;
3621 }
3622
3623 for(int n = value_count_weights;
3625 dst.ptr[n] = -max_scalar;
3626 dst.ptr[n+1] = max_scalar;
3627 }
3628 }
3629};
3630#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3631
3632template<class policy_t, class scalar_t, class part_t, class index_t,
3633 class device_t, class array_t>
3635 typedef typename policy_t::member_type member_type;
3636 typedef Kokkos::View<scalar_t*> scalar_view_t;
3637
3638#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3639 typedef array_t value_type[];
3640#endif
3641
3643 array_t max_scalar;
3644
3652 Kokkos::View<index_t*, device_t> permutations;
3653 Kokkos::View<scalar_t *, device_t> coordinates;
3654 Kokkos::View<scalar_t**, device_t> weights;
3655 Kokkos::View<part_t*, device_t> parts;
3656 Kokkos::View<scalar_t *, device_t> cut_coordinates;
3657 Kokkos::View<index_t *, device_t> part_xadj;
3660
3661#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3665#endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3666
3668 int mj_loop_count,
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3671 part_t mj_num_cuts,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3688#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3689 ) :
3690 loop_count(mj_loop_count),
3691 max_scalar(mj_max_scalar),
3692 concurrent_current_part(mj_concurrent_current_part),
3693 num_cuts(mj_num_cuts),
3694 current_work_part(mj_current_work_part),
3695 current_concurrent_num_parts(mj_current_concurrent_num_parts),
3696 value_count_rightleft(mj_left_right_array_size),
3697 value_count_weights(mj_weight_array_size),
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3699 permutations(mj_permutations),
3700 coordinates(mj_coordinates),
3701 weights(mj_weights),
3702 parts(mj_parts),
3703 cut_coordinates(mj_cut_coordinates),
3704 part_xadj(mj_part_xadj),
3705 uniform_weights0(mj_uniform_weights0),
3706 sEpsilon(mj_sEpsilon)
3707#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3711#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3712 {
3713 }
3714
3715 size_t team_shmem_size (int team_size) const {
3716#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result = sizeof(array_t) *
3719#else
3720 int result = sizeof(array_t) *
3722#endif
3723
3724 // pad this to a multiple of 8 or it will run corrupt
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3728 }
3729 return result;
3730 }
3731
3732 KOKKOS_INLINE_FUNCTION
3733#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3734 void operator() (const member_type & teamMember) const {
3735#else
3736 void operator() (const member_type & teamMember, value_type teamSum) const {
3737#endif
3738
3739 index_t all_begin = (concurrent_current_part == 0) ? 0 :
3741 index_t all_end = part_xadj(concurrent_current_part);
3742
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3745
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3748 stride += 1; // make sure we have coverage for the final points
3749 }
3750
3751 // the last team may have less work than the other teams
3752 // the last team can be empty (begin > end) if num_teams > stride
3753 // which is true for many teams and small numbers of coords (tests)
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3756 if(end > all_end) {
3757 end = all_end;
3758 }
3759
3760#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3761 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3763
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3765 sh_mem_size);
3766
3767 // init the shared array to 0
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3769 for(int n = 0; n < value_count_weights; ++n) {
3770 shared_ptr[n] = 0;
3771 }
3772 for(int n = value_count_weights;
3774 shared_ptr[n] = -max_scalar;
3775 shared_ptr[n+1] = max_scalar;
3776 }
3777 });
3778 teamMember.team_barrier();
3779
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3782 [=] (index_t ii) {
3783#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3784 // create the team shared data - each thread gets one of the arrays
3785 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3786 value_count_rightleft) * teamMember.team_size();
3787
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3789 sh_mem_size);
3790
3791 // select the array for this thread
3792 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3794
3795 // create reducer which handles the Zoltan2_MJArrayType class
3797 max_scalar, array,
3800
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3803#if (__cplusplus > 201703L)
3804 [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3805#else
3806 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3807#endif
3808#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3809
3810 int i = permutations(ii);
3811 scalar_t coord = coordinates(i);
3812 array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3813
3814 // now check each part and it's right cut
3815 index_t part = parts(i)/2;
3816
3817 int upper = num_cuts;
3818 int lower = 0;
3819
3820 // binary search - find matching part
3821 while(true) {
3822 scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3823 scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3824
3825 if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3826#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3827 Kokkos::atomic_add(&shared_ptr[part*2], w);
3828#else
3829 threadSum.ptr[part*2] += w;
3830#endif
3831
3832 parts(i) = part*2;
3833
3834 // now handle the left/right closest part
3835#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3836 array_t new_value = (array_t) coord;
3837 array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3838 while(new_value < prev_value) {
3839 prev_value = Kokkos::atomic_compare_exchange(
3840 &shared_ptr[value_count_weights + part * 2 + 1],
3841 prev_value, new_value);
3842 }
3843 prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3844 while(new_value > prev_value) {
3845 prev_value = Kokkos::atomic_compare_exchange(
3846 &shared_ptr[value_count_weights + part * 2 + 2],
3847 prev_value, new_value);
3848 }
3849#else
3850 // note cut to left needs to set right closest and cut to right needs
3851 // to set left closest. It's index +1 and +2 instead of -1 and +0
3852 // because right/left segment is padded with an extra pair at
3853 // begining and end to avoid branching with if checks.
3854 if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3855 threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3856 }
3857 if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3858 threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3859 }
3860#endif
3861
3862 break;
3863 }
3864 else if(part != num_cuts) {
3865 if(coord < b + sEpsilon && coord > b - sEpsilon) {
3866 // Note if on cut we set right/left closest to the cut itself
3867 // but we add +2 because we buffered the area with an extra slot
3868 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3869#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3870 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3871 shared_ptr[value_count_weights + part * 2 + 2] = b;
3872 shared_ptr[value_count_weights + part * 2 + 3] = b;
3873#else
3874 threadSum.ptr[part*2+1] += w;
3875 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3876 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3877#endif
3878
3879 parts(i) = part*2+1;
3880
3881 // Need to scan up for any other cuts of same coordinate
3882 // This is costly but it's only relevant for the fix4785 test
3883 // which loads a lot of coordinates on the same point, so without
3884 // this our cuts would all just sit at 0.
3885 part_t base_b = part;
3886 scalar_t base_coord = cut_coordinates(base_b);
3887 part += 1;
3888 while(part < num_cuts) {
3889 b = cut_coordinates(part);
3890 scalar_t delta = b - base_coord;
3891 if(delta < 0) delta = -delta;
3892 if(delta < sEpsilon) {
3893 // Note if on cut we set right/left closest to the cut itself
3894 // but we add +2 because we buffered the area with an extra slot
3895 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3896#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3897 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3898 shared_ptr[value_count_weights + part * 2 + 2] = b;
3899 shared_ptr[value_count_weights + part * 2 + 3] = b;
3900#else
3901 threadSum.ptr[part*2+1] += w;
3902 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3903 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3904#endif
3905 }
3906 else { break; }
3907 ++part;
3908 }
3909 part = base_b - 1;
3910 while(part >= 0) {
3911 b = cut_coordinates(part);
3912 scalar_t delta = b - base_coord;
3913 if(delta < 0) delta = -delta;
3914 if(delta < sEpsilon) {
3915 // Note if on cut we set right/left closest to the cut itself
3916 // but we add +2 because we buffered the area with an extra slot
3917 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3918#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3919 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3920 shared_ptr[value_count_weights + part * 2 + 2] = b;
3921 shared_ptr[value_count_weights + part * 2 + 3] = b;
3922#else
3923 threadSum.ptr[part*2+1] += w;
3924 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3925 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3926#endif
3927 }
3928 else { break; }
3929 --part;
3930 }
3931
3932 break;
3933 }
3934 }
3935
3936 if(loop_count != 0) {
3937 // subsequent loops can just step towards target
3938 if(coord < b) {
3939 part -= 1;
3940 }
3941 else {
3942 part += 1;
3943 }
3944 }
3945 else {
3946 // initial loop binary search
3947 if(coord < b) {
3948 if(part == lower + 1) {
3949 part = lower;
3950 }
3951 else {
3952 upper = part - 1;
3953 part -= (part - lower)/2;
3954 }
3955 }
3956 else if(part == upper - 1) {
3957 part = upper;
3958 }
3959 else {
3960 lower = part + 1;
3961 part += (upper - part)/2;
3962 }
3963 }
3964 }
3965#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3966 });
3967#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3968 }, arraySumReducer);
3969#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3970
3971 teamMember.team_barrier();
3972
3973 // collect all the team's results
3974#if (__cplusplus > 201703L)
3975 Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
3976#else
3977 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3978#endif
3979 for(int n = 0; n < value_count_weights; ++n) {
3980#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 Kokkos::atomic_add(&current_part_weights(n),
3982 static_cast<double>(shared_ptr[n]));
3983#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3984 teamSum[n] += array.ptr[n];
3985#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3986 }
3987
3988#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3989 int insert_left = 0;
3990 int insert_right = 0;
3991#endif
3992
3993 for(int n = 2 + value_count_weights;
3994 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3995#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3996 scalar_t new_value = shared_ptr[n+1];
3997 scalar_t prev_value = current_right_closest(insert_right);
3998 while(new_value < prev_value) {
3999 prev_value = Kokkos::atomic_compare_exchange(
4000 &current_right_closest(insert_right), prev_value, new_value);
4001 }
4002
4003 new_value = shared_ptr[n];
4004 prev_value = current_left_closest(insert_left);
4005 while(new_value > prev_value) {
4006 prev_value = Kokkos::atomic_compare_exchange(
4007 &current_left_closest(insert_left), prev_value, new_value);
4008 }
4009
4010 ++insert_left;
4011 ++insert_right;
4012#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4013 if(array.ptr[n] > teamSum[n]) {
4014 teamSum[n] = array.ptr[n];
4015 }
4016 if(array.ptr[n+1] < teamSum[n+1]) {
4017 teamSum[n+1] = array.ptr[n+1];
4018 }
4019#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4020 }
4021 });
4022
4023 teamMember.team_barrier();
4024 }
4025
4026#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4027 KOKKOS_INLINE_FUNCTION
4028 void join(value_type dst, const value_type src) const {
4029 for(int n = 0; n < value_count_weights; ++n) {
4030 dst[n] += src[n];
4031 }
4032
4033 for(int n = value_count_weights + 2;
4034 n < value_count_weights + value_count_rightleft - 2; n += 2) {
4035 if(src[n] > dst[n]) {
4036 dst[n] = src[n];
4037 }
4038 if(src[n+1] < dst[n+1]) {
4039 dst[n+1] = src[n+1];
4040 }
4041 }
4042 }
4043
4044 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4045 for(int n = 0; n < value_count_weights; ++n) {
4046 dst[n] = 0;
4047 }
4048
4049 for(int n = value_count_weights;
4051 dst[n] = -max_scalar;
4052 dst[n+1] = max_scalar;
4053 }
4054 }
4055#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4056};
4057
4065template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4066 typename mj_part_t, typename mj_node_t>
4067void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4068 mj_1D_part_get_part_weights(
4070 mj_part_t current_work_part,
4071 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4072 int loop_count)
4073{
4074 auto local_is_cut_line_determined = is_cut_line_determined;
4075 auto local_thread_part_weights = thread_part_weights;
4076 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4077 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4078
4079 // Create some locals so we don't use this inside the kernels
4080 // which causes problems
4081 auto local_sEpsilon = this->sEpsilon;
4082 auto local_assigned_part_ids = this->assigned_part_ids;
4083 auto local_coordinate_permutations = this->coordinate_permutations;
4084 auto local_mj_weights = this->mj_weights;
4085 auto local_part_xadj = this->part_xadj;
4086 auto local_global_min_max_coord_total_weight =
4087 this->global_min_max_coord_total_weight;
4088
4089 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4090
4091 auto local_device_num_partitioning_in_current_dim =
4092 device_num_partitioning_in_current_dim;
4093
4094 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4095 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4096
4097 mj_part_t total_part_shift = 0;
4098
4099 mj_part_t concurrent_cut_shifts = 0;
4100 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4101 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4102 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4103 concurrent_cut_shifts, temp_cut_coords.size()));
4104
4105 mj_part_t num_parts =
4106 host_num_partitioning_in_current_dim(current_work_part + kk);
4107 mj_part_t num_cuts = num_parts - 1;
4108 mj_part_t total_part_count = num_parts + num_cuts;
4109 mj_part_t weight_array_length = num_cuts + num_parts;
4110
4111 // for right/left closest + buffer cut on either side
4112 mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4113
4114 if(this->incomplete_cut_count(kk) == 0) {
4115 total_part_shift += total_part_count;
4116 concurrent_cut_shifts += num_cuts;
4117 continue;
4118 }
4119
4120 // if not set use 60 - was initial testing amount but somewhat arbitrary
4121 auto policy_ReduceWeightsFunctor = policy_t(
4122 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4123
4124#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4125 int total_array_length =
4126 weight_array_length + right_left_array_length;
4127#endif
4128
4129 // Using float here caused some numerical errors for coord on cut calculations.
4130 // Probably that can be fixed with proper epsilon adjustment but since cuda
4131 // doesn't reduce right now the shared memory pressure is no longer relevant.
4132 // Just use scalar_t to match the original algorithm.
4133 typedef mj_scalar_t array_t;
4134
4135#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4136 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", total_array_length);
4137#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4138
4139 int offset_cuts = 0;
4140 for(int kk2 = 0; kk2 < kk; ++kk2) {
4141 offset_cuts +=
4142 host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4143 }
4144 Kokkos::View<double *, device_t> my_current_part_weights =
4145 Kokkos::subview(local_thread_part_weights,
4146 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4147 total_part_shift + total_part_count));
4148 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4149 Kokkos::subview(local_thread_cut_left_closest_point,
4150 std::pair<mj_lno_t, mj_lno_t>(
4151 offset_cuts,
4152 local_thread_cut_left_closest_point.size()));
4153 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4154 Kokkos::subview(local_thread_cut_right_closest_point,
4155 std::pair<mj_lno_t, mj_lno_t>(
4156 offset_cuts,
4157 local_thread_cut_right_closest_point.size()));
4158
4159 array_t max_scalar = std::numeric_limits<array_t>::max();
4160
4161#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4162 // initialize values
4163 Kokkos::parallel_for(
4164 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4165 KOKKOS_LAMBDA (int dummy) {
4166 for(int n = 0; n < weight_array_length; ++n) {
4167 my_current_part_weights(n) = 0;
4168 }
4169 for(int n = 0; n < num_cuts; ++n) {
4170 my_current_left_closest(n) = -max_scalar;
4171 my_current_right_closest(n) = max_scalar;
4172 }
4173 });
4174#endif
4175
4176 mj_part_t concurrent_current_part =
4177 current_work_part + kk;
4178
4179 ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4180 typename mj_node_t::device_type, array_t>
4181 teamFunctor(
4182 loop_count,
4183 max_scalar,
4185 num_cuts,
4188 right_left_array_length,
4189 weight_array_length,
4190 coordinate_permutations,
4191 mj_current_dim_coords,
4192 mj_weights,
4193 assigned_part_ids,
4194 local_temp_cut_coords,
4195 part_xadj,
4196 mj_uniform_weights(0), // host and currently only relevant to slot 0
4197 sEpsilon
4198#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4199 ,my_current_part_weights,
4200 my_current_left_closest,
4201 my_current_right_closest
4202#endif
4203 );
4204
4205#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4206 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4207#else
4208 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4209 teamFunctor, reduce_array);
4210 Kokkos::fence();
4211#endif
4212
4213#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4214 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4215
4216 for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4217 hostArray(i) = reduce_array[i];
4218 }
4219
4220 Kokkos::deep_copy(my_current_part_weights, hostArray);
4221
4222 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4223 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4224 for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4225 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4226 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4227 }
4228 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4229 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4230#endif
4231
4232 total_part_shift += total_part_count;
4233 concurrent_cut_shifts += num_cuts;
4234 }
4235
4236 auto local_temp_cut_coords = temp_cut_coords;
4237
4238 Kokkos::parallel_for(
4239 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4240 (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4241 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4242 current_work_part + kk);
4243 mj_part_t num_cuts = num_parts - 1;
4244 mj_part_t total_part_count = num_parts + num_cuts;
4245
4246 if(local_device_incomplete_cut_count(kk) > 0) {
4247 // get the prefix sum
4248 // This is an inefficiency but not sure if it matters much
4249 size_t offset = 0;
4250 size_t offset_cuts = 0;
4251 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4252 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4253 current_work_part + kk2);
4254 offset += num_parts_kk2 * 2 - 1;
4255 offset_cuts += num_parts_kk2 - 1;
4256 }
4257
4258 for(mj_part_t i = 1; i < total_part_count; ++i) {
4259 // check for cuts sharing the same position; all cuts sharing a position
4260 // have the same weight == total weight for all cuts sharing the
4261 // position. Don't want to accumulate that total weight more than once.
4262 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4263 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4264 local_temp_cut_coords(offset_cuts + i /2 - 1))
4265 < local_sEpsilon) {
4266 // i % 2 = 0 when part i represents the cut coordinate.
4267 // if it is a cut, and if next cut also has the same coordinate, then
4268 // dont addup.
4269 local_thread_part_weights(offset + i)
4270 = local_thread_part_weights(offset + i-2);
4271 continue;
4272 }
4273
4274 // otherwise do the prefix sum.
4275 local_thread_part_weights(offset + i) +=
4276 local_thread_part_weights(offset + i-1);
4277 }
4278 }
4279 });
4280}
4281
4289template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4290 typename mj_part_t, typename mj_node_t>
4291void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4292 mj_combine_rightleft_and_weights(
4293 mj_part_t current_work_part,
4295{
4296 auto local_thread_part_weights = this->thread_part_weights;
4297 auto local_is_cut_line_determined = this->is_cut_line_determined;
4298 auto local_thread_cut_left_closest_point =
4299 this->thread_cut_left_closest_point;
4300 auto local_thread_cut_right_closest_point =
4301 this->thread_cut_right_closest_point;
4302 auto local_total_part_weight_left_right_closests =
4303 this->total_part_weight_left_right_closests;
4304 auto local_device_num_partitioning_in_current_dim =
4305 device_num_partitioning_in_current_dim;
4306 Kokkos::parallel_for(
4307 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4308 KOKKOS_LAMBDA (int dummy) {
4309
4310 size_t tlr_array_shift = 0;
4311 mj_part_t cut_shift = 0;
4312 size_t total_part_array_shift = 0;
4313
4314 // iterate for all concurrent parts to find the left and right closest
4315 // points in the process.
4316 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4317
4318 mj_part_t num_parts_in_part =
4319 local_device_num_partitioning_in_current_dim(current_work_part + i);
4320 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4321 size_t num_total_part_in_part =
4322 num_parts_in_part + size_t (num_cuts_in_part);
4323
4324 // iterate for cuts in a single part.
4325 for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4326 mj_part_t next = tlr_array_shift + ii;
4327 mj_part_t cut_index = cut_shift + ii;
4328
4329 if(!local_is_cut_line_determined(cut_index)) {
4330 mj_scalar_t left_closest_in_process =
4331 local_thread_cut_left_closest_point(cut_index);
4332 mj_scalar_t right_closest_in_process =
4333 local_thread_cut_right_closest_point(cut_index);
4334
4335 // store the left and right closes points.
4336 local_total_part_weight_left_right_closests(
4337 num_total_part_in_part + next) = left_closest_in_process;
4338
4339 local_total_part_weight_left_right_closests(
4340 num_total_part_in_part + num_cuts_in_part + next) =
4341 right_closest_in_process;
4342 }
4343 }
4344
4345 for(size_t j = 0; j < num_total_part_in_part; ++j) {
4346 mj_part_t cut_ind = j / 2 + cut_shift;
4347
4348 // need to check j != num_total_part_in_part - 1
4349 // which is same as j/2 != num_cuts_in_part.
4350 // we cannot check it using cut_ind, because of the concurrent part
4351 // concantanetion.
4352 if(j == num_total_part_in_part - 1 ||
4353 !local_is_cut_line_determined(cut_ind)) {
4354 double pwj = local_thread_part_weights(total_part_array_shift + j);
4355 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4356 }
4357 }
4358
4359 // set the shift position in the arrays
4360 cut_shift += num_cuts_in_part;
4361 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4362 total_part_array_shift += num_total_part_in_part;
4363 }
4364 });
4365}
4366
4379template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4380 typename mj_part_t, typename mj_node_t>
4381KOKKOS_INLINE_FUNCTION
4382void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4383 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4384 mj_scalar_t cut_lower_bound,
4385 mj_scalar_t cut_upper_weight,
4386 mj_scalar_t cut_lower_weight,
4387 mj_scalar_t expected_weight,
4388 mj_scalar_t &new_cut_position,
4389 mj_scalar_t sEpsilon) {
4390
4391 if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4392 new_cut_position = cut_upper_bound; //or lower bound does not matter.
4393 }
4394
4395 if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4396 new_cut_position = cut_lower_bound;
4397 }
4398
4399 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4400 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4401 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4402
4403 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4404 int scale_constant = 20;
4405 int shiftint= int (required_shift * scale_constant);
4406 if(shiftint == 0) shiftint = 1;
4407 required_shift = mj_scalar_t (shiftint) / scale_constant;
4408 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4409}
4410
4411#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4412
4413template<class policy_t, class scalar_t>
4415
4420
4421 KOKKOS_INLINE_FUNCTION ArrayReducer(
4422 value_type &val,
4423 int mj_value_count) :
4424 value(&val),
4425 value_count(mj_value_count)
4426 {}
4427
4428 KOKKOS_INLINE_FUNCTION
4430 return *value;
4431 }
4432
4433 KOKKOS_INLINE_FUNCTION
4434 void join(value_type& dst, const value_type& src) const {
4435 for(int n = 0; n < value_count; ++n) {
4436 dst.ptr[n] += src.ptr[n];
4437 }
4438 }
4439
4440 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4441 dst.ptr = value->ptr; // must update ptr
4442 for(int n = 0; n < value_count; ++n) {
4443 dst.ptr[n] = 0;
4444 }
4445 }
4446};
4447
4448#endif
4449
4450template<class policy_t, class scalar_t, class part_t, class index_t,
4451 class device_t, class array_t>
4453 typedef typename policy_t::member_type member_type;
4454 typedef Kokkos::View<scalar_t*> scalar_view_t;
4455
4456#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4457 typedef array_t value_type[];
4458#endif
4459
4462 Kokkos::View<index_t*, device_t> permutations;
4463 Kokkos::View<scalar_t *, device_t> coordinates;
4464 Kokkos::View<part_t*, device_t> parts;
4465 Kokkos::View<index_t *, device_t> part_xadj;
4466 Kokkos::View<index_t *, device_t> track_on_cuts;
4467
4468#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4469 Kokkos::View<int *, device_t> local_point_counts;
4470#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4471
4473 part_t mj_concurrent_current_part,
4474 part_t mj_weight_array_size,
4475 Kokkos::View<index_t*, device_t> & mj_permutations,
4476 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4477 Kokkos::View<part_t*, device_t> & mj_parts,
4478 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4479 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4480#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4481 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4482#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4483 ) :
4484 concurrent_current_part(mj_concurrent_current_part),
4485 value_count(mj_weight_array_size),
4486 permutations(mj_permutations),
4487 coordinates(mj_coordinates),
4488 parts(mj_parts),
4489 part_xadj(mj_part_xadj),
4490 track_on_cuts(mj_track_on_cuts)
4491#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4492 ,local_point_counts(mj_local_point_counts)
4493#endif
4494 {
4495 }
4496
4497 size_t team_shmem_size (int team_size) const {
4498#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4499 int result = sizeof(array_t) * (value_count);
4500#else
4501 int result = sizeof(array_t) * (value_count) * team_size;
4502#endif
4503
4504 // pad this to a multiple of 8 or it will run corrupt
4505 int remainder = result % 8;
4506 if(remainder != 0) {
4507 result += 8 - remainder;
4508 }
4509 return result;
4510 }
4511
4512 KOKKOS_INLINE_FUNCTION
4513#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4514 void operator() (const member_type & teamMember) const {
4515#else
4516 void operator() (const member_type & teamMember, value_type teamSum) const {
4517#endif
4518 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4520 index_t all_end = part_xadj(concurrent_current_part);
4521
4522 index_t num_working_points = all_end - all_begin;
4523 int num_teams = teamMember.league_size();
4524
4525 index_t stride = num_working_points / num_teams;
4526 if((num_working_points % num_teams) > 0) {
4527 stride += 1; // make sure we have coverage for the final points
4528 }
4529
4530 index_t begin = all_begin + stride * teamMember.league_rank();
4531 index_t end = begin + stride;
4532 if(end > all_end) {
4533 end = all_end; // the last team may have less work than the other teams
4534 }
4535
4536 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4537
4538 // create the team shared data - each thread gets one of the arrays
4539#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4540 size_t sh_mem_size = sizeof(array_t) * (value_count);
4541#else
4542 size_t sh_mem_size =
4543 sizeof(array_t) * (value_count) * teamMember.team_size();
4544#endif
4545
4546 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4547 sh_mem_size);
4548
4549#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4550 // init the shared array to 0
4551 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4552 for(int n = 0; n < value_count; ++n) {
4553 shared_ptr[n] = 0;
4554 }
4555 });
4556 teamMember.team_barrier();
4557
4558 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4559 [=] (index_t ii) {
4560#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4561 // select the array for this thread
4562 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4563 (value_count)]);
4564
4565 // create reducer which handles the Zoltan2_MJArrayType class
4566 ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4567
4568 Kokkos::parallel_reduce(
4569 Kokkos::TeamThreadRange(teamMember, begin, end),
4570#if (__cplusplus > 201703L)
4571 [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4572#else
4573 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4574#endif
4575#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4576
4577 index_t coordinate_index = permutations(ii);
4578 part_t place = parts(coordinate_index);
4579 part_t part = place / 2;
4580 if(place % 2 == 0) {
4581#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4582 Kokkos::atomic_add(&shared_ptr[part], 1);
4583#else
4584 threadSum.ptr[part] += 1;
4585#endif
4586
4587 parts(coordinate_index) = part;
4588 }
4589 else {
4590 // fill a tracking array so we can process these slower points
4591 // in next cycle
4592 index_t set_index = Kokkos::atomic_fetch_add(
4593 &track_on_cuts(track_on_cuts_insert_index), 1);
4594 track_on_cuts(set_index) = ii;
4595 }
4596#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4597 });
4598#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4599 }, arrayReducer);
4600#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4601
4602 teamMember.team_barrier();
4603
4604 // collect all the team's results
4605#if (__cplusplus > 201703L)
4606 Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
4607#else
4608 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4609#endif
4610 for(int n = 0; n < value_count; ++n) {
4611#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4612 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4613#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4614 teamSum[n] += array.ptr[n];
4615#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4616 }
4617 });
4618
4619 teamMember.team_barrier();
4620 }
4621
4622#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4623
4624 KOKKOS_INLINE_FUNCTION
4625 void join(value_type dst, const value_type src) const {
4626 for(int n = 0; n < value_count; ++n) {
4627 dst[n] += src[n];
4628 }
4629 }
4630
4631 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4632 for(int n = 0; n < value_count; ++n) {
4633 dst[n] = 0;
4634 }
4635 }
4636#endif
4637};
4638
4654template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4655 typename mj_part_t, typename mj_node_t>
4656void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4657mj_create_new_partitions(
4658 mj_part_t num_parts,
4659 mj_part_t current_concurrent_work_part,
4660 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4661 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4662 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4663 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4664{
4665 // Get locals for cuda
4666 auto local_thread_part_weight_work = this->thread_part_weight_work;
4667 auto local_point_counts = this->thread_point_counts;
4668 auto local_distribute_points_on_cut_lines =
4669 this->distribute_points_on_cut_lines;
4670 auto local_thread_cut_line_weight_to_put_left =
4671 this->thread_cut_line_weight_to_put_left;
4672 auto local_sEpsilon = this->sEpsilon;
4673 auto local_coordinate_permutations = this->coordinate_permutations;
4674 auto local_mj_weights = this->mj_weights;
4675 auto local_assigned_part_ids = this->assigned_part_ids;
4676 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4677
4678 mj_part_t num_cuts = num_parts - 1;
4679
4680 Kokkos::parallel_for(
4681 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4682 KOKKOS_LAMBDA(int dummy) {
4683
4684 if(local_distribute_points_on_cut_lines) {
4685 for(int i = 0; i < num_cuts; ++i) {
4686 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4687 if(left_weight > local_sEpsilon) {
4688 // the weight of thread ii on cut.
4689 mj_scalar_t thread_ii_weight_on_cut =
4690 local_thread_part_weight_work(i * 2 + 1) -
4691 local_thread_part_weight_work(i * 2);
4692
4693 if(thread_ii_weight_on_cut < left_weight) {
4694 // if left weight is bigger than threads weight on cut.
4695 local_thread_cut_line_weight_to_put_left(i) =
4696 thread_ii_weight_on_cut;
4697 }
4698 else {
4699 // if thread's weight is bigger than space, then put only a portion.
4700 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4701 }
4702 left_weight -= thread_ii_weight_on_cut;
4703 }
4704 else {
4705 local_thread_cut_line_weight_to_put_left(i) = 0;
4706 }
4707 }
4708
4709 // this is a special case. If cutlines share the same coordinate,
4710 // their weights are equal. We need to adjust the ratio for that.
4711 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4712 if(std::abs(current_concurrent_cut_coordinate(i) -
4713 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4714 local_thread_cut_line_weight_to_put_left(i) -=
4715 local_thread_cut_line_weight_to_put_left(i - 1);
4716 }
4717 local_thread_cut_line_weight_to_put_left(i) =
4718 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4719 least_signifiance) * significance_mul) /
4720 static_cast<mj_scalar_t>(significance_mul);
4721 }
4722 }
4723
4724 for(mj_part_t i = 0; i < num_parts; ++i) {
4725 local_point_counts(i) = 0;
4726 }
4727 });
4728
4729 mj_lno_t coordinate_begin_index =
4730 current_concurrent_work_part == 0 ? 0 :
4731 host_part_xadj(current_concurrent_work_part - 1);
4732 mj_lno_t coordinate_end_index =
4733 host_part_xadj(current_concurrent_work_part);
4734
4735 mj_lno_t total_on_cut;
4736 Kokkos::parallel_reduce("Get total_on_cut",
4737 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4738 coordinate_begin_index, coordinate_end_index),
4739 KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4740 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4741 mj_part_t coordinate_assigned_place =
4742 local_assigned_part_ids(coordinate_index);
4743 if(coordinate_assigned_place % 2 == 1) {
4744 val += 1;
4745 }
4746 }, total_on_cut);
4747
4748 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4749 if(total_on_cut > 0) {
4750 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4751 "track_on_cuts", // would do WithoutInitialization but need last init to 0
4752 total_on_cut + 1); // extra index to use for tracking
4753 }
4754
4755 // here we need to parallel reduce an array to count coords in each part
4756 // atomically adding, especially for low part count would kill us
4757 // in the original setup we kept arrays allocated for each thread but for
4758 // the cuda version we'd like to avoid allocating N arrays for the number
4759 // of teams/threads which would be complicated based on running openmp or
4760 // cuda.
4761 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4762
4763 // if not set use 60 - somewhat arbitrary based on initial performance tests
4764 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4765
4766 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4767 typedef int array_t;
4768
4769 // just need parts - on the cuts will be handled in a separate serial
4770 // call after this.
4771#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4772 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", num_parts);
4773#endif
4774
4775 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4776 typename mj_node_t::device_type, array_t>teamFunctor(
4777 current_concurrent_work_part,
4778 num_parts,
4779 coordinate_permutations,
4780 mj_current_dim_coords,
4781 assigned_part_ids,
4782 part_xadj,
4783 track_on_cuts
4784#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4785 ,local_point_counts
4786#endif
4787 );
4788
4789#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4790 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4791#else
4792 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4793 Kokkos::fence();
4794#endif
4795
4796#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4797 for(mj_part_t part = 0; part < num_parts; ++part) {
4798 local_point_counts(part) = reduce_array[part];
4799 }
4800#endif
4801
4802 // the last member is utility used for atomically inserting the values.
4803 // Sorting here avoids potential indeterminancy in the partitioning results
4804 if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4805 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4806 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4807 Kokkos::sort(track_on_cuts_sort);
4808 }
4809
4810 bool uniform_weights0 = this->mj_uniform_weights(0);
4811 Kokkos::parallel_for(
4812 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4813 KOKKOS_LAMBDA (int dummy) {
4814
4815 for(int j = 0; j < total_on_cut; ++j) {
4816 int ii = track_on_cuts(j);
4817 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4818 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4819 local_mj_weights(coordinate_index,0);
4820 mj_part_t coordinate_assigned_place =
4821 local_assigned_part_ids(coordinate_index);
4822 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4823 // if it is on the cut.
4824 if(local_distribute_points_on_cut_lines &&
4825 local_thread_cut_line_weight_to_put_left(
4826 coordinate_assigned_part) > local_sEpsilon) {
4827 // if the rectilinear partitioning is allowed,
4828 // and the thread has still space to put on the left of the cut
4829 // then thread puts the vertex to left.
4830 local_thread_cut_line_weight_to_put_left(
4831 coordinate_assigned_part) -= coordinate_weight;
4832 // if putting the vertex to left increased the weight more
4833 // than expected, and if the next cut is on the same coordinate,
4834 // then we need to adjust how much weight next cut puts to its left as
4835 // well, in order to take care of the imbalance.
4836 if(local_thread_cut_line_weight_to_put_left(
4837 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4838 num_cuts - 1 &&
4839 std::abs(current_concurrent_cut_coordinate(
4840 coordinate_assigned_part+1) -
4841 current_concurrent_cut_coordinate(
4842 coordinate_assigned_part)) < local_sEpsilon)
4843 {
4844 local_thread_cut_line_weight_to_put_left(
4845 coordinate_assigned_part + 1) +=
4846 local_thread_cut_line_weight_to_put_left(
4847 coordinate_assigned_part);
4848 }
4849 ++local_point_counts(coordinate_assigned_part);
4850 local_assigned_part_ids(coordinate_index) =
4851 coordinate_assigned_part;
4852 }
4853 else {
4854 // if there is no more space on the left, put the coordinate to the
4855 // right of the cut.
4856 ++coordinate_assigned_part;
4857 // this while loop is necessary when a line is partitioned into more
4858 // than 2 parts.
4859 while(local_distribute_points_on_cut_lines &&
4860 coordinate_assigned_part < num_cuts)
4861 {
4862 // traverse all the cut lines having the same partitiong
4863 if(std::abs(current_concurrent_cut_coordinate(
4864 coordinate_assigned_part) -
4865 current_concurrent_cut_coordinate(
4866 coordinate_assigned_part - 1)) < local_sEpsilon)
4867 {
4868 // if line has enough space on left, put it there.
4869 if(local_thread_cut_line_weight_to_put_left(
4870 coordinate_assigned_part) > local_sEpsilon &&
4871 local_thread_cut_line_weight_to_put_left(
4872 coordinate_assigned_part) >=
4873 std::abs(local_thread_cut_line_weight_to_put_left(
4874 coordinate_assigned_part) - coordinate_weight))
4875 {
4876 local_thread_cut_line_weight_to_put_left(
4877 coordinate_assigned_part) -= coordinate_weight;
4878 // Again if it put too much on left of the cut,
4879 // update how much the next cut sharing the same coordinate will
4880 // put to its left.
4881 if(local_thread_cut_line_weight_to_put_left(
4882 coordinate_assigned_part) < 0 &&
4883 coordinate_assigned_part < num_cuts - 1 &&
4884 std::abs(current_concurrent_cut_coordinate(
4885 coordinate_assigned_part+1) -
4886 current_concurrent_cut_coordinate(
4887 coordinate_assigned_part)) < local_sEpsilon)
4888 {
4889 local_thread_cut_line_weight_to_put_left(
4890 coordinate_assigned_part + 1) +=
4891 local_thread_cut_line_weight_to_put_left(
4892 coordinate_assigned_part);
4893 }
4894 break;
4895 }
4896 }
4897 else {
4898 break;
4899 }
4900 ++coordinate_assigned_part;
4901 }
4902 local_point_counts(coordinate_assigned_part) += 1;
4903 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4904 }
4905 }
4906
4907 for(int j = 0; j < num_parts; ++j) {
4908 out_part_xadj(j) = local_point_counts(j);
4909 local_point_counts(j) = 0;
4910
4911 if(j != 0) {
4912 out_part_xadj(j) += out_part_xadj(j - 1);
4913 local_point_counts(j) += out_part_xadj(j - 1);
4914 }
4915 }
4916 });
4917
4918 // here we will determine insert indices for N teams
4919 // then all the teams can fill
4920
4921#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4922
4923 // This is the fastest so far - just straight atomic writes for CUDA
4924 // However this is not a deterministic result since it is atomic.
4925 // The final result will be deterministic.
4926 Kokkos::parallel_for(
4927 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4928 coordinate_begin_index, coordinate_end_index),
4929 KOKKOS_LAMBDA (mj_lno_t ii) {
4930 mj_lno_t i = local_coordinate_permutations(ii);
4931 mj_part_t p = local_assigned_part_ids(i);
4932 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4933 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4934 });
4935
4936#else
4937
4938#ifdef KOKKOS_ENABLE_OPENMP
4939 // will return and fix this - revert back to 1 for clear auto testing
4940 const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4941#else
4942 const int num_threads = 1;
4943#endif
4944
4945 const int num_teams = 1; // cuda is handled above using a different format
4946
4947 // allow init - we want all 0's first
4948 Kokkos::View<mj_lno_t*, device_t>
4949 point_counter("insert indices", num_teams * num_threads * num_parts);
4950
4951 // count how many coords per thread
4952 // then we will fill each independently
4953 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4954 block_policy(num_teams, num_threads);
4955 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4956 member_type member_type;
4957 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4958 mj_lno_t block_size = range / num_teams + 1;
4959 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4960 int team = team_member.league_rank();
4961 int team_offset = team * num_threads * num_parts;
4962 mj_lno_t begin = coordinate_begin_index + team * block_size;
4963 mj_lno_t end = begin + block_size;
4964 if(end > coordinate_end_index) {
4965 end = coordinate_end_index;
4966 }
4967
4968 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4969 [=] (mj_lno_t ii) {
4970 int thread = team_member.team_rank();
4971 mj_lno_t i = local_coordinate_permutations(ii);
4972 mj_part_t p = local_assigned_part_ids(i);
4973 int index = team_offset + thread * num_parts + p;
4974 ++point_counter(index);
4975 });
4976 });
4977
4978 // now prefix sum
4979 // we currently have the counts in the slots
4980 // we want the first counter for each part to be 0
4981 // then the rest should be the sum of all the priors
4982 Kokkos::parallel_for(
4983 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4984 KOKKOS_LAMBDA (int dummy) {
4985 int num_sets = point_counter.size() / num_parts;
4986 for(int set = num_sets - 1; set >= 1; set -=1) {
4987 int base = set * num_parts;
4988 for(int part = 0; part < num_parts; ++part) {
4989 point_counter(base + part) = point_counter(base + part - num_parts);
4990 }
4991 }
4992
4993 for(int part = 0; part < num_parts; ++part) {
4994 point_counter(part) = 0;
4995 }
4996
4997 for(int set = 1; set < num_sets; ++set) {
4998 int base = set * num_parts;
4999 for(int part = 0; part < num_parts; ++part) {
5000 point_counter(base + part) += point_counter(base + part - num_parts);
5001 }
5002 }
5003 });
5004
5005 // now permute
5006 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
5007 int team = team_member.league_rank();
5008 int team_offset = team * num_threads * num_parts;
5009 mj_lno_t begin = coordinate_begin_index + team * block_size;
5010 mj_lno_t end = begin + block_size;
5011 if(end > coordinate_end_index) {
5012 end = coordinate_end_index;
5013 }
5014 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5015 [=] (mj_lno_t ii) {
5016 int thread = team_member.team_rank();
5017 mj_lno_t i = local_coordinate_permutations(ii);
5018 mj_part_t p = local_assigned_part_ids(i);
5019 int index = team_offset + thread * num_parts + p;
5020 int set_counter = (point_counter(index)++) + local_point_counts(p);
5021 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5022 });
5023 });
5024#endif
5025}
5026
5070template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5071 typename mj_part_t, typename mj_node_t>
5072void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5073 mj_node_t>::mj_get_new_cut_coordinates(
5074 mj_part_t current_concurrent_num_parts,
5075 mj_part_t kk,
5076 const mj_part_t &num_cuts,
5077 const double &used_imbalance_tolerance,
5078 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5079 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5080 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5081 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5082 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5083 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5084 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5085 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5086 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5087 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5088 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5089 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5090 Kokkos::View<mj_scalar_t *, device_t> &
5091 current_part_cut_line_weight_to_put_left,
5092 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5093{
5094 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5095
5096 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5097 auto local_sEpsilon = sEpsilon;
5098 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5099 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5100 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5101 auto local_global_min_max_coord_total_weight =
5102 global_min_max_coord_total_weight;
5103
5104 const auto _sEpsilon = this->sEpsilon;
5105 // Note for a 22 part system I tried removing the outer loop
5106 // and doing each sub loop as a simple parallel_for over num_cuts.
5107 // But that was about twice as slow (10ms) as the current form (5ms)
5108 // so I think the overhead of launching the new global parallel kernels
5109 // is costly. This form is just running one team so effectively using
5110 // a single warp to process the cuts. I expect with a lot of parts this
5111 // might need changing.
5112 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5113 policy_one_team(1, Kokkos::AUTO());
5114 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5115 member_type member_type;
5116 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5117
5118 mj_scalar_t min_coordinate =
5119 local_global_min_max_coord_total_weight(kk);
5120 mj_scalar_t max_coordinate =
5121 local_global_min_max_coord_total_weight(
5122 kk + current_concurrent_num_parts);
5123 mj_scalar_t global_total_weight =
5124 local_global_min_max_coord_total_weight(
5125 kk + current_concurrent_num_parts * 2);
5126
5127 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5128 [=] (mj_part_t i) {
5129 // if left and right closest points are not set yet,
5130 // set it to the cut itself.
5131 if(min_coordinate -
5132 current_global_left_closest_points(i) > local_sEpsilon) {
5133 current_global_left_closest_points(i) =
5134 current_cut_coordinates(i);
5135 }
5136 if(current_global_right_closest_points(i) -
5137 max_coordinate > local_sEpsilon) {
5138 current_global_right_closest_points(i) =
5139 current_cut_coordinates(i);
5140 }
5141 });
5142 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5143
5144 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5145 [=] (mj_part_t i) {
5146 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5147 mj_node_t>;
5148 // seen weight in the part
5149 mj_scalar_t seen_weight_in_part = 0;
5150 // expected weight for part.
5151 mj_scalar_t expected_weight_in_part = 0;
5152 // imbalance for the left and right side of the cut.
5153 double imbalance_on_left = 0, imbalance_on_right = 0;
5154 if(local_distribute_points_on_cut_lines) {
5155 // init the weight on the cut.
5156 local_global_rectilinear_cut_weight(i) = 0;
5157 local_process_rectilinear_cut_weight(i) = 0;
5158 }
5159 bool bContinue = false;
5160 // if already determined at previous iterations,
5161 // then just write the coordinate to new array, and proceed.
5162 if(current_cut_line_determined(i)) {
5163 new_current_cut_coordinates(i) =
5164 current_cut_coordinates(i);
5165 bContinue = true;
5166 }
5167 if(!bContinue) {
5168 //current weight of the part at the left of the cut line.
5169 seen_weight_in_part = current_global_part_weights(i * 2);
5170
5171 //expected ratio
5172 expected_weight_in_part = current_part_target_weights(i);
5173
5174 //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5175 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5176 expected_weight_in_part);
5177 // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5178 // globalTotalWeight, 1 - expected);
5179 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5180 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5181 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5182 used_imbalance_tolerance < local_sEpsilon ;
5183 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5184 used_imbalance_tolerance < local_sEpsilon;
5185 //if the cut line reaches to desired imbalance.
5186 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5187 current_cut_line_determined(i) = true;
5188 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5189 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5190 }
5191 else if(imbalance_on_left < 0) {
5192 //if left imbalance < 0 then we need to move the cut to right.
5193 if(local_distribute_points_on_cut_lines) {
5194 // if it is okay to distribute the coordinate on
5195 // the same coordinate to left and right.
5196 // then check if we can reach to the target weight by including the
5197 // coordinates in the part.
5198 if(current_global_part_weights(i * 2 + 1) ==
5199 expected_weight_in_part) {
5200 // if it is we are done.
5201 current_cut_line_determined(i) = true;
5202 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5203
5204 //then assign everything on the cut to the left of the cut.
5205 new_current_cut_coordinates(i) =
5206 current_cut_coordinates(i);
5207 //for this cut all the weight on cut will be put to left.
5208 current_part_cut_line_weight_to_put_left(i) =
5209 current_local_part_weights(i * 2 + 1) -
5210 current_local_part_weights(i * 2);
5211 bContinue = true;
5212 }
5213 else if(current_global_part_weights(i * 2 + 1) >
5214 expected_weight_in_part) {
5215 // if the weight is larger than the expected weight,
5216 // then we need to distribute some points to left, some to right.
5217 current_cut_line_determined(i) = true;
5218 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5219
5220 // increase the num cuts to be determined with rectilinear
5221 // partitioning.
5222 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5223 new_current_cut_coordinates(i) =
5224 current_cut_coordinates(i);
5225 local_process_rectilinear_cut_weight[i] =
5226 current_local_part_weights(i * 2 + 1) -
5227 current_local_part_weights(i * 2);
5228 bContinue = true;
5229 }
5230 }
5231
5232 if(!bContinue) {
5233
5234 // we need to move further right,so set lower bound to current line,
5235 // and shift it to the closes point from right.
5236 current_cut_lower_bounds(i) =
5237 current_global_right_closest_points(i);
5238
5239 //set the lower bound weight to the weight we have seen.
5240 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5241
5242 // compare the upper bound with what has been found in the
5243 // last iteration.
5244 // we try to make more strict bounds for the cut here.
5245 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5246 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5247 mj_scalar_t line_weight =
5248 current_global_part_weights(ii * 2 + 1);
5249 if(p_weight >= expected_weight_in_part) {
5250 // if a cut on the right has the expected weight, then we found
5251 // our cut position. Set up and low coordiantes to this
5252 // new cut coordinate, but we need one more iteration to
5253 // finalize the cut position, as wee need to update the part ids.
5254 if(p_weight == expected_weight_in_part) {
5255 current_cut_upper_bounds(i) =
5256 current_cut_coordinates(ii);
5257 current_cut_upper_weights(i) = p_weight;
5258 current_cut_lower_bounds(i) =
5259 current_cut_coordinates(ii);
5260 current_cut_lower_bound_weights(i) = p_weight;
5261 } else if(p_weight < current_cut_upper_weights(i)) {
5262 // if a part weight is larger then my expected weight,
5263 // but lower than my upper bound weight, update upper bound.
5264 current_cut_upper_bounds(i) =
5265 current_global_left_closest_points(ii);
5266 current_cut_upper_weights(i) = p_weight;
5267 }
5268 break;
5269 }
5270 // if comes here then pw < ew
5271 // then compare the weight against line weight.
5272 if(line_weight >= expected_weight_in_part) {
5273 // if the line is larger than the expected weight, then we need
5274 // to reach to the balance by distributing coordinates on
5275 // this line.
5276 current_cut_upper_bounds(i) =
5277 current_cut_coordinates(ii);
5278 current_cut_upper_weights(i) = line_weight;
5279 current_cut_lower_bounds(i) =
5280 current_cut_coordinates(ii);
5281 current_cut_lower_bound_weights(i) = p_weight;
5282 break;
5283 }
5284 // if a stricter lower bound is found,
5285 // update the lower bound.
5286 if(p_weight <= expected_weight_in_part && p_weight >=
5287 current_cut_lower_bound_weights(i)) {
5288 current_cut_lower_bounds(i) =
5289 current_global_right_closest_points(ii);
5290 current_cut_lower_bound_weights(i) = p_weight;
5291 }
5292 }
5293
5294 mj_scalar_t new_cut_position = 0;
5295 algMJ_t::mj_calculate_new_cut_position(
5296 current_cut_upper_bounds(i),
5297 current_cut_lower_bounds(i),
5298 current_cut_upper_weights(i),
5299 current_cut_lower_bound_weights(i),
5300 expected_weight_in_part, new_cut_position,
5301 _sEpsilon);
5302
5303 // if cut line does not move significantly.
5304 // then finalize the search.
5305 if(std::abs(current_cut_coordinates(i) -
5306 new_cut_position) < local_sEpsilon) {
5307 current_cut_line_determined(i) = true;
5308 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5309
5310 //set the cut coordinate and proceed.
5311 new_current_cut_coordinates(i) =
5312 current_cut_coordinates(i);
5313 } else {
5314 new_current_cut_coordinates(i) = new_cut_position;
5315 }
5316 } // bContinue
5317 } else {
5318 // need to move the cut line to left.
5319 // set upper bound to current line.
5320 current_cut_upper_bounds(i) =
5321 current_global_left_closest_points(i);
5322 current_cut_upper_weights(i) =
5323 seen_weight_in_part;
5324 // compare the current cut line weights with
5325 // previous upper and lower bounds.
5326 for(int ii = i - 1; ii >= 0; --ii) {
5327 mj_scalar_t p_weight =
5328 current_global_part_weights(ii * 2);
5329 mj_scalar_t line_weight =
5330 current_global_part_weights(ii * 2 + 1);
5331 if(p_weight <= expected_weight_in_part) {
5332 if(p_weight == expected_weight_in_part) {
5333 // if the weight of the part is my expected weight
5334 // then we find the solution.
5335 current_cut_upper_bounds(i) =
5336 current_cut_coordinates(ii);
5337 current_cut_upper_weights(i) = p_weight;
5338 current_cut_lower_bounds(i) =
5339 current_cut_coordinates(ii);
5340 current_cut_lower_bound_weights(i) = p_weight;
5341 }
5342 else if(p_weight > current_cut_lower_bound_weights(i)) {
5343 // if found weight is bigger than the lower bound
5344 // then update the lower bound.
5345 current_cut_lower_bounds(i) =
5346 current_global_right_closest_points(ii);
5347 current_cut_lower_bound_weights(i) = p_weight;
5348
5349 // at the same time, if weight of line is bigger than the
5350 // expected weight, then update the upper bound as well.
5351 // in this case the balance will be obtained by distributing
5352 // weights on this cut position.
5353 if(line_weight > expected_weight_in_part) {
5354 current_cut_upper_bounds(i) =
5355 current_global_right_closest_points(ii);
5356 current_cut_upper_weights(i) = line_weight;
5357 }
5358 }
5359 break;
5360 }
5361 // if the weight of the cut on the left is still bigger than
5362 // my weight, and also if the weight is smaller than the current
5363 // upper weight, or if the weight is equal to current upper
5364 // weight, but on the left of the upper weight, then update
5365 // upper bound.
5366 if(p_weight >= expected_weight_in_part &&
5367 (p_weight < current_cut_upper_weights(i) ||
5368 (p_weight == current_cut_upper_weights(i) &&
5369 current_cut_upper_bounds(i) >
5370 current_global_left_closest_points(ii)))) {
5371 current_cut_upper_bounds(i) =
5372 current_global_left_closest_points(ii);
5373 current_cut_upper_weights(i) = p_weight;
5374 }
5375 }
5376 mj_scalar_t new_cut_position = 0;
5377 algMJ_t::mj_calculate_new_cut_position(
5378 current_cut_upper_bounds(i),
5379 current_cut_lower_bounds(i),
5380 current_cut_upper_weights(i),
5381 current_cut_lower_bound_weights(i),
5382 expected_weight_in_part,
5383 new_cut_position,
5384 _sEpsilon);
5385
5386 // if cut line does not move significantly.
5387 if(std::abs(current_cut_coordinates(i) -
5388 new_cut_position) < local_sEpsilon) {
5389 current_cut_line_determined(i) = true;
5390 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5391 //set the cut coordinate and proceed.
5392 new_current_cut_coordinates(i) =
5393 current_cut_coordinates(i);
5394 } else {
5395 new_current_cut_coordinates(i) =
5396 new_cut_position;
5397 }
5398 }
5399 }; // bContinue
5400 });
5401
5402 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5403 });
5404
5405 // view_rectilinear_cut_count
5406 mj_part_t rectilinear_cut_count;
5407 Kokkos::parallel_reduce("Read bDoingWork",
5408 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5409 KOKKOS_LAMBDA(int dummy, int & set_single) {
5410 set_single = view_rectilinear_cut_count(0);
5411 }, rectilinear_cut_count);
5412
5413 if(rectilinear_cut_count > 0) {
5414 auto host_local_process_rectilinear_cut_weight =
5415 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5416 local_process_rectilinear_cut_weight);
5417 auto host_local_global_rectilinear_cut_weight =
5418 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5419 local_global_rectilinear_cut_weight);
5420 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5421 local_process_rectilinear_cut_weight);
5422 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5423 local_global_rectilinear_cut_weight);
5424 Teuchos::scan<int,mj_scalar_t>(
5425 *comm, Teuchos::REDUCE_SUM,
5426 num_cuts,
5427 host_local_process_rectilinear_cut_weight.data(),
5428 host_local_global_rectilinear_cut_weight.data());
5429 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5430 host_local_process_rectilinear_cut_weight);
5431 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5432 host_local_global_rectilinear_cut_weight);
5433
5434 Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5435 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5436 KOKKOS_LAMBDA(int dummy) {
5437 for(mj_part_t i = 0; i < num_cuts; ++i) {
5438 // if cut line weight to be distributed.
5439 if(local_global_rectilinear_cut_weight(i) > 0) {
5440 // expected weight to go to left of the cut.
5441 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5442 // the weight that should be put to left of the cut.
5443 mj_scalar_t necessary_weight_on_line_for_left =
5444 expected_part_weight - current_global_part_weights(i * 2);
5445
5446 // the weight of the cut in the process
5447 mj_scalar_t my_weight_on_line =
5448 local_process_rectilinear_cut_weight(i);
5449
5450 // the sum of the cut weights upto this process,
5451 // including the weight of this process.
5452 mj_scalar_t weight_on_line_upto_process_inclusive =
5453 local_global_rectilinear_cut_weight(i);
5454 // the space on the left side of the cut after all processes
5455 // before this process (including this process)
5456 // puts their weights on cut to left.
5457 mj_scalar_t space_to_put_left =
5458 necessary_weight_on_line_for_left -
5459 weight_on_line_upto_process_inclusive;
5460 // add my weight to this space to find out how much space
5461 // is left to me.
5462 mj_scalar_t space_left_to_me =
5463 space_to_put_left + my_weight_on_line;
5464
5465 /*
5466 cout << "expected_part_weight:" << expected_part_weight
5467 << " necessary_weight_on_line_for_left:"
5468 << necessary_weight_on_line_for_left
5469 << " my_weight_on_line" << my_weight_on_line
5470 << " weight_on_line_upto_process_inclusive:"
5471 << weight_on_line_upto_process_inclusive
5472 << " space_to_put_left:" << space_to_put_left
5473 << " space_left_to_me" << space_left_to_me << endl;
5474 */
5475
5476 if(space_left_to_me < 0) {
5477 // space_left_to_me is negative and i dont need to put
5478 // anything to left.
5479 current_part_cut_line_weight_to_put_left(i) = 0;
5480 }
5481 else if(space_left_to_me >= my_weight_on_line) {
5482 // space left to me is bigger than the weight of the
5483 // processor on cut.
5484 // so put everything to left.
5485 current_part_cut_line_weight_to_put_left(i) =
5486 my_weight_on_line;
5487 // cout << "setting current_part_cut_line_weight_to_put_left
5488 // to my_weight_on_line:" << my_weight_on_line << endl;
5489 }
5490 else {
5491 // put only the weight as much as the space.
5492 current_part_cut_line_weight_to_put_left(i) =
5493 space_left_to_me;
5494 // cout << "setting current_part_cut_line_weight_to_put_left
5495 // to space_left_to_me:" << space_left_to_me << endl;
5496 }
5497 }
5498 }
5499 view_rectilinear_cut_count(0) = 0;
5500 });
5501 }
5502
5503 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5504}
5505
5515template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5516 typename mj_part_t, typename mj_node_t>
5517void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5518 get_processor_num_points_in_parts(
5519 mj_part_t num_procs,
5520 mj_part_t num_parts,
5521 mj_gno_t *&num_points_in_all_processor_parts)
5522{
5523 // initially allocation_size is num_parts
5524 size_t allocation_size = num_parts * (num_procs + 1);
5525
5526 // this will be output
5527 // holds how many each processor has in each part.
5528 // last portion is the sum of all processor points in each part.
5529
5530 // allocate memory for the local num coordinates in each part.
5531 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5532 new mj_gno_t[allocation_size];
5533
5534 // this is the portion of the memory which will be used
5535 // at the summation to obtain total number of processors' points in each part.
5536 mj_gno_t *my_local_points_to_reduce_sum =
5537 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5538
5539 // this is the portion of the memory where each stores its local number.
5540 // this information is needed by other processors.
5541 mj_gno_t *my_local_point_counts_in_each_part =
5542 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5543
5544 // initialize the array with 0's.
5545 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5546 sizeof(mj_gno_t)*allocation_size);
5547
5548 auto local_new_part_xadj = this->new_part_xadj;
5549 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5550 Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5551 Kokkos::parallel_for("get vals on device",
5552 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5553 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5554 points_per_part(i) =
5555 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5556 });
5557 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5558 Kokkos::deep_copy(host_points_per_part, points_per_part);
5559 for(int i = 0; i < num_parts; ++i) {
5560 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5561 }
5562
5563 // copy the local num parts to the last portion of array, so that this portion
5564 // will represent the global num points in each part after the reduction.
5565 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5566 sizeof(mj_gno_t) * (num_parts) );
5567
5568 // reduceAll operation.
5569 // the portion that belongs to a processor with index p
5570 // will start from myRank * num_parts.
5571 // the global number of points will be held at the index
5572 try{
5573 reduceAll<int, mj_gno_t>(
5574 *(this->comm),
5575 Teuchos::REDUCE_SUM,
5576 allocation_size,
5577 num_local_points_in_each_part_to_reduce_sum,
5578 num_points_in_all_processor_parts);
5579 }
5580 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5581
5582 delete [] num_local_points_in_each_part_to_reduce_sum;
5583}
5584
5600template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5601 typename mj_part_t, typename mj_node_t>
5602bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5603 mj_check_to_migrate(
5604 size_t migration_reduce_all_population,
5605 mj_lno_t num_coords_for_last_dim_part,
5606 mj_part_t num_procs,
5607 mj_part_t num_parts,
5608 mj_gno_t *num_points_in_all_processor_parts)
5609{
5610 // if reduce all count and population in the last dim is too high
5611 if(migration_reduce_all_population > future_reduceall_cutoff) {
5612 return true;
5613 }
5614
5615 // if the work in a part per processor in the last dim is too low.
5616 if(num_coords_for_last_dim_part < min_work_last_dim) {
5617 return true;
5618 }
5619
5620 // if migration is to be checked and the imbalance is too high
5621 if(this->check_migrate_avoid_migration_option == 0) {
5622 double global_imbalance = 0;
5623 // global shift to reach the sum of coordiante count in each part.
5624 size_t global_shift = num_procs * num_parts;
5625
5626 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5627 for(mj_part_t i = 0; i < num_parts; ++i) {
5628 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5629 / double(num_procs);
5630
5631 global_imbalance += std::abs(ideal_num -
5632 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5633 }
5634 }
5635 global_imbalance /= num_parts;
5636 global_imbalance /= num_procs;
5637
5638 if(global_imbalance <= this->minimum_migration_imbalance) {
5639 return false;
5640 }
5641 else {
5642 return true;
5643 }
5644 }
5645 else {
5646 // if migration is forced
5647 return true;
5648 }
5649}
5650
5664template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5665 typename mj_part_t, typename mj_node_t>
5666void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5667 assign_send_destinations(
5668 mj_part_t num_parts,
5669 mj_part_t *part_assignment_proc_begin_indices,
5670 mj_part_t *processor_chains_in_parts,
5671 mj_lno_t *send_count_to_each_proc,
5672 int *coordinate_destinations) {
5673
5674 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5675 deep_copy(host_new_part_xadj, this->new_part_xadj);
5676
5677 auto host_new_coordinate_permutations =
5678 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5679 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5680
5681 for(mj_part_t p = 0; p < num_parts; ++p) {
5682 mj_lno_t part_begin = 0;
5683 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5684 mj_lno_t part_end = host_new_part_xadj(p);
5685 // get the first part that current processor will send its part-p.
5686 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5687 // initialize how many point I sent to this processor.
5688 mj_lno_t num_total_send = 0;
5689 for(mj_lno_t j=part_begin; j < part_end; j++) {
5690 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5691 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5692 // then get the next processor to send the points in part p.
5693 num_total_send = 0;
5694 // assign new processor to part_assign_begin[p]
5695 part_assignment_proc_begin_indices[p] =
5696 processor_chains_in_parts[proc_to_sent];
5697 // remove the previous processor
5698 processor_chains_in_parts[proc_to_sent] = -1;
5699 // choose the next processor as the next one to send.
5700 proc_to_sent = part_assignment_proc_begin_indices[p];
5701 }
5702 // write the gno index to corresponding position in sendBuf.
5703 coordinate_destinations[local_ind] = proc_to_sent;
5704 ++num_total_send;
5705 }
5706 }
5707}
5708
5729template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5730 typename mj_part_t, typename mj_node_t>
5731void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5732 mj_assign_proc_to_parts(
5733 mj_gno_t * num_points_in_all_processor_parts,
5734 mj_part_t num_parts,
5735 mj_part_t num_procs,
5736 mj_lno_t *send_count_to_each_proc,
5737 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5738 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5739 mj_part_t &out_part_index,
5740 mj_part_t &output_part_numbering_begin_index,
5741 int * coordinate_destinations) {
5742 mj_gno_t *global_num_points_in_parts =
5743 num_points_in_all_processor_parts + num_procs * num_parts;
5744 mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5745
5746 // boolean variable if the process finds its part to be assigned.
5747 bool did_i_find_my_group = false;
5748
5749 mj_part_t num_free_procs = num_procs;
5750 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5751
5752 double max_imbalance_difference = 0;
5753 mj_part_t max_differing_part = 0;
5754
5755 // find how many processor each part requires.
5756 for(mj_part_t i = 0; i < num_parts; i++) {
5757
5758 // scalar portion of the required processors
5759 double scalar_required_proc = num_procs *
5760 (double (global_num_points_in_parts[i]) /
5761 double (this->num_global_coords));
5762
5763 // round it to closest integer; make sure have at least one proc.
5764 mj_part_t required_proc =
5765 static_cast<mj_part_t> (0.5 + scalar_required_proc);
5766 if(required_proc == 0) required_proc = 1;
5767
5768 // if assigning the required num procs, creates problems for the rest
5769 // of the parts, then only assign {num_free_procs -
5770 // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5771 if(num_free_procs -
5772 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5773 required_proc = num_free_procs -
5774 (minimum_num_procs_required_for_rest_of_parts);
5775 }
5776
5777 // reduce the free processor count
5778 num_free_procs -= required_proc;
5779
5780 // reduce the free minimum processor count required for the rest of the
5781 // part by 1.
5782 --minimum_num_procs_required_for_rest_of_parts;
5783
5784 // part (i) is assigned to (required_proc) processors.
5785 num_procs_assigned_to_each_part[i] = required_proc;
5786
5787 // because of the roundings some processors might be left as unassigned.
5788 // we want to assign those processors to the part with most imbalance.
5789 // find the part with the maximum imbalance here.
5790 double imbalance_wrt_ideal =
5791 (scalar_required_proc - required_proc) / required_proc;
5792 if(imbalance_wrt_ideal > max_imbalance_difference) {
5793 max_imbalance_difference = imbalance_wrt_ideal;
5794 max_differing_part = i;
5795 }
5796 }
5797
5798 // assign extra processors to the part with maximum imbalance
5799 // than the ideal.
5800 if(num_free_procs > 0) {
5801 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5802 }
5803
5804 // now find what are the best processors with least migration for each part.
5805
5806 // part_assignment_proc_begin_indices ([i]) is the array that holds the
5807 // beginning index of a processor that processor sends its data for part - i
5808 mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5809
5810 // the next processor send is found in processor_chains_in_parts,
5811 // in linked list manner.
5812 mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5813 mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5814
5815 // initialize the assignment of each processor.
5816 // this has a linked list implementation.
5817 // the beginning of processors assigned
5818 // to each part is hold at part_assignment_proc_begin_indices[part].
5819 // then the next processor assigned to that part is located at
5820 // proc_part_assignments[part_assign_begins[part]], this is a chain
5821 // until the value of -1 is reached.
5822 for(int i = 0; i < num_procs; ++i ) {
5823 processor_part_assignments[i] = -1;
5824 processor_chains_in_parts[i] = -1;
5825 }
5826 for(int i = 0; i < num_parts; ++i ) {
5827 part_assignment_proc_begin_indices[i] = -1;
5828 }
5829
5830 // std::cout << "Before migration: mig type:" <<
5831 // this->migration_type << std::endl;
5832 // Allocate memory for sorting data structure.
5833 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5834 sort_item_num_part_points_in_procs =
5835 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5836
5837 for(mj_part_t i = 0; i < num_parts; ++i) {
5838 // the algorithm tries to minimize the cost of migration, by assigning the
5839 // processors with highest number of coordinates on that part.
5840 // here we might want to implement a maximum weighted bipartite matching
5841 // algorithm.
5842 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5843 sort_item_num_part_points_in_procs[ii].id = ii;
5844 // if processor is not assigned yet.
5845 // add its num points to the sort data structure.
5846 if(processor_part_assignments[ii] == -1) {
5847 sort_item_num_part_points_in_procs[ii].val =
5848 num_points_in_all_processor_parts[ii * num_parts + i];
5849 // indicate that the processor has positive weight.
5850 sort_item_num_part_points_in_procs[ii].signbit = 1;
5851 }
5852 else {
5853 // if processor is already assigned, insert -nLocal - 1 so that it
5854 // won't be selected again.
5855 // would be same if we simply set it to -1, but more information with
5856 // no extra cost (which is used later) is provided.
5857 // sort_item_num_part_points_in_procs[ii].val =
5858 // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5859
5860 // UPDATE: Since above gets warning when unsigned is used to
5861 // represent, we added extra bit to as sign bit to the sort item.
5862 // It is 1 for positives, 0 for negatives.
5863 sort_item_num_part_points_in_procs[ii].val =
5864 num_points_in_all_processor_parts[ii * num_parts + i];
5865 sort_item_num_part_points_in_procs[ii].signbit = 0;
5866 }
5867 }
5868
5869 // sort the processors in the part.
5870 uqSignsort<mj_part_t, mj_gno_t,char>
5871 (num_procs, sort_item_num_part_points_in_procs);
5872
5873 /*
5874 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5875 std::cout << "ii:" << ii << " " <<
5876 sort_item_num_part_points_in_procs[ii].id <<
5877 " " << sort_item_num_part_points_in_procs[ii].val <<
5878 " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5879 std::endl;
5880 }
5881 */
5882
5883 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5884 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5885 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5886 ceil(total_num_points_in_part / double (required_proc_count)));
5887
5888 // starts sending to least heaviest part.
5889 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5890 mj_part_t next_proc_to_send_id =
5891 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5892 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5893 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5894
5895 // find the processors that will be assigned to this part, which are the
5896 // heaviest non assigned processors.
5897 for(mj_part_t ii = num_procs - 1;
5898 ii >= num_procs - required_proc_count; --ii) {
5899 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5900 // assign processor to part - i.
5901 processor_part_assignments[proc_id] = i;
5902 }
5903
5904 bool did_change_sign = false;
5905 // if processor has a minus count, reverse it.
5906 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5907 // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5908 // TODO: SEE BUG 6194
5909 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5910 did_change_sign = true;
5911 sort_item_num_part_points_in_procs[ii].signbit = 1;
5912 }
5913 else {
5914 break;
5915 }
5916 }
5917
5918 if(did_change_sign) {
5919 // resort the processors in the part for the rest of the processors that
5920 // is not assigned.
5921 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5922 sort_item_num_part_points_in_procs);
5923 }
5924
5925 /*
5926 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5927 std::cout << "after resort ii:" << ii << " " <<
5928 sort_item_num_part_points_in_procs[ii].id <<
5929 " " << sort_item_num_part_points_in_procs[ii].val <<
5930 " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5931 std::endl;
5932 }
5933 */
5934
5935 // check if this processors is one of the procs assigned to this part.
5936 // if it is, then get the group.
5937 if(!did_i_find_my_group) {
5938 for(mj_part_t ii = num_procs - 1; ii >=
5939 num_procs - required_proc_count; --ii) {
5940
5941 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5942
5943 // add the proc to the group.
5944 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5945
5946 if(proc_id_to_assign == this->myRank) {
5947 // if the assigned process is me, then I find my group.
5948 did_i_find_my_group = true;
5949
5950 // set the beginning of part i to my rank.
5951 part_assignment_proc_begin_indices[i] = this->myRank;
5952 processor_chains_in_parts[this->myRank] = -1;
5953
5954 // set send count to myself to the number of points that I have
5955 // in part i.
5956 send_count_to_each_proc[this->myRank] =
5957 sort_item_num_part_points_in_procs[ii].val;
5958
5959 // calculate the shift required for the
5960 // output_part_numbering_begin_index
5961 for(mj_part_t in = 0; in < i; ++in) {
5962 output_part_numbering_begin_index +=
5963 (*next_future_num_parts_in_parts)[in];
5964 }
5965 out_part_index = i;
5966 }
5967 }
5968
5969 // if these was not my group,
5970 // clear the subcomminicator processor array.
5971 if(!did_i_find_my_group) {
5972 processor_ranks_for_subcomm.clear();
5973 }
5974 }
5975
5976 // send points of the nonassigned coordinates to the assigned coordinates.
5977 // starts from the heaviest nonassigned processor.
5978 // TODO we might want to play with this part, that allows more
5979 // computational imbalance but having better communication balance.
5980 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5981 mj_part_t nonassigned_proc_id =
5982 sort_item_num_part_points_in_procs[ii].id;
5983 mj_lno_t num_points_to_sent =
5984 sort_item_num_part_points_in_procs[ii].val;
5985
5986 // we set number of points to -to_sent - 1 for the assigned processors.
5987 // we reverse it here. This should not happen, as we have already
5988 // reversed them above.
5989#ifdef MJ_DEBUG
5990 if(num_points_to_sent < 0) {
5991 cout << "Migration - processor assignments - for part:" << i
5992 << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
5993 << num_points_to_sent << std::endl;
5994 std::terminate();
5995 }
5996#endif
5997
5998 switch (migration_type) {
5999 case 0:
6000 {
6001 // now sends the points to the assigned processors.
6002 while (num_points_to_sent > 0) {
6003 // if the processor has enough space.
6004 if(num_points_to_sent <= space_left_in_sent_proc) {
6005 // reduce the space left in the processor.
6006 space_left_in_sent_proc -= num_points_to_sent;
6007 // if my rank is the one that is sending the coordinates.
6008 if(this->myRank == nonassigned_proc_id) {
6009 // set my sent count to the sent processor.
6010 send_count_to_each_proc[next_proc_to_send_id] =
6011 num_points_to_sent;
6012 // save the processor in the list (processor_chains_in_parts
6013 // and part_assignment_proc_begin_indices)
6014 // that the processor will send its point in part-i.
6015 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6016 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6017 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6018 }
6019 num_points_to_sent = 0;
6020 }
6021 else {
6022 // there might be no space left in the processor.
6023 if(space_left_in_sent_proc > 0) {
6024 num_points_to_sent -= space_left_in_sent_proc;
6025
6026 //send as the space left in the processor.
6027 if(this->myRank == nonassigned_proc_id) {
6028 // send as much as the space in this case.
6029 send_count_to_each_proc[next_proc_to_send_id] =
6030 space_left_in_sent_proc;
6031 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6032 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6033 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6034 }
6035 }
6036 // change the sent part
6037 ++next_proc_to_send_index;
6038
6039#ifdef MJ_DEBUG
6040 if(next_part_to_send_index < nprocs - required_proc_count ) {
6041 cout << "Migration - processor assignments - for part:"
6042 << i
6043 << " next_part_to_send :" << next_part_to_send_index
6044 << " nprocs:" << nprocs
6045 << " required_proc_count:" << required_proc_count
6046 << " Error: next_part_to_send_index <" <<
6047 << " nprocs - required_proc_count" << std::endl;
6048 std::terminate();
6049 }
6050#endif
6051 // send the new id.
6052 next_proc_to_send_id =
6053 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6054 // set the new space in the processor.
6055 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6056 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6057 }
6058 }
6059 }
6060 break;
6061 default:
6062 {
6063 // to minimize messages, we want each processor to send its
6064 // coordinates to only a single point.
6065 // we do not respect imbalances here, we send all points to the
6066 // next processor.
6067 if(this->myRank == nonassigned_proc_id) {
6068 // set my sent count to the sent processor.
6069 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6070 // save the processor in the list (processor_chains_in_parts and
6071 // part_assignment_proc_begin_indices)
6072 // that the processor will send its point in part-i.
6073 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6074 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6075 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6076 }
6077 num_points_to_sent = 0;
6078 ++next_proc_to_send_index;
6079
6080 // if we made it to the heaviest processor we round robin and
6081 // go to beginning
6082 if(next_proc_to_send_index == num_procs) {
6083 next_proc_to_send_index = num_procs - required_proc_count;
6084 }
6085 // send the new id.
6086 next_proc_to_send_id =
6087 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6088 // set the new space in the processor.
6089 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6090 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6091 }
6092 }
6093 }
6094 }
6095
6096 /*
6097 for(int i = 0; i < num_procs;++i) {
6098 std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6099 send_count_to_each_proc[i] << std::endl;
6100 }
6101 */
6102
6103 this->assign_send_destinations(
6104 num_parts,
6105 part_assignment_proc_begin_indices,
6106 processor_chains_in_parts,
6107 send_count_to_each_proc,
6108 coordinate_destinations);
6109 delete [] part_assignment_proc_begin_indices;
6110 delete [] processor_chains_in_parts;
6111 delete [] processor_part_assignments;
6112 delete [] sort_item_num_part_points_in_procs;
6113 delete [] num_procs_assigned_to_each_part;
6114}
6115
6131template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6132 typename mj_part_t, typename mj_node_t>
6133void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6134 assign_send_destinations2(
6135 mj_part_t num_parts,
6136 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6137 int *coordinate_destinations,
6138 mj_part_t &output_part_numbering_begin_index,
6139 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6140{
6141 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6142 mj_part_t previous_processor = -1;
6143
6144 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6145 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6146
6147 auto local_new_coordinate_permutations =
6148 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6149 Kokkos::deep_copy(local_new_coordinate_permutations,
6150 this->new_coordinate_permutations);
6151
6152 for(mj_part_t i = 0; i < num_parts; ++i) {
6153 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6154
6155 // assigned processors are sorted.
6156 mj_lno_t part_begin_index = 0;
6157
6158 if(p > 0) {
6159 part_begin_index = local_new_part_xadj(p - 1);
6160 }
6161
6162 mj_lno_t part_end_index = local_new_part_xadj(p);
6163
6164 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6165 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6166 output_part_numbering_begin_index = part_shift_amount;
6167 }
6168 previous_processor = assigned_proc;
6169 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6170
6171 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6172 mj_lno_t localInd = local_new_coordinate_permutations(j);
6173 coordinate_destinations[localInd] = assigned_proc;
6174 }
6175 }
6176}
6177
6199template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6200 typename mj_part_t, typename mj_node_t>
6201void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6202 mj_assign_parts_to_procs(
6203 mj_gno_t * num_points_in_all_processor_parts,
6204 mj_part_t num_parts,
6205 mj_part_t num_procs,
6206 mj_lno_t *send_count_to_each_proc,
6207 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6208 mj_part_t &out_num_part,
6209 std::vector<mj_part_t> &out_part_indices,
6210 mj_part_t &output_part_numbering_begin_index,
6211 int *coordinate_destinations) {
6212
6213 out_num_part = 0;
6214 mj_gno_t *global_num_points_in_parts =
6215 num_points_in_all_processor_parts + num_procs * num_parts;
6216 out_part_indices.clear();
6217
6218 // to sort the parts that is assigned to the processors.
6219 // id is the part number, sort value is the assigned processor id.
6220 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6221 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6222 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6223 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6224
6225 // calculate the optimal number of coordinates that should be assigned
6226 // to each processor.
6227 mj_lno_t work_each =
6228 mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6229
6230 // to hold the left space as the number of coordinates to the optimal
6231 // number in each proc.
6232 mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6233
6234 // initialize left space in each.
6235 for(mj_part_t i = 0; i < num_procs; ++i) {
6236 space_in_each_processor[i] = work_each;
6237 }
6238
6239 // we keep track of how many parts each processor is assigned to.
6240 // because in some weird inputs, it might be possible that some
6241 // processors is not assigned to any part. Using these variables,
6242 // we force each processor to have at least one part.
6243 mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6244 memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6245 int empty_proc_count = num_procs;
6246
6247 // to sort the parts with decreasing order of their coordiantes.
6248 // id are the part numbers, sort value is the number of points in each.
6249 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6250 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6251
6252 // initially we will sort the parts according to the number of coordinates
6253 // they have, so that we will start assigning with the part that has the most
6254 // number of coordinates.
6255 for(mj_part_t i = 0; i < num_parts; ++i) {
6256 sort_item_point_counts_in_parts[i].id = i;
6257 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6258 }
6259
6260 // sort parts with increasing order of loads.
6261 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6262
6263 // assigning parts to the processors
6264 // traverse the part with decreasing order of load.
6265 // first assign the heaviest part.
6266 for(mj_part_t j = 0; j < num_parts; ++j) {
6267 // sorted with increasing order, traverse inverse.
6268 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6269
6270 // load of the part
6271 mj_gno_t load = global_num_points_in_parts[i];
6272
6273 // assigned processors
6274 mj_part_t assigned_proc = -1;
6275
6276 // sort processors with increasing number of points in this part.
6277 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6278 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6279
6280 // if there are still enough parts to fill empty processors, than proceed
6281 // normally, but if empty processor count is equal to the number of part,
6282 // then we force to part assignments only to empty processors.
6283 if(empty_proc_count < num_parts - j ||
6284 num_parts_proc_assigned[ii] == 0) {
6285 // how many points processor ii has in part i?
6286 sort_item_num_points_of_proc_in_part_i[ii].val =
6287 num_points_in_all_processor_parts[ii * num_parts + i];
6288 }
6289 else {
6290 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6291 }
6292 }
6293
6294 uqsort<mj_part_t, mj_gno_t>(num_procs,
6295 sort_item_num_points_of_proc_in_part_i);
6296
6297 // traverse all processors with decreasing load.
6298 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6299 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6300 if(assigned_proc == -1 ||
6301 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6302 assigned_proc = ii;
6303 }
6304 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6305 if(ii < assigned_proc) {
6306 // ties go to lower proc
6307 // not necessary for a valid result but allows testing to compare
6308 // MPI results and have parts numbers assigned to the same boxes.
6309 // We don't break here because we may have more ties still to check.
6310 // The indeterminate state before this is due to Cuda using
6311 // atomics to refill the permutation array. So non-cuda runs don't
6312 // actualy need this since they will always have the same pattern.
6313 assigned_proc = ii;
6314 }
6315 }
6316 else {
6317 break; // now we can break - we have our part and no more ties.
6318 }
6319 }
6320
6321 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6322 --empty_proc_count;
6323 }
6324
6325 space_in_each_processor[assigned_proc] -= load;
6326 //to sort later, part-i is assigned to the proccessor - assignment.
6327 sort_item_part_to_proc_assignment[j].id = i; //part i
6328
6329 // assigned to processor - assignment.
6330 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6331
6332 // if assigned processor is me, increase the number.
6333 if(assigned_proc == this->myRank) {
6334 out_num_part++;//assigned_part_count;
6335 out_part_indices.push_back(i);
6336 }
6337
6338 // increase the send to that processor by the number of points in that
6339 // part, as everyone send their coordiantes in this part to the
6340 // processor assigned to this part.
6341 send_count_to_each_proc[assigned_proc] +=
6342 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6343 }
6344
6345 delete [] num_parts_proc_assigned;
6346 delete [] sort_item_num_points_of_proc_in_part_i;
6347 delete [] sort_item_point_counts_in_parts;
6348 delete [] space_in_each_processor;
6349
6350 // sort assignments with respect to the assigned processors.
6351 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6352
6353 // fill sendBuf.
6354 this->assign_send_destinations2(
6355 num_parts,
6356 sort_item_part_to_proc_assignment,
6357 coordinate_destinations,
6358 output_part_numbering_begin_index,
6359 next_future_num_parts_in_parts);
6360
6361 delete [] sort_item_part_to_proc_assignment;
6362}
6363
6364
6388template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6389 typename mj_part_t, typename mj_node_t>
6390void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6391 mj_migration_part_proc_assignment(
6392 mj_gno_t * num_points_in_all_processor_parts,
6393 mj_part_t num_parts,
6394 mj_part_t num_procs,
6395 mj_lno_t *send_count_to_each_proc,
6396 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6397 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6398 mj_part_t &out_num_part,
6399 std::vector<mj_part_t> &out_part_indices,
6400 mj_part_t &output_part_numbering_begin_index,
6401 int *coordinate_destinations)
6402{
6403 processor_ranks_for_subcomm.clear();
6404 // if(this->num_local_coords > 0)
6405 if(num_procs > num_parts) {
6406 // if there are more processors than the number of current part
6407 // then processors share the existing parts.
6408 // at the end each processor will have a single part,
6409 // but a part will be shared by a group of processors.
6410 mj_part_t out_part_index = 0;
6411
6412 this->mj_assign_proc_to_parts(
6413 num_points_in_all_processor_parts,
6414 num_parts,
6415 num_procs,
6416 send_count_to_each_proc,
6417 processor_ranks_for_subcomm,
6418 next_future_num_parts_in_parts,
6419 out_part_index,
6420 output_part_numbering_begin_index,
6421 coordinate_destinations
6422 );
6423
6424 out_num_part = 1;
6425 out_part_indices.clear();
6426 out_part_indices.push_back(out_part_index);
6427 }
6428 else {
6429
6430 // there are more parts than the processors.
6431 // therefore a processor will be assigned multiple parts,
6432 // the subcommunicators will only have a single processor.
6433 processor_ranks_for_subcomm.push_back(this->myRank);
6434
6435 // since there are more parts then procs,
6436 // assign multiple parts to processors.
6437
6438 this->mj_assign_parts_to_procs(
6439 num_points_in_all_processor_parts,
6440 num_parts,
6441 num_procs,
6442 send_count_to_each_proc,
6443 next_future_num_parts_in_parts,
6444 out_num_part,
6445 out_part_indices,
6446 output_part_numbering_begin_index,
6447 coordinate_destinations);
6448 }
6449}
6450
6464template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6465 typename mj_part_t, typename mj_node_t>
6466void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6467 mj_migrate_coords(
6468 mj_part_t num_procs,
6469 mj_lno_t &num_new_local_points,
6470 std::string iteration,
6471 int *coordinate_destinations,
6472 mj_part_t num_parts)
6473{
6474
6475#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6476 if(sizeof(mj_lno_t) <= sizeof(int)) {
6477 // Cannot use Zoltan_Comm with local ordinals larger than ints.
6478 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6479 // may overflow.
6480 ZOLTAN_COMM_OBJ *plan = NULL;
6481 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6482 int num_incoming_gnos = 0;
6483 int message_tag = 7859;
6484
6485 this->mj_env->timerStart(MACRO_TIMERS,
6486 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6487 int ierr = Zoltan_Comm_Create(
6488 &plan,
6489 int(this->num_local_coords),
6490 coordinate_destinations,
6491 mpi_comm,
6492 message_tag,
6493 &num_incoming_gnos);
6494
6495 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6496 this->mj_env->timerStop(MACRO_TIMERS,
6497 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6498
6499 this->mj_env->timerStart(MACRO_TIMERS,
6500 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6501
6502 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6503 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6504 // view; need the explicit Host creation and deep_copy.
6505
6506 // migrate gnos.
6507 {
6508 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6509 Kokkos::HostSpace(), this->current_mj_gnos);
6510 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6511 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6512 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6513 auto host_dst_gnos = Kokkos::create_mirror_view(
6514 Kokkos::HostSpace(), dst_gnos);
6515 message_tag++;
6516 ierr = Zoltan_Comm_Do(
6517 plan,
6518 message_tag,
6519 (char *) host_current_mj_gnos.data(),
6520 sizeof(mj_gno_t),
6521 (char *) host_dst_gnos.data());
6522 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6523 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6524 this->current_mj_gnos = dst_gnos;
6525 }
6526
6527 //migrate coordinates
6528 {
6529 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6530 auto host_src_coordinates = Kokkos::create_mirror_view(
6531 Kokkos::HostSpace(), this->mj_coordinates);
6532 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6533 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6534 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6535 num_incoming_gnos, this->coord_dim);
6536 auto host_dst_coordinates = Kokkos::create_mirror_view(
6537 Kokkos::HostSpace(), dst_coordinates);
6538 for(int i = 0; i < this->coord_dim; ++i) {
6539 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6540 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6541 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6542 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6543 // Note Layout Left means we can do these in contiguous blocks
6544 message_tag++;
6545 ierr = Zoltan_Comm_Do(
6546 plan,
6547 message_tag,
6548 (char *) sub_host_src_coordinates.data(),
6549 sizeof(mj_scalar_t),
6550 (char *) sub_host_dst_coordinates.data());
6551 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6552 }
6553 deep_copy(dst_coordinates, host_dst_coordinates);
6554 this->mj_coordinates = dst_coordinates;
6555 }
6556
6557 // migrate weights.
6558 {
6559 auto host_src_weights = Kokkos::create_mirror_view(
6560 Kokkos::HostSpace(), this->mj_weights);
6561 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6562 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6563 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6564 num_incoming_gnos, this->num_weights_per_coord);
6565 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6566 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6567 auto sub_host_src_weights
6568 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6569 auto sub_host_dst_weights
6570 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6571 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6572 // Copy because of layout
6573 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6574 sent_weight[n] = sub_host_src_weights(n);
6575 }
6576 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6577 message_tag++;
6578 ierr = Zoltan_Comm_Do(
6579 plan,
6580 message_tag,
6581 (char *) sent_weight.getRawPtr(),
6582 sizeof(mj_scalar_t),
6583 (char *) received_weight.getRawPtr());
6584 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6585 // Again we copy by index due to layout
6586 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6587 sub_host_dst_weights(n) = received_weight[n];
6588 }
6589 }
6590 deep_copy(dst_weights, host_dst_weights);
6591 this->mj_weights = dst_weights;
6592 }
6593
6594 // migrate owners.
6595 {
6596 // Note that owners we kept on Serial
6597 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6598 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6599 num_incoming_gnos);
6600 message_tag++;
6601 ierr = Zoltan_Comm_Do(
6602 plan,
6603 message_tag,
6604 (char *) owner_of_coordinate.data(),
6605 sizeof(int),
6606 (char *) dst_owners_of_coordinate.data());
6607 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6608 this->owner_of_coordinate = dst_owners_of_coordinate;
6609 }
6610
6611 // if num procs is less than num parts,
6612 // we need the part assigment arrays as well, since
6613 // there will be multiple parts in processor.
6614 {
6615 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6616 Kokkos::HostSpace(), this->assigned_part_ids);
6617 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6618 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6619 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6620 num_incoming_gnos);
6621 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6622 Kokkos::HostSpace(), dst_assigned_part_ids);
6623 mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6624 if(num_procs < num_parts) {
6625 message_tag++;
6626 ierr = Zoltan_Comm_Do(
6627 plan,
6628 message_tag,
6629 (char *) host_src_assigned_part_ids.data(),
6630 sizeof(mj_part_t),
6631 (char *) host_dst_assigned_part_ids.data());
6632 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6633 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6634 }
6635 // In original code this would just assign to an uninitialized array
6636 // if num_procs < num_parts. We're doing the same here.
6637 this->assigned_part_ids = dst_assigned_part_ids;
6638 }
6639
6640 ierr = Zoltan_Comm_Destroy(&plan);
6641 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6642 num_new_local_points = num_incoming_gnos;
6643 this->mj_env->timerStop(MACRO_TIMERS,
6644 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6645 }
6646 else
6647#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6648 {
6649 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6650 "Migration DistributorPlanCreating-" + iteration);
6651
6652 Tpetra::Distributor distributor(this->comm);
6653 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6654 this->num_local_coords);
6655 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6656 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6657 "Migration DistributorPlanCreating-" + iteration);
6658 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6659 "Migration DistributorMigration-" + iteration);
6660
6661 // note MPI buffers should all be on Kokkos::HostSpace and not
6662 // Kokkos::CudaUVMSpace.
6663 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6664 // view; need the explicit Host creation and deep_copy.
6665 // migrate gnos.
6666 {
6667 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6668 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
6669 num_incoming_gnos);
6670
6671 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6672 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
6673 this->current_mj_gnos.extent(0));
6674 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6675
6676 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6677
6678 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6679 Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6680
6681 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6682 }
6683
6684 // migrate coordinates
6685 // coordinates in MJ are LayoutLeft since Tpetra Multivector is LayoutLeft
6686 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6687 dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6688
6689 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6690 host_src_coordinates(
6691 Kokkos::ViewAllocateWithoutInitializing("host_coords"),
6692 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6693 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6694
6695 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6696 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
6697 num_incoming_gnos);
6698
6699 for(int i = 0; i < this->coord_dim; ++i) {
6700
6701 // Note Layout Left means we can do these in contiguous blocks
6702
6703 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6704 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6705
6706 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6707
6708 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6709 received_coord);
6710
6711 // Kokkos::deep_copy will fence, I think, so it should be safe
6712 // to reuse received_coord in the next lop iteration
6713 }
6714 this->mj_coordinates = dst_coordinates;
6715
6716 // migrate weights.
6717 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6718 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6719 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6720 dst_weights);
6721
6722 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6723 Kokkos::HostSpace(), this->mj_weights);
6724
6725 // contiguous buffers to gather potentially strided data
6726 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6727 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
6728 this->num_local_coords);
6729
6730 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6731 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
6732 num_incoming_gnos);
6733
6734 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6735
6736 auto sub_host_src_weights
6737 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6738
6739 auto sub_host_dst_weights
6740 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6741
6742
6743 // Layout Right means the weights are not contiguous
6744 // However we don't have any systems setup with more than 1 weight so
6745 // really I have not tested any of this code with num weights > 1.
6746 // I think this is the right thing to do.
6747 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6748 sent_weight[n] = sub_host_src_weights(n);
6749 }
6750
6751 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6752
6753 // Again we copy by index due to layout
6754 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6755 sub_host_dst_weights(n) = received_weight[n];
6756 }
6757 }
6758 Kokkos::deep_copy(dst_weights, host_dst_weights);
6759 this->mj_weights = dst_weights;
6760
6761 // migrate owners
6762 {
6763 // Note owners we kept on Serial
6764 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6765 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6766 num_incoming_gnos);
6767
6768 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6769
6770 this->owner_of_coordinate = received_owners;
6771 }
6772
6773 // if num procs is less than num parts,
6774 // we need the part assigment arrays as well, since
6775 // there will be multiple parts in processor.
6776 if(num_procs < num_parts) {
6777 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6778 Kokkos::ViewAllocateWithoutInitializing("host_parts"),
6779 this->assigned_part_ids.extent(0));
6780 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6781
6782 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6783 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
6784 num_incoming_gnos);
6785
6786 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6787
6788 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6789 ("assigned_part_ids", num_incoming_gnos);
6790 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6791 }
6792 else {
6793 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6794 ("assigned_part_ids", num_incoming_gnos);
6795 }
6796 this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6797 "Migration DistributorMigration-" + iteration);
6798
6799 num_new_local_points = num_incoming_gnos;
6800 }
6801}
6802
6808template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6809 typename mj_part_t, typename mj_node_t>
6810void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6811 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6812{
6813 mj_part_t group_size = processor_ranks_for_subcomm.size();
6814 mj_part_t *ids = new mj_part_t[group_size];
6815 for(mj_part_t i = 0; i < group_size; ++i) {
6816 ids[i] = processor_ranks_for_subcomm[i];
6817 }
6818 ArrayView<const mj_part_t> idView(ids, group_size);
6819 this->comm = this->comm->createSubcommunicator(idView);
6820 delete [] ids;
6821}
6822
6828template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6829 typename mj_part_t, typename mj_node_t>
6830void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6831 fill_permutation_array(
6832 mj_part_t output_num_parts,
6833 mj_part_t num_parts)
6834{
6835 // if there is single output part, then simply fill the permutation array.
6836 if(output_num_parts == 1) {
6837 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6838 Kokkos::parallel_for(
6839 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6840 (0, this->num_local_coords),
6841 KOKKOS_LAMBDA(mj_lno_t i) {
6842 local_new_coordinate_permutations(i) = i;
6843 });
6844 auto local_new_part_xadj = this->new_part_xadj;
6845 auto local_num_local_coords = this->num_local_coords;
6846 Kokkos::parallel_for(
6847 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6848 KOKKOS_LAMBDA(int dummy) {
6849 local_new_part_xadj(0) = local_num_local_coords;
6850 });
6851 }
6852 else {
6853 auto local_num_local_coords = this->num_local_coords;
6854 auto local_assigned_part_ids = this->assigned_part_ids;
6855 auto local_new_part_xadj = this->new_part_xadj;
6856 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6857
6858 // part shift holds the which part number an old part number corresponds to.
6859 Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6860
6861 // otherwise we need to count how many points are there in each part.
6862 // we allocate here as num_parts, because the sent partids are up to
6863 // num_parts, although there are outout_num_parts different part.
6864 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6865 "num_points_in_parts", num_parts);
6866
6867 Kokkos::parallel_for(
6868 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6869 KOKKOS_LAMBDA(int dummy) {
6870
6871 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6872 mj_part_t ii = local_assigned_part_ids(i);
6873 ++num_points_in_parts(ii);
6874 }
6875
6876 // write the end points of the parts.
6877 mj_part_t p = 0;
6878 mj_lno_t prev_index = 0;
6879 for(mj_part_t i = 0; i < num_parts; ++i) {
6880 if(num_points_in_parts(i) > 0) {
6881 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6882 prev_index += num_points_in_parts(i);
6883 part_shifts(i) = p++;
6884 }
6885 }
6886
6887 // for the rest of the parts write the end index as end point.
6888 mj_part_t assigned_num_parts = p - 1;
6889 for(;p < num_parts; ++p) {
6890 local_new_part_xadj(p) =
6891 local_new_part_xadj(assigned_num_parts);
6892 }
6893 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6894 num_points_in_parts(i) = local_new_part_xadj(i);
6895 }
6896
6897 // write the permutation array here.
6898 // get the part of the coordinate i, shift it to obtain the new part number.
6899 // assign it to the end of the new part numbers pointer.
6900 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6901 mj_part_t part =
6902 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6903 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6904 }
6905 });
6906 }
6907}
6908
6933template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6934 typename mj_part_t, typename mj_node_t>
6935bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6936 mj_perform_migration(
6937 mj_part_t input_num_parts,
6938 mj_part_t &output_num_parts,
6939 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6940 mj_part_t &output_part_begin_index,
6941 size_t migration_reduce_all_population,
6942 mj_lno_t num_coords_for_last_dim_part,
6943 std::string iteration,
6944 RCP<mj_partBoxVector_t> &input_part_boxes,
6945 RCP<mj_partBoxVector_t> &output_part_boxes)
6946{
6947 mj_part_t num_procs = this->comm->getSize();
6948 this->myRank = this->comm->getRank();
6949
6950 // this array holds how many points each processor has in each part.
6951 // to access how many points processor i has on part j,
6952 // num_points_in_all_processor_parts[i * num_parts + j]
6953 mj_gno_t *num_points_in_all_processor_parts =
6954 new mj_gno_t[input_num_parts * (num_procs + 1)];
6955
6956 // get the number of coordinates in each part in each processor.
6957 this->get_processor_num_points_in_parts(
6958 num_procs,
6959 input_num_parts,
6960 num_points_in_all_processor_parts);
6961
6962 // check if migration will be performed or not.
6963 if(!this->mj_check_to_migrate(
6964 migration_reduce_all_population,
6965 num_coords_for_last_dim_part,
6966 num_procs,
6967 input_num_parts,
6968 num_points_in_all_processor_parts)) {
6969 delete [] num_points_in_all_processor_parts;
6970 return false;
6971 }
6972
6973 mj_lno_t *send_count_to_each_proc = NULL;
6974 int *coordinate_destinations = new int[this->num_local_coords];
6975 send_count_to_each_proc = new mj_lno_t[num_procs];
6976
6977 for(int i = 0; i < num_procs; ++i) {
6978 send_count_to_each_proc[i] = 0;
6979 }
6980
6981 std::vector<mj_part_t> processor_ranks_for_subcomm;
6982 std::vector<mj_part_t> out_part_indices;
6983
6984 // determine which processors are assigned to which parts
6985 this->mj_migration_part_proc_assignment(
6986 num_points_in_all_processor_parts,
6987 input_num_parts,
6988 num_procs,
6989 send_count_to_each_proc,
6990 processor_ranks_for_subcomm,
6991 next_future_num_parts_in_parts,
6992 output_num_parts,
6993 out_part_indices,
6994 output_part_begin_index,
6995 coordinate_destinations);
6996
6997 delete [] send_count_to_each_proc;
6998 std::vector <mj_part_t> tmpv;
6999
7000 std::sort (out_part_indices.begin(), out_part_indices.end());
7001 mj_part_t outP = out_part_indices.size();
7002 mj_gno_t new_global_num_points = 0;
7003 mj_gno_t *global_num_points_in_parts =
7004 num_points_in_all_processor_parts + num_procs * input_num_parts;
7005
7006 if(this->mj_keep_part_boxes) {
7007 input_part_boxes->clear();
7008 }
7009
7010 // now we calculate the new values for next_future_num_parts_in_parts.
7011 // same for the part boxes.
7012 for(mj_part_t i = 0; i < outP; ++i) {
7013 mj_part_t ind = out_part_indices[i];
7014 new_global_num_points += global_num_points_in_parts[ind];
7015 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7016 if(this->mj_keep_part_boxes) {
7017 input_part_boxes->push_back((*output_part_boxes)[ind]);
7018 }
7019 }
7020
7021 // swap the input and output part boxes.
7022 if(this->mj_keep_part_boxes) {
7023 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7024 input_part_boxes = output_part_boxes;
7025 output_part_boxes = tmpPartBoxes;
7026 }
7027 next_future_num_parts_in_parts->clear();
7028 for(mj_part_t i = 0; i < outP; ++i) {
7029 mj_part_t p = tmpv[i];
7030 next_future_num_parts_in_parts->push_back(p);
7031 }
7032
7033 delete [] num_points_in_all_processor_parts;
7034
7035 mj_lno_t num_new_local_points = 0;
7036 //perform the actual migration operation here.
7037 this->mj_migrate_coords(
7038 num_procs,
7039 num_new_local_points,
7040 iteration,
7041 coordinate_destinations,
7042 input_num_parts);
7043
7044 delete [] coordinate_destinations;
7045 if(this->num_local_coords != num_new_local_points) {
7046 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7047 (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7048 num_new_local_points);
7049 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7050 (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7051 num_new_local_points);
7052 }
7053 this->num_local_coords = num_new_local_points;
7054 this->num_global_coords = new_global_num_points;
7055
7056 // create subcommunicator.
7057 this->create_sub_communicator(processor_ranks_for_subcomm);
7058
7059 processor_ranks_for_subcomm.clear();
7060
7061 // fill the new permutation arrays.
7062 this->fill_permutation_array(output_num_parts, input_num_parts);
7063
7064 return true;
7065}
7066
7085template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7086 typename mj_part_t, typename mj_node_t>
7087void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7088 create_consistent_chunks(
7089 mj_part_t num_parts,
7090 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7091 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7092 mj_lno_t coordinate_begin,
7093 mj_lno_t coordinate_end,
7094 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7095 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7096 int coordInd,
7097 bool longest_dim_part,
7098 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7099{
7100 // Note that this method is only used by task mapper
7101 // All code in this file has been verified to run with UVM off by running
7102 // mj tests and task mapper tests with UVM off. However for this particular
7103 // method I did not do much for UVM off. I heavily use device to host copies
7104 // and more or less preserve the original logic. Due to the handling of
7105 // arrays it will be a bit of work to convert this to as better form.
7106 // Since it's only relevant to task mapper and I wasn't sure how much priority
7107 // to give it, I put that on hold until further discussion.
7108 mj_part_t no_cuts = num_parts - 1;
7109
7110 // now if the rectilinear partitioning is allowed we decide how
7111 // much weight each thread should put to left and right.
7112 if(this->distribute_points_on_cut_lines) {
7113 auto local_thread_cut_line_weight_to_put_left =
7114 this->thread_cut_line_weight_to_put_left;
7115 auto local_thread_part_weight_work =
7116 this->thread_part_weight_work;
7117 auto local_sEpsilon = this->sEpsilon;
7118
7119 Kokkos::parallel_for(
7120 Kokkos::RangePolicy<typename mj_node_t::execution_space,
7121 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7122 // the left to be put on the left of the cut.
7123 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7124 if(left_weight > local_sEpsilon) {
7125 // the weight of thread ii on cut.
7126 mj_scalar_t thread_ii_weight_on_cut =
7127 local_thread_part_weight_work(i * 2 + 1) -
7128 local_thread_part_weight_work(i * 2);
7129 if(thread_ii_weight_on_cut < left_weight) {
7130 local_thread_cut_line_weight_to_put_left(i) =
7131 thread_ii_weight_on_cut;
7132 }
7133 else {
7134 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7135 }
7136 }
7137 else {
7138 local_thread_cut_line_weight_to_put_left(i) = 0;
7139 }
7140 });
7141
7142 if(no_cuts > 0) {
7143 auto local_least_signifiance = least_signifiance;
7144 auto local_significance_mul = significance_mul;
7145 Kokkos::parallel_for(
7146 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7147 (0, 1), KOKKOS_LAMBDA (int dummy) {
7148 // this is a special case. If cutlines share the same coordinate,
7149 // their weights are equal.
7150 // we need to adjust the ratio for that.
7151 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7152 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7153 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7154 mj_scalar_t delta = cut2 - cut1;
7155 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7156 if(abs_delta < local_sEpsilon) {
7157 local_thread_cut_line_weight_to_put_left(i) -=
7158 local_thread_cut_line_weight_to_put_left(i - 1);
7159 }
7160 local_thread_cut_line_weight_to_put_left(i) =
7161 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7162 local_least_signifiance) * local_significance_mul) /
7163 static_cast<mj_scalar_t>(local_significance_mul);
7164 }
7165 });
7166 }
7167 }
7168
7169 auto local_thread_point_counts = this->thread_point_counts;
7170 Kokkos::parallel_for(
7171 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7172 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7173 local_thread_point_counts(i) = 0;
7174 });
7175
7176 // for this specific case we dont want to distribute the points along the
7177 // cut position randomly, as we need a specific ordering of them. Instead,
7178 // we put the coordinates into a sort item, where we sort those
7179 // using the coordinates of points on other dimensions and the index.
7180
7181 // some of the cuts might share the same position.
7182 // in this case, if cut i and cut j share the same position
7183 // cut_map[i] = cut_map[j] = sort item index.
7184 mj_part_t *cut_map = new mj_part_t[no_cuts];
7185
7186 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7187 typedef std::vector< multiSItem > multiSVector;
7188 typedef std::vector<multiSVector> multiS2Vector;
7189
7190 // to keep track of the memory allocated.
7191 std::vector<mj_scalar_t *>allocated_memory;
7192
7193 // vector for which the coordinates will be sorted.
7194 multiS2Vector sort_vector_points_on_cut;
7195
7196 // the number of cuts that have different coordinates.
7197 mj_part_t different_cut_count = 1;
7198 cut_map[0] = 0;
7199
7200 // now we insert 1 sort vector for all cuts on the different
7201 // positins.if multiple cuts are on the same position,
7202 // they share sort vectors.
7203 multiSVector tmpMultiSVector;
7204 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7205
7206 auto local_current_concurrent_cut_coordinate =
7207 current_concurrent_cut_coordinate;
7208 auto host_current_concurrent_cut_coordinate =
7209 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7210 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7211 local_current_concurrent_cut_coordinate);
7212
7213 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7214 // if cuts share the same cut coordinates
7215 // set the cutmap accordingly.
7216 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7217 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7218 cut_map[i] = cut_map[i-1];
7219 }
7220 else {
7221 cut_map[i] = different_cut_count++;
7222 multiSVector tmp2MultiSVector;
7223 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7224 }
7225 }
7226 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7227 host_current_concurrent_cut_coordinate);
7228
7229 // now the actual part assigment.
7230 auto host_coordinate_permutations =
7231 Kokkos::create_mirror_view(coordinate_permutations);
7232 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7233
7234 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7235 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7236
7237 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7238 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7239
7240 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7241 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7242
7243 auto local_coord_dim = this->coord_dim;
7244
7245 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7246 mj_lno_t i = host_coordinate_permutations(ii);
7247 mj_part_t pp = host_assigned_part_ids(i);
7248 mj_part_t p = pp / 2;
7249 // if the coordinate is on a cut.
7250 if(pp % 2 == 1 ) {
7251 mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7252 allocated_memory.push_back(vals);
7253
7254 // we insert the coordinates to the sort item here.
7255 int val_ind = 0;
7256
7257 if(longest_dim_part) {
7258 // std::cout << std::endl << std::endl;
7259 for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7260 // uSignedSortItem<int, mj_scalar_t, char>
7261 // *p_coord_dimension_range_sorted
7262 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7263 // std::cout << "next_largest_coord_dim: " <<
7264 // next_largest_coord_dim << " ";
7265 // Note refactor in progress
7266 vals[val_ind++] =
7267 host_mj_coordinates(i,next_largest_coord_dim);
7268 }
7269 }
7270 else {
7271 for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7272 vals[val_ind++] = host_mj_coordinates(i,dim);
7273 }
7274 for(int dim = 0; dim < coordInd; ++dim) {
7275 vals[val_ind++] = host_mj_coordinates(i,dim);
7276 }
7277 }
7278
7279 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7280 //insert the point to the sort vector pointed by the cut_map[p].
7281 mj_part_t cmap = cut_map[p];
7282 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7283 }
7284 else {
7285 //if it is not on the cut, simple sorting.
7286 ++host_thread_point_counts(p);
7287 host_assigned_part_ids(i) = p;
7288 }
7289 }
7290
7291 // sort all the sort vectors.
7292 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7293 std::sort (sort_vector_points_on_cut[i].begin(),
7294 sort_vector_points_on_cut[i].end());
7295 }
7296
7297 mj_part_t previous_cut_map = cut_map[0];
7298
7299 auto host_thread_cut_line_weight_to_put_left =
7300 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7301 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7302 thread_cut_line_weight_to_put_left);
7303
7304 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7305 Kokkos::deep_copy(host_mj_weights, mj_weights);
7306
7307 // this is how much previous part owns the weight of the current part.
7308 // when target part weight is 1.6, and the part on the left is given 2,
7309 // the left has an extra 0.4, while the right has missing 0.4 from the
7310 // previous cut.
7311 // This parameter is used to balance this issues.
7312 // in the above example weight_stolen_from_previous_part will be 0.4.
7313 // if the left part target is 2.2 but it is given 2,
7314 // then weight_stolen_from_previous_part will be -0.2.
7315 mj_scalar_t weight_stolen_from_previous_part = 0;
7316 for(mj_part_t p = 0; p < no_cuts; ++p) {
7317 mj_part_t mapped_cut = cut_map[p];
7318
7319 // if previous cut map is done, and it does not have the same index,
7320 // then assign all points left on that cut to its right.
7321 if(previous_cut_map != mapped_cut) {
7322 mj_lno_t sort_vector_end = (mj_lno_t)
7323 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7324 for(; sort_vector_end >= 0; --sort_vector_end) {
7325 multiSItem t =
7326 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7327 mj_lno_t i = t.index;
7328 ++host_thread_point_counts(p);
7329 host_assigned_part_ids(i) = p;
7330 }
7331 sort_vector_points_on_cut[previous_cut_map].clear();
7332 }
7333
7334 // TODO: MD: I dont remember why I have it reverse order here.
7335 mj_lno_t sort_vector_end = (mj_lno_t)
7336 sort_vector_points_on_cut[mapped_cut].size() - 1;
7337 // mj_lno_t sort_vector_begin= 0;
7338 // mj_lno_t sort_vector_size =
7339 // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7340
7341 // TODO commented for reverse order
7342 for(; sort_vector_end >= 0; --sort_vector_end) {
7343 // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7344 // TODO COMMENTED FOR REVERSE ORDER
7345 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7346 //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7347 mj_lno_t i = t.index;
7348 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7349 this->mj_weights(i,0);
7350 // part p has enough space for point i, then put it to point i.
7351 if(host_thread_cut_line_weight_to_put_left(p) +
7352 weight_stolen_from_previous_part> this->sEpsilon &&
7353 host_thread_cut_line_weight_to_put_left(p) +
7354 weight_stolen_from_previous_part -
7355 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7356 weight_stolen_from_previous_part - w)> this->sEpsilon)
7357 {
7358 host_thread_cut_line_weight_to_put_left(p) -= w;
7359
7360 sort_vector_points_on_cut[mapped_cut].pop_back();
7361
7362 ++host_thread_point_counts(p);
7363 host_assigned_part_ids(i) = p;
7364 // if putting this weight to left overweights the left cut, then
7365 // increase the space for the next cut using
7366 // weight_stolen_from_previous_part.
7367 if(p < no_cuts - 1 &&
7368 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7369 if(mapped_cut == cut_map[p + 1] ) {
7370 // if the cut before the cut indexed at p was also at the same
7371 // position special case, as we handle the weight differently here.
7372 if(previous_cut_map != mapped_cut) {
7373 weight_stolen_from_previous_part =
7374 host_thread_cut_line_weight_to_put_left(p);
7375 }
7376 else {
7377 // if the cut before the cut indexed at p was also at the same
7378 // position we assign extra weights cumulatively in this case.
7379 weight_stolen_from_previous_part +=
7380 host_thread_cut_line_weight_to_put_left(p);
7381 }
7382 }
7383 else{
7384 weight_stolen_from_previous_part =
7385 -host_thread_cut_line_weight_to_put_left(p);
7386 }
7387 // end assignment for part p
7388 break;
7389 }
7390 } else {
7391 // if part p does not have enough space for this point
7392 // and if there is another cut sharing the same positon,
7393 // again increase the space for the next
7394 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7395 if(previous_cut_map != mapped_cut) {
7396 weight_stolen_from_previous_part =
7397 host_thread_cut_line_weight_to_put_left(p);
7398 }
7399 else {
7400 weight_stolen_from_previous_part +=
7401 host_thread_cut_line_weight_to_put_left(p);
7402 }
7403 }
7404 else{
7405 weight_stolen_from_previous_part =
7406 -host_thread_cut_line_weight_to_put_left(p);
7407 }
7408 // end assignment for part p
7409 break;
7410 }
7411 }
7412 previous_cut_map = mapped_cut;
7413 }
7414
7415 // TODO commented for reverse order
7416 // put everything left on the last cut to the last part.
7417 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7418 previous_cut_map].size() - 1;
7419
7420 // mj_lno_t sort_vector_begin= 0;
7421 // mj_lno_t sort_vector_size = (mj_lno_t)
7422 // sort_vector_points_on_cut[previous_cut_map].size();
7423 // TODO commented for reverse order
7424 for(; sort_vector_end >= 0; --sort_vector_end) {
7425 // TODO commented for reverse order
7426 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7427 // multiSItem t =
7428 // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7429 mj_lno_t i = t.index;
7430 ++host_thread_point_counts(no_cuts);
7431 host_assigned_part_ids(i) = no_cuts;
7432 }
7433
7434 sort_vector_points_on_cut[previous_cut_map].clear();
7435 delete [] cut_map;
7436
7437 //free the memory allocated for vertex sort items .
7438 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7439 for(mj_lno_t i = 0; i < vSize; ++i) {
7440 delete [] allocated_memory[i];
7441 }
7442
7443 auto local_out_part_xadj = out_part_xadj;
7444 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7445 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7446
7447 // creation of part_xadj as in usual case.
7448 for(mj_part_t j = 0; j < num_parts; ++j) {
7449 host_out_part_xadj(j) = host_thread_point_counts(j);
7450 host_thread_point_counts(j) = 0;
7451 }
7452
7453 // perform prefix sum for num_points in parts.
7454 for(mj_part_t j = 1; j < num_parts; ++j) {
7455 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7456 }
7457
7458 // shift the num points in threads thread to obtain the
7459 // beginning index of each thread's private space.
7460 for(mj_part_t j = 1; j < num_parts; ++j) {
7461 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7462 }
7463
7464 auto host_new_coordinate_permutations =
7465 Kokkos::create_mirror_view(new_coordinate_permutations);
7466 Kokkos::deep_copy(host_new_coordinate_permutations,
7467 new_coordinate_permutations);
7468
7469 // now thread gets the coordinate and writes the index of coordinate to
7470 // the permutation array using the part index we calculated.
7471 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7472 mj_lno_t i = host_coordinate_permutations(ii);
7473 mj_part_t p = host_assigned_part_ids(i);
7474 host_new_coordinate_permutations(coordinate_begin +
7475 host_thread_point_counts(p)++) = i;
7476 }
7477
7478 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7479 Kokkos::deep_copy(new_coordinate_permutations,
7480 host_new_coordinate_permutations);
7481 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7482}
7483
7493template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7494 typename mj_part_t, typename mj_node_t>
7495void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7496 set_final_parts(
7497 mj_part_t current_num_parts,
7498 mj_part_t output_part_begin_index,
7499 RCP<mj_partBoxVector_t> &output_part_boxes,
7500 bool is_data_ever_migrated)
7501{
7502 this->mj_env->timerStart(MACRO_TIMERS,
7503 mj_timer_base_string + "Part_Assignment");
7504
7505 auto local_part_xadj = part_xadj;
7506 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7507 auto local_coordinate_permutations = coordinate_permutations;
7508 auto local_assigned_part_ids = assigned_part_ids;
7509
7510 if(local_mj_keep_part_boxes) {
7511 for(int i = 0; i < current_num_parts; ++i) {
7512 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7513 }
7514 }
7515
7516 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7517 current_num_parts, Kokkos::AUTO());
7518 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7519 member_type member_type;
7520 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7521 int i = team_member.league_rank();
7522 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7523 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7524 [=] (mj_lno_t ii) {
7525 mj_lno_t k = local_coordinate_permutations(ii);
7526 local_assigned_part_ids(k) = i + output_part_begin_index;
7527 });
7528 });
7529
7530 if(is_data_ever_migrated) {
7531#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7532 if(sizeof(mj_lno_t) <= sizeof(int)) {
7533
7534 // Cannot use Zoltan_Comm with local ordinals larger than ints.
7535 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7536 // may overflow.
7537
7538 // if data is migrated, then send part numbers to the original owners.
7539 ZOLTAN_COMM_OBJ *plan = NULL;
7540 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7541
7542 int incoming = 0;
7543 int message_tag = 7856;
7544
7545 this->mj_env->timerStart(MACRO_TIMERS,
7546 mj_timer_base_string + "Final Z1PlanCreating");
7547
7548 // setup incoming count
7549 int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7550 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7551
7552 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7553 this->mj_env->timerStop(MACRO_TIMERS,
7554 mj_timer_base_string + "Final Z1PlanCreating" );
7555
7556 this->mj_env->timerStart(MACRO_TIMERS,
7557 mj_timer_base_string + "Final Z1PlanComm");
7558
7559 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7560 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7561 // view; need the explicit Host creation and deep_copy.
7562
7563 // migrate gnos to actual owners.
7564 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7565 Kokkos::HostSpace(), this->current_mj_gnos);
7566 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7567 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7568 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7569 auto host_dst_gnos = Kokkos::create_mirror_view(
7570 Kokkos::HostSpace(), dst_gnos);
7571 message_tag++;
7572 ierr = Zoltan_Comm_Do( plan, message_tag,
7573 (char *) host_current_mj_gnos.data(),
7574 sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7575 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7576 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7577 this->current_mj_gnos = dst_gnos;
7578
7579 // migrate part ids to actual owners.
7580 auto host_src_part_ids = Kokkos::create_mirror_view(
7581 Kokkos::HostSpace(), this->assigned_part_ids);
7582 deep_copy(host_src_part_ids, this->assigned_part_ids);
7583 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7584 Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7585 auto host_dst_part_ids = Kokkos::create_mirror_view(
7586 Kokkos::HostSpace(), dst_part_ids);
7587 message_tag++;
7588 ierr = Zoltan_Comm_Do( plan, message_tag,
7589 (char *) host_src_part_ids.data(),
7590 sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7591 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7592 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7593 this->assigned_part_ids = dst_part_ids;
7594
7595 ierr = Zoltan_Comm_Destroy(&plan);
7596 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7597
7598 this->num_local_coords = incoming;
7599
7600 this->mj_env->timerStop(MACRO_TIMERS,
7601 mj_timer_base_string + "Final Z1PlanComm");
7602 }
7603 else
7604#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7605 {
7606 // setup incoming count
7607 this->mj_env->timerStart(MACRO_TIMERS,
7608 mj_timer_base_string + "Final DistributorPlanCreating");
7609 Tpetra::Distributor distributor(this->mj_problemComm);
7610 ArrayView<const mj_part_t> owners_of_coords(
7611 this->owner_of_coordinate.data(), this->num_local_coords);
7612 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7613 this->mj_env->timerStop(MACRO_TIMERS,
7614 mj_timer_base_string + "Final DistributorPlanCreating" );
7615
7616 this->mj_env->timerStart(MACRO_TIMERS,
7617 mj_timer_base_string + "Final DistributorPlanComm");
7618
7619 // migrate gnos to actual owners.
7620 // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7621 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7622 // view; need the explicit Host creation and deep_copy.
7623 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7624 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
7625 this->current_mj_gnos.extent(0));
7626 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7627
7628 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7629 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
7630 incoming);
7631
7632 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7633
7634 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7635 Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7636
7637 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7638
7639 // migrate part ids to actual owners.
7640 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7641 Kokkos::ViewAllocateWithoutInitializing("sent_partids"),
7642 this->assigned_part_ids.extent(0));
7643 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7644
7645 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7646 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
7647 incoming);
7648
7649 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7650
7651 this->assigned_part_ids =
7652 Kokkos::View<mj_part_t*, device_t>(
7653 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7654 incoming);
7655
7656 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7657 this->num_local_coords = incoming;
7658
7659 this->mj_env->timerStop(MACRO_TIMERS,
7660 mj_timer_base_string + "Final DistributorPlanComm");
7661 }
7662 }
7663
7664 this->mj_env->timerStop(MACRO_TIMERS,
7665 mj_timer_base_string + "Part_Assignment");
7666
7667 this->mj_env->timerStart(MACRO_TIMERS,
7668 mj_timer_base_string + "Solution_Part_Assignment");
7669
7670 // ArrayRCP<mj_part_t> partId;
7671 // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7672
7673 if(this->mj_keep_part_boxes) {
7674 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7675 }
7676
7677 this->mj_env->timerStop(MACRO_TIMERS,
7678 mj_timer_base_string + "Solution_Part_Assignment");
7679}
7680
7693template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7694 typename mj_part_t, typename mj_node_t>
7697 bool distribute_points_on_cut_lines_,
7698 int max_concurrent_part_calculation_,
7699 int check_migrate_avoid_migration_option_,
7700 double minimum_migration_imbalance_,
7701 int migration_type_)
7702{
7703 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7704 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7705 this->check_migrate_avoid_migration_option =
7706 check_migrate_avoid_migration_option_;
7707 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7708 this->migration_type = migration_type_;
7709}
7710
7738template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7739 typename mj_part_t, typename mj_node_t>
7742 const RCP<const Environment> &env,
7743 RCP<const Comm<int> > &problemComm,
7744 double imbalance_tolerance_,
7745 int num_teams_,
7746 size_t num_global_parts_,
7747 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7748 int recursion_depth_,
7749 int coord_dim_,
7750 mj_lno_t num_local_coords_,
7751 mj_gno_t num_global_coords_,
7752 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7753 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7754 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7755 int num_weights_per_coord_,
7756 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7757 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7758 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7759 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7760 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7761{
7762
7763 // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7765 this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7766
7767 this->mj_env = env;
7768 this->mj_problemComm = problemComm;
7769 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7770 this->mj_env->timerStart(MACRO_TIMERS,
7771 mj_timer_base_string + "Total");
7772 this->mj_env->debug(3, "In MultiJagged Jagged");
7773 this->imbalance_tolerance = imbalance_tolerance_;
7774 this->mj_num_teams = num_teams_;
7775 this->num_global_parts = num_global_parts_;
7776 this->part_no_array = part_no_array_;
7777 this->recursion_depth = recursion_depth_;
7778 this->coord_dim = coord_dim_;
7779 this->num_local_coords = num_local_coords_;
7780 this->num_global_coords = num_global_coords_;
7781 this->mj_coordinates = mj_coordinates_;
7782 this->initial_mj_gnos = initial_mj_gnos_;
7783 this->num_weights_per_coord = num_weights_per_coord_;
7784 this->mj_uniform_weights = mj_uniform_weights_;
7785 this->mj_weights = mj_weights_;
7786 this->mj_uniform_parts = mj_uniform_parts_;
7787
7788 // this->set_input_data();
7789
7790 this->set_part_specifications();
7791
7792 this->mj_env->timerStart(MACRO_TIMERS,
7793 mj_timer_base_string + "Allocate Views");
7794 this->allocate_set_work_memory();
7795 this->mj_env->timerStop(MACRO_TIMERS,
7796 mj_timer_base_string + "Allocate Views");
7797
7798 // We duplicate the comm as we create subcommunicators during migration.
7799 // We keep the problemComm as it is, while comm changes after each migration.
7800 this->comm = this->mj_problemComm->duplicate();
7801
7802#ifdef print_debug
7803 if(comm->getRank() == 0) {
7804 std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7805 std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7806 std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7807 }
7808#endif
7809
7810 // initially there is a single partition
7811 mj_part_t current_num_parts = 1;
7812 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7813 this->all_cut_coordinates;
7814 this->mj_env->timerStart(MACRO_TIMERS,
7815 mj_timer_base_string + "Problem_Partitioning");
7816 mj_part_t output_part_begin_index = 0;
7817 mj_part_t future_num_parts = this->total_num_part;
7818 bool is_data_ever_migrated = false;
7819
7820 std::vector<mj_part_t> *future_num_part_in_parts =
7821 new std::vector<mj_part_t> ();
7822 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7823 new std::vector<mj_part_t> ();
7824
7825 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7826
7827 RCP<mj_partBoxVector_t> input_part_boxes;
7828 RCP<mj_partBoxVector_t> output_part_boxes;
7829
7830 if(this->mj_keep_part_boxes) {
7831 input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7832 output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7833 compute_global_box();
7834 this->init_part_boxes(output_part_boxes);
7835 }
7836
7837 auto local_part_xadj = this->part_xadj;
7838
7839 // Need a device counter - how best to allocate?
7840 // Putting this allocation in the loops is very costly so moved out here.
7841 Kokkos::View<mj_part_t*, device_t>
7842 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7843 Kokkos::View<size_t*, device_t>
7844 view_total_reduction_size("view_total_reduction_size", 1);
7845
7846 for(int i = 0; i < this->recursion_depth; ++i) {
7847
7848 // convert i to string to be used for debugging purposes.
7849 std::string istring = std::to_string(i);
7850
7851 // next_future_num_parts_in_parts will be as the size of outnumParts,
7852 // and this will hold how many more parts that each output part
7853 // should be divided. this array will also be used to determine the weight
7854 // ratios of the parts. swap the arrays to use iteratively.
7855 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7856 future_num_part_in_parts = next_future_num_parts_in_parts;
7857 next_future_num_parts_in_parts = tmpPartVect;
7858
7859 // clear next_future_num_parts_in_parts array as
7860 // getPartitionArrays expects it to be empty.
7861 next_future_num_parts_in_parts->clear();
7862 if(this->mj_keep_part_boxes) {
7863 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7864 input_part_boxes = output_part_boxes;
7865 output_part_boxes = tmpPartBoxes;
7866 output_part_boxes->clear();
7867 }
7868
7869 // returns the total no. of output parts for this dimension partitioning.
7870 mj_part_t output_part_count_in_dimension =
7871 this->update_part_num_arrays(
7872 future_num_part_in_parts,
7873 next_future_num_parts_in_parts,
7874 future_num_parts,
7875 current_num_parts,
7876 i,
7877 input_part_boxes,
7878 output_part_boxes, 1);
7879
7880 // if the number of obtained parts equal to current number of parts,
7881 // skip this dimension. For example, this happens when 1 is given in the
7882 // input part array is given. P=4,5,1,2
7883 if(output_part_count_in_dimension == current_num_parts) {
7884 //still need to swap the input output arrays.
7885 tmpPartVect= future_num_part_in_parts;
7886 future_num_part_in_parts = next_future_num_parts_in_parts;
7887 next_future_num_parts_in_parts = tmpPartVect;
7888
7889 if(this->mj_keep_part_boxes) {
7890 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7891 input_part_boxes = output_part_boxes;
7892 output_part_boxes = tmpPartBoxes;
7893 }
7894 continue;
7895 }
7896
7897 // get the coordinate axis along which the partitioning will be done.
7898 int coordInd = i % this->coord_dim;
7899
7900 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7901 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7902
7903 this->mj_env->timerStart(MACRO_TIMERS,
7904 mj_timer_base_string + "Problem_Partitioning_" + istring);
7905
7906 // alloc Memory to point the indices
7907 // of the parts in the permutation array.
7908 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7909 "new part xadj", output_part_count_in_dimension);
7910
7911 // the index where in the new_part_xadj will be written.
7912 mj_part_t output_part_index = 0;
7913
7914 // whatever is written to output_part_index will be added with
7915 // output_coordinate_end_index so that the points will be shifted.
7916 mj_part_t output_coordinate_end_index = 0;
7917
7918 mj_part_t current_work_part = 0;
7919 mj_part_t current_concurrent_num_parts =
7920 std::min(current_num_parts - current_work_part,
7921 this->max_concurrent_part_calculation);
7922
7923 mj_part_t obtained_part_index = 0;
7924
7925 auto host_process_local_min_max_coord_total_weight =
7926 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7927 auto host_global_min_max_coord_total_weight =
7928 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7929
7930 // run for all available parts.
7931 for(; current_work_part < current_num_parts;
7932 current_work_part += current_concurrent_num_parts) {
7933
7934 current_concurrent_num_parts =
7935 std::min(current_num_parts - current_work_part,
7936 this->max_concurrent_part_calculation);
7937
7938 int bDoingWork_int; // Can't reduce on bool so use int
7939 auto local_device_num_partitioning_in_current_dim =
7940 device_num_partitioning_in_current_dim;
7941 Kokkos::parallel_reduce("Read bDoingWork",
7942 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7943 KOKKOS_LAMBDA(int dummy, int & set_single) {
7944 set_single = 0;
7945 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7946 if(local_device_num_partitioning_in_current_dim(
7947 current_work_part + kk) != 1) {
7948 set_single = 1;
7949 break;
7950 }
7951 }
7952 }, bDoingWork_int);
7953 bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7954
7955 this->mj_get_local_min_max_coord_totW(
7956 current_work_part,
7957 current_concurrent_num_parts,
7958 mj_current_dim_coords);
7959
7960 // 1D partitioning
7961 if(bDoingWork) {
7962 // obtain global Min max of the part.
7963 this->mj_get_global_min_max_coord_totW(
7964 current_concurrent_num_parts,
7965 this->process_local_min_max_coord_total_weight,
7966 this->global_min_max_coord_total_weight);
7967
7968 // represents the total number of cutlines
7969 // whose coordinate should be determined.
7970 mj_part_t total_incomplete_cut_count = 0;
7971
7972 // Compute weight ratios for parts & cuts:
7973 // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7974 // part0 cut0 part1 cut1 part2 cut2 part3
7975 mj_part_t concurrent_part_cut_shift = 0;
7976 mj_part_t concurrent_part_part_shift = 0;
7977
7978 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7979
7980 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7981 global_min_max_coord_total_weight);
7982
7983 mj_scalar_t min_coordinate =
7984 host_global_min_max_coord_total_weight(kk);
7985 mj_scalar_t max_coordinate =
7986 host_global_min_max_coord_total_weight(
7987 kk + current_concurrent_num_parts);
7988
7989 mj_scalar_t global_total_weight =
7990 host_global_min_max_coord_total_weight(
7991 kk + 2 * current_concurrent_num_parts);
7992
7993 mj_part_t concurrent_current_part_index = current_work_part + kk;
7994
7995 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7996 concurrent_current_part_index);
7997
7998 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7999 Kokkos::subview(current_cut_coordinates,
8000 std::pair<mj_lno_t, mj_lno_t>(
8001 concurrent_part_cut_shift, current_cut_coordinates.size()));
8002 Kokkos::View<mj_scalar_t *, device_t>
8003 current_target_part_weights =
8004 Kokkos::subview(target_part_weights,
8005 std::pair<mj_lno_t, mj_lno_t>(
8006 concurrent_part_part_shift, target_part_weights.size()));
8007
8008 // shift the usedCutCoordinate array as noCuts.
8009 concurrent_part_cut_shift += partition_count - 1;
8010 // shift the partRatio array as noParts.
8011 concurrent_part_part_shift += partition_count;
8012
8013 // calculate only if part is not empty,
8014 // and part will be further partitioned.
8015 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8016
8017 // increase num_cuts_do_be_determined by the number of cuts of the
8018 // current part's cut line number.
8019 total_incomplete_cut_count += partition_count - 1;
8020
8021 this->incomplete_cut_count(kk) = partition_count - 1;
8022
8023 // get the target weights of the parts
8024 this->mj_get_initial_cut_coords_target_weights(
8025 min_coordinate,
8026 max_coordinate,
8027 partition_count - 1,
8028 global_total_weight,
8029 usedCutCoordinate,
8030 current_target_part_weights,
8031 future_num_part_in_parts,
8032 next_future_num_parts_in_parts,
8033 concurrent_current_part_index,
8034 obtained_part_index);
8035
8036 mj_lno_t coordinate_end_index =
8037 host_part_xadj(concurrent_current_part_index);
8038 mj_lno_t coordinate_begin_index =
8039 concurrent_current_part_index==0 ? 0 :
8040 host_part_xadj(concurrent_current_part_index - 1);
8041
8042 this->set_initial_coordinate_parts(
8043 max_coordinate,
8044 min_coordinate,
8045 coordinate_begin_index, coordinate_end_index,
8046 this->coordinate_permutations,
8047 mj_current_dim_coords,
8048 this->assigned_part_ids,
8049 partition_count);
8050 }
8051 else {
8052 // e.g., if have fewer coordinates than parts, don't need to do
8053 // next dim.
8054 this->incomplete_cut_count(kk) = 0;
8055 }
8056
8057 obtained_part_index += partition_count;
8058 }
8059
8060 // used imbalance, it is always 0, as it is difficult to
8061 // estimate a range.
8062 double used_imbalance = 0;
8063 // Determine cut lines for all concurrent parts parts here.
8064 this->mj_env->timerStart(MACRO_TIMERS,
8065 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8066
8067 this->mj_1D_part(
8068 mj_current_dim_coords,
8069 used_imbalance,
8070 current_work_part,
8071 current_concurrent_num_parts,
8072 current_cut_coordinates,
8073 total_incomplete_cut_count,
8074 view_rectilinear_cut_count,
8075 view_total_reduction_size);
8076
8077 this->mj_env->timerStop(MACRO_TIMERS,
8078 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8079 }
8080
8081 // create new part chunks
8082 {
8083 mj_part_t output_array_shift = 0;
8084 mj_part_t cut_shift = 0;
8085 size_t tlr_shift = 0;
8086 size_t partweight_array_shift = 0;
8087 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8088
8089 mj_part_t current_concurrent_work_part = current_work_part + kk;
8090
8091 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8092 current_concurrent_work_part);
8093
8094 // if the part is empty, skip the part.
8095 int coordinateA_bigger_than_coordinateB =
8096 host_global_min_max_coord_total_weight(kk) >
8097 host_global_min_max_coord_total_weight(
8098 kk + current_concurrent_num_parts);
8099
8100 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8101 // we still need to write the begin and end point of the empty part.
8102 // simply set it zero, the array indices will be shifted later
8103 auto local_new_part_xadj = this->new_part_xadj;
8104 Kokkos::parallel_for(
8105 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8106 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8107 local_new_part_xadj(
8108 output_part_index + output_array_shift + jj) = 0;
8109 });
8110
8111 cut_shift += num_parts - 1;
8112 tlr_shift += (4 *(num_parts - 1) + 1);
8113 output_array_shift += num_parts;
8114 partweight_array_shift += (2 * (num_parts - 1) + 1);
8115 continue;
8116 }
8117
8118 Kokkos::View<mj_scalar_t *, device_t>
8119 current_concurrent_cut_coordinate =
8120 Kokkos::subview(current_cut_coordinates,
8121 std::pair<mj_lno_t, mj_lno_t>(
8122 cut_shift,
8123 current_cut_coordinates.size()));
8124 Kokkos::View<mj_scalar_t *, device_t>
8125 used_local_cut_line_weight_to_left =
8126 Kokkos::subview(process_cut_line_weight_to_put_left,
8127 std::pair<mj_lno_t, mj_lno_t>(
8128 cut_shift,
8129 process_cut_line_weight_to_put_left.size()));
8130
8131 this->thread_part_weight_work =
8132 Kokkos::subview(
8133 this->thread_part_weights,
8134 std::pair<mj_lno_t, mj_lno_t>(
8135 partweight_array_shift,
8136 this->thread_part_weights.extent(0)));
8137
8138 if(num_parts > 1) {
8139 if(this->mj_keep_part_boxes) {
8140 // if part boxes are to be stored update the boundaries.
8141 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8142 mj_scalar_t temp_get_val;
8143 Kokkos::parallel_reduce("Read single",
8144 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8145 KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8146 set_single = current_concurrent_cut_coordinate(j);
8147 }, temp_get_val);
8148 (*output_part_boxes)
8149 [output_array_shift + output_part_index + j].
8150 updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8151 (*output_part_boxes)
8152 [output_array_shift + output_part_index + j + 1].
8153 updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8154 }
8155 }
8156
8157 // Rewrite the indices based on the computed cuts.
8158 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8159 Kokkos::subview(this->new_part_xadj,
8160 std::pair<mj_lno_t, mj_lno_t>(
8161 output_part_index + output_array_shift,
8162 this->new_part_xadj.size()));
8163
8164 this->mj_create_new_partitions(
8165 num_parts,
8166 current_concurrent_work_part,
8167 mj_current_dim_coords,
8168 current_concurrent_cut_coordinate,
8169 used_local_cut_line_weight_to_left,
8170 sub_new_part_xadj);
8171 }
8172 else {
8173
8174 mj_lno_t coordinate_end = host_part_xadj(
8175 current_concurrent_work_part);
8176 mj_lno_t coordinate_begin =
8177 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8178 current_concurrent_work_part - 1);
8179
8180 // if this part is partitioned into 1 then just copy
8181 // the old values.
8182 mj_lno_t part_size = coordinate_end - coordinate_begin;
8183
8184 // Awkward here to set one value - need some broader
8185 // refactoring to improve this one.
8186 auto local_new_part_xadj = this->new_part_xadj;
8187 Kokkos::parallel_for(
8188 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8189 (0, 1), KOKKOS_LAMBDA (int dummy) {
8190 local_new_part_xadj(
8191 output_part_index + output_array_shift) = part_size;
8192 });
8193
8194 auto subview_new_coordinate_permutations =
8195 Kokkos::subview(this->new_coordinate_permutations,
8196 std::pair<mj_lno_t, mj_lno_t>(
8197 coordinate_begin,
8198 coordinate_begin + part_size));
8199 auto subview_coordinate_permutations =
8200 Kokkos::subview(this->coordinate_permutations,
8201 std::pair<mj_lno_t, mj_lno_t>(
8202 coordinate_begin,
8203 coordinate_begin + part_size));
8204 Kokkos::deep_copy(subview_new_coordinate_permutations,
8205 subview_coordinate_permutations);
8206 }
8207 cut_shift += num_parts - 1;
8208 output_array_shift += num_parts;
8209 partweight_array_shift += (2 * (num_parts - 1) + 1);
8210 }
8211
8212 // shift cut coordinates so that all cut coordinates are stored.
8213 // no shift now because we dont keep the cuts.
8214 // current_cut_coordinates += cut_shift;
8215 // mj_create_new_partitions from coordinates partitioned the parts
8216 // and write the indices as if there were a single part.
8217 // now we need to shift the beginning indices.
8218 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8219 mj_part_t num_parts =
8220 host_num_partitioning_in_current_dim(current_work_part + kk);
8221
8222 // These two kernels are a bit awkward but need broader redesign to
8223 // avoid this situation.
8224 auto local_new_part_xadj = this->new_part_xadj;
8225 Kokkos::parallel_for(
8226 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8227 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8228 local_new_part_xadj(output_part_index+ii) +=
8229 output_coordinate_end_index;
8230 });
8231
8232 // increase the previous count by current end.
8233 mj_part_t temp_get;
8234 Kokkos::parallel_reduce("Read single",
8235 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8236 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8237 set_single =
8238 local_new_part_xadj(output_part_index + num_parts - 1);
8239 }, temp_get);
8240 output_coordinate_end_index = temp_get;
8241 //increase the current out.
8242 output_part_index += num_parts;
8243 }
8244 }
8245 }
8246
8247 // end of this partitioning dimension
8248 int current_world_size = this->comm->getSize();
8249 long migration_reduce_all_population =
8250 this->total_dim_num_reduce_all * current_world_size;
8251 bool is_migrated_in_current_dimension = false;
8252
8253 // we migrate if there are more partitionings to be done after this step
8254 // and if the migration is not forced to be avoided.
8255 // and the operation is not sequential.
8256 if(future_num_parts > 1 &&
8257 this->check_migrate_avoid_migration_option >= 0 &&
8258 current_world_size > 1) {
8259 this->mj_env->timerStart(MACRO_TIMERS,
8260 mj_timer_base_string + "Problem_Migration-" + istring);
8261 mj_part_t num_parts = output_part_count_in_dimension;
8262
8263 if(this->mj_perform_migration(
8264 num_parts,
8265 current_num_parts, //output
8266 next_future_num_parts_in_parts, //output
8267 output_part_begin_index,
8268 migration_reduce_all_population,
8269 this->num_global_coords / (future_num_parts * current_num_parts),
8270 istring,
8271 input_part_boxes, output_part_boxes) )
8272 {
8273 is_migrated_in_current_dimension = true;
8274 is_data_ever_migrated = true;
8275 this->mj_env->timerStop(MACRO_TIMERS,
8276 mj_timer_base_string + "Problem_Migration-" + istring);
8277 // since data is migrated, we reduce the number of reduceAll
8278 // operations for the last part.
8279 this->total_dim_num_reduce_all /= num_parts;
8280 }
8281 else {
8282 is_migrated_in_current_dimension = false;
8283 this->mj_env->timerStop(MACRO_TIMERS,
8284 mj_timer_base_string + "Problem_Migration-" + istring);
8285 }
8286 }
8287
8288 // swap the coordinate permutations for the next dimension.
8289 Kokkos::View<mj_lno_t*, device_t> tmp =
8290 this->coordinate_permutations;
8291 this->coordinate_permutations =
8292 this->new_coordinate_permutations;
8293
8294 this->new_coordinate_permutations = tmp;
8295 if(!is_migrated_in_current_dimension) {
8296 this->total_dim_num_reduce_all -= current_num_parts;
8297 current_num_parts = output_part_count_in_dimension;
8298 }
8299
8300 {
8301 this->part_xadj = this->new_part_xadj;
8302 local_part_xadj = this->new_part_xadj;
8303 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8304 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8305
8306 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8307 this->mj_env->timerStop(MACRO_TIMERS,
8308 mj_timer_base_string + "Problem_Partitioning_" + istring);
8309 }
8310 }
8311
8312 // Partitioning is done
8313 delete future_num_part_in_parts;
8314 delete next_future_num_parts_in_parts;
8315 this->mj_env->timerStop(MACRO_TIMERS,
8316 mj_timer_base_string + "Problem_Partitioning");
8318
8319 //get the final parts of each initial coordinate
8320 //the results will be written to
8321 //this->assigned_part_ids for gnos given in this->current_mj_gnos
8322 this->set_final_parts(
8323 current_num_parts,
8324 output_part_begin_index,
8325 output_part_boxes,
8326 is_data_ever_migrated);
8327
8328 result_assigned_part_ids_ = this->assigned_part_ids;
8329 result_mj_gnos_ = this->current_mj_gnos;
8330 this->mj_env->timerStop(MACRO_TIMERS,
8331 mj_timer_base_string + "Total");
8332 this->mj_env->debug(3, "Out of MultiJagged");
8333}
8334
8335template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8336 typename mj_part_t, typename mj_node_t>
8337RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8338 mj_partBoxVector_t>
8340 get_kept_boxes() const
8341{
8342 if(this->mj_keep_part_boxes) {
8343 return this->kept_boxes;
8344 }
8345 else {
8346 throw std::logic_error("Error: part boxes are not stored.");
8347 }
8348}
8349
8350template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8351 typename mj_part_t, typename mj_node_t>
8352RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8353 mj_partBoxVector_t>
8355 compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8356{
8357 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8358 mj_part_t ntasks = this->num_global_parts;
8359 int dim = (*localPartBoxes)[0].getDim();
8360 coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8361
8362 memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8363
8364 coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8365 memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8366
8367 coord_t *localPartMins = localPartBoundaries;
8368 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8369
8370 coord_t *globalPartMins = globalPartBoundaries;
8371 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8372
8373 mj_part_t boxCount = localPartBoxes->size();
8374 for(mj_part_t i = 0; i < boxCount; ++i) {
8375 mj_part_t pId = (*localPartBoxes)[i].getpId();
8376
8377 // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8378
8379 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8380 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8381
8382 for(int j = 0; j < dim; ++j) {
8383 localPartMins[dim * pId + j] = lmins[j];
8384 localPartMaxs[dim * pId + j] = lmaxs[j];
8385
8386 /*
8387 std::cout << "me:" << comm->getRank() <<
8388 " dim * pId + j:"<< dim * pId + j <<
8389 " localMin:" << localPartMins[dim * pId + j] <<
8390 " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8391 */
8392 }
8393 }
8394
8395 Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8396
8397 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8398 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8399
8400 RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8401 for(mj_part_t i = 0; i < ntasks; ++i) {
8403 globalPartMins + dim * i,
8404 globalPartMaxs + dim * i);
8405
8406 /*
8407 for(int j = 0; j < dim; ++j) {
8408 std::cout << "me:" << comm->getRank() <<
8409 " dim * pId + j:"<< dim * i + j <<
8410 " globalMin:" << globalPartMins[dim * i + j] <<
8411 " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8412 }
8413 */
8414
8415 pB->push_back(tpb);
8416 }
8417 delete []localPartBoundaries;
8418 delete []globalPartBoundaries;
8419 //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8420 return pB;
8421}
8422
8425template <typename Adapter>
8426class Zoltan2_AlgMJ : public Algorithm<Adapter>
8427{
8428
8429private:
8430
8431#ifndef DOXYGEN_SHOULD_SKIP_THIS
8432 // For coordinates and weights, MJ needs floats or doubles
8433 // But Adapter can provide other scalars, e.g., ints.
8434 // So have separate scalar_t for MJ and adapter.
8435 typedef typename Adapter::scalar_t adapter_scalar_t;
8436
8437 // Provide a default type for mj_scalar_t;
8438 typedef float default_mj_scalar_t;
8439
8440 // If Adapter provided float or double scalar_t, use it (prevents copies).
8441 // Otherwise, use the default type of mj_scalar_t;
8442 typedef typename
8443 std::conditional<
8444 (std::is_same<adapter_scalar_t, float>::value ||
8445 std::is_same<adapter_scalar_t, double>::value),
8446 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8447
8448 typedef typename Adapter::gno_t mj_gno_t;
8449 typedef typename Adapter::lno_t mj_lno_t;
8450 typedef typename Adapter::part_t mj_part_t;
8451 typedef typename Adapter::node_t mj_node_t;
8452 typedef coordinateModelPartBox mj_partBox_t;
8453 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8454 typedef typename mj_node_t::device_type device_t;
8455#endif
8456
8458
8459 RCP<const Environment> mj_env; // the environment object
8460 RCP<const Comm<int> > mj_problemComm; // initial comm object
8461 RCP<const typename Adapter::base_adapter_t> mj_adapter; // coordinate adapter
8462
8463 // PARAMETERS
8464 double imbalance_tolerance; // input imbalance tolerance.
8465
8466 int num_teams; // how many teams to run main loop with
8467
8468 size_t num_global_parts; // the targeted number of parts
8469
8470 // input part array specifying num part to divide along each dim.
8471 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8472
8473 // the number of steps that partitioning will be solved in.
8474 int recursion_depth;
8475
8476 int coord_dim; // coordinate dimension.
8477 mj_lno_t num_local_coords; //number of local coords.
8478 mj_gno_t num_global_coords; //number of global coords.
8479
8480 // initial global ids of the coordinates.
8481 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8482
8483 // two dimension coordinate array.
8484 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8485 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8486 mj_coordinates;
8487
8488 int num_weights_per_coord; // number of weights per coordinate
8489
8490 // if the target parts are uniform.
8491 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8492
8493 // two dimensional weight array.
8494 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8495
8496 // if the target parts are uniform
8497 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8498
8499 // Nonuniform first level partitioning
8500 // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8501 // machine coordinates and application coordinates.
8502 // An optimization that completely partitions the most important machine
8503 // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8504 // coordinate). The standard MJ alg follows after the nonuniform first level
8505 // partitioning.
8506 // If used, number of parts for the first level partitioning
8507 mj_part_t num_first_level_parts;
8508
8509 // If used, the distribution of parts for the nonuniform
8510 // first level partitioning
8511 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8512
8513 // if partitioning can distribute points on same coordiante to
8514 // different parts.
8515 bool distribute_points_on_cut_lines;
8516
8517 // how many parts we can calculate concurrently.
8518 mj_part_t max_concurrent_part_calculation;
8519
8520 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8521 int check_migrate_avoid_migration_option;
8522
8523 // when doing the migration, 0 will aim for perfect load-imbalance,
8524 int migration_type;
8525
8526 // 1 for minimized messages
8527
8528 // when MJ decides whether to migrate, the minimum imbalance for migration.
8529 double minimum_migration_imbalance;
8530 bool mj_keep_part_boxes; //if the boxes need to be kept.
8531
8532 // if this is set, then recursion depth is adjusted to its maximum value.
8533 bool mj_run_as_rcb;
8534 int mj_premigration_option;
8535 int min_coord_per_rank_for_premigration;
8536
8537 // communication graph xadj
8538 ArrayRCP<mj_part_t> comXAdj_;
8539
8540 // communication graph adj.
8541 ArrayRCP<mj_part_t> comAdj_;
8542
8543 void copy(
8544 const RCP<PartitioningSolution<Adapter> >&solution);
8545
8546 void set_input_parameters(const Teuchos::ParameterList &p);
8547
8548 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8549
8550 bool mj_premigrate_to_subset(
8551 int used_num_ranks,
8552 int migration_selection_option,
8553 RCP<const Environment> mj_env_,
8554 RCP<const Comm<int> > mj_problemComm_,
8555 int coord_dim_,
8556 mj_lno_t num_local_coords_,
8557 mj_gno_t num_global_coords_, size_t num_global_parts_,
8558 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8559 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8560 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8561 mj_coordinates_,
8562 int num_weights_per_coord_,
8563 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8564 //results
8565 RCP<const Comm<int> > &result_problemComm_,
8566 mj_lno_t & result_num_local_coords_,
8567 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8568 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8569 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8570 result_mj_coordinates_,
8571 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8572 int * &result_actual_owner_rank_);
8573
8574public:
8575
8576 Zoltan2_AlgMJ(const RCP<const Environment> &env,
8577 RCP<const Comm<int> > &problemComm,
8578 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8579 mj_partitioner(),
8580 mj_env(env),
8581 mj_problemComm(problemComm),
8582 mj_adapter(adapter),
8583 imbalance_tolerance(0),
8584 num_teams(0),
8585 num_global_parts(1),
8586 recursion_depth(0),
8587 coord_dim(0),
8588 num_local_coords(0),
8589 num_global_coords(0),
8590 num_weights_per_coord(0),
8591 num_first_level_parts(1),
8592 distribute_points_on_cut_lines(true),
8593 max_concurrent_part_calculation(1),
8594 check_migrate_avoid_migration_option(0),
8595 migration_type(0),
8596 minimum_migration_imbalance(0.30),
8597 mj_keep_part_boxes(false),
8598 mj_run_as_rcb(false),
8599 mj_premigration_option(0),
8600 min_coord_per_rank_for_premigration(32000),
8601 comXAdj_(),
8602 comAdj_()
8603 {
8604 }
8605
8607 {
8608 }
8609
8612 static void getValidParameters(ParameterList & pl)
8613 {
8614 const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8615 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8616 Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8617 pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8618 "algorithm. As many as the dimension count.", mj_parts_Validator);
8619
8620 pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8621 "coordinates will be calculated concurently.",
8623
8624 pl.set("mj_minimum_migration_imbalance", 1.1,
8625 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8626 "processors to avoid migration",
8628
8629 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8630 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8631 pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8632 "depending on the imbalance, 1 for forcing migration, 2 for "
8633 "avoiding migration", mj_migration_option_validator);
8634
8635 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8636 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8637 pl.set("mj_migration_type", 0,
8638 "Migration type, 0 for migration to minimize the imbalance "
8639 "1 for migration to minimize messages exchanged the migration.",
8640 mj_migration_option_validator);
8641
8642 // bool parameter
8643 pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8644 "geometric partitioning.", Environment::getBoolValidator());
8645
8646 // bool parameter
8647 pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8649
8650 pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8651 "greater than 0.", Environment::getAnyIntValidator());
8652
8653 RCP<Teuchos::EnhancedNumberValidator<int>>
8654 mj_num_teams_validator =
8655 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8656 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8657 pl.set("mj_num_teams", 0,
8658 "How many teams for the main kernel loop"
8659 , mj_num_teams_validator);
8660
8661 RCP<Teuchos::EnhancedNumberValidator<int>>
8662 mj_premigration_option_validator =
8663 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8664
8665 pl.set("mj_premigration_option", 0,
8666 "Whether to do premigration or not. 0 for no migration "
8667 "x > 0 for migration to consecutive processors, "
8668 "the subset will be 0,x,2x,3x,...subset ranks."
8669 , mj_premigration_option_validator);
8670
8671 pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8672 "assign each rank in multijagged after premigration"
8674 }
8675
8681 void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8682
8683 mj_partBoxVector_t &getPartBoxesView() const
8684 {
8685 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8686 return *pBoxes;
8687 }
8688
8689 mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8690
8691 void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8692 size_t &nPartsFound, mj_part_t **partsFound) const;
8693
8696 void getCommunicationGraph(
8697 const PartitioningSolution<Adapter> *solution,
8698 ArrayRCP<mj_part_t> &comXAdj,
8699 ArrayRCP<mj_part_t> &comAdj);
8700
8701 void set_up_partitioning_data( // public for CUDA
8702 const RCP<PartitioningSolution<Adapter> >&solution);
8703
8704 private:
8705 std::string timer_base_string; // used for making timers
8706
8707 // After loading views from coordinate adapter we may need to copy them
8708 // if mj type is different, but otherwise we just want to assign the view.
8709 // So purpose of this code is to make that assign only happen when the types
8710 // match. The empty case would otherwise not compile.
8711 // If they don't match the internal code handles allocating the new view
8712 // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8713 template<class dst_t, class src_t> // version for same types
8714 typename std::enable_if<std::is_same<typename dst_t::value_type,
8715 typename src_t::value_type>::value>::type
8716 assign_if_same(dst_t & dst, const src_t & src) {
8717 dst = src;
8718 }
8719 template<class dst_t, class src_t> // version for different types
8720 typename std::enable_if<!std::is_same<typename dst_t::value_type,
8721 typename src_t::value_type>::value>::type
8722 assign_if_same(dst_t & dst, const src_t & src) {
8723 // do nothing - handled manually
8724 }
8725};
8726
8727template <typename Adapter>
8728bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8729 int used_num_ranks,
8730 int migration_selection_option,
8731 RCP<const Environment> mj_env_,
8732 RCP<const Comm<int> > mj_problemComm_,
8733 int coord_dim_,
8734 mj_lno_t num_local_coords_,
8735 mj_gno_t num_global_coords_, size_t num_global_parts_,
8736 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8737 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8738 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8739 int num_weights_per_coord_,
8740 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8741 //results
8742 RCP<const Comm<int> > & result_problemComm_,
8743 mj_lno_t &result_num_local_coords_,
8744 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8745 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8746 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8747 result_mj_coordinates_,
8748 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8749 int * &result_actual_owner_rank_)
8750{
8751 mj_env_->timerStart(MACRO_TIMERS,
8752 timer_base_string + "PreMigration DistributorPlanCreating");
8753
8754 int myRank = mj_problemComm_->getRank();
8755 int worldSize = mj_problemComm_->getSize();
8756
8757 mj_part_t groupsize = worldSize / used_num_ranks;
8758
8759 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8760
8761 mj_part_t i_am_sending_to = 0;
8762 bool am_i_a_receiver = false;
8763
8764 for(int i = 0; i < used_num_ranks; ++i) {
8765 group_begins[i+ 1] = group_begins[i] + groupsize;
8766 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8767 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8768 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8769 i_am_sending_to = group_begins[i];
8770 }
8771 if(myRank == group_begins[i]) {
8772 am_i_a_receiver = true;
8773 }
8774 }
8775
8776 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8777 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8778
8779 Tpetra::Distributor distributor(mj_problemComm_);
8780
8781 std::vector<mj_part_t>
8782 coordinate_destinations(num_local_coords_, i_am_sending_to);
8783
8784 ArrayView<const mj_part_t>
8785 destinations(&(coordinate_destinations[0]), num_local_coords_);
8786 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8787 result_num_local_coords_ = num_incoming_gnos;
8788 mj_env_->timerStop(MACRO_TIMERS,
8789 timer_base_string + "PreMigration DistributorPlanCreating");
8790
8791 mj_env_->timerStart(MACRO_TIMERS,
8792 timer_base_string + "PreMigration DistributorMigration");
8793
8794
8795 // migrate gnos.
8796 // MPI buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8797 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
8798 // view; need the explicit Host creation and deep_copy.
8799 {
8800 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8801 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
8802 initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8803 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8804
8805 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8806 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
8807 num_incoming_gnos);
8808
8809 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8810
8811 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8812 Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8813 num_incoming_gnos);
8814 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8815 }
8816
8817 // migrate coordinates
8818 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8819
8820 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8821 host_src_coordinates(
8822 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8823 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8824
8825 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8826
8827 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8828 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8829 num_incoming_gnos, this->coord_dim);
8830
8831 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8832 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
8833 num_incoming_gnos);
8834
8835 for(int i = 0; i < this->coord_dim; ++i) {
8836
8837 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8838
8839 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8840
8841 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8842 received_coord);
8843 Kokkos::fence();
8844 }
8845 result_mj_coordinates_ = dst_coordinates;
8846
8847 // migrate weights.
8848
8849 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8850 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8851 num_incoming_gnos, this->num_weights_per_coord);
8852 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8853
8854 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8855 Kokkos::HostSpace(), this->mj_weights);
8856
8857 // contiguous buffers to gather potentially strided data
8858 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8859 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
8860 this->num_local_coords);
8861
8862 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8863 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
8864 num_incoming_gnos);
8865
8866 for(int i = 0; i < this->num_weights_per_coord; ++i) {
8867
8868 auto sub_host_src_weights
8869 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8870 auto sub_host_dst_weights
8871 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8872
8873 // Layout Right means these weights are not contiguous
8874 // However we don't have any systems setup with more than 1 weight so
8875 // really I have not tested any of this code with num weights > 1.
8876 // I think this is the right thing to do. Note that there are other
8877 // places in the code which don't handle the possibility of more weights.
8878 // So evaluating all that and adding tests would be another project.
8879 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8880 sent_weight[n] = sub_host_src_weights(n);
8881 }
8882
8883 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8884
8885 // Again we copy by index due to layout
8886 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8887 sub_host_dst_weights(n) = received_weight[n];
8888 }
8889 }
8890 Kokkos::deep_copy(dst_weights, host_dst_weights);
8891 result_mj_weights_ = dst_weights;
8892
8893 // migrate the owners of the coordinates
8894 {
8895 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8896 Kokkos::ViewAllocateWithoutInitializing("sent_owners"),
8897 num_local_coords_);
8898 Kokkos::deep_copy(sent_owners, myRank);
8899
8900 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8901 Kokkos::ViewAllocateWithoutInitializing("received_owners"),
8902 num_incoming_gnos);
8903
8904 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8905
8906 result_actual_owner_rank_ = new int[num_incoming_gnos];
8907 memcpy(
8908 result_actual_owner_rank_,
8909 received_owners.data(),
8910 num_incoming_gnos * sizeof(int));
8911 }
8912
8913 mj_env_->timerStop(MACRO_TIMERS,
8914 timer_base_string + "PreMigration DistributorMigration");
8915 return am_i_a_receiver;
8916}
8917
8925template <typename Adapter>
8927 const RCP<PartitioningSolution<Adapter> > &solution)
8928{
8929 // purpose of this code is to validate node and UVM status for the tests
8930 // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8931 // << "Execution Space: " << mj_node_t::execution_space::name()
8932 // << std::endl;
8933
8934 int execute_counter =
8936 timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8937
8938 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8939 {
8940 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8941
8942 this->set_up_partitioning_data(solution);
8943
8944 this->set_input_parameters(this->mj_env->getParameters());
8945 if(this->mj_keep_part_boxes) {
8946 this->mj_partitioner.set_to_keep_part_boxes();
8947 }
8948
8949 this->mj_partitioner.set_partitioning_parameters(
8950 this->distribute_points_on_cut_lines,
8951 this->max_concurrent_part_calculation,
8952 this->check_migrate_avoid_migration_option,
8953 this->minimum_migration_imbalance, this->migration_type);
8954
8955 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8956 mj_lno_t result_num_local_coords = this->num_local_coords;
8957 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8958 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8959 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8960 result_mj_coordinates = this->mj_coordinates;
8961 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8962 this->mj_weights;
8963 int *result_actual_owner_rank = NULL;
8964
8965 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8966 this->initial_mj_gnos;
8967
8968 // TODO: MD 08/2017: Further discussion is required.
8969 // MueLu calls MJ when it has very few coordinates per processors,
8970 // such as 10. For example, it begins with 1K processor with 1K coordinate
8971 // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8972 // It calls MJ to repartition these to 10 coordinates.
8973 // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8974 // 10 parts. As expected strong scaling is problem here, because
8975 // computation is almost 0, and communication cost of MJ linearly increases.
8976 // Premigration option gathers the coordinates to 10 parts before MJ starts
8977 // therefore MJ will run with a smalller subset of the problem.
8978 // Below, I am migrating the coordinates if mj_premigration_option is set,
8979 // and the result parts are less than the current part count, and the
8980 // average number of local coordinates is less than some threshold.
8981 // For example, premigration may not help if 1000 processors are
8982 // partitioning data to 10, but each of them already have 1M coordinate.
8983 // In that case, we premigration would not help.
8984 int current_world_size = this->mj_problemComm->getSize();
8985 mj_lno_t threshold_num_local_coords =
8986 this->min_coord_per_rank_for_premigration;
8987 bool is_pre_migrated = false;
8988 bool am_i_in_subset = true;
8989
8990 // Note that we need to add testing for migration and should also cover the
8991 // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8992 // Currently did a minimal test of this code by running mjTest with
8993 // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8994 if(mj_premigration_option > 0 &&
8995 size_t (current_world_size) > this->num_global_parts &&
8996 this->num_global_coords < mj_gno_t (
8997 current_world_size * threshold_num_local_coords))
8998 {
8999 if(this->mj_keep_part_boxes) {
9000 throw std::logic_error("Multijagged: mj_keep_part_boxes and "
9001 "mj_premigration_option are not supported together yet.");
9002 }
9003
9004 is_pre_migrated =true;
9005 int migration_selection_option = mj_premigration_option;
9006 if(migration_selection_option * this->num_global_parts >
9007 (size_t) (current_world_size)) {
9008 migration_selection_option =
9009 current_world_size / this->num_global_parts;
9010 }
9011
9012 int used_num_ranks = int (this->num_global_coords /
9013 float (threshold_num_local_coords) + 0.5);
9014
9015 if(used_num_ranks == 0) {
9016 used_num_ranks = 1;
9017 }
9018
9019 am_i_in_subset = this->mj_premigrate_to_subset(
9020 used_num_ranks,
9021 migration_selection_option,
9022 this->mj_env,
9023 this->mj_problemComm,
9024 this->coord_dim,
9025 this->num_local_coords,
9026 this->num_global_coords,
9027 this->num_global_parts,
9028 this->initial_mj_gnos,
9029 this->mj_coordinates,
9030 this->num_weights_per_coord,
9031 this->mj_weights,
9032 //results
9033 result_problemComm,
9034 result_num_local_coords,
9035 result_initial_mj_gnos,
9036 result_mj_coordinates,
9037 result_mj_weights,
9038 result_actual_owner_rank);
9039
9040 result_initial_mj_gnos_ = result_initial_mj_gnos;
9041 }
9042
9043 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9044 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9045
9046 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9047
9048 if(am_i_in_subset) {
9049 this->mj_partitioner.multi_jagged_part(
9050 this->mj_env,
9051 result_problemComm, //this->mj_problemComm,
9052 this->imbalance_tolerance,
9053 this->num_teams,
9054 this->num_global_parts,
9055 this->part_no_array,
9056 this->recursion_depth,
9057 this->coord_dim,
9058 result_num_local_coords, //this->num_local_coords,
9059 this->num_global_coords,
9060 result_initial_mj_gnos_,
9061 result_mj_coordinates,
9062 this->num_weights_per_coord,
9063 this->mj_uniform_weights,
9064 result_mj_weights,
9065 this->mj_uniform_parts,
9066 result_assigned_part_ids,
9067 result_mj_gnos
9068 );
9069 }
9070
9071 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9072
9073 // Reorder results so that they match the order of the input
9074 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9075 localGidToLid.reserve(result_num_local_coords);
9076 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9077 Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9078 result_initial_mj_gnos_.size());
9079 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9080 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9081 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9082 }
9083
9084 ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9085 0, result_num_local_coords, true);
9086 auto host_result_assigned_part_ids =
9087 Kokkos::create_mirror_view(result_assigned_part_ids);
9088 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9089 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9090 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9091 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9092 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9093 partId[origLID] = host_result_assigned_part_ids(i);
9094 }
9095
9096 //now the results are reordered. but if premigration occured,
9097 //then we need to send these ids to actual owners again.
9098 if(is_pre_migrated) {
9099 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9100 "PostMigration DistributorPlanCreating");
9101 Tpetra::Distributor distributor(this->mj_problemComm);
9102
9103 ArrayView<const mj_part_t> actual_owner_destinations(
9104 result_actual_owner_rank , result_num_local_coords);
9105
9106 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9107 actual_owner_destinations);
9108
9109 if(num_incoming_gnos != this->num_local_coords) {
9110 throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9111 "num incoming is not equal to num local coords");
9112 }
9113
9114 mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9115 "PostMigration DistributorPlanCreating");
9116 mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9117 "PostMigration DistributorMigration");
9118
9119 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9120 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
9121 num_incoming_gnos);
9122 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9123 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
9124 num_incoming_gnos);
9125
9126 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9127 received_gnos);
9128 {
9129 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9130 if (partId.size() > 0) {
9131 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9132 partId.getRawPtr(), partId.size()); //unmanaged
9133 }
9134 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9135 }
9136
9137 partId = arcp(new mj_part_t[this->num_local_coords],
9138 0, this->num_local_coords, true);
9139
9140 {
9141 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9142 localGidToLid2.reserve(this->num_local_coords);
9143 auto host_initial_mj_gnos =
9144 Kokkos::create_mirror_view(this->initial_mj_gnos);
9145 Kokkos::deep_copy(host_initial_mj_gnos,
9146 this->initial_mj_gnos);
9147 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9148 localGidToLid2[host_initial_mj_gnos(i)] = i;
9149 }
9150
9151 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9152 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9153 partId[origLID] = received_partids[i];
9154 }
9155 }
9156
9157 {
9158 delete [] result_actual_owner_rank;
9159 }
9160 mj_env->timerStop(MACRO_TIMERS,
9161 timer_base_string + "PostMigration DistributorMigration");
9162 }
9163 solution->setParts(partId);
9164 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9165 }
9166
9167 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9168
9169 // reset the view (release the reference to device data)
9170 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9171}
9172
9173/* \brief Sets the partitioning data for multijagged algorithm.
9174 * */
9175template <typename Adapter>
9177 const RCP<PartitioningSolution<Adapter> > &solution
9178)
9179{
9180 modelFlag_t flags;
9181 CoordinateModel<Adapter> mj_coords(mj_adapter, mj_env, mj_problemComm, flags);
9182
9183 this->coord_dim = mj_coords.getCoordinateDim();
9184 this->num_weights_per_coord = mj_coords.getNumWeightsPerCoordinate();
9185 this->num_local_coords = mj_coords.getLocalNumCoordinates();
9186 this->num_global_coords = mj_coords.getGlobalNumCoordinates();
9187
9188 int criteria_dim = (this->num_weights_per_coord ?
9189 this->num_weights_per_coord : 1);
9190 // From the Solution we get part information.
9191 // If the part sizes for a given criteria are not uniform,
9192 // then they are values that sum to 1.0.
9193 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9194 // allocate only two dimensional pointer.
9195 // raw pointer addresess will be obtained from multivector.
9196 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9197 "uniform parts", criteria_dim);
9198 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9199 "uniform weights", criteria_dim);
9200
9201 Kokkos::View<const mj_gno_t *, device_t> gnos;
9202 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9203 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9204 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9205 mj_coords.getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9206 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9207 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9208 Kokkos::View<mj_scalar_t **, device_t> wgts;
9209
9210 // Now we must get the data from the adapter.
9211 // If the types match we point to the view but if not, we must copy.
9212 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9213 // we can just point the views but we must specialize because this code
9214 // only compiles in this case - for is_same false assign does nothing.
9215 assign_if_same(xyz, xyz_adapter);
9216 assign_if_same(wgts, wgts_adapter);
9217 }
9218 else {
9219 // we only allocate a new view if we are going to copy
9220 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9221 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9222 (Kokkos::ViewAllocateWithoutInitializing(
9223 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9224 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9225 Kokkos::ViewAllocateWithoutInitializing("wgts"),
9226 wgts_adapter.extent(0), wgts_adapter.extent(1));
9227
9228 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9229 Kokkos::parallel_for(
9230 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9231 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9232 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9233 xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9234 }
9235 });
9236 Kokkos::parallel_for(
9237 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9238 (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9239 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9240 wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9241 }
9242 });
9243 }
9244
9245 // obtain global ids.
9246 this->initial_mj_gnos = gnos;
9247 // extract coordinates from multivector.
9248 this->mj_coordinates = xyz;
9249 // if no weights are provided set uniform weight.
9250
9251 if(this->num_weights_per_coord == 0) {
9252 this->mj_uniform_weights(0) = true;
9253 Kokkos::resize(this->mj_weights, 0, 0);
9254 }
9255 else{
9256 this->mj_weights = wgts;
9257 for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9258 this->mj_uniform_weights(wdim) = false;
9259 }
9260 }
9261
9262 for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9263 if(solution->criteriaHasUniformPartSizes(wdim)) {
9264 this->mj_uniform_parts(wdim) = true;
9265 }
9266 else {
9267 printf("Error: MJ does not support non uniform target part weights\n");
9268 std::terminate();
9269 }
9270 }
9271}
9272
9273/* \brief Sets the partitioning parameters for multijagged algorithm.
9274 * \param pl: is the parameter list provided to zoltan2 call
9275 * */
9276template <typename Adapter>
9277void Zoltan2_AlgMJ<Adapter>::set_input_parameters(
9278 const Teuchos::ParameterList &pl)
9279{
9280 const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9281 if(pe) {
9282 double tol;
9283 tol = pe->getValue(&tol);
9284 this->imbalance_tolerance = tol - 1.0;
9285 }
9286
9287 // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9288 if(this->imbalance_tolerance <= 0) {
9289 this->imbalance_tolerance= 10e-4;
9290 }
9291
9292 // if an input partitioning array is provided.
9293 Kokkos::resize(this->part_no_array, 0);
9294
9295 // the length of the input partitioning array.
9296 this->recursion_depth = 0;
9297
9298 if(pl.getPtr<int>("mj_num_teams")) {
9299 this->num_teams = pl.get<int>("mj_num_teams");
9300 }
9301
9302 if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9303 auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9304 int mj_parts_size = static_cast<int>(mj_parts.size());
9305
9306 // build the view we'll have data on and copy values from host
9307 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9308 "part_no_array", mj_parts_size);
9309 for(int i = 0; i < mj_parts_size; ++i) {
9310 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9311 }
9312
9313 this->recursion_depth = mj_parts_size - 1;
9314 this->mj_env->debug(2, "mj_parts provided by user");
9315 }
9316
9317 // get mj specific parameters.
9318 this->distribute_points_on_cut_lines = true;
9319 this->max_concurrent_part_calculation = 1;
9320
9321 this->mj_run_as_rcb = false;
9322 this->mj_premigration_option = 0;
9323 this->min_coord_per_rank_for_premigration = 32000;
9324
9325 int mj_user_recursion_depth = -1;
9326 this->mj_keep_part_boxes = false;
9327 this->check_migrate_avoid_migration_option = 0;
9328 this->migration_type = 0;
9329 this->minimum_migration_imbalance = 0.35;
9330
9331 pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9332 if(pe) {
9333 double imb;
9334 imb = pe->getValue(&imb);
9335 this->minimum_migration_imbalance = imb - 1.0;
9336 }
9337
9338 pe = pl.getEntryPtr("mj_migration_option");
9339 if(pe) {
9340 this->check_migrate_avoid_migration_option =
9341 pe->getValue(&this->check_migrate_avoid_migration_option);
9342 } else {
9343 this->check_migrate_avoid_migration_option = 0;
9344 }
9345 if(this->check_migrate_avoid_migration_option > 1) {
9346 this->check_migrate_avoid_migration_option = -1;
9347 }
9348
9350 pe = pl.getEntryPtr("mj_migration_type");
9351 if(pe) {
9352 this->migration_type = pe->getValue(&this->migration_type);
9353 } else {
9354 this->migration_type = 0;
9355 }
9356
9357 //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9359
9360 pe = pl.getEntryPtr("mj_concurrent_part_count");
9361 if(pe) {
9362 this->max_concurrent_part_calculation =
9363 pe->getValue(&this->max_concurrent_part_calculation);
9364 } else {
9365 this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9366 }
9367
9368 pe = pl.getEntryPtr("mj_keep_part_boxes");
9369 if(pe) {
9370 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9371 } else {
9372 this->mj_keep_part_boxes = false; // Set to invalid value
9373 }
9374
9375 // For now, need keep_part_boxes to do pointAssign and boxAssign.
9376 // pe = pl.getEntryPtr("keep_cuts");
9377 // if(pe) {
9378 // int tmp = pe->getValue(&tmp);
9379 // if(tmp) this->mj_keep_part_boxes = true;
9380 // }
9381
9382 //need to keep part boxes if mapping type is geometric.
9383 if(this->mj_keep_part_boxes == false) {
9384 pe = pl.getEntryPtr("mapping_type");
9385 if(pe) {
9386 int mapping_type = -1;
9387 mapping_type = pe->getValue(&mapping_type);
9388 if(mapping_type == 0) {
9389 mj_keep_part_boxes = true;
9390 }
9391 }
9392 }
9393
9394 // need to keep part boxes if mapping type is geometric.
9395 pe = pl.getEntryPtr("mj_enable_rcb");
9396 if(pe) {
9397 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9398 } else {
9399 this->mj_run_as_rcb = false; // Set to invalid value
9400 }
9401
9402 pe = pl.getEntryPtr("mj_premigration_option");
9403 if(pe) {
9404 mj_premigration_option = pe->getValue(&mj_premigration_option);
9405 } else {
9406 mj_premigration_option = 0;
9407 }
9408
9409 pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9410 if(pe) {
9411 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9412 } else {
9413 min_coord_per_rank_for_premigration = 32000;
9414 }
9415
9416 pe = pl.getEntryPtr("mj_recursion_depth");
9417 if(pe) {
9418 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9419 } else {
9420 mj_user_recursion_depth = -1; // Set to invalid value
9421 }
9422
9423 bool val = false;
9424 pe = pl.getEntryPtr("rectilinear");
9425 if(pe) {
9426 val = pe->getValue(&val);
9427 }
9428 if(val) {
9429 this->distribute_points_on_cut_lines = false;
9430 } else {
9431 this->distribute_points_on_cut_lines = true;
9432 }
9433
9434 if(this->mj_run_as_rcb) {
9435 mj_user_recursion_depth =
9436 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9437 }
9438 if(this->recursion_depth < 1) {
9439 if(mj_user_recursion_depth > 0) {
9440 this->recursion_depth = mj_user_recursion_depth;
9441 }
9442 else {
9443 this->recursion_depth = this->coord_dim;
9444 }
9445 }
9446}
9447
9449template <typename Adapter>
9451 int dim,
9452 adapter_scalar_t *lower,
9453 adapter_scalar_t *upper,
9454 size_t &nPartsFound,
9455 typename Adapter::part_t **partsFound) const
9456{
9457 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9458 // TODO: complexity. Or at least do a search through the boxes, using
9459 // TODO: p x q x r x ... if possible.
9460
9461 nPartsFound = 0;
9462 *partsFound = NULL;
9463
9464 if(this->mj_keep_part_boxes) {
9465
9466 // Get vector of part boxes
9467 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9468
9469 size_t nBoxes = (*partBoxes).size();
9470 if(nBoxes == 0) {
9471 throw std::logic_error("no part boxes exist");
9472 }
9473
9474 // Determine whether the box overlaps the globalBox at all
9475 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9476
9477 if(globalBox->boxesOverlap(dim, lower, upper)) {
9478
9479 std::vector<typename Adapter::part_t> partlist;
9480
9481 // box overlaps the global box; find specific overlapping boxes
9482 for(size_t i = 0; i < nBoxes; i++) {
9483 try {
9484 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9485 nPartsFound++;
9486 partlist.push_back((*partBoxes)[i].getpId());
9487 /*
9488 std::cout << "Given box (";
9489 for(int j = 0; j < dim; j++)
9490 std::cout << lower[j] << " ";
9491 std::cout << ") x (";
9492 for(int j = 0; j < dim; j++)
9493 std::cout << upper[j] << " ";
9494 std::cout << ") overlaps PartBox "
9495 << (*partBoxes)[i].getpId() << " (";
9496 for(int j = 0; j < dim; j++)
9497 std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9498 std::cout << ") x (";
9499 for(int j = 0; j < dim; j++)
9500 std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9501 std::cout << ")" << std::endl;
9502 */
9503 }
9504 }
9506 }
9507 if(nPartsFound) {
9508 *partsFound = new mj_part_t[nPartsFound];
9509 for(size_t i = 0; i < nPartsFound; i++)
9510 (*partsFound)[i] = partlist[i];
9511 }
9512 }
9513 else {
9514 // Box does not overlap the domain at all. Find the closest part
9515 // Not sure how to perform this operation for MJ without having the
9516 // cuts. With the RCB cuts, the concept of a part extending to
9517 // infinity was natural. With the boxes, it is much more difficult.
9518 // TODO: For now, return information indicating NO OVERLAP.
9519 }
9520 }
9521 else {
9522 throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9523 }
9524}
9525
9527template <typename Adapter>
9529 int dim,
9530 adapter_scalar_t *point) const
9531{
9532 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9533 // TODO: complexity. Or at least do a search through the boxes, using
9534 // TODO: p x q x r x ... if possible.
9535
9536 if(this->mj_keep_part_boxes) {
9537 typename Adapter::part_t foundPart = -1;
9538
9539 // Get vector of part boxes
9540 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9541
9542 size_t nBoxes = (*partBoxes).size();
9543 if(nBoxes == 0) {
9544 throw std::logic_error("no part boxes exist");
9545 }
9546
9547 // Determine whether the point is within the global domain
9548 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9549
9550 if(globalBox->pointInBox(dim, point)) {
9551
9552 // point is in the global domain; determine in which part it is.
9553 size_t i;
9554 for(i = 0; i < nBoxes; i++) {
9555 try {
9556 if((*partBoxes)[i].pointInBox(dim, point)) {
9557 foundPart = (*partBoxes)[i].getpId();
9558 // std::cout << "Point (";
9559 // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9560 // std::cout << ") found in box " << i << " part " << foundPart
9561 // << std::endl;
9562 // (*partBoxes)[i].print();
9563 break;
9564 }
9565 }
9567 }
9568
9569 if(i == nBoxes) {
9570 // This error should never occur
9571 std::ostringstream oss;
9572 oss << "Point (";
9573 for(int j = 0; j < dim; j++) oss << point[j] << " ";
9574 oss << ") not found in domain";
9575 throw std::logic_error(oss.str());
9576 }
9577 }
9578
9579 else {
9580 // Point is outside the global domain.
9581 // Determine to which part it is closest.
9582 // TODO: with cuts, would not need this special case
9583
9584 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9585 size_t closestBox = 0;
9586 coord_t minDistance = std::numeric_limits<coord_t>::max();
9587 coord_t *centroid = new coord_t[dim];
9588 for(size_t i = 0; i < nBoxes; i++) {
9589 (*partBoxes)[i].computeCentroid(centroid);
9590 coord_t sum = 0.;
9591 coord_t diff;
9592 for(int j = 0; j < dim; j++) {
9593 diff = centroid[j] - point[j];
9594 sum += diff * diff;
9595 }
9596 if(sum < minDistance) {
9597 minDistance = sum;
9598 closestBox = i;
9599 }
9600 }
9601 foundPart = (*partBoxes)[closestBox].getpId();
9602 delete [] centroid;
9603 }
9604
9605 return foundPart;
9606 }
9607 else {
9608 throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9609 }
9610}
9611
9612template <typename Adapter>
9614 const PartitioningSolution<Adapter> *solution,
9615 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9616 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9617{
9618 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9619 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9620 mj_part_t ntasks = (*pBoxes).size();
9621 int dim = (*pBoxes)[0].getDim();
9622 GridHash grid(pBoxes, ntasks, dim);
9623 grid.getAdjArrays(comXAdj_, comAdj_);
9624 }
9625 comAdj = comAdj_;
9626 comXAdj = comXAdj_;
9627}
9628
9629template <typename Adapter>
9630RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9631Zoltan2_AlgMJ<Adapter>::getGlobalBoxBoundaries() const
9632{
9633 return this->mj_partitioner.get_kept_boxes();
9634}
9635} // namespace Zoltan2
9636
9637#endif
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
int getCoordinateDim() const
Returns the dimension of the coordinates.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
#define epsilon
Definition nd.cpp:82
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.