49#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50#define _ZOLTAN2_ALGMultiJagged_HPP_
59#include <Tpetra_Distributor.hpp>
60#include <Teuchos_StandardParameterEntryValidators.hpp>
61#include <Teuchos_ParameterList.hpp>
62#include <Kokkos_Sort.hpp>
66#include <unordered_map>
68#ifdef ZOLTAN2_USEZOLTANCOMM
69#ifdef HAVE_ZOLTAN2_MPI
70#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71#include "zoltan_comm_cpp.h"
72#include "zoltan_types.h"
81template <
typename Ordinal,
typename T>
92 epsilon(std::numeric_limits<T>::epsilon()) {}
98 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
105 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
106 for(Ordinal i = 0; i < count; i++) {
107 if(
Z2_ABS(inBuffer[i]) > epsilon) {
108 inoutBuffer[i] = inBuffer[i];
124template <
typename IT,
typename CT,
typename WT>
139 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
144 this->index = index_;
145 this->count = count_;
147 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
160 assert(this->count == other.
count);
161 for(CT i = 0; i < this->count; ++i) {
163 if(std::abs(this->val[i] - other.
val[i]) < this->epsilon) {
167 if(this->val[i] < other.
val[i]) {
176 return this->index < other.
index;
182template <
class IT,
class WT>
193template <
class IT,
class WT>
195 const int NSTACK = 50;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[NSTACK];
205 for(j=l+1;j<=ir;j++) {
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
237 do i++;
while (arr[i].val < aval);
238 do j--;
while (arr[j].val > aval);
240 std::swap(arr[i],arr[j]);
245 if(jstack > NSTACK) {
246 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
263template <
class IT,
class WT,
class SIGN>
271 if(this->signbit < rhs.
signbit) {
275 else if(this->signbit == rhs.
signbit) {
276 if(this->val < rhs.
val) {
277 return this->signbit;
280 else if(this->val > rhs.
val) {
281 return !this->signbit;
295 return (this->val == rhs.
val && this->signbit == rhs.
signbit) || (*
this < rhs);
302template <
class IT,
class WT,
class SIGN>
304 const IT NSTACK = 50;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[NSTACK];
313 for(j=l+1;j<=ir;j++) {
315 for(i=j-1;i>=1;i--) {
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
345 do i++;
while (arr[i] < a);
346 do j--;
while (a < arr[j]);
348 std::swap(arr[i],arr[j]);
353 if(jstack > NSTACK) {
354 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
390 static int counter = 0;
394 static int counter = 0;
401template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
402 typename mj_part_t,
typename mj_node_t>
406 typedef typename mj_node_t::device_type device_t;
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
413 static constexpr size_t future_reduceall_cutoff = 1500000;
417 static constexpr mj_lno_t min_work_last_dim = 1000;
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr int significance_mul = 1000;
422 std::string mj_timer_base_string;
424 RCP<const Environment> mj_env;
425 RCP<const Comm<int> > mj_problemComm;
426 RCP<Comm<int> > comm;
427 double imbalance_tolerance;
430 int num_weights_per_coord;
431 size_t initial_num_loc_coords;
433 mj_lno_t num_local_coords;
434 mj_gno_t num_global_coords;
435 mj_scalar_t sEpsilon;
438 bool distribute_points_on_cut_lines;
441 mj_part_t max_concurrent_part_calculation;
444 int mj_user_recursion_depth;
445 bool mj_keep_part_boxes;
448 int check_migrate_avoid_migration_option;
455 double minimum_migration_imbalance;
472 mj_part_t num_first_level_parts;
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
478 mj_part_t total_num_cut ;
479 mj_part_t total_num_part;
481 mj_part_t max_num_part_along_dim ;
482 mj_part_t max_num_cut_along_dim;
485 size_t max_num_total_part_along_dim;
487 mj_part_t total_dim_num_reduce_all;
491 mj_part_t last_dim_num_part;
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
512 size_t num_global_parts;
515 RCP<mj_partBoxVector_t> kept_boxes;
517 RCP<mj_partBox_t> global_box;
522 bool divide_to_prime_first;
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
607 Kokkos::View<double *, device_t>
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
625 Kokkos::View<mj_lno_t *, device_t>
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim;
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) /
static_cast<double>(expected) - 1.0;
660 void set_part_specifications();
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
754 bool mj_perform_migration(
755 mj_part_t in_num_parts,
756 mj_part_t &out_num_parts,
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
787 mj_gno_t *num_points_in_all_processor_parts);
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
876 void assign_send_destinations(
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
897 void assign_send_destinations2(
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
950 void mj_migrate_coords(
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
974 while (n % divisor == 0) {
976 largest_factor = divisor;
979 if(divisor * divisor > n) {
986 return largest_factor;
1019 const RCP<const Environment> &env,
1020 RCP<
const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1073 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1133#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1141 void allocate_set_work_memory();
1144 void compute_global_box();
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1423 bool longest_dim_part,
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1443template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1444 typename mj_part_t,
typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1515template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1516 typename mj_part_t,
typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1543 this->divide_to_prime_first = divide_to_prime_first_;
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1555 this->num_first_level_parts = num_first_level_parts_;
1557 this->first_level_distribution = first_level_distribution_;
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1569 this->num_weights_per_coord = 0;
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) =
true;
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1580 this->mj_uniform_parts(0) =
true;
1582 this->set_part_specifications();
1584 this->allocate_set_work_memory();
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (
int dummy) {
1591 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1596 mj_part_t current_num_parts = 1;
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1601 mj_part_t future_num_parts = this->total_num_part;
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size(
"view_total_reduction_size", 1);
1625 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1637 next_future_num_parts_in_parts->clear();
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1648 t2, num_ranks_per_node);
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1661 std::string istring = std::to_string(rd);
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1670 mj_part_t output_part_index = 0;
1674 mj_part_t output_coordinate_end_index = 0;
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1679 mj_part_t obtained_part_index = 0;
1682 int coordInd = rd % this->coord_dim;
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1696 mj_part_t actual_work_part_count = 0;
1701 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(
int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1722 this->mj_get_local_min_max_coord_totW(
1724 current_concurrent_num_parts,
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1765 current_concurrent_num_parts,
1771 if(actual_work_part_count > 0) {
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1784 mj_part_t total_incomplete_cut_count = 0;
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1817 concurrent_part_cut_shift += partition_count - 1;
1819 concurrent_part_part_shift += partition_count;
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1825 total_incomplete_cut_count += partition_count - 1;
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1835 this->mj_get_initial_cut_coords_target_weights(
1838 partition_count - 1,
1839 global_total_weight,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1856 this->set_initial_coordinate_parts(
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1868 this->incomplete_cut_count(kk) = 0;
1870 obtained_part_index += partition_count;
1875 double used_imbalance = 0;
1879 mj_timer_base_string +
"mj_1D_part()");
1882 mj_current_dim_coords,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1892 mj_timer_base_string +
"mj_1D_part()");
1895 obtained_part_index += current_concurrent_num_parts;
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1904 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1950 process_cut_line_weight_to_put_left.size()));
1952 this->thread_part_weight_work =
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1967 this->create_consistent_chunks(
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce(
"Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2055 output_coordinate_end_index = get_single;
2057 output_part_index += num_parts;
2064 current_num_parts = output_part_count_in_dimension;
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj);
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2081 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2092template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2093 typename mj_part_t,
typename mj_node_t>
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2099 return this->global_box;
2104template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2105 typename mj_part_t,
typename mj_node_t>
2106void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2109 this->mj_keep_part_boxes =
true;
2119template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2120 typename mj_part_t,
typename mj_node_t>
2124 this->total_num_cut = 0;
2125 this->total_num_part = 1;
2126 this->max_num_part_along_dim = 0;
2127 this->total_dim_num_reduce_all = 0;
2128 this->last_dim_num_part = 1;
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2140 this->total_num_part = 1;
2141 for(
int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2145 mj_part_t track_max = 0;
2146 for(
int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2158 mj_part_t future_num_parts = this->num_global_parts;
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2169 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2187 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2196 this->num_global_parts * max_part / sum_first_level_dist;
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2209 future_num_parts = nfutureNumParts;
2211 this->total_num_part = this->num_global_parts;
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2222 for(
int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr <<
"Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2260template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2261 typename mj_part_t,
typename mj_node_t>
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2267 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2294template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2295 typename mj_part_t,
typename mj_node_t>
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2318 if(current_part_no_array < 1) {
2319 std::cout <<
"Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration <<
"] is given as:" <<
2321 current_part_no_array << std::endl;
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout <<
"Current recursive iteration: " << current_iteration
2333 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2334 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2384 future_num_parts = 1;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr <<
"ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2426 output_num_parts = this->num_first_level_parts;
2429 future_num_parts /= this->num_first_level_parts;
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2436 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2448 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2453 else if (this->divide_to_prime_first) {
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2460 output_num_parts += num_partitions_in_current_dim;
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2504 ++num_future_parts_for_part_iii;
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2525 output_num_parts += num_partitions_in_current_dim;
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2543 ++num_future_parts_for_part_iii;
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>(
"test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2580template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2581 typename mj_part_t,
typename mj_node_t>
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2601 this->num_local_coords);
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2608 this->num_local_coords);
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2748 this->max_num_part_along_dim);
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2791template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2792 typename mj_part_t,
typename mj_node_t>
2793void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2797 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2799 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2801 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2803 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2805 auto local_mj_coordinates = this->mj_coordinates;
2809 for(
int i = 0; i < this->coord_dim; ++i) {
2814 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce(
"MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce(
"MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2842 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2856template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2857 typename mj_part_t,
typename mj_node_t>
2858void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2870template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2884 bool bUniformWeights = mj_uniform_weights(0);
2886 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2899 if(coordinate_begin_index >= coordinate_end_index)
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2907 Kokkos::parallel_reduce(
"get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2916 Kokkos::parallel_reduce(
"get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce(
"get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2965template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2966 typename mj_part_t,
typename mj_node_t>
2967void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2975 if(this->comm->getSize() > 1) {
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2983 Teuchos::MultiJaggedCombinedMinMaxTotalReductionOp<int, mj_scalar_t>
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2987 reduceAll<int, mj_scalar_t>(
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3039template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3040 typename mj_part_t,
typename mj_node_t>
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts ,
3046 mj_scalar_t global_weight,
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts,
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3058 mj_scalar_t coord_range = max_coord - min_coord;
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3081 mj_scalar_t cumulative = 0;
3083 if(bUniformPartsCheck) {
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3113 Kokkos::parallel_for(
"Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3121 current_target_part_weights(num_cuts) = global_weight;
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3154template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3155 typename mj_part_t,
typename mj_node_t>
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1;
3193 mj_part_ids[iii] = 2 * pp;
3212template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3213 typename mj_part_t,
typename mj_node_t>
3214void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3224 this->temp_cut_coords = current_cut_coordinates;
3226 Teuchos::MultiJaggedCombinedReductionOp<mj_part_t, mj_scalar_t>
3227 *reductionOp = NULL;
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3236 reductionOp =
new Teuchos::MultiJaggedCombinedReductionOp
3237 <mj_part_t, mj_scalar_t>(
3240 current_concurrent_num_parts);
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (
int dummy) {
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) =
false;
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3317 mj_current_dim_coords,
3321 this->mj_combine_rightleft_and_weights(
3323 current_concurrent_num_parts);
3326 if(!bSingleProcess) {
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce(
"Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3360 mj_part_t cut_shift = 0;
3364 size_t tlr_shift = 0;
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3393 this->total_part_weight_left_right_closests.size()));
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3431 local_process_cut_line_weight_to_put_left.size()));
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3456 local_cut_lower_bound_coordinates.size()));
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3536template<
class scalar_t>
3542 KOKKOS_INLINE_FUNCTION
3545 KOKKOS_INLINE_FUNCTION
3554#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3556template<
class policy_t,
class scalar_t,
class part_t>
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3577 KOKKOS_INLINE_FUNCTION
3582 KOKKOS_INLINE_FUNCTION
3585 dst.
ptr[n] += src.
ptr[n];
3590 if(src.
ptr[n] > dst.
ptr[n]) {
3593 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3594 dst.
ptr[n+1] = src.
ptr[n+1];
3599 KOKKOS_INLINE_FUNCTION
3602 dst.
ptr[n] += src.
ptr[n];
3607 if(src.
ptr[n] > dst.
ptr[n]) {
3610 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3611 dst.
ptr[n+1] = src.
ptr[n+1];
3632template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3633 class device_t,
class array_t>
3638#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3661#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3684#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3707#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3716#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result =
sizeof(array_t) *
3720 int result =
sizeof(array_t) *
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3732 KOKKOS_INLINE_FUNCTION
3733#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3760#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3778 teamMember.team_barrier();
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3803#
if (__cplusplus > 201703L)
3815 index_t part =
parts(i)/2;
3826#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3827 Kokkos::atomic_add(&shared_ptr[part*2], w);
3829 threadSum.
ptr[part*2] += w;
3835#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3836 array_t new_value = (array_t) coord;
3838 while(new_value < prev_value) {
3839 prev_value = Kokkos::atomic_compare_exchange(
3841 prev_value, new_value);
3844 while(new_value > prev_value) {
3845 prev_value = Kokkos::atomic_compare_exchange(
3847 prev_value, new_value);
3865 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3869#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3870 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3874 threadSum.
ptr[part*2+1] += w;
3879 parts(i) = part*2+1;
3891 if(delta < 0) delta = -delta;
3896#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3897 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3901 threadSum.
ptr[part*2+1] += w;
3913 if(delta < 0) delta = -delta;
3918#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3919 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3923 threadSum.
ptr[part*2+1] += w;
3948 if(part == lower + 1) {
3953 part -= (part - lower)/2;
3956 else if(part == upper - 1) {
3961 part += (upper - part)/2;
3965#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3968 }, arraySumReducer);
3971 teamMember.team_barrier();
3974#if (__cplusplus > 201703L)
3975 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
3977 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3980#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 Kokkos::atomic_add(¤t_part_weights(n),
3982 static_cast<double>(shared_ptr[n]));
3984 teamSum[n] += array.
ptr[n];
3988#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3989 int insert_left = 0;
3990 int insert_right = 0;
3995#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3996 scalar_t new_value = shared_ptr[n+1];
3997 scalar_t prev_value = current_right_closest(insert_right);
3998 while(new_value < prev_value) {
3999 prev_value = Kokkos::atomic_compare_exchange(
4000 ¤t_right_closest(insert_right), prev_value, new_value);
4003 new_value = shared_ptr[n];
4004 prev_value = current_left_closest(insert_left);
4005 while(new_value > prev_value) {
4006 prev_value = Kokkos::atomic_compare_exchange(
4007 ¤t_left_closest(insert_left), prev_value, new_value);
4013 if(array.
ptr[n] > teamSum[n]) {
4014 teamSum[n] = array.
ptr[n];
4016 if(array.
ptr[n+1] < teamSum[n+1]) {
4017 teamSum[n+1] = array.
ptr[n+1];
4023 teamMember.team_barrier();
4026#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4027 KOKKOS_INLINE_FUNCTION
4035 if(src[n] > dst[n]) {
4038 if(src[n+1] < dst[n+1]) {
4039 dst[n+1] = src[n+1];
4065template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4066 typename mj_part_t,
typename mj_node_t>
4067void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4068 mj_1D_part_get_part_weights(
4071 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4074 auto local_is_cut_line_determined = is_cut_line_determined;
4075 auto local_thread_part_weights = thread_part_weights;
4076 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4077 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4081 auto local_sEpsilon = this->sEpsilon;
4082 auto local_assigned_part_ids = this->assigned_part_ids;
4083 auto local_coordinate_permutations = this->coordinate_permutations;
4084 auto local_mj_weights = this->mj_weights;
4085 auto local_part_xadj = this->part_xadj;
4086 auto local_global_min_max_coord_total_weight =
4087 this->global_min_max_coord_total_weight;
4089 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4091 auto local_device_num_partitioning_in_current_dim =
4092 device_num_partitioning_in_current_dim;
4094 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4095 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4097 mj_part_t total_part_shift = 0;
4099 mj_part_t concurrent_cut_shifts = 0;
4101 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4102 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4103 concurrent_cut_shifts, temp_cut_coords.size()));
4105 mj_part_t num_parts =
4107 mj_part_t
num_cuts = num_parts - 1;
4108 mj_part_t total_part_count = num_parts +
num_cuts;
4109 mj_part_t weight_array_length =
num_cuts + num_parts;
4112 mj_part_t right_left_array_length = (
num_cuts + 2) * 2;
4114 if(this->incomplete_cut_count(kk) == 0) {
4115 total_part_shift += total_part_count;
4121 auto policy_ReduceWeightsFunctor = policy_t(
4122 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4124#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4125 int total_array_length =
4126 weight_array_length + right_left_array_length;
4133 typedef mj_scalar_t array_t;
4135#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4136 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", total_array_length);
4139 int offset_cuts = 0;
4140 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4144 Kokkos::View<double *, device_t> my_current_part_weights =
4145 Kokkos::subview(local_thread_part_weights,
4146 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4147 total_part_shift + total_part_count));
4148 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4149 Kokkos::subview(local_thread_cut_left_closest_point,
4150 std::pair<mj_lno_t, mj_lno_t>(
4152 local_thread_cut_left_closest_point.size()));
4153 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4154 Kokkos::subview(local_thread_cut_right_closest_point,
4155 std::pair<mj_lno_t, mj_lno_t>(
4157 local_thread_cut_right_closest_point.size()));
4159 array_t
max_scalar = std::numeric_limits<array_t>::max();
4161#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4163 Kokkos::parallel_for(
4164 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4165 KOKKOS_LAMBDA (
int dummy) {
4166 for(
int n = 0; n < weight_array_length; ++n) {
4167 my_current_part_weights(n) = 0;
4169 for(
int n = 0; n <
num_cuts; ++n) {
4180 typename mj_node_t::device_type, array_t>
4188 right_left_array_length,
4189 weight_array_length,
4190 coordinate_permutations,
4191 mj_current_dim_coords,
4194 local_temp_cut_coords,
4196 mj_uniform_weights(0),
4198#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4199 ,my_current_part_weights,
4200 my_current_left_closest,
4201 my_current_right_closest
4205#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4206 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4208 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4209 teamFunctor, reduce_array);
4213#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4214 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4216 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4217 hostArray(i) = reduce_array[i];
4220 Kokkos::deep_copy(my_current_part_weights, hostArray);
4222 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4223 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4224 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4225 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4226 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4228 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4229 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4232 total_part_shift += total_part_count;
4236 auto local_temp_cut_coords = temp_cut_coords;
4238 Kokkos::parallel_for(
4239 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4241 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4243 mj_part_t
num_cuts = num_parts - 1;
4244 mj_part_t total_part_count = num_parts +
num_cuts;
4246 if(local_device_incomplete_cut_count(kk) > 0) {
4250 size_t offset_cuts = 0;
4251 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4252 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4254 offset += num_parts_kk2 * 2 - 1;
4255 offset_cuts += num_parts_kk2 - 1;
4258 for(mj_part_t i = 1; i < total_part_count; ++i) {
4262 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4263 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4264 local_temp_cut_coords(offset_cuts + i /2 - 1))
4269 local_thread_part_weights(offset + i)
4270 = local_thread_part_weights(offset + i-2);
4275 local_thread_part_weights(offset + i) +=
4276 local_thread_part_weights(offset + i-1);
4289template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4290 typename mj_part_t,
typename mj_node_t>
4291void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4292 mj_combine_rightleft_and_weights(
4296 auto local_thread_part_weights = this->thread_part_weights;
4297 auto local_is_cut_line_determined = this->is_cut_line_determined;
4298 auto local_thread_cut_left_closest_point =
4299 this->thread_cut_left_closest_point;
4300 auto local_thread_cut_right_closest_point =
4301 this->thread_cut_right_closest_point;
4302 auto local_total_part_weight_left_right_closests =
4303 this->total_part_weight_left_right_closests;
4304 auto local_device_num_partitioning_in_current_dim =
4305 device_num_partitioning_in_current_dim;
4306 Kokkos::parallel_for(
4307 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4308 KOKKOS_LAMBDA (
int dummy) {
4310 size_t tlr_array_shift = 0;
4311 mj_part_t cut_shift = 0;
4312 size_t total_part_array_shift = 0;
4318 mj_part_t num_parts_in_part =
4320 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4321 size_t num_total_part_in_part =
4322 num_parts_in_part + size_t (num_cuts_in_part);
4325 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4326 mj_part_t next = tlr_array_shift + ii;
4327 mj_part_t cut_index = cut_shift + ii;
4329 if(!local_is_cut_line_determined(cut_index)) {
4330 mj_scalar_t left_closest_in_process =
4331 local_thread_cut_left_closest_point(cut_index);
4332 mj_scalar_t right_closest_in_process =
4333 local_thread_cut_right_closest_point(cut_index);
4336 local_total_part_weight_left_right_closests(
4337 num_total_part_in_part + next) = left_closest_in_process;
4339 local_total_part_weight_left_right_closests(
4340 num_total_part_in_part + num_cuts_in_part + next) =
4341 right_closest_in_process;
4345 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4346 mj_part_t cut_ind = j / 2 + cut_shift;
4352 if(j == num_total_part_in_part - 1 ||
4353 !local_is_cut_line_determined(cut_ind)) {
4354 double pwj = local_thread_part_weights(total_part_array_shift + j);
4355 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4360 cut_shift += num_cuts_in_part;
4361 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4362 total_part_array_shift += num_total_part_in_part;
4379template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4380 typename mj_part_t,
typename mj_node_t>
4381KOKKOS_INLINE_FUNCTION
4382void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4383 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4384 mj_scalar_t cut_lower_bound,
4385 mj_scalar_t cut_upper_weight,
4386 mj_scalar_t cut_lower_weight,
4387 mj_scalar_t expected_weight,
4388 mj_scalar_t &new_cut_position,
4391 if(std::abs(cut_upper_bound - cut_lower_bound) <
sEpsilon) {
4392 new_cut_position = cut_upper_bound;
4395 if(std::abs(cut_upper_weight - cut_lower_weight) <
sEpsilon) {
4396 new_cut_position = cut_lower_bound;
4399 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4400 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4401 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4403 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4404 int scale_constant = 20;
4405 int shiftint= int (required_shift * scale_constant);
4406 if(shiftint == 0) shiftint = 1;
4407 required_shift = mj_scalar_t (shiftint) / scale_constant;
4408 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4411#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4413template<
class policy_t,
class scalar_t>
4423 int mj_value_count) :
4428 KOKKOS_INLINE_FUNCTION
4433 KOKKOS_INLINE_FUNCTION
4436 dst.
ptr[n] += src.
ptr[n];
4450template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4451 class device_t,
class array_t>
4456#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4468#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4469 Kokkos::View<int *, device_t> local_point_counts;
4473 part_t mj_concurrent_current_part,
4474 part_t mj_weight_array_size,
4475 Kokkos::View<index_t*, device_t> & mj_permutations,
4476 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4477 Kokkos::View<part_t*, device_t> & mj_parts,
4478 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4479 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4480#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4481 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4491#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4492 ,local_point_counts(mj_local_point_counts)
4498#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4501 int result =
sizeof(array_t) * (
value_count) * team_size;
4505 int remainder = result % 8;
4506 if(remainder != 0) {
4507 result += 8 - remainder;
4512 KOKKOS_INLINE_FUNCTION
4513#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4514 void operator() (
const member_type & teamMember)
const {
4522 index_t num_working_points = all_end - all_begin;
4523 int num_teams = teamMember.league_size();
4525 index_t stride = num_working_points / num_teams;
4526 if((num_working_points % num_teams) > 0) {
4530 index_t begin = all_begin + stride * teamMember.league_rank();
4531 index_t end = begin + stride;
4539#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4540 size_t sh_mem_size =
sizeof(array_t) * (
value_count);
4542 size_t sh_mem_size =
4543 sizeof(array_t) * (
value_count) * teamMember.team_size();
4546 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4549#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4551 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4556 teamMember.team_barrier();
4558 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4568 Kokkos::parallel_reduce(
4569 Kokkos::TeamThreadRange(teamMember, begin, end),
4570#
if (__cplusplus > 201703L)
4580 if(place % 2 == 0) {
4581#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4582 Kokkos::atomic_add(&shared_ptr[part], 1);
4584 threadSum.
ptr[part] += 1;
4587 parts(coordinate_index) = part;
4592 index_t set_index = Kokkos::atomic_fetch_add(
4596#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4602 teamMember.team_barrier();
4605#if (__cplusplus > 201703L)
4606 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
4608 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4611#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4612 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4614 teamSum[n] += array.
ptr[n];
4619 teamMember.team_barrier();
4622#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4624 KOKKOS_INLINE_FUNCTION
4654template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4655 typename mj_part_t,
typename mj_node_t>
4656void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4657mj_create_new_partitions(
4658 mj_part_t num_parts,
4659 mj_part_t current_concurrent_work_part,
4660 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4661 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4662 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4663 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4666 auto local_thread_part_weight_work = this->thread_part_weight_work;
4667 auto local_point_counts = this->thread_point_counts;
4668 auto local_distribute_points_on_cut_lines =
4669 this->distribute_points_on_cut_lines;
4670 auto local_thread_cut_line_weight_to_put_left =
4671 this->thread_cut_line_weight_to_put_left;
4672 auto local_sEpsilon = this->sEpsilon;
4673 auto local_coordinate_permutations = this->coordinate_permutations;
4674 auto local_mj_weights = this->mj_weights;
4675 auto local_assigned_part_ids = this->assigned_part_ids;
4676 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4678 mj_part_t num_cuts = num_parts - 1;
4680 Kokkos::parallel_for(
4681 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4682 KOKKOS_LAMBDA(
int dummy) {
4684 if(local_distribute_points_on_cut_lines) {
4685 for(
int i = 0; i < num_cuts; ++i) {
4686 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4687 if(left_weight > local_sEpsilon) {
4689 mj_scalar_t thread_ii_weight_on_cut =
4690 local_thread_part_weight_work(i * 2 + 1) -
4691 local_thread_part_weight_work(i * 2);
4693 if(thread_ii_weight_on_cut < left_weight) {
4695 local_thread_cut_line_weight_to_put_left(i) =
4696 thread_ii_weight_on_cut;
4700 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4702 left_weight -= thread_ii_weight_on_cut;
4705 local_thread_cut_line_weight_to_put_left(i) = 0;
4711 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4712 if(std::abs(current_concurrent_cut_coordinate(i) -
4713 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4714 local_thread_cut_line_weight_to_put_left(i) -=
4715 local_thread_cut_line_weight_to_put_left(i - 1);
4717 local_thread_cut_line_weight_to_put_left(i) =
4718 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4719 least_signifiance) * significance_mul) /
4720 static_cast<mj_scalar_t
>(significance_mul);
4724 for(mj_part_t i = 0; i < num_parts; ++i) {
4725 local_point_counts(i) = 0;
4729 mj_lno_t coordinate_begin_index =
4730 current_concurrent_work_part == 0 ? 0 :
4731 host_part_xadj(current_concurrent_work_part - 1);
4732 mj_lno_t coordinate_end_index =
4733 host_part_xadj(current_concurrent_work_part);
4735 mj_lno_t total_on_cut;
4736 Kokkos::parallel_reduce(
"Get total_on_cut",
4737 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4738 coordinate_begin_index, coordinate_end_index),
4739 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4740 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4741 mj_part_t coordinate_assigned_place =
4742 local_assigned_part_ids(coordinate_index);
4743 if(coordinate_assigned_place % 2 == 1) {
4748 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4749 if(total_on_cut > 0) {
4750 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4761 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4764 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4766 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4767 typedef int array_t;
4771#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4772 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", num_parts);
4775 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4776 typename mj_node_t::device_type, array_t>teamFunctor(
4777 current_concurrent_work_part,
4779 coordinate_permutations,
4780 mj_current_dim_coords,
4784#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4789#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4790 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4792 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4796#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4797 for(mj_part_t part = 0; part < num_parts; ++part) {
4798 local_point_counts(part) = reduce_array[part];
4804 if(track_on_cuts.size() > 0) {
4805 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4806 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4807 Kokkos::sort(track_on_cuts_sort);
4810 bool uniform_weights0 = this->mj_uniform_weights(0);
4811 Kokkos::parallel_for(
4812 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4813 KOKKOS_LAMBDA (
int dummy) {
4815 for(
int j = 0; j < total_on_cut; ++j) {
4816 int ii = track_on_cuts(j);
4817 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4818 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4819 local_mj_weights(coordinate_index,0);
4820 mj_part_t coordinate_assigned_place =
4821 local_assigned_part_ids(coordinate_index);
4822 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4824 if(local_distribute_points_on_cut_lines &&
4825 local_thread_cut_line_weight_to_put_left(
4826 coordinate_assigned_part) > local_sEpsilon) {
4830 local_thread_cut_line_weight_to_put_left(
4831 coordinate_assigned_part) -= coordinate_weight;
4836 if(local_thread_cut_line_weight_to_put_left(
4837 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4839 std::abs(current_concurrent_cut_coordinate(
4840 coordinate_assigned_part+1) -
4841 current_concurrent_cut_coordinate(
4842 coordinate_assigned_part)) < local_sEpsilon)
4844 local_thread_cut_line_weight_to_put_left(
4845 coordinate_assigned_part + 1) +=
4846 local_thread_cut_line_weight_to_put_left(
4847 coordinate_assigned_part);
4849 ++local_point_counts(coordinate_assigned_part);
4850 local_assigned_part_ids(coordinate_index) =
4851 coordinate_assigned_part;
4856 ++coordinate_assigned_part;
4859 while(local_distribute_points_on_cut_lines &&
4860 coordinate_assigned_part < num_cuts)
4863 if(std::abs(current_concurrent_cut_coordinate(
4864 coordinate_assigned_part) -
4865 current_concurrent_cut_coordinate(
4866 coordinate_assigned_part - 1)) < local_sEpsilon)
4869 if(local_thread_cut_line_weight_to_put_left(
4870 coordinate_assigned_part) > local_sEpsilon &&
4871 local_thread_cut_line_weight_to_put_left(
4872 coordinate_assigned_part) >=
4873 std::abs(local_thread_cut_line_weight_to_put_left(
4874 coordinate_assigned_part) - coordinate_weight))
4876 local_thread_cut_line_weight_to_put_left(
4877 coordinate_assigned_part) -= coordinate_weight;
4881 if(local_thread_cut_line_weight_to_put_left(
4882 coordinate_assigned_part) < 0 &&
4883 coordinate_assigned_part < num_cuts - 1 &&
4884 std::abs(current_concurrent_cut_coordinate(
4885 coordinate_assigned_part+1) -
4886 current_concurrent_cut_coordinate(
4887 coordinate_assigned_part)) < local_sEpsilon)
4889 local_thread_cut_line_weight_to_put_left(
4890 coordinate_assigned_part + 1) +=
4891 local_thread_cut_line_weight_to_put_left(
4892 coordinate_assigned_part);
4900 ++coordinate_assigned_part;
4902 local_point_counts(coordinate_assigned_part) += 1;
4903 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4907 for(
int j = 0; j < num_parts; ++j) {
4908 out_part_xadj(j) = local_point_counts(j);
4909 local_point_counts(j) = 0;
4912 out_part_xadj(j) += out_part_xadj(j - 1);
4913 local_point_counts(j) += out_part_xadj(j - 1);
4921#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4926 Kokkos::parallel_for(
4927 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4928 coordinate_begin_index, coordinate_end_index),
4929 KOKKOS_LAMBDA (mj_lno_t ii) {
4930 mj_lno_t i = local_coordinate_permutations(ii);
4931 mj_part_t p = local_assigned_part_ids(i);
4932 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4933 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4938#ifdef KOKKOS_ENABLE_OPENMP
4940 const int num_threads = 1;
4942 const int num_threads = 1;
4945 const int num_teams = 1;
4948 Kokkos::View<mj_lno_t*, device_t>
4949 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4953 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4954 block_policy(num_teams, num_threads);
4955 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4956 member_type member_type;
4957 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4958 mj_lno_t block_size = range / num_teams + 1;
4959 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4960 int team = team_member.league_rank();
4961 int team_offset = team * num_threads * num_parts;
4962 mj_lno_t begin = coordinate_begin_index + team * block_size;
4963 mj_lno_t end = begin + block_size;
4964 if(end > coordinate_end_index) {
4965 end = coordinate_end_index;
4968 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4970 int thread = team_member.team_rank();
4971 mj_lno_t i = local_coordinate_permutations(ii);
4972 mj_part_t p = local_assigned_part_ids(i);
4973 int index = team_offset + thread * num_parts + p;
4974 ++point_counter(index);
4982 Kokkos::parallel_for(
4983 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4984 KOKKOS_LAMBDA (
int dummy) {
4985 int num_sets = point_counter.size() / num_parts;
4986 for(
int set = num_sets - 1; set >= 1; set -=1) {
4987 int base = set * num_parts;
4988 for(
int part = 0; part < num_parts; ++part) {
4989 point_counter(base + part) = point_counter(base + part - num_parts);
4993 for(
int part = 0; part < num_parts; ++part) {
4994 point_counter(part) = 0;
4997 for(
int set = 1; set < num_sets; ++set) {
4998 int base = set * num_parts;
4999 for(
int part = 0; part < num_parts; ++part) {
5000 point_counter(base + part) += point_counter(base + part - num_parts);
5006 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
5007 int team = team_member.league_rank();
5008 int team_offset = team * num_threads * num_parts;
5009 mj_lno_t begin = coordinate_begin_index + team * block_size;
5010 mj_lno_t end = begin + block_size;
5011 if(end > coordinate_end_index) {
5012 end = coordinate_end_index;
5014 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5016 int thread = team_member.team_rank();
5017 mj_lno_t i = local_coordinate_permutations(ii);
5018 mj_part_t p = local_assigned_part_ids(i);
5019 int index = team_offset + thread * num_parts + p;
5020 int set_counter = (point_counter(index)++) + local_point_counts(p);
5021 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5070template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5071 typename mj_part_t,
typename mj_node_t>
5072void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5073 mj_node_t>::mj_get_new_cut_coordinates(
5074 mj_part_t current_concurrent_num_parts,
5076 const mj_part_t &num_cuts,
5077 const double &used_imbalance_tolerance,
5078 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5079 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5080 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5081 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5082 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5083 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5084 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5085 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5086 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5087 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5088 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5089 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5090 Kokkos::View<mj_scalar_t *, device_t> &
5091 current_part_cut_line_weight_to_put_left,
5092 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5094 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5096 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5097 auto local_sEpsilon = sEpsilon;
5098 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5099 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5100 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5101 auto local_global_min_max_coord_total_weight =
5102 global_min_max_coord_total_weight;
5104 const auto _sEpsilon = this->sEpsilon;
5112 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5113 policy_one_team(1, Kokkos::AUTO());
5114 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5115 member_type member_type;
5116 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5118 mj_scalar_t min_coordinate =
5119 local_global_min_max_coord_total_weight(kk);
5120 mj_scalar_t max_coordinate =
5121 local_global_min_max_coord_total_weight(
5122 kk + current_concurrent_num_parts);
5123 mj_scalar_t global_total_weight =
5124 local_global_min_max_coord_total_weight(
5125 kk + current_concurrent_num_parts * 2);
5127 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5132 current_global_left_closest_points(i) > local_sEpsilon) {
5133 current_global_left_closest_points(i) =
5134 current_cut_coordinates(i);
5136 if(current_global_right_closest_points(i) -
5137 max_coordinate > local_sEpsilon) {
5138 current_global_right_closest_points(i) =
5139 current_cut_coordinates(i);
5142 team_member.team_barrier();
5144 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5146 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5149 mj_scalar_t seen_weight_in_part = 0;
5151 mj_scalar_t expected_weight_in_part = 0;
5153 double imbalance_on_left = 0, imbalance_on_right = 0;
5154 if(local_distribute_points_on_cut_lines) {
5156 local_global_rectilinear_cut_weight(i) = 0;
5157 local_process_rectilinear_cut_weight(i) = 0;
5159 bool bContinue =
false;
5162 if(current_cut_line_determined(i)) {
5163 new_current_cut_coordinates(i) =
5164 current_cut_coordinates(i);
5169 seen_weight_in_part = current_global_part_weights(i * 2);
5172 expected_weight_in_part = current_part_target_weights(i);
5175 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5176 expected_weight_in_part);
5179 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5180 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5181 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5182 used_imbalance_tolerance < local_sEpsilon ;
5183 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5184 used_imbalance_tolerance < local_sEpsilon;
5186 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5187 current_cut_line_determined(i) =
true;
5188 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5189 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5191 else if(imbalance_on_left < 0) {
5193 if(local_distribute_points_on_cut_lines) {
5198 if(current_global_part_weights(i * 2 + 1) ==
5199 expected_weight_in_part) {
5201 current_cut_line_determined(i) =
true;
5202 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5205 new_current_cut_coordinates(i) =
5206 current_cut_coordinates(i);
5208 current_part_cut_line_weight_to_put_left(i) =
5209 current_local_part_weights(i * 2 + 1) -
5210 current_local_part_weights(i * 2);
5213 else if(current_global_part_weights(i * 2 + 1) >
5214 expected_weight_in_part) {
5217 current_cut_line_determined(i) =
true;
5218 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5222 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5223 new_current_cut_coordinates(i) =
5224 current_cut_coordinates(i);
5225 local_process_rectilinear_cut_weight[i] =
5226 current_local_part_weights(i * 2 + 1) -
5227 current_local_part_weights(i * 2);
5236 current_cut_lower_bounds(i) =
5237 current_global_right_closest_points(i);
5240 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5245 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5246 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5247 mj_scalar_t line_weight =
5248 current_global_part_weights(ii * 2 + 1);
5249 if(p_weight >= expected_weight_in_part) {
5254 if(p_weight == expected_weight_in_part) {
5255 current_cut_upper_bounds(i) =
5256 current_cut_coordinates(ii);
5257 current_cut_upper_weights(i) = p_weight;
5258 current_cut_lower_bounds(i) =
5259 current_cut_coordinates(ii);
5260 current_cut_lower_bound_weights(i) = p_weight;
5261 }
else if(p_weight < current_cut_upper_weights(i)) {
5264 current_cut_upper_bounds(i) =
5265 current_global_left_closest_points(ii);
5266 current_cut_upper_weights(i) = p_weight;
5272 if(line_weight >= expected_weight_in_part) {
5276 current_cut_upper_bounds(i) =
5277 current_cut_coordinates(ii);
5278 current_cut_upper_weights(i) = line_weight;
5279 current_cut_lower_bounds(i) =
5280 current_cut_coordinates(ii);
5281 current_cut_lower_bound_weights(i) = p_weight;
5286 if(p_weight <= expected_weight_in_part && p_weight >=
5287 current_cut_lower_bound_weights(i)) {
5288 current_cut_lower_bounds(i) =
5289 current_global_right_closest_points(ii);
5290 current_cut_lower_bound_weights(i) = p_weight;
5294 mj_scalar_t new_cut_position = 0;
5295 algMJ_t::mj_calculate_new_cut_position(
5296 current_cut_upper_bounds(i),
5297 current_cut_lower_bounds(i),
5298 current_cut_upper_weights(i),
5299 current_cut_lower_bound_weights(i),
5300 expected_weight_in_part, new_cut_position,
5305 if(std::abs(current_cut_coordinates(i) -
5306 new_cut_position) < local_sEpsilon) {
5307 current_cut_line_determined(i) =
true;
5308 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5311 new_current_cut_coordinates(i) =
5312 current_cut_coordinates(i);
5314 new_current_cut_coordinates(i) = new_cut_position;
5320 current_cut_upper_bounds(i) =
5321 current_global_left_closest_points(i);
5322 current_cut_upper_weights(i) =
5323 seen_weight_in_part;
5326 for(
int ii = i - 1; ii >= 0; --ii) {
5327 mj_scalar_t p_weight =
5328 current_global_part_weights(ii * 2);
5329 mj_scalar_t line_weight =
5330 current_global_part_weights(ii * 2 + 1);
5331 if(p_weight <= expected_weight_in_part) {
5332 if(p_weight == expected_weight_in_part) {
5335 current_cut_upper_bounds(i) =
5336 current_cut_coordinates(ii);
5337 current_cut_upper_weights(i) = p_weight;
5338 current_cut_lower_bounds(i) =
5339 current_cut_coordinates(ii);
5340 current_cut_lower_bound_weights(i) = p_weight;
5342 else if(p_weight > current_cut_lower_bound_weights(i)) {
5345 current_cut_lower_bounds(i) =
5346 current_global_right_closest_points(ii);
5347 current_cut_lower_bound_weights(i) = p_weight;
5353 if(line_weight > expected_weight_in_part) {
5354 current_cut_upper_bounds(i) =
5355 current_global_right_closest_points(ii);
5356 current_cut_upper_weights(i) = line_weight;
5366 if(p_weight >= expected_weight_in_part &&
5367 (p_weight < current_cut_upper_weights(i) ||
5368 (p_weight == current_cut_upper_weights(i) &&
5369 current_cut_upper_bounds(i) >
5370 current_global_left_closest_points(ii)))) {
5371 current_cut_upper_bounds(i) =
5372 current_global_left_closest_points(ii);
5373 current_cut_upper_weights(i) = p_weight;
5376 mj_scalar_t new_cut_position = 0;
5377 algMJ_t::mj_calculate_new_cut_position(
5378 current_cut_upper_bounds(i),
5379 current_cut_lower_bounds(i),
5380 current_cut_upper_weights(i),
5381 current_cut_lower_bound_weights(i),
5382 expected_weight_in_part,
5387 if(std::abs(current_cut_coordinates(i) -
5388 new_cut_position) < local_sEpsilon) {
5389 current_cut_line_determined(i) =
true;
5390 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5392 new_current_cut_coordinates(i) =
5393 current_cut_coordinates(i);
5395 new_current_cut_coordinates(i) =
5402 team_member.team_barrier();
5406 mj_part_t rectilinear_cut_count;
5407 Kokkos::parallel_reduce(
"Read bDoingWork",
5408 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5409 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5410 set_single = view_rectilinear_cut_count(0);
5411 }, rectilinear_cut_count);
5413 if(rectilinear_cut_count > 0) {
5414 auto host_local_process_rectilinear_cut_weight =
5415 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5416 local_process_rectilinear_cut_weight);
5417 auto host_local_global_rectilinear_cut_weight =
5418 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5419 local_global_rectilinear_cut_weight);
5420 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5421 local_process_rectilinear_cut_weight);
5422 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5423 local_global_rectilinear_cut_weight);
5424 Teuchos::scan<int,mj_scalar_t>(
5425 *comm, Teuchos::REDUCE_SUM,
5427 host_local_process_rectilinear_cut_weight.data(),
5428 host_local_global_rectilinear_cut_weight.data());
5429 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5430 host_local_process_rectilinear_cut_weight);
5431 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5432 host_local_global_rectilinear_cut_weight);
5434 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5435 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5436 KOKKOS_LAMBDA(
int dummy) {
5437 for(mj_part_t i = 0; i < num_cuts; ++i) {
5439 if(local_global_rectilinear_cut_weight(i) > 0) {
5441 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5443 mj_scalar_t necessary_weight_on_line_for_left =
5444 expected_part_weight - current_global_part_weights(i * 2);
5447 mj_scalar_t my_weight_on_line =
5448 local_process_rectilinear_cut_weight(i);
5452 mj_scalar_t weight_on_line_upto_process_inclusive =
5453 local_global_rectilinear_cut_weight(i);
5457 mj_scalar_t space_to_put_left =
5458 necessary_weight_on_line_for_left -
5459 weight_on_line_upto_process_inclusive;
5462 mj_scalar_t space_left_to_me =
5463 space_to_put_left + my_weight_on_line;
5476 if(space_left_to_me < 0) {
5479 current_part_cut_line_weight_to_put_left(i) = 0;
5481 else if(space_left_to_me >= my_weight_on_line) {
5485 current_part_cut_line_weight_to_put_left(i) =
5492 current_part_cut_line_weight_to_put_left(i) =
5499 view_rectilinear_cut_count(0) = 0;
5503 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5515template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5516 typename mj_part_t,
typename mj_node_t>
5517void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5518 get_processor_num_points_in_parts(
5519 mj_part_t num_procs,
5520 mj_part_t num_parts,
5521 mj_gno_t *&num_points_in_all_processor_parts)
5524 size_t allocation_size = num_parts * (num_procs + 1);
5531 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5532 new mj_gno_t[allocation_size];
5536 mj_gno_t *my_local_points_to_reduce_sum =
5537 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5541 mj_gno_t *my_local_point_counts_in_each_part =
5542 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5545 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5546 sizeof(mj_gno_t)*allocation_size);
5548 auto local_new_part_xadj = this->new_part_xadj;
5549 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5550 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5551 Kokkos::parallel_for(
"get vals on device",
5552 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5553 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5554 points_per_part(i) =
5555 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5557 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5558 Kokkos::deep_copy(host_points_per_part, points_per_part);
5559 for(
int i = 0; i < num_parts; ++i) {
5560 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5565 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5566 sizeof(mj_gno_t) * (num_parts) );
5573 reduceAll<int, mj_gno_t>(
5575 Teuchos::REDUCE_SUM,
5577 num_local_points_in_each_part_to_reduce_sum,
5578 num_points_in_all_processor_parts);
5582 delete [] num_local_points_in_each_part_to_reduce_sum;
5600template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5601 typename mj_part_t,
typename mj_node_t>
5602bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5603 mj_check_to_migrate(
5604 size_t migration_reduce_all_population,
5605 mj_lno_t num_coords_for_last_dim_part,
5606 mj_part_t num_procs,
5607 mj_part_t num_parts,
5608 mj_gno_t *num_points_in_all_processor_parts)
5611 if(migration_reduce_all_population > future_reduceall_cutoff) {
5616 if(num_coords_for_last_dim_part < min_work_last_dim) {
5621 if(this->check_migrate_avoid_migration_option == 0) {
5622 double global_imbalance = 0;
5624 size_t global_shift = num_procs * num_parts;
5626 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5627 for(mj_part_t i = 0; i < num_parts; ++i) {
5628 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5629 / double(num_procs);
5631 global_imbalance += std::abs(ideal_num -
5632 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5635 global_imbalance /= num_parts;
5636 global_imbalance /= num_procs;
5638 if(global_imbalance <= this->minimum_migration_imbalance) {
5664template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5665 typename mj_part_t,
typename mj_node_t>
5666void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5667 assign_send_destinations(
5668 mj_part_t num_parts,
5669 mj_part_t *part_assignment_proc_begin_indices,
5670 mj_part_t *processor_chains_in_parts,
5671 mj_lno_t *send_count_to_each_proc,
5672 int *coordinate_destinations) {
5674 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5675 deep_copy(host_new_part_xadj, this->new_part_xadj);
5677 auto host_new_coordinate_permutations =
5678 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5679 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5681 for(mj_part_t p = 0; p < num_parts; ++p) {
5682 mj_lno_t part_begin = 0;
5683 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5684 mj_lno_t part_end = host_new_part_xadj(p);
5686 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5688 mj_lno_t num_total_send = 0;
5689 for(mj_lno_t j=part_begin; j < part_end; j++) {
5690 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5691 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5695 part_assignment_proc_begin_indices[p] =
5696 processor_chains_in_parts[proc_to_sent];
5698 processor_chains_in_parts[proc_to_sent] = -1;
5700 proc_to_sent = part_assignment_proc_begin_indices[p];
5703 coordinate_destinations[local_ind] = proc_to_sent;
5729template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5730 typename mj_part_t,
typename mj_node_t>
5731void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5732 mj_assign_proc_to_parts(
5733 mj_gno_t * num_points_in_all_processor_parts,
5734 mj_part_t num_parts,
5735 mj_part_t num_procs,
5736 mj_lno_t *send_count_to_each_proc,
5737 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5738 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5739 mj_part_t &out_part_index,
5740 mj_part_t &output_part_numbering_begin_index,
5741 int * coordinate_destinations) {
5742 mj_gno_t *global_num_points_in_parts =
5743 num_points_in_all_processor_parts + num_procs * num_parts;
5744 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5747 bool did_i_find_my_group =
false;
5749 mj_part_t num_free_procs = num_procs;
5750 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5752 double max_imbalance_difference = 0;
5753 mj_part_t max_differing_part = 0;
5756 for(mj_part_t i = 0; i < num_parts; i++) {
5759 double scalar_required_proc = num_procs *
5760 (double (global_num_points_in_parts[i]) /
5761 double (this->num_global_coords));
5764 mj_part_t required_proc =
5765 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5766 if(required_proc == 0) required_proc = 1;
5772 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5773 required_proc = num_free_procs -
5774 (minimum_num_procs_required_for_rest_of_parts);
5778 num_free_procs -= required_proc;
5782 --minimum_num_procs_required_for_rest_of_parts;
5785 num_procs_assigned_to_each_part[i] = required_proc;
5790 double imbalance_wrt_ideal =
5791 (scalar_required_proc - required_proc) / required_proc;
5792 if(imbalance_wrt_ideal > max_imbalance_difference) {
5793 max_imbalance_difference = imbalance_wrt_ideal;
5794 max_differing_part = i;
5800 if(num_free_procs > 0) {
5801 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5808 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5812 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5813 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5822 for(
int i = 0; i < num_procs; ++i ) {
5823 processor_part_assignments[i] = -1;
5824 processor_chains_in_parts[i] = -1;
5826 for(
int i = 0; i < num_parts; ++i ) {
5827 part_assignment_proc_begin_indices[i] = -1;
5833 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5834 sort_item_num_part_points_in_procs =
5835 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5837 for(mj_part_t i = 0; i < num_parts; ++i) {
5842 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5843 sort_item_num_part_points_in_procs[ii].id = ii;
5846 if(processor_part_assignments[ii] == -1) {
5847 sort_item_num_part_points_in_procs[ii].val =
5848 num_points_in_all_processor_parts[ii * num_parts + i];
5850 sort_item_num_part_points_in_procs[ii].signbit = 1;
5863 sort_item_num_part_points_in_procs[ii].val =
5864 num_points_in_all_processor_parts[ii * num_parts + i];
5865 sort_item_num_part_points_in_procs[ii].signbit = 0;
5870 uqSignsort<mj_part_t, mj_gno_t,char>
5871 (num_procs, sort_item_num_part_points_in_procs);
5883 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5884 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5885 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5886 ceil(total_num_points_in_part /
double (required_proc_count)));
5889 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5890 mj_part_t next_proc_to_send_id =
5891 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5892 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5893 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5897 for(mj_part_t ii = num_procs - 1;
5898 ii >= num_procs - required_proc_count; --ii) {
5899 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5901 processor_part_assignments[proc_id] = i;
5904 bool did_change_sign =
false;
5906 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5909 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5910 did_change_sign =
true;
5911 sort_item_num_part_points_in_procs[ii].signbit = 1;
5918 if(did_change_sign) {
5921 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5922 sort_item_num_part_points_in_procs);
5937 if(!did_i_find_my_group) {
5938 for(mj_part_t ii = num_procs - 1; ii >=
5939 num_procs - required_proc_count; --ii) {
5941 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5944 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5946 if(proc_id_to_assign == this->myRank) {
5948 did_i_find_my_group =
true;
5951 part_assignment_proc_begin_indices[i] = this->myRank;
5952 processor_chains_in_parts[this->myRank] = -1;
5956 send_count_to_each_proc[this->myRank] =
5957 sort_item_num_part_points_in_procs[ii].val;
5961 for(mj_part_t in = 0; in < i; ++in) {
5962 output_part_numbering_begin_index +=
5963 (*next_future_num_parts_in_parts)[in];
5971 if(!did_i_find_my_group) {
5972 processor_ranks_for_subcomm.clear();
5980 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5981 mj_part_t nonassigned_proc_id =
5982 sort_item_num_part_points_in_procs[ii].id;
5983 mj_lno_t num_points_to_sent =
5984 sort_item_num_part_points_in_procs[ii].val;
5990 if(num_points_to_sent < 0) {
5991 cout <<
"Migration - processor assignments - for part:" << i
5992 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:"
5993 << num_points_to_sent << std::endl;
5998 switch (migration_type) {
6002 while (num_points_to_sent > 0) {
6004 if(num_points_to_sent <= space_left_in_sent_proc) {
6006 space_left_in_sent_proc -= num_points_to_sent;
6008 if(this->myRank == nonassigned_proc_id) {
6010 send_count_to_each_proc[next_proc_to_send_id] =
6015 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6016 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6017 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6019 num_points_to_sent = 0;
6023 if(space_left_in_sent_proc > 0) {
6024 num_points_to_sent -= space_left_in_sent_proc;
6027 if(this->myRank == nonassigned_proc_id) {
6029 send_count_to_each_proc[next_proc_to_send_id] =
6030 space_left_in_sent_proc;
6031 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6032 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6033 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6037 ++next_proc_to_send_index;
6040 if(next_part_to_send_index < nprocs - required_proc_count ) {
6041 cout <<
"Migration - processor assignments - for part:"
6043 <<
" next_part_to_send :" << next_part_to_send_index
6044 <<
" nprocs:" << nprocs
6045 <<
" required_proc_count:" << required_proc_count
6046 <<
" Error: next_part_to_send_index <" <<
6047 <<
" nprocs - required_proc_count" << std::endl;
6052 next_proc_to_send_id =
6053 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6055 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6056 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6067 if(this->myRank == nonassigned_proc_id) {
6069 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6073 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6074 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6075 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6077 num_points_to_sent = 0;
6078 ++next_proc_to_send_index;
6082 if(next_proc_to_send_index == num_procs) {
6083 next_proc_to_send_index = num_procs - required_proc_count;
6086 next_proc_to_send_id =
6087 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6089 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6090 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6103 this->assign_send_destinations(
6105 part_assignment_proc_begin_indices,
6106 processor_chains_in_parts,
6107 send_count_to_each_proc,
6108 coordinate_destinations);
6109 delete [] part_assignment_proc_begin_indices;
6110 delete [] processor_chains_in_parts;
6111 delete [] processor_part_assignments;
6112 delete [] sort_item_num_part_points_in_procs;
6113 delete [] num_procs_assigned_to_each_part;
6131template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6132 typename mj_part_t,
typename mj_node_t>
6133void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6134 assign_send_destinations2(
6135 mj_part_t num_parts,
6136 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6137 int *coordinate_destinations,
6138 mj_part_t &output_part_numbering_begin_index,
6139 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6141 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6142 mj_part_t previous_processor = -1;
6144 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6145 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6147 auto local_new_coordinate_permutations =
6148 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6149 Kokkos::deep_copy(local_new_coordinate_permutations,
6150 this->new_coordinate_permutations);
6152 for(mj_part_t i = 0; i < num_parts; ++i) {
6153 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6156 mj_lno_t part_begin_index = 0;
6159 part_begin_index = local_new_part_xadj(p - 1);
6162 mj_lno_t part_end_index = local_new_part_xadj(p);
6164 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6165 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6166 output_part_numbering_begin_index = part_shift_amount;
6168 previous_processor = assigned_proc;
6169 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6171 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6172 mj_lno_t localInd = local_new_coordinate_permutations(j);
6173 coordinate_destinations[localInd] = assigned_proc;
6199template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6200 typename mj_part_t,
typename mj_node_t>
6201void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6202 mj_assign_parts_to_procs(
6203 mj_gno_t * num_points_in_all_processor_parts,
6204 mj_part_t num_parts,
6205 mj_part_t num_procs,
6206 mj_lno_t *send_count_to_each_proc,
6207 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6208 mj_part_t &out_num_part,
6209 std::vector<mj_part_t> &out_part_indices,
6210 mj_part_t &output_part_numbering_begin_index,
6211 int *coordinate_destinations) {
6214 mj_gno_t *global_num_points_in_parts =
6215 num_points_in_all_processor_parts + num_procs * num_parts;
6216 out_part_indices.clear();
6220 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6221 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6222 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6223 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6227 mj_lno_t work_each =
6228 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6232 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6235 for(mj_part_t i = 0; i < num_procs; ++i) {
6236 space_in_each_processor[i] = work_each;
6243 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6244 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6245 int empty_proc_count = num_procs;
6249 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6250 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6255 for(mj_part_t i = 0; i < num_parts; ++i) {
6256 sort_item_point_counts_in_parts[i].id = i;
6257 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6261 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6266 for(mj_part_t j = 0; j < num_parts; ++j) {
6268 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6271 mj_gno_t load = global_num_points_in_parts[i];
6274 mj_part_t assigned_proc = -1;
6277 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6278 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6283 if(empty_proc_count < num_parts - j ||
6284 num_parts_proc_assigned[ii] == 0) {
6286 sort_item_num_points_of_proc_in_part_i[ii].val =
6287 num_points_in_all_processor_parts[ii * num_parts + i];
6290 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6294 uqsort<mj_part_t, mj_gno_t>(num_procs,
6295 sort_item_num_points_of_proc_in_part_i);
6298 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6299 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6300 if(assigned_proc == -1 ||
6301 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6304 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6305 if(ii < assigned_proc) {
6321 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6325 space_in_each_processor[assigned_proc] -= load;
6327 sort_item_part_to_proc_assignment[j].id = i;
6330 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6333 if(assigned_proc == this->myRank) {
6335 out_part_indices.push_back(i);
6341 send_count_to_each_proc[assigned_proc] +=
6342 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6345 delete [] num_parts_proc_assigned;
6346 delete [] sort_item_num_points_of_proc_in_part_i;
6347 delete [] sort_item_point_counts_in_parts;
6348 delete [] space_in_each_processor;
6351 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6354 this->assign_send_destinations2(
6356 sort_item_part_to_proc_assignment,
6357 coordinate_destinations,
6358 output_part_numbering_begin_index,
6359 next_future_num_parts_in_parts);
6361 delete [] sort_item_part_to_proc_assignment;
6388template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6389 typename mj_part_t,
typename mj_node_t>
6390void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6391 mj_migration_part_proc_assignment(
6392 mj_gno_t * num_points_in_all_processor_parts,
6393 mj_part_t num_parts,
6394 mj_part_t num_procs,
6395 mj_lno_t *send_count_to_each_proc,
6396 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6397 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6398 mj_part_t &out_num_part,
6399 std::vector<mj_part_t> &out_part_indices,
6400 mj_part_t &output_part_numbering_begin_index,
6401 int *coordinate_destinations)
6403 processor_ranks_for_subcomm.clear();
6405 if(num_procs > num_parts) {
6410 mj_part_t out_part_index = 0;
6412 this->mj_assign_proc_to_parts(
6413 num_points_in_all_processor_parts,
6416 send_count_to_each_proc,
6417 processor_ranks_for_subcomm,
6418 next_future_num_parts_in_parts,
6420 output_part_numbering_begin_index,
6421 coordinate_destinations
6425 out_part_indices.clear();
6426 out_part_indices.push_back(out_part_index);
6433 processor_ranks_for_subcomm.push_back(this->myRank);
6438 this->mj_assign_parts_to_procs(
6439 num_points_in_all_processor_parts,
6442 send_count_to_each_proc,
6443 next_future_num_parts_in_parts,
6446 output_part_numbering_begin_index,
6447 coordinate_destinations);
6464template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6465 typename mj_part_t,
typename mj_node_t>
6466void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6468 mj_part_t num_procs,
6469 mj_lno_t &num_new_local_points,
6470 std::string iteration,
6471 int *coordinate_destinations,
6472 mj_part_t num_parts)
6475#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6476 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6480 ZOLTAN_COMM_OBJ *plan = NULL;
6481 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6482 int num_incoming_gnos = 0;
6483 int message_tag = 7859;
6486 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6487 int ierr = Zoltan_Comm_Create(
6489 int(this->num_local_coords),
6490 coordinate_destinations,
6493 &num_incoming_gnos);
6497 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6500 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6508 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6509 Kokkos::HostSpace(), this->current_mj_gnos);
6510 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6511 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6512 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6513 auto host_dst_gnos = Kokkos::create_mirror_view(
6514 Kokkos::HostSpace(), dst_gnos);
6516 ierr = Zoltan_Comm_Do(
6519 (
char *) host_current_mj_gnos.data(),
6521 (
char *) host_dst_gnos.data());
6523 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6524 this->current_mj_gnos = dst_gnos;
6530 auto host_src_coordinates = Kokkos::create_mirror_view(
6531 Kokkos::HostSpace(), this->mj_coordinates);
6532 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6533 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6534 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6535 num_incoming_gnos, this->coord_dim);
6536 auto host_dst_coordinates = Kokkos::create_mirror_view(
6537 Kokkos::HostSpace(), dst_coordinates);
6538 for(
int i = 0; i < this->coord_dim; ++i) {
6539 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6540 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6541 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6542 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6545 ierr = Zoltan_Comm_Do(
6548 (
char *) sub_host_src_coordinates.data(),
6549 sizeof(mj_scalar_t),
6550 (
char *) sub_host_dst_coordinates.data());
6553 deep_copy(dst_coordinates, host_dst_coordinates);
6554 this->mj_coordinates = dst_coordinates;
6559 auto host_src_weights = Kokkos::create_mirror_view(
6560 Kokkos::HostSpace(), this->mj_weights);
6561 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6562 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6563 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6564 num_incoming_gnos, this->num_weights_per_coord);
6565 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6566 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6567 auto sub_host_src_weights
6568 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6569 auto sub_host_dst_weights
6570 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6571 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6573 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6574 sent_weight[n] = sub_host_src_weights(n);
6576 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6578 ierr = Zoltan_Comm_Do(
6581 (
char *) sent_weight.getRawPtr(),
6582 sizeof(mj_scalar_t),
6583 (
char *) received_weight.getRawPtr());
6586 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6587 sub_host_dst_weights(n) = received_weight[n];
6590 deep_copy(dst_weights, host_dst_weights);
6591 this->mj_weights = dst_weights;
6597 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6598 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6601 ierr = Zoltan_Comm_Do(
6604 (
char *) owner_of_coordinate.data(),
6606 (
char *) dst_owners_of_coordinate.data());
6608 this->owner_of_coordinate = dst_owners_of_coordinate;
6615 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6616 Kokkos::HostSpace(), this->assigned_part_ids);
6617 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6618 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6619 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6621 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6622 Kokkos::HostSpace(), dst_assigned_part_ids);
6623 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6624 if(num_procs < num_parts) {
6626 ierr = Zoltan_Comm_Do(
6629 (
char *) host_src_assigned_part_ids.data(),
6631 (
char *) host_dst_assigned_part_ids.data());
6633 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6637 this->assigned_part_ids = dst_assigned_part_ids;
6640 ierr = Zoltan_Comm_Destroy(&plan);
6642 num_new_local_points = num_incoming_gnos;
6644 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6649 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6650 "Migration DistributorPlanCreating-" + iteration);
6652 Tpetra::Distributor distributor(this->comm);
6653 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6654 this->num_local_coords);
6655 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6656 this->mj_env->timerStop(
MACRO_TIMERS, mj_timer_base_string +
6657 "Migration DistributorPlanCreating-" + iteration);
6658 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6659 "Migration DistributorMigration-" + iteration);
6667 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6668 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
6671 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6672 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
6673 this->current_mj_gnos.extent(0));
6674 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6676 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6678 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6679 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6681 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6686 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6687 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6689 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6690 host_src_coordinates(
6691 Kokkos::ViewAllocateWithoutInitializing(
"host_coords"),
6692 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6693 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6695 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6696 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
6699 for(
int i = 0; i < this->coord_dim; ++i) {
6703 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6704 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6706 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6708 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6714 this->mj_coordinates = dst_coordinates;
6717 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6718 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6719 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6722 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6723 Kokkos::HostSpace(), this->mj_weights);
6726 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6727 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
6728 this->num_local_coords);
6730 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6731 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
6734 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6736 auto sub_host_src_weights
6737 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6739 auto sub_host_dst_weights
6740 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6747 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6748 sent_weight[n] = sub_host_src_weights(n);
6751 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6754 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6755 sub_host_dst_weights(n) = received_weight[n];
6758 Kokkos::deep_copy(dst_weights, host_dst_weights);
6759 this->mj_weights = dst_weights;
6764 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6765 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6768 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6770 this->owner_of_coordinate = received_owners;
6776 if(num_procs < num_parts) {
6777 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6778 Kokkos::ViewAllocateWithoutInitializing(
"host_parts"),
6779 this->assigned_part_ids.extent(0));
6780 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6782 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6783 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
6786 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6788 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6789 (
"assigned_part_ids", num_incoming_gnos);
6790 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6793 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6794 (
"assigned_part_ids", num_incoming_gnos);
6796 this->mj_env->timerStop(
MACRO_TIMERS,
"" + mj_timer_base_string +
6797 "Migration DistributorMigration-" + iteration);
6799 num_new_local_points = num_incoming_gnos;
6808template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6809 typename mj_part_t,
typename mj_node_t>
6810void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6811 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6813 mj_part_t group_size = processor_ranks_for_subcomm.size();
6814 mj_part_t *ids =
new mj_part_t[group_size];
6815 for(mj_part_t i = 0; i < group_size; ++i) {
6816 ids[i] = processor_ranks_for_subcomm[i];
6818 ArrayView<const mj_part_t> idView(ids, group_size);
6819 this->comm = this->comm->createSubcommunicator(idView);
6828template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6829 typename mj_part_t,
typename mj_node_t>
6830void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6831 fill_permutation_array(
6832 mj_part_t output_num_parts,
6833 mj_part_t num_parts)
6836 if(output_num_parts == 1) {
6837 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6838 Kokkos::parallel_for(
6839 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6840 (0, this->num_local_coords),
6841 KOKKOS_LAMBDA(mj_lno_t i) {
6842 local_new_coordinate_permutations(i) = i;
6844 auto local_new_part_xadj = this->new_part_xadj;
6845 auto local_num_local_coords = this->num_local_coords;
6846 Kokkos::parallel_for(
6847 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6848 KOKKOS_LAMBDA(
int dummy) {
6849 local_new_part_xadj(0) = local_num_local_coords;
6853 auto local_num_local_coords = this->num_local_coords;
6854 auto local_assigned_part_ids = this->assigned_part_ids;
6855 auto local_new_part_xadj = this->new_part_xadj;
6856 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6859 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6864 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6865 "num_points_in_parts", num_parts);
6867 Kokkos::parallel_for(
6868 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6869 KOKKOS_LAMBDA(
int dummy) {
6871 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6872 mj_part_t ii = local_assigned_part_ids(i);
6873 ++num_points_in_parts(ii);
6878 mj_lno_t prev_index = 0;
6879 for(mj_part_t i = 0; i < num_parts; ++i) {
6880 if(num_points_in_parts(i) > 0) {
6881 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6882 prev_index += num_points_in_parts(i);
6883 part_shifts(i) = p++;
6888 mj_part_t assigned_num_parts = p - 1;
6889 for(;p < num_parts; ++p) {
6890 local_new_part_xadj(p) =
6891 local_new_part_xadj(assigned_num_parts);
6893 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6894 num_points_in_parts(i) = local_new_part_xadj(i);
6900 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6902 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6903 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6933template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6934 typename mj_part_t,
typename mj_node_t>
6935bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6936 mj_perform_migration(
6937 mj_part_t input_num_parts,
6938 mj_part_t &output_num_parts,
6939 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6940 mj_part_t &output_part_begin_index,
6941 size_t migration_reduce_all_population,
6942 mj_lno_t num_coords_for_last_dim_part,
6943 std::string iteration,
6944 RCP<mj_partBoxVector_t> &input_part_boxes,
6945 RCP<mj_partBoxVector_t> &output_part_boxes)
6947 mj_part_t num_procs = this->comm->getSize();
6948 this->myRank = this->comm->getRank();
6953 mj_gno_t *num_points_in_all_processor_parts =
6954 new mj_gno_t[input_num_parts * (num_procs + 1)];
6957 this->get_processor_num_points_in_parts(
6960 num_points_in_all_processor_parts);
6963 if(!this->mj_check_to_migrate(
6964 migration_reduce_all_population,
6965 num_coords_for_last_dim_part,
6968 num_points_in_all_processor_parts)) {
6969 delete [] num_points_in_all_processor_parts;
6973 mj_lno_t *send_count_to_each_proc = NULL;
6974 int *coordinate_destinations =
new int[this->num_local_coords];
6975 send_count_to_each_proc =
new mj_lno_t[num_procs];
6977 for(
int i = 0; i < num_procs; ++i) {
6978 send_count_to_each_proc[i] = 0;
6981 std::vector<mj_part_t> processor_ranks_for_subcomm;
6982 std::vector<mj_part_t> out_part_indices;
6985 this->mj_migration_part_proc_assignment(
6986 num_points_in_all_processor_parts,
6989 send_count_to_each_proc,
6990 processor_ranks_for_subcomm,
6991 next_future_num_parts_in_parts,
6994 output_part_begin_index,
6995 coordinate_destinations);
6997 delete [] send_count_to_each_proc;
6998 std::vector <mj_part_t> tmpv;
7000 std::sort (out_part_indices.begin(), out_part_indices.end());
7001 mj_part_t outP = out_part_indices.size();
7002 mj_gno_t new_global_num_points = 0;
7003 mj_gno_t *global_num_points_in_parts =
7004 num_points_in_all_processor_parts + num_procs * input_num_parts;
7006 if(this->mj_keep_part_boxes) {
7007 input_part_boxes->clear();
7012 for(mj_part_t i = 0; i < outP; ++i) {
7013 mj_part_t ind = out_part_indices[i];
7014 new_global_num_points += global_num_points_in_parts[ind];
7015 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7016 if(this->mj_keep_part_boxes) {
7017 input_part_boxes->push_back((*output_part_boxes)[ind]);
7022 if(this->mj_keep_part_boxes) {
7023 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7024 input_part_boxes = output_part_boxes;
7025 output_part_boxes = tmpPartBoxes;
7027 next_future_num_parts_in_parts->clear();
7028 for(mj_part_t i = 0; i < outP; ++i) {
7029 mj_part_t p = tmpv[i];
7030 next_future_num_parts_in_parts->push_back(p);
7033 delete [] num_points_in_all_processor_parts;
7035 mj_lno_t num_new_local_points = 0;
7037 this->mj_migrate_coords(
7039 num_new_local_points,
7041 coordinate_destinations,
7044 delete [] coordinate_destinations;
7045 if(this->num_local_coords != num_new_local_points) {
7046 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7047 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7048 num_new_local_points);
7049 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7050 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7051 num_new_local_points);
7053 this->num_local_coords = num_new_local_points;
7054 this->num_global_coords = new_global_num_points;
7057 this->create_sub_communicator(processor_ranks_for_subcomm);
7059 processor_ranks_for_subcomm.clear();
7062 this->fill_permutation_array(output_num_parts, input_num_parts);
7085template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7086 typename mj_part_t,
typename mj_node_t>
7087void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7088 create_consistent_chunks(
7089 mj_part_t num_parts,
7090 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7091 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7092 mj_lno_t coordinate_begin,
7093 mj_lno_t coordinate_end,
7094 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7095 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7097 bool longest_dim_part,
7098 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7108 mj_part_t no_cuts = num_parts - 1;
7112 if(this->distribute_points_on_cut_lines) {
7113 auto local_thread_cut_line_weight_to_put_left =
7114 this->thread_cut_line_weight_to_put_left;
7115 auto local_thread_part_weight_work =
7116 this->thread_part_weight_work;
7117 auto local_sEpsilon = this->sEpsilon;
7119 Kokkos::parallel_for(
7120 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7121 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7123 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7124 if(left_weight > local_sEpsilon) {
7126 mj_scalar_t thread_ii_weight_on_cut =
7127 local_thread_part_weight_work(i * 2 + 1) -
7128 local_thread_part_weight_work(i * 2);
7129 if(thread_ii_weight_on_cut < left_weight) {
7130 local_thread_cut_line_weight_to_put_left(i) =
7131 thread_ii_weight_on_cut;
7134 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7138 local_thread_cut_line_weight_to_put_left(i) = 0;
7143 auto local_least_signifiance = least_signifiance;
7144 auto local_significance_mul = significance_mul;
7145 Kokkos::parallel_for(
7146 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7147 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7151 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7152 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7153 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7154 mj_scalar_t delta = cut2 - cut1;
7155 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7156 if(abs_delta < local_sEpsilon) {
7157 local_thread_cut_line_weight_to_put_left(i) -=
7158 local_thread_cut_line_weight_to_put_left(i - 1);
7160 local_thread_cut_line_weight_to_put_left(i) =
7161 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7162 local_least_signifiance) * local_significance_mul) /
7163 static_cast<mj_scalar_t
>(local_significance_mul);
7169 auto local_thread_point_counts = this->thread_point_counts;
7170 Kokkos::parallel_for(
7171 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7172 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7173 local_thread_point_counts(i) = 0;
7184 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7186 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7187 typedef std::vector< multiSItem > multiSVector;
7188 typedef std::vector<multiSVector> multiS2Vector;
7191 std::vector<mj_scalar_t *>allocated_memory;
7194 multiS2Vector sort_vector_points_on_cut;
7197 mj_part_t different_cut_count = 1;
7203 multiSVector tmpMultiSVector;
7204 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7206 auto local_current_concurrent_cut_coordinate =
7207 current_concurrent_cut_coordinate;
7208 auto host_current_concurrent_cut_coordinate =
7209 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7210 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7211 local_current_concurrent_cut_coordinate);
7213 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7216 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7217 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7218 cut_map[i] = cut_map[i-1];
7221 cut_map[i] = different_cut_count++;
7222 multiSVector tmp2MultiSVector;
7223 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7226 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7227 host_current_concurrent_cut_coordinate);
7230 auto host_coordinate_permutations =
7231 Kokkos::create_mirror_view(coordinate_permutations);
7232 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7234 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7235 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7237 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7238 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7240 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7241 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7243 auto local_coord_dim = this->coord_dim;
7245 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7246 mj_lno_t i = host_coordinate_permutations(ii);
7247 mj_part_t pp = host_assigned_part_ids(i);
7248 mj_part_t p = pp / 2;
7251 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7252 allocated_memory.push_back(vals);
7257 if(longest_dim_part) {
7259 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7262 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7267 host_mj_coordinates(i,next_largest_coord_dim);
7271 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7272 vals[val_ind++] = host_mj_coordinates(i,dim);
7274 for(
int dim = 0; dim < coordInd; ++dim) {
7275 vals[val_ind++] = host_mj_coordinates(i,dim);
7279 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7281 mj_part_t cmap = cut_map[p];
7282 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7286 ++host_thread_point_counts(p);
7287 host_assigned_part_ids(i) = p;
7292 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7293 std::sort (sort_vector_points_on_cut[i].begin(),
7294 sort_vector_points_on_cut[i].end());
7297 mj_part_t previous_cut_map = cut_map[0];
7299 auto host_thread_cut_line_weight_to_put_left =
7300 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7301 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7302 thread_cut_line_weight_to_put_left);
7304 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7305 Kokkos::deep_copy(host_mj_weights, mj_weights);
7315 mj_scalar_t weight_stolen_from_previous_part = 0;
7316 for(mj_part_t p = 0; p < no_cuts; ++p) {
7317 mj_part_t mapped_cut = cut_map[p];
7321 if(previous_cut_map != mapped_cut) {
7322 mj_lno_t sort_vector_end = (mj_lno_t)
7323 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7324 for(; sort_vector_end >= 0; --sort_vector_end) {
7326 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7327 mj_lno_t i = t.index;
7328 ++host_thread_point_counts(p);
7329 host_assigned_part_ids(i) = p;
7331 sort_vector_points_on_cut[previous_cut_map].clear();
7335 mj_lno_t sort_vector_end = (mj_lno_t)
7336 sort_vector_points_on_cut[mapped_cut].size() - 1;
7342 for(; sort_vector_end >= 0; --sort_vector_end) {
7345 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7347 mj_lno_t i = t.index;
7348 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7349 this->mj_weights(i,0);
7351 if(host_thread_cut_line_weight_to_put_left(p) +
7352 weight_stolen_from_previous_part> this->sEpsilon &&
7353 host_thread_cut_line_weight_to_put_left(p) +
7354 weight_stolen_from_previous_part -
7355 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7356 weight_stolen_from_previous_part - w)> this->sEpsilon)
7358 host_thread_cut_line_weight_to_put_left(p) -= w;
7360 sort_vector_points_on_cut[mapped_cut].pop_back();
7362 ++host_thread_point_counts(p);
7363 host_assigned_part_ids(i) = p;
7367 if(p < no_cuts - 1 &&
7368 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7369 if(mapped_cut == cut_map[p + 1] ) {
7372 if(previous_cut_map != mapped_cut) {
7373 weight_stolen_from_previous_part =
7374 host_thread_cut_line_weight_to_put_left(p);
7379 weight_stolen_from_previous_part +=
7380 host_thread_cut_line_weight_to_put_left(p);
7384 weight_stolen_from_previous_part =
7385 -host_thread_cut_line_weight_to_put_left(p);
7394 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7395 if(previous_cut_map != mapped_cut) {
7396 weight_stolen_from_previous_part =
7397 host_thread_cut_line_weight_to_put_left(p);
7400 weight_stolen_from_previous_part +=
7401 host_thread_cut_line_weight_to_put_left(p);
7405 weight_stolen_from_previous_part =
7406 -host_thread_cut_line_weight_to_put_left(p);
7412 previous_cut_map = mapped_cut;
7417 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7418 previous_cut_map].size() - 1;
7424 for(; sort_vector_end >= 0; --sort_vector_end) {
7426 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7429 mj_lno_t i = t.index;
7430 ++host_thread_point_counts(no_cuts);
7431 host_assigned_part_ids(i) = no_cuts;
7434 sort_vector_points_on_cut[previous_cut_map].clear();
7438 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7439 for(mj_lno_t i = 0; i < vSize; ++i) {
7440 delete [] allocated_memory[i];
7443 auto local_out_part_xadj = out_part_xadj;
7444 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7445 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7448 for(mj_part_t j = 0; j < num_parts; ++j) {
7449 host_out_part_xadj(j) = host_thread_point_counts(j);
7450 host_thread_point_counts(j) = 0;
7454 for(mj_part_t j = 1; j < num_parts; ++j) {
7455 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7460 for(mj_part_t j = 1; j < num_parts; ++j) {
7461 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7464 auto host_new_coordinate_permutations =
7465 Kokkos::create_mirror_view(new_coordinate_permutations);
7466 Kokkos::deep_copy(host_new_coordinate_permutations,
7467 new_coordinate_permutations);
7471 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7472 mj_lno_t i = host_coordinate_permutations(ii);
7473 mj_part_t p = host_assigned_part_ids(i);
7474 host_new_coordinate_permutations(coordinate_begin +
7475 host_thread_point_counts(p)++) = i;
7478 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7479 Kokkos::deep_copy(new_coordinate_permutations,
7480 host_new_coordinate_permutations);
7481 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7493template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7494 typename mj_part_t,
typename mj_node_t>
7495void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7497 mj_part_t current_num_parts,
7498 mj_part_t output_part_begin_index,
7499 RCP<mj_partBoxVector_t> &output_part_boxes,
7500 bool is_data_ever_migrated)
7503 mj_timer_base_string +
"Part_Assignment");
7505 auto local_part_xadj = part_xadj;
7506 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7507 auto local_coordinate_permutations = coordinate_permutations;
7508 auto local_assigned_part_ids = assigned_part_ids;
7510 if(local_mj_keep_part_boxes) {
7511 for(
int i = 0; i < current_num_parts; ++i) {
7512 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7516 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7517 current_num_parts, Kokkos::AUTO());
7518 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7519 member_type member_type;
7520 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7521 int i = team_member.league_rank();
7522 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7523 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7525 mj_lno_t k = local_coordinate_permutations(ii);
7526 local_assigned_part_ids(k) = i + output_part_begin_index;
7530 if(is_data_ever_migrated) {
7531#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7532 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
7539 ZOLTAN_COMM_OBJ *plan = NULL;
7540 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7543 int message_tag = 7856;
7546 mj_timer_base_string +
"Final Z1PlanCreating");
7549 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7550 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7554 mj_timer_base_string +
"Final Z1PlanCreating" );
7557 mj_timer_base_string +
"Final Z1PlanComm");
7564 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7565 Kokkos::HostSpace(), this->current_mj_gnos);
7566 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7567 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7568 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7569 auto host_dst_gnos = Kokkos::create_mirror_view(
7570 Kokkos::HostSpace(), dst_gnos);
7572 ierr = Zoltan_Comm_Do( plan, message_tag,
7573 (
char *) host_current_mj_gnos.data(),
7574 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7576 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7577 this->current_mj_gnos = dst_gnos;
7580 auto host_src_part_ids = Kokkos::create_mirror_view(
7581 Kokkos::HostSpace(), this->assigned_part_ids);
7582 deep_copy(host_src_part_ids, this->assigned_part_ids);
7583 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7584 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7585 auto host_dst_part_ids = Kokkos::create_mirror_view(
7586 Kokkos::HostSpace(), dst_part_ids);
7588 ierr = Zoltan_Comm_Do( plan, message_tag,
7589 (
char *) host_src_part_ids.data(),
7590 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7592 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7593 this->assigned_part_ids = dst_part_ids;
7595 ierr = Zoltan_Comm_Destroy(&plan);
7598 this->num_local_coords = incoming;
7601 mj_timer_base_string +
"Final Z1PlanComm");
7608 mj_timer_base_string +
"Final DistributorPlanCreating");
7609 Tpetra::Distributor distributor(this->mj_problemComm);
7610 ArrayView<const mj_part_t> owners_of_coords(
7611 this->owner_of_coordinate.data(), this->num_local_coords);
7612 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7614 mj_timer_base_string +
"Final DistributorPlanCreating" );
7617 mj_timer_base_string +
"Final DistributorPlanComm");
7623 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7624 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
7625 this->current_mj_gnos.extent(0));
7626 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7628 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7629 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
7632 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7634 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7635 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7637 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7640 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7641 Kokkos::ViewAllocateWithoutInitializing(
"sent_partids"),
7642 this->assigned_part_ids.extent(0));
7643 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7645 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7646 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
7649 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7651 this->assigned_part_ids =
7652 Kokkos::View<mj_part_t*, device_t>(
7653 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7656 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7657 this->num_local_coords = incoming;
7660 mj_timer_base_string +
"Final DistributorPlanComm");
7665 mj_timer_base_string +
"Part_Assignment");
7668 mj_timer_base_string +
"Solution_Part_Assignment");
7673 if(this->mj_keep_part_boxes) {
7674 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7678 mj_timer_base_string +
"Solution_Part_Assignment");
7693template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7694 typename mj_part_t,
typename mj_node_t>
7697 bool distribute_points_on_cut_lines_,
7698 int max_concurrent_part_calculation_,
7699 int check_migrate_avoid_migration_option_,
7700 double minimum_migration_imbalance_,
7701 int migration_type_)
7703 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7704 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7705 this->check_migrate_avoid_migration_option =
7706 check_migrate_avoid_migration_option_;
7707 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7708 this->migration_type = migration_type_;
7738template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7739 typename mj_part_t,
typename mj_node_t>
7742 const RCP<const Environment> &env,
7743 RCP<
const Comm<int> > &problemComm,
7744 double imbalance_tolerance_,
7746 size_t num_global_parts_,
7747 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7748 int recursion_depth_,
7750 mj_lno_t num_local_coords_,
7751 mj_gno_t num_global_coords_,
7752 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7754 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7755 int num_weights_per_coord_,
7756 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7757 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7758 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7759 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7760 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7765 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7768 this->mj_problemComm = problemComm;
7769 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7771 mj_timer_base_string +
"Total");
7772 this->mj_env->debug(3,
"In MultiJagged Jagged");
7773 this->imbalance_tolerance = imbalance_tolerance_;
7774 this->mj_num_teams = num_teams_;
7775 this->num_global_parts = num_global_parts_;
7776 this->part_no_array = part_no_array_;
7777 this->recursion_depth = recursion_depth_;
7778 this->coord_dim = coord_dim_;
7779 this->num_local_coords = num_local_coords_;
7780 this->num_global_coords = num_global_coords_;
7781 this->mj_coordinates = mj_coordinates_;
7782 this->initial_mj_gnos = initial_mj_gnos_;
7783 this->num_weights_per_coord = num_weights_per_coord_;
7784 this->mj_uniform_weights = mj_uniform_weights_;
7785 this->mj_weights = mj_weights_;
7786 this->mj_uniform_parts = mj_uniform_parts_;
7790 this->set_part_specifications();
7793 mj_timer_base_string +
"Allocate Views");
7794 this->allocate_set_work_memory();
7796 mj_timer_base_string +
"Allocate Views");
7800 this->comm = this->mj_problemComm->duplicate();
7803 if(comm->getRank() == 0) {
7804 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7805 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7806 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7811 mj_part_t current_num_parts = 1;
7812 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7813 this->all_cut_coordinates;
7815 mj_timer_base_string +
"Problem_Partitioning");
7816 mj_part_t output_part_begin_index = 0;
7817 mj_part_t future_num_parts = this->total_num_part;
7818 bool is_data_ever_migrated =
false;
7820 std::vector<mj_part_t> *future_num_part_in_parts =
7821 new std::vector<mj_part_t> ();
7822 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7823 new std::vector<mj_part_t> ();
7825 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7827 RCP<mj_partBoxVector_t> input_part_boxes;
7828 RCP<mj_partBoxVector_t> output_part_boxes;
7830 if(this->mj_keep_part_boxes) {
7831 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7832 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7833 compute_global_box();
7834 this->init_part_boxes(output_part_boxes);
7837 auto local_part_xadj = this->part_xadj;
7841 Kokkos::View<mj_part_t*, device_t>
7842 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7843 Kokkos::View<size_t*, device_t>
7844 view_total_reduction_size(
"view_total_reduction_size", 1);
7846 for(
int i = 0; i < this->recursion_depth; ++i) {
7849 std::string istring = std::to_string(i);
7855 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7856 future_num_part_in_parts = next_future_num_parts_in_parts;
7857 next_future_num_parts_in_parts = tmpPartVect;
7861 next_future_num_parts_in_parts->clear();
7862 if(this->mj_keep_part_boxes) {
7863 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7864 input_part_boxes = output_part_boxes;
7865 output_part_boxes = tmpPartBoxes;
7866 output_part_boxes->clear();
7870 mj_part_t output_part_count_in_dimension =
7871 this->update_part_num_arrays(
7872 future_num_part_in_parts,
7873 next_future_num_parts_in_parts,
7878 output_part_boxes, 1);
7883 if(output_part_count_in_dimension == current_num_parts) {
7885 tmpPartVect= future_num_part_in_parts;
7886 future_num_part_in_parts = next_future_num_parts_in_parts;
7887 next_future_num_parts_in_parts = tmpPartVect;
7889 if(this->mj_keep_part_boxes) {
7890 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7891 input_part_boxes = output_part_boxes;
7892 output_part_boxes = tmpPartBoxes;
7898 int coordInd = i % this->coord_dim;
7900 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7901 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7904 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7908 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7909 "new part xadj", output_part_count_in_dimension);
7912 mj_part_t output_part_index = 0;
7916 mj_part_t output_coordinate_end_index = 0;
7918 mj_part_t current_work_part = 0;
7919 mj_part_t current_concurrent_num_parts =
7920 std::min(current_num_parts - current_work_part,
7921 this->max_concurrent_part_calculation);
7923 mj_part_t obtained_part_index = 0;
7925 auto host_process_local_min_max_coord_total_weight =
7926 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7927 auto host_global_min_max_coord_total_weight =
7928 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7931 for(; current_work_part < current_num_parts;
7932 current_work_part += current_concurrent_num_parts) {
7934 current_concurrent_num_parts =
7935 std::min(current_num_parts - current_work_part,
7936 this->max_concurrent_part_calculation);
7939 auto local_device_num_partitioning_in_current_dim =
7940 device_num_partitioning_in_current_dim;
7941 Kokkos::parallel_reduce(
"Read bDoingWork",
7942 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7943 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7945 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7946 if(local_device_num_partitioning_in_current_dim(
7947 current_work_part + kk) != 1) {
7953 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7955 this->mj_get_local_min_max_coord_totW(
7957 current_concurrent_num_parts,
7958 mj_current_dim_coords);
7963 this->mj_get_global_min_max_coord_totW(
7964 current_concurrent_num_parts,
7965 this->process_local_min_max_coord_total_weight,
7966 this->global_min_max_coord_total_weight);
7970 mj_part_t total_incomplete_cut_count = 0;
7975 mj_part_t concurrent_part_cut_shift = 0;
7976 mj_part_t concurrent_part_part_shift = 0;
7978 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7980 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7981 global_min_max_coord_total_weight);
7983 mj_scalar_t min_coordinate =
7984 host_global_min_max_coord_total_weight(kk);
7985 mj_scalar_t max_coordinate =
7986 host_global_min_max_coord_total_weight(
7987 kk + current_concurrent_num_parts);
7989 mj_scalar_t global_total_weight =
7990 host_global_min_max_coord_total_weight(
7991 kk + 2 * current_concurrent_num_parts);
7993 mj_part_t concurrent_current_part_index = current_work_part + kk;
7995 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7996 concurrent_current_part_index);
7998 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7999 Kokkos::subview(current_cut_coordinates,
8000 std::pair<mj_lno_t, mj_lno_t>(
8001 concurrent_part_cut_shift, current_cut_coordinates.size()));
8002 Kokkos::View<mj_scalar_t *, device_t>
8003 current_target_part_weights =
8004 Kokkos::subview(target_part_weights,
8005 std::pair<mj_lno_t, mj_lno_t>(
8006 concurrent_part_part_shift, target_part_weights.size()));
8009 concurrent_part_cut_shift += partition_count - 1;
8011 concurrent_part_part_shift += partition_count;
8015 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8019 total_incomplete_cut_count += partition_count - 1;
8021 this->incomplete_cut_count(kk) = partition_count - 1;
8024 this->mj_get_initial_cut_coords_target_weights(
8027 partition_count - 1,
8028 global_total_weight,
8030 current_target_part_weights,
8031 future_num_part_in_parts,
8032 next_future_num_parts_in_parts,
8033 concurrent_current_part_index,
8034 obtained_part_index);
8036 mj_lno_t coordinate_end_index =
8037 host_part_xadj(concurrent_current_part_index);
8038 mj_lno_t coordinate_begin_index =
8039 concurrent_current_part_index==0 ? 0 :
8040 host_part_xadj(concurrent_current_part_index - 1);
8042 this->set_initial_coordinate_parts(
8045 coordinate_begin_index, coordinate_end_index,
8046 this->coordinate_permutations,
8047 mj_current_dim_coords,
8048 this->assigned_part_ids,
8054 this->incomplete_cut_count(kk) = 0;
8057 obtained_part_index += partition_count;
8062 double used_imbalance = 0;
8065 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8068 mj_current_dim_coords,
8071 current_concurrent_num_parts,
8072 current_cut_coordinates,
8073 total_incomplete_cut_count,
8074 view_rectilinear_cut_count,
8075 view_total_reduction_size);
8078 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8083 mj_part_t output_array_shift = 0;
8084 mj_part_t cut_shift = 0;
8085 size_t tlr_shift = 0;
8086 size_t partweight_array_shift = 0;
8087 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8089 mj_part_t current_concurrent_work_part = current_work_part + kk;
8091 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8092 current_concurrent_work_part);
8095 int coordinateA_bigger_than_coordinateB =
8096 host_global_min_max_coord_total_weight(kk) >
8097 host_global_min_max_coord_total_weight(
8098 kk + current_concurrent_num_parts);
8100 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8103 auto local_new_part_xadj = this->new_part_xadj;
8104 Kokkos::parallel_for(
8105 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8106 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8107 local_new_part_xadj(
8108 output_part_index + output_array_shift + jj) = 0;
8111 cut_shift += num_parts - 1;
8112 tlr_shift += (4 *(num_parts - 1) + 1);
8113 output_array_shift += num_parts;
8114 partweight_array_shift += (2 * (num_parts - 1) + 1);
8118 Kokkos::View<mj_scalar_t *, device_t>
8119 current_concurrent_cut_coordinate =
8120 Kokkos::subview(current_cut_coordinates,
8121 std::pair<mj_lno_t, mj_lno_t>(
8123 current_cut_coordinates.size()));
8124 Kokkos::View<mj_scalar_t *, device_t>
8125 used_local_cut_line_weight_to_left =
8126 Kokkos::subview(process_cut_line_weight_to_put_left,
8127 std::pair<mj_lno_t, mj_lno_t>(
8129 process_cut_line_weight_to_put_left.size()));
8131 this->thread_part_weight_work =
8133 this->thread_part_weights,
8134 std::pair<mj_lno_t, mj_lno_t>(
8135 partweight_array_shift,
8136 this->thread_part_weights.extent(0)));
8139 if(this->mj_keep_part_boxes) {
8141 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8142 mj_scalar_t temp_get_val;
8143 Kokkos::parallel_reduce(
"Read single",
8144 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8145 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8146 set_single = current_concurrent_cut_coordinate(j);
8148 (*output_part_boxes)
8149 [output_array_shift + output_part_index + j].
8150 updateMinMax(temp_get_val, 1 , coordInd);
8151 (*output_part_boxes)
8152 [output_array_shift + output_part_index + j + 1].
8153 updateMinMax(temp_get_val, 0 , coordInd);
8158 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8159 Kokkos::subview(this->new_part_xadj,
8160 std::pair<mj_lno_t, mj_lno_t>(
8161 output_part_index + output_array_shift,
8162 this->new_part_xadj.size()));
8164 this->mj_create_new_partitions(
8166 current_concurrent_work_part,
8167 mj_current_dim_coords,
8168 current_concurrent_cut_coordinate,
8169 used_local_cut_line_weight_to_left,
8174 mj_lno_t coordinate_end = host_part_xadj(
8175 current_concurrent_work_part);
8176 mj_lno_t coordinate_begin =
8177 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8178 current_concurrent_work_part - 1);
8182 mj_lno_t part_size = coordinate_end - coordinate_begin;
8186 auto local_new_part_xadj = this->new_part_xadj;
8187 Kokkos::parallel_for(
8188 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8189 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8190 local_new_part_xadj(
8191 output_part_index + output_array_shift) = part_size;
8194 auto subview_new_coordinate_permutations =
8195 Kokkos::subview(this->new_coordinate_permutations,
8196 std::pair<mj_lno_t, mj_lno_t>(
8198 coordinate_begin + part_size));
8199 auto subview_coordinate_permutations =
8200 Kokkos::subview(this->coordinate_permutations,
8201 std::pair<mj_lno_t, mj_lno_t>(
8203 coordinate_begin + part_size));
8204 Kokkos::deep_copy(subview_new_coordinate_permutations,
8205 subview_coordinate_permutations);
8207 cut_shift += num_parts - 1;
8208 output_array_shift += num_parts;
8209 partweight_array_shift += (2 * (num_parts - 1) + 1);
8218 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8219 mj_part_t num_parts =
8220 host_num_partitioning_in_current_dim(current_work_part + kk);
8224 auto local_new_part_xadj = this->new_part_xadj;
8225 Kokkos::parallel_for(
8226 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8227 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8228 local_new_part_xadj(output_part_index+ii) +=
8229 output_coordinate_end_index;
8234 Kokkos::parallel_reduce(
"Read single",
8235 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8236 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8238 local_new_part_xadj(output_part_index + num_parts - 1);
8240 output_coordinate_end_index = temp_get;
8242 output_part_index += num_parts;
8248 int current_world_size = this->comm->getSize();
8249 long migration_reduce_all_population =
8250 this->total_dim_num_reduce_all * current_world_size;
8251 bool is_migrated_in_current_dimension =
false;
8256 if(future_num_parts > 1 &&
8257 this->check_migrate_avoid_migration_option >= 0 &&
8258 current_world_size > 1) {
8260 mj_timer_base_string +
"Problem_Migration-" + istring);
8261 mj_part_t num_parts = output_part_count_in_dimension;
8263 if(this->mj_perform_migration(
8266 next_future_num_parts_in_parts,
8267 output_part_begin_index,
8268 migration_reduce_all_population,
8269 this->num_global_coords / (future_num_parts * current_num_parts),
8271 input_part_boxes, output_part_boxes) )
8273 is_migrated_in_current_dimension =
true;
8274 is_data_ever_migrated =
true;
8276 mj_timer_base_string +
"Problem_Migration-" + istring);
8279 this->total_dim_num_reduce_all /= num_parts;
8282 is_migrated_in_current_dimension =
false;
8284 mj_timer_base_string +
"Problem_Migration-" + istring);
8289 Kokkos::View<mj_lno_t*, device_t> tmp =
8290 this->coordinate_permutations;
8291 this->coordinate_permutations =
8292 this->new_coordinate_permutations;
8294 this->new_coordinate_permutations = tmp;
8295 if(!is_migrated_in_current_dimension) {
8296 this->total_dim_num_reduce_all -= current_num_parts;
8297 current_num_parts = output_part_count_in_dimension;
8301 this->part_xadj = this->new_part_xadj;
8302 local_part_xadj = this->new_part_xadj;
8303 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8304 Kokkos::deep_copy(host_part_xadj, part_xadj);
8306 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8308 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8313 delete future_num_part_in_parts;
8314 delete next_future_num_parts_in_parts;
8316 mj_timer_base_string +
"Problem_Partitioning");
8322 this->set_final_parts(
8324 output_part_begin_index,
8326 is_data_ever_migrated);
8328 result_assigned_part_ids_ = this->assigned_part_ids;
8329 result_mj_gnos_ = this->current_mj_gnos;
8331 mj_timer_base_string +
"Total");
8332 this->mj_env->debug(3,
"Out of MultiJagged");
8335template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8336 typename mj_part_t,
typename mj_node_t>
8337RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8342 if(this->mj_keep_part_boxes) {
8343 return this->kept_boxes;
8346 throw std::logic_error(
"Error: part boxes are not stored.");
8350template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8351 typename mj_part_t,
typename mj_node_t>
8352RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8358 mj_part_t ntasks = this->num_global_parts;
8359 int dim = (*localPartBoxes)[0].getDim();
8360 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8362 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8364 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8365 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8367 coord_t *localPartMins = localPartBoundaries;
8368 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8370 coord_t *globalPartMins = globalPartBoundaries;
8371 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8373 mj_part_t boxCount = localPartBoxes->size();
8374 for(mj_part_t i = 0; i < boxCount; ++i) {
8375 mj_part_t pId = (*localPartBoxes)[i].getpId();
8379 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8380 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8382 for(
int j = 0; j < dim; ++j) {
8383 localPartMins[dim * pId + j] = lmins[j];
8384 localPartMaxs[dim * pId + j] = lmaxs[j];
8397 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8398 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8400 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8401 for(mj_part_t i = 0; i < ntasks; ++i) {
8403 globalPartMins + dim * i,
8404 globalPartMaxs + dim * i);
8417 delete []localPartBoundaries;
8418 delete []globalPartBoundaries;
8425template <
typename Adapter>
8431#ifndef DOXYGEN_SHOULD_SKIP_THIS
8435 typedef typename Adapter::scalar_t adapter_scalar_t;
8438 typedef float default_mj_scalar_t;
8444 (std::is_same<adapter_scalar_t, float>::value ||
8445 std::is_same<adapter_scalar_t, double>::value),
8446 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8448 typedef typename Adapter::gno_t mj_gno_t;
8449 typedef typename Adapter::lno_t mj_lno_t;
8450 typedef typename Adapter::part_t mj_part_t;
8451 typedef typename Adapter::node_t mj_node_t;
8453 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8454 typedef typename mj_node_t::device_type device_t;
8459 RCP<const Environment> mj_env;
8460 RCP<const Comm<int> > mj_problemComm;
8461 RCP<const typename Adapter::base_adapter_t> mj_adapter;
8464 double imbalance_tolerance;
8468 size_t num_global_parts;
8471 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8474 int recursion_depth;
8477 mj_lno_t num_local_coords;
8478 mj_gno_t num_global_coords;
8481 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8485 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8488 int num_weights_per_coord;
8491 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8494 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8497 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8507 mj_part_t num_first_level_parts;
8511 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8515 bool distribute_points_on_cut_lines;
8518 mj_part_t max_concurrent_part_calculation;
8521 int check_migrate_avoid_migration_option;
8529 double minimum_migration_imbalance;
8530 bool mj_keep_part_boxes;
8534 int mj_premigration_option;
8535 int min_coord_per_rank_for_premigration;
8538 ArrayRCP<mj_part_t> comXAdj_;
8541 ArrayRCP<mj_part_t> comAdj_;
8546 void set_input_parameters(
const Teuchos::ParameterList &p);
8548 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8550 bool mj_premigrate_to_subset(
8552 int migration_selection_option,
8553 RCP<const Environment> mj_env_,
8554 RCP<
const Comm<int> > mj_problemComm_,
8556 mj_lno_t num_local_coords_,
8557 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8558 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8560 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8562 int num_weights_per_coord_,
8563 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8565 RCP<
const Comm<int> > &result_problemComm_,
8566 mj_lno_t & result_num_local_coords_,
8567 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8569 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8570 result_mj_coordinates_,
8571 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8572 int * &result_actual_owner_rank_);
8577 RCP<
const Comm<int> > &problemComm,
8578 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8581 mj_problemComm(problemComm),
8582 mj_adapter(adapter),
8583 imbalance_tolerance(0),
8585 num_global_parts(1),
8588 num_local_coords(0),
8589 num_global_coords(0),
8590 num_weights_per_coord(0),
8591 num_first_level_parts(1),
8592 distribute_points_on_cut_lines(true),
8593 max_concurrent_part_calculation(1),
8594 check_migrate_avoid_migration_option(0),
8596 minimum_migration_imbalance(0.30),
8597 mj_keep_part_boxes(false),
8598 mj_run_as_rcb(false),
8599 mj_premigration_option(0),
8600 min_coord_per_rank_for_premigration(32000),
8614 const bool bUnsorted =
true;
8615 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8617 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
8618 "algorithm. As many as the dimension count.", mj_parts_Validator);
8620 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
8621 "coordinates will be calculated concurently.",
8624 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8625 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8626 "processors to avoid migration",
8629 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8630 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8631 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
8632 "depending on the imbalance, 1 for forcing migration, 2 for "
8633 "avoiding migration", mj_migration_option_validator);
8635 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8636 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8637 pl.set(
"mj_migration_type", 0,
8638 "Migration type, 0 for migration to minimize the imbalance "
8639 "1 for migration to minimize messages exchanged the migration.",
8640 mj_migration_option_validator);
8643 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
8647 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8650 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
8653 RCP<Teuchos::EnhancedNumberValidator<int>>
8654 mj_num_teams_validator =
8655 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8656 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8657 pl.set(
"mj_num_teams", 0,
8658 "How many teams for the main kernel loop"
8659 , mj_num_teams_validator);
8661 RCP<Teuchos::EnhancedNumberValidator<int>>
8662 mj_premigration_option_validator =
8663 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8665 pl.set(
"mj_premigration_option", 0,
8666 "Whether to do premigration or not. 0 for no migration "
8667 "x > 0 for migration to consecutive processors, "
8668 "the subset will be 0,x,2x,3x,...subset ranks."
8669 , mj_premigration_option_validator);
8671 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to "
8672 "assign each rank in multijagged after premigration"
8685 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8689 mj_part_t pointAssign(
int dim, adapter_scalar_t *point)
const;
8691 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8692 size_t &nPartsFound, mj_part_t **partsFound)
const;
8696 void getCommunicationGraph(
8698 ArrayRCP<mj_part_t> &comXAdj,
8699 ArrayRCP<mj_part_t> &comAdj);
8701 void set_up_partitioning_data(
8705 std::string timer_base_string;
8713 template<
class dst_t,
class src_t>
8714 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8715 typename src_t::value_type>::value>::type
8716 assign_if_same(dst_t & dst,
const src_t & src) {
8719 template<
class dst_t,
class src_t>
8720 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8721 typename src_t::value_type>::value>::type
8722 assign_if_same(dst_t & dst,
const src_t & src) {
8727template <
typename Adapter>
8728bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8730 int migration_selection_option,
8731 RCP<const Environment> mj_env_,
8732 RCP<
const Comm<int> > mj_problemComm_,
8734 mj_lno_t num_local_coords_,
8735 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8736 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8738 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8739 int num_weights_per_coord_,
8740 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8742 RCP<
const Comm<int> > & result_problemComm_,
8743 mj_lno_t &result_num_local_coords_,
8744 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8746 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8747 result_mj_coordinates_,
8748 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8749 int * &result_actual_owner_rank_)
8752 timer_base_string +
"PreMigration DistributorPlanCreating");
8754 int myRank = mj_problemComm_->getRank();
8755 int worldSize = mj_problemComm_->getSize();
8757 mj_part_t groupsize = worldSize / used_num_ranks;
8759 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8761 mj_part_t i_am_sending_to = 0;
8762 bool am_i_a_receiver =
false;
8764 for(
int i = 0; i < used_num_ranks; ++i) {
8765 group_begins[i+ 1] = group_begins[i] + groupsize;
8766 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8767 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8768 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8769 i_am_sending_to = group_begins[i];
8771 if(myRank == group_begins[i]) {
8772 am_i_a_receiver =
true;
8776 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8777 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8779 Tpetra::Distributor distributor(mj_problemComm_);
8781 std::vector<mj_part_t>
8782 coordinate_destinations(num_local_coords_, i_am_sending_to);
8784 ArrayView<const mj_part_t>
8785 destinations(&(coordinate_destinations[0]), num_local_coords_);
8786 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8787 result_num_local_coords_ = num_incoming_gnos;
8789 timer_base_string +
"PreMigration DistributorPlanCreating");
8792 timer_base_string +
"PreMigration DistributorMigration");
8800 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8801 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
8802 initial_mj_gnos_.size());
8803 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8805 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8806 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
8809 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8811 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8812 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8814 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8820 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8821 host_src_coordinates(
8822 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8823 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8825 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8827 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8828 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8829 num_incoming_gnos, this->coord_dim);
8831 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8832 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
8835 for(
int i = 0; i < this->coord_dim; ++i) {
8837 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8839 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8841 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8845 result_mj_coordinates_ = dst_coordinates;
8849 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8850 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8851 num_incoming_gnos, this->num_weights_per_coord);
8852 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8854 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8855 Kokkos::HostSpace(), this->mj_weights);
8858 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8859 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
8860 this->num_local_coords);
8862 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8863 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
8866 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8868 auto sub_host_src_weights
8869 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8870 auto sub_host_dst_weights
8871 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8879 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8880 sent_weight[n] = sub_host_src_weights(n);
8883 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8886 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8887 sub_host_dst_weights(n) = received_weight[n];
8890 Kokkos::deep_copy(dst_weights, host_dst_weights);
8891 result_mj_weights_ = dst_weights;
8895 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8896 Kokkos::ViewAllocateWithoutInitializing(
"sent_owners"),
8898 Kokkos::deep_copy(sent_owners, myRank);
8900 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8901 Kokkos::ViewAllocateWithoutInitializing(
"received_owners"),
8904 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8906 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8908 result_actual_owner_rank_,
8909 received_owners.data(),
8910 num_incoming_gnos *
sizeof(
int));
8914 timer_base_string +
"PreMigration DistributorMigration");
8915 return am_i_a_receiver;
8925template <
typename Adapter>
8934 int execute_counter =
8936 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8938 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"all");
8940 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"setup");
8944 this->set_input_parameters(this->mj_env->getParameters());
8945 if(this->mj_keep_part_boxes) {
8946 this->mj_partitioner.set_to_keep_part_boxes();
8949 this->mj_partitioner.set_partitioning_parameters(
8950 this->distribute_points_on_cut_lines,
8951 this->max_concurrent_part_calculation,
8952 this->check_migrate_avoid_migration_option,
8953 this->minimum_migration_imbalance, this->migration_type);
8955 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8956 mj_lno_t result_num_local_coords = this->num_local_coords;
8957 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8959 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8960 result_mj_coordinates = this->mj_coordinates;
8961 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8963 int *result_actual_owner_rank = NULL;
8965 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8966 this->initial_mj_gnos;
8984 int current_world_size = this->mj_problemComm->getSize();
8985 mj_lno_t threshold_num_local_coords =
8986 this->min_coord_per_rank_for_premigration;
8987 bool is_pre_migrated =
false;
8988 bool am_i_in_subset =
true;
8994 if(mj_premigration_option > 0 &&
8995 size_t (current_world_size) > this->num_global_parts &&
8996 this->num_global_coords < mj_gno_t (
8997 current_world_size * threshold_num_local_coords))
8999 if(this->mj_keep_part_boxes) {
9000 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and "
9001 "mj_premigration_option are not supported together yet.");
9004 is_pre_migrated =
true;
9005 int migration_selection_option = mj_premigration_option;
9006 if(migration_selection_option * this->num_global_parts >
9007 (
size_t) (current_world_size)) {
9008 migration_selection_option =
9009 current_world_size / this->num_global_parts;
9012 int used_num_ranks = int (this->num_global_coords /
9013 float (threshold_num_local_coords) + 0.5);
9015 if(used_num_ranks == 0) {
9019 am_i_in_subset = this->mj_premigrate_to_subset(
9021 migration_selection_option,
9023 this->mj_problemComm,
9025 this->num_local_coords,
9026 this->num_global_coords,
9027 this->num_global_parts,
9028 this->initial_mj_gnos,
9029 this->mj_coordinates,
9030 this->num_weights_per_coord,
9034 result_num_local_coords,
9035 result_initial_mj_gnos,
9036 result_mj_coordinates,
9038 result_actual_owner_rank);
9040 result_initial_mj_gnos_ = result_initial_mj_gnos;
9043 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9044 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9046 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"setup");
9048 if(am_i_in_subset) {
9049 this->mj_partitioner.multi_jagged_part(
9052 this->imbalance_tolerance,
9054 this->num_global_parts,
9055 this->part_no_array,
9056 this->recursion_depth,
9058 result_num_local_coords,
9059 this->num_global_coords,
9060 result_initial_mj_gnos_,
9061 result_mj_coordinates,
9062 this->num_weights_per_coord,
9063 this->mj_uniform_weights,
9065 this->mj_uniform_parts,
9066 result_assigned_part_ids,
9071 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"cleanup");
9074 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9075 localGidToLid.reserve(result_num_local_coords);
9076 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9077 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9078 result_initial_mj_gnos_.size());
9079 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9080 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9081 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9084 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9085 0, result_num_local_coords,
true);
9086 auto host_result_assigned_part_ids =
9087 Kokkos::create_mirror_view(result_assigned_part_ids);
9088 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9089 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9090 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9091 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9092 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9093 partId[origLID] = host_result_assigned_part_ids(i);
9098 if(is_pre_migrated) {
9099 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
9100 "PostMigration DistributorPlanCreating");
9101 Tpetra::Distributor distributor(this->mj_problemComm);
9103 ArrayView<const mj_part_t> actual_owner_destinations(
9104 result_actual_owner_rank , result_num_local_coords);
9106 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9107 actual_owner_destinations);
9109 if(num_incoming_gnos != this->num_local_coords) {
9110 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - "
9111 "num incoming is not equal to num local coords");
9115 "PostMigration DistributorPlanCreating");
9117 "PostMigration DistributorMigration");
9119 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9120 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
9122 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9123 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
9126 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9129 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9130 if (partId.size() > 0) {
9131 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9132 partId.getRawPtr(), partId.size());
9134 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9137 partId = arcp(
new mj_part_t[this->num_local_coords],
9138 0, this->num_local_coords,
true);
9141 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9142 localGidToLid2.reserve(this->num_local_coords);
9143 auto host_initial_mj_gnos =
9144 Kokkos::create_mirror_view(this->initial_mj_gnos);
9145 Kokkos::deep_copy(host_initial_mj_gnos,
9146 this->initial_mj_gnos);
9147 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9148 localGidToLid2[host_initial_mj_gnos(i)] = i;
9151 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9152 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9153 partId[origLID] = received_partids[i];
9158 delete [] result_actual_owner_rank;
9161 timer_base_string +
"PostMigration DistributorMigration");
9163 solution->setParts(partId);
9164 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"cleanup");
9167 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"all");
9170 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9175template <
typename Adapter>
9188 int criteria_dim = (this->num_weights_per_coord ?
9189 this->num_weights_per_coord : 1);
9193 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9196 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9197 "uniform parts", criteria_dim);
9198 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9199 "uniform weights", criteria_dim);
9201 Kokkos::View<const mj_gno_t *, device_t> gnos;
9202 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9204 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9207 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9208 Kokkos::View<mj_scalar_t **, device_t> wgts;
9212 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9215 assign_if_same(xyz, xyz_adapter);
9216 assign_if_same(wgts, wgts_adapter);
9221 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9222 (Kokkos::ViewAllocateWithoutInitializing(
9223 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9224 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9225 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9226 wgts_adapter.extent(0), wgts_adapter.extent(1));
9228 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9229 Kokkos::parallel_for(
9230 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9231 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9232 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9233 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9236 Kokkos::parallel_for(
9237 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9238 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9239 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9240 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9246 this->initial_mj_gnos = gnos;
9248 this->mj_coordinates = xyz;
9251 if(this->num_weights_per_coord == 0) {
9252 this->mj_uniform_weights(0) =
true;
9253 Kokkos::resize(this->mj_weights, 0, 0);
9256 this->mj_weights = wgts;
9257 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9258 this->mj_uniform_weights(wdim) =
false;
9262 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9263 if(solution->criteriaHasUniformPartSizes(wdim)) {
9264 this->mj_uniform_parts(wdim) =
true;
9267 printf(
"Error: MJ does not support non uniform target part weights\n");
9276template <
typename Adapter>
9277void Zoltan2_AlgMJ<Adapter>::set_input_parameters(
9278 const Teuchos::ParameterList &pl)
9280 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9283 tol = pe->getValue(&tol);
9284 this->imbalance_tolerance = tol - 1.0;
9288 if(this->imbalance_tolerance <= 0) {
9289 this->imbalance_tolerance= 10e-4;
9293 Kokkos::resize(this->part_no_array, 0);
9296 this->recursion_depth = 0;
9298 if(pl.getPtr<
int>(
"mj_num_teams")) {
9299 this->num_teams = pl.get<
int>(
"mj_num_teams");
9302 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9303 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9304 int mj_parts_size =
static_cast<int>(mj_parts.size());
9307 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9308 "part_no_array", mj_parts_size);
9309 for(
int i = 0; i < mj_parts_size; ++i) {
9310 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9313 this->recursion_depth = mj_parts_size - 1;
9314 this->mj_env->debug(2,
"mj_parts provided by user");
9318 this->distribute_points_on_cut_lines =
true;
9319 this->max_concurrent_part_calculation = 1;
9321 this->mj_run_as_rcb =
false;
9322 this->mj_premigration_option = 0;
9323 this->min_coord_per_rank_for_premigration = 32000;
9325 int mj_user_recursion_depth = -1;
9326 this->mj_keep_part_boxes =
false;
9327 this->check_migrate_avoid_migration_option = 0;
9328 this->migration_type = 0;
9329 this->minimum_migration_imbalance = 0.35;
9331 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9334 imb = pe->getValue(&imb);
9335 this->minimum_migration_imbalance = imb - 1.0;
9338 pe = pl.getEntryPtr(
"mj_migration_option");
9340 this->check_migrate_avoid_migration_option =
9341 pe->getValue(&this->check_migrate_avoid_migration_option);
9343 this->check_migrate_avoid_migration_option = 0;
9345 if(this->check_migrate_avoid_migration_option > 1) {
9346 this->check_migrate_avoid_migration_option = -1;
9350 pe = pl.getEntryPtr(
"mj_migration_type");
9352 this->migration_type = pe->getValue(&this->migration_type);
9354 this->migration_type = 0;
9360 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9362 this->max_concurrent_part_calculation =
9363 pe->getValue(&this->max_concurrent_part_calculation);
9365 this->max_concurrent_part_calculation = 1;
9368 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9370 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9372 this->mj_keep_part_boxes =
false;
9383 if(this->mj_keep_part_boxes ==
false) {
9384 pe = pl.getEntryPtr(
"mapping_type");
9386 int mapping_type = -1;
9387 mapping_type = pe->getValue(&mapping_type);
9388 if(mapping_type == 0) {
9389 mj_keep_part_boxes =
true;
9395 pe = pl.getEntryPtr(
"mj_enable_rcb");
9397 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9399 this->mj_run_as_rcb =
false;
9402 pe = pl.getEntryPtr(
"mj_premigration_option");
9404 mj_premigration_option = pe->getValue(&mj_premigration_option);
9406 mj_premigration_option = 0;
9409 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9411 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9413 min_coord_per_rank_for_premigration = 32000;
9416 pe = pl.getEntryPtr(
"mj_recursion_depth");
9418 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9420 mj_user_recursion_depth = -1;
9424 pe = pl.getEntryPtr(
"rectilinear");
9426 val = pe->getValue(&val);
9429 this->distribute_points_on_cut_lines =
false;
9431 this->distribute_points_on_cut_lines =
true;
9434 if(this->mj_run_as_rcb) {
9435 mj_user_recursion_depth =
9436 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9438 if(this->recursion_depth < 1) {
9439 if(mj_user_recursion_depth > 0) {
9440 this->recursion_depth = mj_user_recursion_depth;
9443 this->recursion_depth = this->coord_dim;
9449template <
typename Adapter>
9452 adapter_scalar_t *lower,
9453 adapter_scalar_t *upper,
9454 size_t &nPartsFound,
9455 typename Adapter::part_t **partsFound)
const
9464 if(this->mj_keep_part_boxes) {
9467 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9469 size_t nBoxes = (*partBoxes).size();
9471 throw std::logic_error(
"no part boxes exist");
9475 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9477 if(globalBox->boxesOverlap(dim, lower, upper)) {
9479 std::vector<typename Adapter::part_t> partlist;
9482 for(
size_t i = 0; i < nBoxes; i++) {
9484 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9486 partlist.push_back((*partBoxes)[i].getpId());
9508 *partsFound =
new mj_part_t[nPartsFound];
9509 for(
size_t i = 0; i < nPartsFound; i++)
9510 (*partsFound)[i] = partlist[i];
9522 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9527template <
typename Adapter>
9530 adapter_scalar_t *point)
const
9536 if(this->mj_keep_part_boxes) {
9537 typename Adapter::part_t foundPart = -1;
9540 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9542 size_t nBoxes = (*partBoxes).size();
9544 throw std::logic_error(
"no part boxes exist");
9548 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9550 if(globalBox->pointInBox(dim, point)) {
9554 for(i = 0; i < nBoxes; i++) {
9556 if((*partBoxes)[i].pointInBox(dim, point)) {
9557 foundPart = (*partBoxes)[i].getpId();
9571 std::ostringstream oss;
9573 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9574 oss <<
") not found in domain";
9575 throw std::logic_error(oss.str());
9585 size_t closestBox = 0;
9586 coord_t minDistance = std::numeric_limits<coord_t>::max();
9587 coord_t *centroid =
new coord_t[dim];
9588 for(
size_t i = 0; i < nBoxes; i++) {
9589 (*partBoxes)[i].computeCentroid(centroid);
9592 for(
int j = 0; j < dim; j++) {
9593 diff = centroid[j] - point[j];
9596 if(sum < minDistance) {
9601 foundPart = (*partBoxes)[closestBox].getpId();
9608 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9612template <
typename Adapter>
9618 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9619 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9620 mj_part_t ntasks = (*pBoxes).size();
9621 int dim = (*pBoxes)[0].getDim();
9622 GridHash grid(pBoxes, ntasks, dim);
9629template <
typename Adapter>
9630RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9631Zoltan2_AlgMJ<Adapter>::getGlobalBoxBoundaries()
const
9633 return this->mj_partitioner.get_kept_boxes();
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
int getCoordinateDim() const
Returns the dimension of the coordinates.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
int value_count_rightleft
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
ArrayCombinationReducer reducer
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
part_t concurrent_current_part
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
part_t concurrent_current_part
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
part_t current_concurrent_num_parts
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
int value_count_rightleft
static int get_counter_Zoltan2_AlgMJ()
static int get_counter_AlgMJ()
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.