52#include <Teuchos_ScalarTraits.hpp>
53#include <Kokkos_ArithTraits.hpp>
64 template<
class Scalar,
class Node>
67 using impl_scalar_type =
typename Kokkos::Details::ArithTraits<Scalar>::val_type;
69 using exec_space =
typename Node::execution_space;
70 using memory_space =
typename Node::memory_space;
71 using range_policy = Kokkos::RangePolicy<exec_space>;
73 Kokkos::View<impl_scalar_type*,memory_space> a(
"a", VECTOR_SIZE);
74 Kokkos::View<impl_scalar_type*,memory_space> b(
"b", VECTOR_SIZE);
75 Kokkos::View<impl_scalar_type*,memory_space> c(
"c", VECTOR_SIZE);
76 double total_test_time = 0.0;
78 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
80 Kokkos::parallel_for(
"stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t i) {
81 a(i) = ONE * (double)i;
86 using clock = std::chrono::high_resolution_clock;
88 clock::time_point start, stop;
90 for(
int i = 0; i < KERNEL_REPEATS; i++) {
92 Kokkos::parallel_for(
"stream/add",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t j) {
98 double my_test_time = std::chrono::duration<double>(stop - start).count();
99 total_test_time += my_test_time;
102 return total_test_time / KERNEL_REPEATS;
105 template<
class Scalar,
class Node>
108 using impl_scalar_type =
typename Kokkos::Details::ArithTraits<Scalar>::val_type;
110 using exec_space =
typename Node::execution_space;
111 using memory_space =
typename Node::memory_space;
112 using range_policy = Kokkos::RangePolicy<exec_space>;
114 Kokkos::View<impl_scalar_type*,memory_space> a(
"a", VECTOR_SIZE);
115 Kokkos::View<impl_scalar_type*,memory_space> b(
"b", VECTOR_SIZE);
116 double total_test_time = 0.0;
118 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
120 Kokkos::parallel_for(
"stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t i) {
123 exec_space().fence();
125 using clock = std::chrono::high_resolution_clock;
126 clock::time_point start, stop;
128 for(
int i = 0; i < KERNEL_REPEATS; i++) {
129 start = clock::now();
130 Kokkos::parallel_for(
"stream/copy",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t j) {
134 exec_space().fence();
136 double my_test_time = std::chrono::duration<double>(stop - start).count();
137 total_test_time += my_test_time;
140 return total_test_time / KERNEL_REPEATS;
145 double table_lookup(
const std::vector<int> & x,
const std::vector<double> & y,
int value) {
147 if(x.size() == 0)
return Teuchos::ScalarTraits<double>::nan();
150 int N = (int) x.size();
152 for( ; hi < N; hi++) {
166 int run = x[hi] - x[hi-1];
167 double rise = y[hi] - y[hi-1];
168 double slope = rise / run;
169 int diff = value - x[hi-1];
171 return y[hi-1] + slope * diff;
176 int run = x[hi] - x[hi-1];
177 double rise = y[hi] - y[hi-1];
178 double slope = rise / run;
179 int diff = value - x[hi-1];
181 return y[hi-1] + slope * diff;
186 const double GB = 1024.0 * 1024.0 * 1024.0;
188 double time_per_call = time / num_calls;
189 return memory_per_call_bytes /
GB / time_per_call;
193 template <
class exec_space,
class memory_space>
195 int rank = comm.getRank();
196 int nproc = comm.getSize();
198 if(nproc < 2)
return;
201 using range_policy = Kokkos::RangePolicy<exec_space>;
202 const int buff_size = (int) pow(2,MAX_SIZE);
204 sizes.resize(MAX_SIZE+1);
205 times.resize(MAX_SIZE+1);
208 Kokkos::View<char*,memory_space> r_buf(
"recv",buff_size), s_buf(
"send",buff_size);
209 Kokkos::deep_copy(s_buf,1);
214 int buddy = odd ? rank - 1 : rank + 1;
216 for(
int i = 0; i < MAX_SIZE + 1 ;i ++) {
217 int msg_size = (int) pow(2,i);
220 double t0 = MPI_Wtime();
221 for(
int j = 0; j < KERNEL_REPEATS; j++) {
224 comm.send(msg_size, (
char*)s_buf.data(), buddy);
225 comm.receive(buddy, msg_size, (
char*)r_buf.data());
228 comm.receive(buddy, msg_size,(
char*)r_buf.data());
229 comm.send(msg_size, (
char*)s_buf.data(), buddy);
234 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
236 times[i] = time_per_call;
246 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
250 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
267 for(
int i=0; i<LOG_MAX_SIZE+1; i++) {
268 int size = (int) pow(2,i);
287 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
293 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
299 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
306 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
312 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
318 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
325 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
331 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
338 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
342 std::ios old_format(NULL);
343 old_format.copyfmt(out);
345 out << setw(20) <<
"Length in Scalars" << setw(1) <<
" "
346 << setw(20) <<
"COPY (us)" << setw(1) <<
" "
347 << setw(20) <<
"ADD (us)" << setw(1) <<
" "
348 << setw(20) <<
"COPY (GB/s)" << setw(1) <<
" "
349 << setw(20) <<
"ADD (GB/s)" << std::endl;
351 out << setw(20) <<
"-----------------" << setw(1) <<
" "
352 << setw(20) <<
"---------" << setw(1) <<
" "
353 << setw(20) <<
"--------" << setw(1) <<
" "
354 << setw(20) <<
"-----------" << setw(1) <<
" "
355 << setw(20) <<
"----------" << std::endl;
367 out << setw(20) << size << setw(1) <<
" "
368 << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) <<
" "
369 << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) <<
" "
370 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) <<
" "
371 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
374 out.copyfmt(old_format);
380 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
390 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
396 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
403 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
409 std::ios old_format(NULL);
410 old_format.copyfmt(out);
412 out << setw(20) <<
"Message Size" << setw(1) <<
" "
413 << setw(20) <<
"Host (us)" << setw(1) <<
" "
414 << setw(20) <<
"Device (us)" << std::endl;
416 out << setw(20) <<
"------------" << setw(1) <<
" "
417 << setw(20) <<
"---------" << setw(1) <<
" "
418 << setw(20) <<
"-----------" << std::endl;
427 out << setw(20) << size << setw(1) <<
" "
428 << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) <<
" "
429 << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
432 out.copyfmt(old_format);
435 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
438 using exec_space =
typename Node::execution_space;
439 using range_policy = Kokkos::RangePolicy<exec_space>;
440 using clock = std::chrono::high_resolution_clock;
442 double total_test_time = 0;
443 clock::time_point start, stop;
444 for(
int i = 0; i < KERNEL_REPEATS; i++) {
445 start = clock::now();
446 Kokkos::parallel_for(
"empty kernel",range_policy(0,1), KOKKOS_LAMBDA (
const size_t j) {
449 exec_space().fence();
451 double my_test_time = std::chrono::duration<double>(stop - start).count();
452 total_test_time += my_test_time;
458 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
465 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
469 std::ios old_format(NULL);
470 old_format.copyfmt(out);
472 out << setw(20) <<
"Launch+Wait Latency (us)" << setw(1) <<
" "
475 out.copyfmt(old_format);
MueLu::DefaultScalar Scalar
void print_pingpong_table(std::ostream &out)
double launch_and_wait_latency_
void print_latency_corrected_stream_vector_table(std::ostream &out)
std::vector< int > pingpong_sizes_
void print_stream_vector_table(std::ostream &out)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction)
std::vector< int > stream_sizes_
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
double launch_latency_lookup()
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
std::vector< double > latency_corrected_stream_copy_times_
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
std::vector< double > pingpong_device_times_
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
std::vector< double > latency_corrected_stream_add_times_
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
std::vector< double > stream_copy_times_
double stream_vector_lookup(int SIZE_IN_BYTES)
std::vector< double > stream_add_times_
std::vector< double > pingpong_host_times_
void launch_latency_make_table(int KERNEL_REPEATS)
void print_launch_latency_table(std::ostream &out)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > ×)
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
Namespace for MueLu classes and methods.