MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
47
48#include <cstdio>
49#include <cmath>
50#include <numeric>
51#include <utility>
52#include <chrono>
53#include <iomanip>
54#include <Teuchos_ScalarTraits.hpp>
55#include <Kokkos_ArithTraits.hpp>
56#include <Xpetra_Import.hpp>
57#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
58#include <Xpetra_TpetraImport.hpp>
59#include <Tpetra_Import.hpp>
60#include <Tpetra_Distributor.hpp>
61#include <mpi.h>
62#endif
63
64
65
66#ifdef HAVE_MPI
67#include <mpi.h>
68#endif
69
70namespace MueLu {
71
72 namespace PerfDetails {
73 template<class Scalar,class Node>
74 double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
75 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
76 using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
77
78 using exec_space = typename Node::execution_space;
79 using memory_space = typename Node::memory_space;
80 using range_policy = Kokkos::RangePolicy<exec_space>;
81
82 Kokkos::View<impl_scalar_type*,memory_space> a("a", VECTOR_SIZE);
83 Kokkos::View<impl_scalar_type*,memory_space> b("b", VECTOR_SIZE);
84 Kokkos::View<impl_scalar_type*,memory_space> c("c", VECTOR_SIZE);
85 double total_test_time = 0.0;
86
87 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
88
89 Kokkos::parallel_for("stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t i) {
90 a(i) = ONE * (double)i;
91 b(i) = a(i);
92 });
93 exec_space().fence();
94
95 using clock = std::chrono::high_resolution_clock;
96
97 clock::time_point start, stop;
98
99 for(int i = 0; i < KERNEL_REPEATS; i++) {
100 start = clock::now();
101 Kokkos::parallel_for("stream/add",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t j) { //Vector Addition
102 c(j) = a(j) + b(j);
103 });
104
105 exec_space().fence();
106 stop = clock::now();
107 double my_test_time = std::chrono::duration<double>(stop - start).count();
108 total_test_time += my_test_time;
109 }
110
111 return total_test_time / KERNEL_REPEATS;
112 }
113
114 template<class Scalar,class Node>
115 double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
116 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
117 using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
118
119 using exec_space = typename Node::execution_space;
120 using memory_space = typename Node::memory_space;
121 using range_policy = Kokkos::RangePolicy<exec_space>;
122
123 Kokkos::View<impl_scalar_type*,memory_space> a("a", VECTOR_SIZE);
124 Kokkos::View<impl_scalar_type*,memory_space> b("b", VECTOR_SIZE);
125 double total_test_time = 0.0;
126
127 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
128
129 Kokkos::parallel_for("stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t i) {
130 a(i) = ONE;
131 });
132 exec_space().fence();
133
134 using clock = std::chrono::high_resolution_clock;
135 clock::time_point start, stop;
136
137 for(int i = 0; i < KERNEL_REPEATS; i++) {
138 start = clock::now();
139 Kokkos::parallel_for("stream/copy",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t j) { //Vector Addition
140 b(j) = a(j);
141 });
142
143 exec_space().fence();
144 stop = clock::now();
145 double my_test_time = std::chrono::duration<double>(stop - start).count();
146 total_test_time += my_test_time;
147 }
148
149 return total_test_time / KERNEL_REPEATS;
150 }
151
152
153
154 double table_lookup(const std::vector<int> & x, const std::vector<double> & y, int value) {
155 // If there's no table, nan
156 if(x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
157
158 // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
159 int N = (int) x.size();
160 int hi = 0;
161 for( ; hi < N; hi++) {
162 if (x[hi] > value)
163 break;
164 }
165
166 if(hi == 0) {
167 // Lower end (return the min time)
168 //printf("Lower end: %d < %d\n",value,x[0]);
169 return y[0];
170 }
171 else if (hi == N) {
172 // Higher end (extrapolate from the last two points)
173 //printf("Upper end: %d > %d\n",value,x[N-1]);
174 hi = N-1;
175 int run = x[hi] - x[hi-1];
176 double rise = y[hi] - y[hi-1];
177 double slope = rise / run;
178 int diff = value - x[hi-1];
179
180 return y[hi-1] + slope * diff;
181 }
182 else {
183 // Interpolate
184 //printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
185 int run = x[hi] - x[hi-1];
186 double rise = y[hi] - y[hi-1];
187 double slope = rise / run;
188 int diff = value - x[hi-1];
189
190 return y[hi-1] + slope * diff;
191 }
192 }
193
194 // Report bandwidth in GB / sec
195 const double GB = 1024.0 * 1024.0 * 1024.0;
196 double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
197 double time_per_call = time / num_calls;
198 return memory_per_call_bytes / GB / time_per_call;
199 }
200
201
202 template <class exec_space, class memory_space>
203 void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE,const Teuchos::Comm<int> &comm, std::vector<int> & sizes, std::vector<double> & times) {
204 int rank = comm.getRank();
205 int nproc = comm.getSize();
206
207 if(nproc < 2) return;
208
209#ifdef HAVE_MPI
210 const int buff_size = (int) pow(2,MAX_SIZE);
211
212 sizes.resize(MAX_SIZE+1);
213 times.resize(MAX_SIZE+1);
214
215 // Allocate memory for the buffers (and fill send)
216 Kokkos::View<char*,memory_space> r_buf("recv",buff_size), s_buf("send",buff_size);
217 Kokkos::deep_copy(s_buf,1);
218
219 //Send and recieve.
220 // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
221 int odd = rank % 2;
222 int buddy = odd ? rank - 1 : rank + 1;
223
224 for(int i = 0; i < MAX_SIZE + 1 ;i ++) {
225 int msg_size = (int) pow(2,i);
226 comm.barrier();
227
228 double t0 = MPI_Wtime();
229 for(int j = 0; j < KERNEL_REPEATS; j++) {
230 if (buddy < nproc) {
231 if (odd) {
232 comm.send(msg_size, (char*)s_buf.data(), buddy);
233 comm.receive(buddy, msg_size, (char*)r_buf.data());
234 }
235 else {
236 comm.receive(buddy, msg_size,(char*)r_buf.data());
237 comm.send(msg_size, (char*)s_buf.data(), buddy);
238 }
239 }
240 }
241
242 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
243 sizes[i] = msg_size;
244 times[i] = time_per_call;
245 }
246#endif
247 }
248
249
250 template <class exec_space, class memory_space, class LocalOrdinal, class GlobalOrdinal, class Node>
251 void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE,const RCP<const Xpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > & import, std::vector<int> & sizes, std::vector<double> & times) {
252 int nproc = import->getSourceMap()->getComm()->getSize();
253 if(nproc < 2) return;
254#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
255 // NOTE: We need to get the distributer here, which means we need Tpetra, since Xpetra does
256 // not have a distributor interface
257 using x_import_type = Xpetra::TpetraImport<LocalOrdinal,GlobalOrdinal,Node>;
258 RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(import);
259 RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(import->getSourceMap()->getComm());
260 MPI_Comm communicator = *mcomm->getRawMpiComm();
261
262 if(Ximport.is_null() || mcomm.is_null()) return;
263 auto Timport = Ximport->getTpetra_Import();
264 auto distor = Timport->getDistributor();
265
266 // Distributor innards
267 Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
268 Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
269 int num_recvs = (int)distor.getNumReceives();
270 int num_sends = (int)distor.getNumSends();
271
272
273 const int buff_size_per_msg = (int) pow(2,MAX_SIZE);
274 sizes.resize(MAX_SIZE+1);
275 times.resize(MAX_SIZE+1);
276
277 // Allocate memory for the buffers (and fill send)
278 Kokkos::View<char*,memory_space> f_recv_buf("forward_recv",buff_size_per_msg*num_recvs), f_send_buf("forward_send",buff_size_per_msg*num_sends);
279 Kokkos::View<char*,memory_space> r_recv_buf("reverse_recv",buff_size_per_msg*num_sends), r_send_buf("reverse_send",buff_size_per_msg*num_recvs);
280 Kokkos::deep_copy(f_send_buf,1);
281 Kokkos::deep_copy(r_send_buf,1);
282
283 std::vector<MPI_Request> requests(num_sends+num_recvs);
284 std::vector<MPI_Status> status(num_sends+num_recvs);
285
286
287 for(int i = 0; i < MAX_SIZE + 1 ;i ++) {
288 int msg_size = (int) pow(2,i);
289
290 MPI_Barrier(communicator);
291
292 double t0 = MPI_Wtime();
293 for(int j = 0; j < KERNEL_REPEATS; j++) {
294 int ct=0;
295 // Recv/Send the forward messsages
296 for(int r=0; r<num_recvs;r++) {
297 const int tag = 1000+j;
298 MPI_Irecv(&f_recv_buf[msg_size*r],msg_size,MPI_CHAR,procsFrom[r],tag,communicator,&requests[ct]);
299 ct++;
300 }
301 for(int s=0; s<num_sends;s++) {
302 const int tag = 1000+j;
303 MPI_Isend(&f_send_buf[msg_size*s],msg_size,MPI_CHAR,procsTo[s],tag,communicator,&requests[ct]);
304 ct++;
305 }
306 // Wait for the forward messsages
307 MPI_Waitall(ct,requests.data(),status.data());
308
309 ct=0;
310 // Recv/Send the reverse messsages
311 for(int r=0; r<num_sends;r++) {
312 const int tag = 2000+j;
313 MPI_Irecv(&r_recv_buf[msg_size*r],msg_size,MPI_CHAR,procsTo[r],tag,communicator,&requests[ct]);
314 ct++;
315 }
316 for(int s=0; s<num_recvs;s++) {
317 const int tag = 2000+j;
318 MPI_Isend(&r_send_buf[msg_size*s],msg_size,MPI_CHAR,procsFrom[s],tag,communicator,&requests[ct]);
319 ct++;
320 }
321 // Wait for the reverse messsages
322 MPI_Waitall(ct,requests.data(),status.data());
323 }
324
325 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
326 sizes[i] = msg_size;
327 times[i] = time_per_call;
328 }
329
330
331
332#endif
333
334 }
335
336
337 }// end namespace PerfDetails
338
339
340 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
342
343 /****************************************************************************************/
344 /****************************************************************************************/
345 /****************************************************************************************/
346
347
348 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
349 void
351
352 // We need launch/waits latency estimates for corrected stream
353 launch_latency_make_table(KERNEL_REPEATS);
354 double latency = launch_latency_lookup();
355
356 if(LOG_MAX_SIZE < 2)
357 LOG_MAX_SIZE=20;
358
359 stream_sizes_.resize(LOG_MAX_SIZE+1);
360 stream_copy_times_.resize(LOG_MAX_SIZE+1);
361 stream_add_times_.resize(LOG_MAX_SIZE+1);
362 latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE+1);
363 latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE+1);
364
365 for(int i=0; i<LOG_MAX_SIZE+1; i++) {
366 int size = (int) pow(2,i);
367 double c_time = PerfDetails::stream_vector_copy<Scalar,Node>(KERNEL_REPEATS,size);
368 double a_time = PerfDetails::stream_vector_add<Scalar,Node>(KERNEL_REPEATS,size);
369
370 stream_sizes_[i] = size;
371
372 // Correct for the difference in memory transactions per element
373 stream_copy_times_[i] = c_time / 2.0;
374 stream_add_times_[i] = a_time / 3.0;
375
376 // Correct for launch latency too. We'll note that sometimes the latency estimate
377 // is higher than the actual copy/add time estimate. If so, we don't correct
378 latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ( (c_time-latency)/2.0 );
379 latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ( (a_time-latency)/3.0 );
380
381
382 }
383 }
384
385 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
386 double
390
391 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
392 double
396
397 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
398 double
402
403
404 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
405 double
409
410 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
411 double
415
416 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
417 double
421
422
423 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
424 void
426 print_stream_vector_table_impl(out,false,prefix);
427 }
428
429 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
430 void
434
435
436 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
437 void
438 PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix) {
439 using namespace std;
440 std::ios old_format(NULL);
441 old_format.copyfmt(out);
442
443 out << prefix
444 << setw(20) << "Length in Scalars" << setw(1) << " "
445 << setw(20) << "COPY (us)" << setw(1) << " "
446 << setw(20) << "ADD (us)" << setw(1) << " "
447 << setw(20) << "COPY (GB/s)" << setw(1) << " "
448 << setw(20) << "ADD (GB/s)" << std::endl;
449
450 out << prefix
451 << setw(20) << "-----------------" << setw(1) << " "
452 << setw(20) << "---------" << setw(1) << " "
453 << setw(20) << "--------" << setw(1) << " "
454 << setw(20) << "-----------" << setw(1) << " "
455 << setw(20) << "----------" << std::endl;
456
457
458 for(int i=0; i<(int)stream_sizes_.size(); i++) {
459 int size = stream_sizes_[i];
460 double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
461 double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
462 // We've already corrected for the transactions per element difference
463 double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time,1,size*sizeof(Scalar));
464 double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time,1,size*sizeof(Scalar));
465
466
467 out << prefix
468 << setw(20) << size << setw(1) << " "
469 << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) << " "
470 << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) << " "
471 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
472 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
473 }
474
475 out.copyfmt(old_format);
476 }
477
478
479 /****************************************************************************************/
480 /****************************************************************************************/
481 /****************************************************************************************/
482
483 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
484 void
492
493 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
494 double
498
499 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
500 double
504
505
506 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
507 void
509 if(pingpong_sizes_.size() == 0) return;
510
511 using namespace std;
512 std::ios old_format(NULL);
513 old_format.copyfmt(out);
514
515 out << prefix
516 << setw(20) << "Message Size" << setw(1) << " "
517 << setw(20) << "Host (us)" << setw(1) << " "
518 << setw(20) << "Device (us)" << std::endl;
519
520 out << prefix
521 << setw(20) << "------------" << setw(1) << " "
522 << setw(20) << "---------" << setw(1) << " "
523 << setw(20) << "-----------" << std::endl;
524
525
526 for(int i=0; i<(int)pingpong_sizes_.size(); i++) {
527 int size = pingpong_sizes_[i];
528 double h_time = pingpong_host_times_[i];
529 double d_time = pingpong_device_times_[i];
530
531
532 out << prefix
533 << setw(20) << size << setw(1) << " "
534 << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
535 << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
536 }
537
538 out.copyfmt(old_format);
539 }
540
541
542
543 /****************************************************************************************/
544 /****************************************************************************************/
545 /****************************************************************************************/
546 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
547 void
548 PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > & import) {
549
551
553
554 }
555
556 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
557 double
561
562 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
563 double
567
568
569 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
570 void
572 if(halopong_sizes_.size() == 0) return;
573
574 using namespace std;
575 std::ios old_format(NULL);
576 old_format.copyfmt(out);
577
578 out << prefix
579 << setw(20) << "Message Size" << setw(1) << " "
580 << setw(20) << "Host (us)" << setw(1) << " "
581 << setw(20) << "Device (us)" << std::endl;
582
583 out << prefix
584 << setw(20) << "------------" << setw(1) << " "
585 << setw(20) << "---------" << setw(1) << " "
586 << setw(20) << "-----------" << std::endl;
587
588
589 for(int i=0; i<(int)halopong_sizes_.size(); i++) {
590 int size = halopong_sizes_[i];
591 double h_time = halopong_host_times_[i];
592 double d_time = halopong_device_times_[i];
593
594
595 out << prefix
596 << setw(20) << size << setw(1) << " "
597 << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
598 << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
599 }
600
601 out.copyfmt(old_format);
602 }
603
604 /****************************************************************************************/
605 /****************************************************************************************/
606 /****************************************************************************************/
607
608 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
609 void
611 using exec_space = typename Node::execution_space;
612 using range_policy = Kokkos::RangePolicy<exec_space>;
613 using clock = std::chrono::high_resolution_clock;
614
615 double total_test_time = 0;
616 clock::time_point start, stop;
617 for(int i = 0; i < KERNEL_REPEATS; i++) {
618 start = clock::now();
619 Kokkos::parallel_for("empty kernel",range_policy(0,1), KOKKOS_LAMBDA (const size_t j) {
620 ;
621 });
622 exec_space().fence();
623 stop = clock::now();
624 double my_test_time = std::chrono::duration<double>(stop - start).count();
625 total_test_time += my_test_time;
626 }
627
628 launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
629 }
630
631
632 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
633 double
637
638
639 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
640 void
642 using namespace std;
643 std::ios old_format(NULL);
644 old_format.copyfmt(out);
645
646 out << prefix
647 << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
648 << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl;
649
650 out.copyfmt(old_format);
651 }
652
653
654} //namespace MueLu
MueLu::DefaultScalar Scalar
std::vector< int > halopong_sizes_
void print_halopong_table(std::ostream &out, const std::string &prefix="")
std::vector< int > pingpong_sizes_
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
std::vector< double > halopong_device_times_
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
std::vector< int > stream_sizes_
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
std::vector< double > latency_corrected_stream_copy_times_
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
std::vector< double > pingpong_device_times_
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
std::vector< double > latency_corrected_stream_add_times_
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
std::vector< double > stream_copy_times_
double stream_vector_lookup(int SIZE_IN_BYTES)
std::vector< double > halopong_host_times_
std::vector< double > stream_add_times_
std::vector< double > pingpong_host_times_
void launch_latency_make_table(int KERNEL_REPEATS)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > &times)
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
Namespace for MueLu classes and methods.