76 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
78 using exec_space =
typename Node::execution_space;
79 using memory_space =
typename Node::memory_space;
80 using range_policy = Kokkos::RangePolicy<exec_space>;
82 Kokkos::View<impl_scalar_type*,memory_space> a(
"a", VECTOR_SIZE);
83 Kokkos::View<impl_scalar_type*,memory_space> b(
"b", VECTOR_SIZE);
84 Kokkos::View<impl_scalar_type*,memory_space> c(
"c", VECTOR_SIZE);
85 double total_test_time = 0.0;
87 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
89 Kokkos::parallel_for(
"stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t i) {
90 a(i) = ONE * (double)i;
95 using clock = std::chrono::high_resolution_clock;
97 clock::time_point start, stop;
99 for(
int i = 0; i < KERNEL_REPEATS; i++) {
100 start = clock::now();
101 Kokkos::parallel_for(
"stream/add",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t j) {
105 exec_space().fence();
107 double my_test_time = std::chrono::duration<double>(stop - start).count();
108 total_test_time += my_test_time;
111 return total_test_time / KERNEL_REPEATS;
117 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
119 using exec_space =
typename Node::execution_space;
120 using memory_space =
typename Node::memory_space;
121 using range_policy = Kokkos::RangePolicy<exec_space>;
123 Kokkos::View<impl_scalar_type*,memory_space> a(
"a", VECTOR_SIZE);
124 Kokkos::View<impl_scalar_type*,memory_space> b(
"b", VECTOR_SIZE);
125 double total_test_time = 0.0;
127 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
129 Kokkos::parallel_for(
"stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t i) {
132 exec_space().fence();
134 using clock = std::chrono::high_resolution_clock;
135 clock::time_point start, stop;
137 for(
int i = 0; i < KERNEL_REPEATS; i++) {
138 start = clock::now();
139 Kokkos::parallel_for(
"stream/copy",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t j) {
143 exec_space().fence();
145 double my_test_time = std::chrono::duration<double>(stop - start).count();
146 total_test_time += my_test_time;
149 return total_test_time / KERNEL_REPEATS;
204 int rank = comm.getRank();
205 int nproc = comm.getSize();
207 if(nproc < 2)
return;
210 const int buff_size = (int) pow(2,MAX_SIZE);
212 sizes.resize(MAX_SIZE+1);
213 times.resize(MAX_SIZE+1);
216 Kokkos::View<char*,memory_space> r_buf(
"recv",buff_size), s_buf(
"send",buff_size);
217 Kokkos::deep_copy(s_buf,1);
222 int buddy = odd ? rank - 1 : rank + 1;
224 for(
int i = 0; i < MAX_SIZE + 1 ;i ++) {
225 int msg_size = (int) pow(2,i);
228 double t0 = MPI_Wtime();
229 for(
int j = 0; j < KERNEL_REPEATS; j++) {
232 comm.send(msg_size, (
char*)s_buf.data(), buddy);
233 comm.receive(buddy, msg_size, (
char*)r_buf.data());
236 comm.receive(buddy, msg_size,(
char*)r_buf.data());
237 comm.send(msg_size, (
char*)s_buf.data(), buddy);
242 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
244 times[i] = time_per_call;
251 void halopong_basic(
int KERNEL_REPEATS,
int MAX_SIZE,
const RCP<
const Xpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > &
import, std::vector<int> & sizes, std::vector<double> & times) {
252 int nproc =
import->getSourceMap()->getComm()->getSize();
253 if(nproc < 2)
return;
254#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
257 using x_import_type = Xpetra::TpetraImport<LocalOrdinal,GlobalOrdinal,Node>;
258 RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(
import);
259 RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(
import->getSourceMap()->getComm());
260 MPI_Comm communicator = *mcomm->getRawMpiComm();
262 if(Ximport.is_null() || mcomm.is_null())
return;
263 auto Timport = Ximport->getTpetra_Import();
264 auto distor = Timport->getDistributor();
267 Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
268 Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
269 int num_recvs = (int)distor.getNumReceives();
270 int num_sends = (int)distor.getNumSends();
273 const int buff_size_per_msg = (int) pow(2,MAX_SIZE);
274 sizes.resize(MAX_SIZE+1);
275 times.resize(MAX_SIZE+1);
278 Kokkos::View<char*,memory_space> f_recv_buf(
"forward_recv",buff_size_per_msg*num_recvs), f_send_buf(
"forward_send",buff_size_per_msg*num_sends);
279 Kokkos::View<char*,memory_space> r_recv_buf(
"reverse_recv",buff_size_per_msg*num_sends), r_send_buf(
"reverse_send",buff_size_per_msg*num_recvs);
280 Kokkos::deep_copy(f_send_buf,1);
281 Kokkos::deep_copy(r_send_buf,1);
283 std::vector<MPI_Request> requests(num_sends+num_recvs);
284 std::vector<MPI_Status> status(num_sends+num_recvs);
287 for(
int i = 0; i < MAX_SIZE + 1 ;i ++) {
288 int msg_size = (int) pow(2,i);
290 MPI_Barrier(communicator);
292 double t0 = MPI_Wtime();
293 for(
int j = 0; j < KERNEL_REPEATS; j++) {
296 for(
int r=0; r<num_recvs;r++) {
297 const int tag = 1000+j;
298 MPI_Irecv(&f_recv_buf[msg_size*r],msg_size,MPI_CHAR,procsFrom[r],tag,communicator,&requests[ct]);
301 for(
int s=0; s<num_sends;s++) {
302 const int tag = 1000+j;
303 MPI_Isend(&f_send_buf[msg_size*s],msg_size,MPI_CHAR,procsTo[s],tag,communicator,&requests[ct]);
307 MPI_Waitall(ct,requests.data(),status.data());
311 for(
int r=0; r<num_sends;r++) {
312 const int tag = 2000+j;
313 MPI_Irecv(&r_recv_buf[msg_size*r],msg_size,MPI_CHAR,procsTo[r],tag,communicator,&requests[ct]);
316 for(
int s=0; s<num_recvs;s++) {
317 const int tag = 2000+j;
318 MPI_Isend(&r_send_buf[msg_size*s],msg_size,MPI_CHAR,procsFrom[s],tag,communicator,&requests[ct]);
322 MPI_Waitall(ct,requests.data(),status.data());
325 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
327 times[i] = time_per_call;