MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_PerfModels_decl.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_PERFMODELS_HPP
47#define MUELU_PERFMODELS_HPP
48
49#include "MueLu_ConfigDefs.hpp"
50#include "Xpetra_Import_fwd.hpp"
51
52#include <vector>
53#include <ostream>
54#include <Teuchos_DefaultComm.hpp>
55
57
58namespace MueLu {
59
60 template <class Scalar,
63 class Node = DefaultNode>
64 class PerfModels {
65 public:
66 PerfModels();
67
68 /* Single Node tests based upon the STREAM benchmark for measuring memory
69 * bandwith and computation rate. These processes compute either the addition
70 * of two vectors or the multiplication of dense matrices of any given size.
71 * Many iterations occur which then return a vector containing the individual
72 * lengths of time per iteration.
73 *
74 * See further here:
75 * - https://www.cs.virginia.edu/stream/ref.html
76 * - https://github.com/UoB-HPC/BabelStream
77 */
78
79 /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
80 void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20);
81 bool has_stream_vector_table() const {return stream_sizes_.size() > 0;}
82
83 /* Lookup in the stream_vector table */
84 double stream_vector_copy_lookup(int SIZE_IN_BYTES);
85 double stream_vector_add_lookup(int SIZE_IN_BYTES);
86 double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES);
87 double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES);
88
89 // Uses the faster of the tables. The time is then divided by the number of memory transactions
90 // per element in the kernel (e.g. 2 for COPY and 3 for ADD).
91 double stream_vector_lookup(int SIZE_IN_BYTES);
92 double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);
93
94 /* Print table */
95 void print_stream_vector_table(std::ostream & out, const std::string & prefix="");
96 void print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix="");
97
98 /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
99 * The sender process sends a message and then waits for confirmation of reception.
100 * Many iterations occur with various message sizes and the average latency values
101 * are returned within a map. Utilizes blocking send and recieve.
102 *
103 * See further: https://mvapich.cse.ohio-state.edu/benchmarks/
104 */
105 void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
106 bool has_pingpong_table() const {return pingpong_sizes_.size() > 0;}
107
108 /* Lookup in the pingpong_vector table */
109 double pingpong_host_lookup(int SIZE_IN_BYTES);
110 double pingpong_device_lookup(int SIZE_IN_BYTES);
111
112 /* Print table */
113 void print_pingpong_table(std::ostream & out, const std::string & prefix="");
114
115 /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.
116 * Here we use exactly the communication pattern specified in the import object
117 * and send messages accordingly. We vary the size in bytes sent per message,
118 * which should capture max-rate effects to some degree.
119 *
120 * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/
121 */
122 void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > & import);
123 bool has_halopong_table() const {return halopong_sizes_.size() > 0;}
124
125 /* Lookup in the halopong_vector table */
126 double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
127 double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
128
129 /* Print table */
130 void print_halopong_table(std::ostream & out, const std::string & prefix="");
131
132
133
134 /* Estimate launch latency based on the cost of submitting an empty Kokkos::parallel_for.
135 * This necessary to correct the memory bandwidth costs for models on high latency platforms,
136 * e.g., GPUS.
137 */
138 void launch_latency_make_table(int KERNEL_REPEATS);
140
141 /* Lookup launch latency */
142 double launch_latency_lookup();
143
144 /* Print table */
145 void print_launch_latency_table(std::ostream & out, const std::string & prefix="");
146
147 private:
148 void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix);
149
150
151 std::vector<int> stream_sizes_;
152 std::vector<double> stream_copy_times_;
153 std::vector<double> stream_add_times_;
156
157 std::vector<int> pingpong_sizes_;
158 std::vector<double> pingpong_host_times_;
159 std::vector<double> pingpong_device_times_;
160
161 std::vector<int> halopong_sizes_;
162 std::vector<double> halopong_host_times_;
163 std::vector<double> halopong_device_times_;
164
166
167
168 }; //class PerfModels
169
170} //namespace MueLu
171
172#endif //ifndef MUELU_PERFMODELS_HPP
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
MueLu::DefaultNode Node
std::vector< int > halopong_sizes_
void print_halopong_table(std::ostream &out, const std::string &prefix="")
std::vector< int > pingpong_sizes_
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
std::vector< double > halopong_device_times_
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
std::vector< int > stream_sizes_
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
bool has_launch_latency_table() const
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
std::vector< double > latency_corrected_stream_copy_times_
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
std::vector< double > pingpong_device_times_
bool has_stream_vector_table() const
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
std::vector< double > latency_corrected_stream_add_times_
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
std::vector< double > stream_copy_times_
double stream_vector_lookup(int SIZE_IN_BYTES)
std::vector< double > halopong_host_times_
std::vector< double > stream_add_times_
std::vector< double > pingpong_host_times_
void launch_latency_make_table(int KERNEL_REPEATS)
Namespace for MueLu classes and methods.
Tpetra::KokkosClassic::DefaultNode::DefaultNodeType DefaultNode