42 typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
43 typedef typename ViewTypeC::execution_space execution_space;
45#if defined (KOKKOS_ENABLE_CUDA)
46 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
48 const bool is_cuda =
false;
50 const unsigned vector_size = is_cuda ? 32 : 1;
51 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
53 const int m =
A.extent(0);
54 const int n =
A.extent(1);
55 const int range = (m+team_size-1)/team_size;
57 typedef Kokkos::TeamPolicy<execution_space> Policy;
59 Policy( range,team_size,vector_size ),
60 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
61 const int i = team.league_rank()*team.team_size() + team.team_rank();
66 for (
int j=0; j<n; ++j)
77 const double tol = 1.0e-14;
78 typedef typename ViewTypeC::value_type value_type;
79 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(
c);
80 Kokkos::deep_copy(h_c,
c);
81 const size_t m =
A.extent(0);
82 const size_t n =
A.extent(1);
83 const size_t p = Kokkos::dimension_scalar(
A);
84 for (
size_t i=0; i<m; ++i) {
85 for (
size_t j=0; j<p; ++j) {
86 value_type t = (j == p-1 ? n : 2*n);
88 std::cout <<
"Comparison failed! " << i <<
"," << j <<
" : "
89 << h_c(i).fastAccessDx(j) <<
" , " << t << std::endl;
98 const size_t nloop,
const bool check)
100 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
101 typedef Kokkos::View<
FadType*, ViewArgs...> ViewTypeB;
102 typedef Kokkos::View<
FadType*, ViewArgs...> ViewTypeC;
103 typedef typename ViewTypeA::execution_space execution_space;
105#if defined (KOKKOS_ENABLE_CUDA)
106 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
108 const bool is_cuda =
false;
111 const int FadStride = is_cuda ? 32 : 1;
112#if defined(SACADO_ALIGN_SFAD)
114 const int Nalign = ((
N+FadStride-1)/FadStride)*FadStride;
115 const size_t pa =
N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
116 typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
118 typedef FadType AlignedFadType;
126 typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
127 typedef Kokkos::View<AlignedFadType*, ConLayoutB, execution_space> ConViewTypeB;
128 typedef Kokkos::View<AlignedFadType*, ConLayoutC, execution_space> ConViewTypeC;
130 ConViewTypeA
A(
"A",m,n,pa+1);
131 ConViewTypeB b(
"B",n,pa+1);
132 ConViewTypeC
c(
"c",m,pa+1);
137 Kokkos::deep_copy(
typename ConViewTypeA::array_type(
A), 1.0);
138 Kokkos::deep_copy(
typename ConViewTypeB::array_type(b), 1.0);
140 Kokkos::Timer wall_clock;
145 execution_space().fence();
148 for (
size_t l=0; l<nloop; l++) {
151 execution_space().fence();
153 perf.
time = wall_clock.seconds() / nloop;
154 perf.
flops = m*n*(2+4*p);
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)