Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Kokkos_Parallel_MP_Vector.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43#define KOKKOS_PARALLEL_MP_VECTOR_HPP
44
45#include "Sacado_MP_Vector.hpp"
46#include "Kokkos_Core.hpp"
47
48//----------------------------------------------------------------------------
49// Kokkos execution policies useful for Sacado::MP::Vector scalar type
50//----------------------------------------------------------------------------
51
52namespace Kokkos {
53
57 template< class ExecSpace, class Tag = void >
59
61 typedef ExecSpace execution_space ;
62 typedef Tag work_tag ;
63
65 size_t range;
66 size_t team;
67 size_t shared;
68
69
72 const size_t range_,
73 const size_t team_,
74 const size_t shared_ = 0 ) :
75 space_(space), range(range_), team(team_), shared(shared_) {}
76
78 MPVectorWorkConfig( const size_t range_,
79 const size_t team_,
80 const size_t shared_ = 0 ) :
81 MPVectorWorkConfig(execution_space(), range_, team_, shared_) {}
82
83 ExecSpace space() const { return space_; }
84};
85
86namespace Impl {
87
88#if defined( KOKKOS_ENABLE_THREADS )
89// Specialization of ParallelFor<> for MPVectorWorkConfig and Threads
90// The default implementation ignores the team size and uses the standard
91// work-range implementation. In the future maybe we should try and use
92// hyperthreads in a useful way. That would require:
93// -- interpreting the team-size differently, rather as the sacado size
94// -- determining the vector size of the architecture
95// -- laying out the threads differently to use hyperthreads across the
96// the sacado dimension
97template< class FunctorType, class Tag >
98class ParallelFor< FunctorType , MPVectorWorkConfig< Threads, Tag > > :
99 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Threads > > {
100 typedef Kokkos::RangePolicy< Tag, Threads > Policy ;
101public:
102 ParallelFor( const FunctorType & functor ,
103 const MPVectorWorkConfig< Threads, Tag > & work_config ) :
104 ParallelFor< FunctorType , Policy >( functor ,
105 Policy( 0, work_config.range ) ) {}
106};
107#endif
108
109#if defined( KOKKOS_ENABLE_OPENMP )
110// Specialization of ParallelFor<> for MPVectorWorkConfig and OpenMP
111// The default implementation ignores the team size and uses the standard
112// work-range implementation. In the future maybe we should try and use
113// hyperthreads in a useful way. That would require:
114// -- interpreting the team-size differently, rather as the sacado size
115// -- determining the vector size of the architecture
116// -- laying out the threads differently to use hyperthreads across the
117// the sacado dimension
118template< class FunctorType, class Tag >
119class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP, Tag > > :
120 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, OpenMP > > {
121 typedef Kokkos::RangePolicy< Tag, OpenMP > Policy ;
122public:
123 ParallelFor( const FunctorType & functor ,
124 const MPVectorWorkConfig< OpenMP, Tag > & work_config ) :
125 ParallelFor< FunctorType , Policy >( functor ,
126 Policy( 0, work_config.range ) ) {}
127};
128#endif
129
130#if defined(KOKKOS_ENABLE_SERIAL)
131// Specialization of ParallelFor<> for MPVectorWorkConfig and Serial
132// The default implementation ignores the team size and uses the standard
133// work-range implementation. In the future maybe we should try and use
134// hyperthreads in a useful way. That would require:
135// -- interpreting the team-size differently, rather as the sacado size
136// -- determining the vector size of the architecture
137// -- laying out the threads differently to use hyperthreads across the
138// the sacado dimension
139template< class FunctorType, class Tag >
140class ParallelFor< FunctorType , MPVectorWorkConfig< Serial, Tag > > :
141 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Serial > > {
142 typedef Kokkos::RangePolicy< Tag, Serial > Policy ;
143public:
144 ParallelFor( const FunctorType & functor ,
145 const MPVectorWorkConfig< Serial, Tag > & work_config ) :
146 ParallelFor< FunctorType , Policy >( functor ,
147 Policy( 0, work_config.range ) ) {}
148};
149#endif // defined(KOKKOS_ENABLE_SERIAL)
150
151#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
152
153// Specialization of ParallelFor<> for MPVectorWorkConfig on Cuda
154// Here we use threadIdx.x for each entry in the specified team-size
155template< class FunctorType, class Tag >
156class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda, Tag > > {
157public:
158
159 typedef Kokkos::RangePolicy< Tag, Cuda > Policy;
160
161 const FunctorType m_functor ;
162 const MPVectorWorkConfig< Cuda, Tag > m_config;
163 const Cuda::size_type m_work ;
164 const Policy m_policy;
165
166 template <class TagType>
167 inline __device__
168 typename std::enable_if<std::is_same<TagType, void>::value>::type
169 exec_range(const Cuda::size_type i, Cuda::size_type j) const {
170 m_functor(i, j);
171 }
172
173 template <class TagType>
174 inline __device__
175 typename std::enable_if<!std::is_same<TagType, void>::value>::type
176 exec_range(const Cuda::size_type i, Cuda::size_type j) const {
177 m_functor(TagType(), i, j);
178 }
179
180 Policy const& get_policy() const { return m_policy; }
181
182 inline
183 __device__
184 void operator()(void) const
185 {
186 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
187
188 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
189 iwork < m_work ;
190 iwork += work_stride ) {
191 this->template exec_range<Tag>(iwork, threadIdx.x);
192 }
193 }
194
195 ParallelFor( const FunctorType & functor ,
196 const MPVectorWorkConfig< Cuda, Tag > & work_config )
197 : m_functor( functor ) ,
198 m_config( work_config ) ,
199 m_work( work_config.range ),
200 m_policy()
201 {
202 }
203
204 inline
205 void execute() const
206 {
207 // To do: query number of registers used by functor and adjust
208 // nwarp accordingly to get maximum occupancy
209
210 Cuda::size_type nwarp = 0;
211 if (m_config.team > CudaTraits::WarpSize) {
212 const Cuda::size_type warps_per_team =
213 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
214 nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
215 }
216 else {
217 const Cuda::size_type teams_per_warp =
218 CudaTraits::WarpSize / m_config.team ;
219 nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
220 }
221 const dim3 block( m_config.team , nwarp , 1 );
222
223 Cuda::size_type nblock =
224 std::min( (m_work + block.y - 1 ) / block.y ,
225 cuda_internal_maximum_grid_count()[0] );
226 const dim3 grid( nblock , 1 , 1 );
227
228 const Cuda::size_type shared = m_config.shared;
229 CudaParallelLaunch< ParallelFor >( *this , grid , block , shared , m_policy.space().impl_internal_space_instance() );
230 }
231};
232
233#endif
234
235} // namespace Impl
236
237} // namespace Kokkos
238
239//----------------------------------------------------------------------------
240//----------------------------------------------------------------------------
241
242#endif /* #ifndef KOKKOS_ATOMIC_MP_VECTOR_HPP */
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
in the default execution space instance
MPVectorWorkConfig(const execution_space &space, const size_t range_, const size_t team_, const size_t shared_=0)
in the provided execution space instance