42#ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43#define KOKKOS_PARALLEL_MP_VECTOR_HPP
46#include "Kokkos_Core.hpp"
57 template<
class ExecSpace,
class Tag =
void >
74 const size_t shared_ = 0 ) :
80 const size_t shared_ = 0 ) :
88#if defined( KOKKOS_ENABLE_THREADS )
97template<
class FunctorType,
class Tag >
98class ParallelFor< FunctorType , MPVectorWorkConfig< Threads, Tag > > :
99 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Threads > > {
100 typedef Kokkos::RangePolicy< Tag, Threads > Policy ;
102 ParallelFor(
const FunctorType & functor ,
103 const MPVectorWorkConfig< Threads, Tag > & work_config ) :
104 ParallelFor< FunctorType , Policy >( functor ,
105 Policy( 0, work_config.range ) ) {}
109#if defined( KOKKOS_ENABLE_OPENMP )
118template<
class FunctorType,
class Tag >
119class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP, Tag > > :
120 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, OpenMP > > {
121 typedef Kokkos::RangePolicy< Tag, OpenMP > Policy ;
123 ParallelFor(
const FunctorType & functor ,
124 const MPVectorWorkConfig< OpenMP, Tag > & work_config ) :
125 ParallelFor< FunctorType , Policy >( functor ,
126 Policy( 0, work_config.range ) ) {}
130#if defined(KOKKOS_ENABLE_SERIAL)
139template<
class FunctorType,
class Tag >
140class ParallelFor< FunctorType , MPVectorWorkConfig< Serial, Tag > > :
141 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Serial > > {
142 typedef Kokkos::RangePolicy< Tag, Serial > Policy ;
144 ParallelFor(
const FunctorType & functor ,
145 const MPVectorWorkConfig< Serial, Tag > & work_config ) :
146 ParallelFor< FunctorType , Policy >( functor ,
147 Policy( 0, work_config.range ) ) {}
151#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
155template<
class FunctorType,
class Tag >
156class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda, Tag > > {
159 typedef Kokkos::RangePolicy< Tag, Cuda > Policy;
161 const FunctorType m_functor ;
162 const MPVectorWorkConfig< Cuda, Tag > m_config;
163 const Cuda::size_type m_work ;
164 const Policy m_policy;
166 template <
class TagType>
168 typename std::enable_if<std::is_same<TagType, void>::value>::type
169 exec_range(
const Cuda::size_type i, Cuda::size_type
j)
const {
173 template <
class TagType>
175 typename std::enable_if<!std::is_same<TagType, void>::value>::type
176 exec_range(
const Cuda::size_type i, Cuda::size_type
j)
const {
177 m_functor(TagType(), i,
j);
180 Policy
const& get_policy()
const {
return m_policy; }
184 void operator()(
void)
const
186 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
188 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
190 iwork += work_stride ) {
191 this->
template exec_range<Tag>(iwork, threadIdx.x);
195 ParallelFor(
const FunctorType & functor ,
196 const MPVectorWorkConfig< Cuda, Tag > & work_config )
197 : m_functor( functor ) ,
198 m_config( work_config ) ,
199 m_work( work_config.range ),
210 Cuda::size_type nwarp = 0;
211 if (m_config.team > CudaTraits::WarpSize) {
212 const Cuda::size_type warps_per_team =
213 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
214 nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
217 const Cuda::size_type teams_per_warp =
218 CudaTraits::WarpSize / m_config.team ;
219 nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
221 const dim3 block( m_config.team , nwarp , 1 );
223 Cuda::size_type nblock =
224 std::min( (m_work + block.y - 1 ) / block.y ,
225 cuda_internal_maximum_grid_count()[0] );
226 const dim3 grid( nblock , 1 , 1 );
228 const Cuda::size_type shared = m_config.shared;
229 CudaParallelLaunch< ParallelFor >( *
this , grid , block , shared , m_policy.space().impl_internal_space_instance() );
ExecSpace execution_space
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
in the default execution space instance
MPVectorWorkConfig execution_policy
MPVectorWorkConfig(const execution_space &space, const size_t range_, const size_t team_, const size_t shared_=0)
in the provided execution space instance