Hi, I packed 2 cuda reduce kernels as a device function and tried to call the function inside another kernel. I find that a cudaDeviceSynchronize() function must be called after the cudaReduceSum() to get the correct result.
The stream of kernels is set to 0 (default), so my understanding is that the code should run serial within each thread, thus no synchronize needed.
Could somebody tell me what is the real issue here? Thanks!
__global__ void kernM(dim3 grid,dim3 block,
const unsigned int comps,
const unsigned int pts,
const unsigned int dim,
const float *X, float* loggamma,
float *working, float *Pi,float *Mu,
float* Sigma, const float weight,
const float tol_spt){
const unsigned int tid = threadIdx.x;
float *loggammaK=&loggamma[tid*pts];
float *workingK=&working[tid*pts*dim*dim];
cudaReduceSum(grid,block,pts,loggammaK,workingK,0);
cudaDeviceSynchronize();
const float maxArg=workingK[0];
cudaReduceSum(grid,block,pts,workingK,workingK,0);
cudaDeviceSynchronize();
}
__global__
void kernReduceSum(unsigned int num,const float *d_in, float *d_out)
{
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
volatile __shared__ float sdata[1024];
sdata[tid] = 0;
if (idx + blockDim.x< num) {
sdata[tid] = d_in[idx] + d_in[idx+blockDim.x];
}
else if(idx<num){
sdata[tid] = d_in[idx];
}
__syncthreads();
if (blockDim.x >= 1024) { if (tid < 512) { sdata[tid] += sdata[tid + 512]; } __syncthreads(); }
if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockDim.x >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) {
if (blockDim.x >= 64) sdata[tid] += sdata[tid + 32];
if (blockDim.x >= 32) sdata[tid] += sdata[tid + 16];
if (blockDim.x >= 16) sdata[tid] += sdata[tid + 8];
if (blockDim.x >= 8) sdata[tid] += sdata[tid + 4];
if (blockDim.x >= 4) sdata[tid] += sdata[tid + 2];
if (blockDim.x >= 2) sdata[tid] += sdata[tid + 1];
}
if (tid == 0) d_out[blockIdx.x] = sdata[0];
}
__host__ __device__
void cudaReduceSum(const dim3 grid,const dim3 block,
const unsigned int num, const float* dIn,float *dOut,
cudaStream_t stream) {
kernReduceSum<<<(grid.x+1)/2,block,0,stream>>>(num,dIn,dOut);
if((grid.x+1)/2!=1) {
const int n = (grid.x / 2) + (grid.x % 2);
kernReduceSum << < 1, block ,0,stream>> > (n, dOut, dOut);
}
}
Another issue is that I tried to create a stream in a host function, but cudaStreamCreate function is very slow. Any possible reason? Thanks!
auto start = std::chrono::high_resolution_clock::now();
cudaStream_t streams[1];
cudaStreamCreate(&streams[0]);
auto end = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
std::cout << "Time elapsed: " << duration.count() << "s" << std::endl;
// result: Time elapsed: 3.15592s