Hi everyone!
I’ve encountered a strange bug(or feature:) ) with launching more than one same kernel sequentially - here is code:
__device__ void dotProdPartialSum(real *g_adata, real *g_bdata)
{
__shared__ real data[BLOCK_SIZE];
int tx = threadIdx.x;
int bx = blockIdx.x;
int idx = bx*BLOCK_SIZE + tx;
data[tx] = g_adata[idx]*g_bdata[idx];
if(tx<16)
data[tx]+=data[tx+16];
if(tx< 8)
data[tx]+=data[tx+ 8];
if(tx< 4)
data[tx]+=data[tx+ 4];
if(tx< 2)
data[tx]+=data[tx+ 2];
if(tx< 1)
data[tx]+=data[tx+ 1];
if(tx==0) partData[bx] = data[0];
}
__global__ void dotProduct(real *g_adata, real *g_bdata, real *result)
{
__shared__ bool isLast;
__shared__ real dataStore[GRID_SIZE];
int tx = threadIdx.x;
real sum = 0;
dotProdPartialSum(g_adata, g_bdata);
__threadfence();
if(tx==0)
{
int ticket = atomicInc(&retirementCount, gridDim.x);
isLast = (ticket==gridDim.x-1);
}
if(isLast)
{
for(int i = 0; i < GRID_SIZE; i+=BLOCK_SIZE)
dataStore[tx+i] = partData[tx+i];
for(int i = 0; i < GRID_SIZE; i+=2*BLOCK_SIZE)
{
if(tx<32)
dataStore[tx+i]+=dataStore[tx+32+i];
if(tx<16)
dataStore[tx+i]+=dataStore[tx+16+i];
if(tx< 8)
dataStore[tx+i]+=dataStore[tx+ 8+i];
if(tx< 4)
dataStore[tx+i]+=dataStore[tx+ 4+i];
if(tx< 2)
dataStore[tx+i]+=dataStore[tx+ 2+i];
if(tx< 1)
dataStore[tx+i]+=dataStore[tx+ 1+i];
sum+=dataStore[i];
}
}
*result = sum;
}
// some code - generating arrays and copying them to vram
for(int i = 0; i < 2; i++)
{
dotProduct<<<dimGrid, dimBlock>>>(r, r, rOut_d);
//mul<<<1,1>>>(r,r,rOut_d);
cudaMemcpy(rOut_h, rOut_d, sizeof(real), cudaMemcpyDeviceToHost);
std::cout << *rOut_h << " ";
}
And here I have a problem - after first launch I have some result, but all next results are zero (“0”). Could anyone tell me where am I wrong ?
CPU/RAM - i7 920/6Gb
Card - GTX470
OS - Win 7 sp1 x64
CUDA Toolkit ver - 4.0 RC2(cudatoolkit_4.0.13_win_64)
VS2008 sp1
Could you post complete code including testing?
what is partData?
#include <iostream>
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#define real float
#define BLOCK_SIZE 32
#define N 8192
#define GRID_SIZE N/BLOCK_SIZE
__device__ unsigned int retirementCount = 0;
__device__ real partData[GRID_SIZE];
__device__ void dotProdPartialSum(real *g_adata, real *g_bdata)
{
__shared__ real data[BLOCK_SIZE];
int tx = threadIdx.x;
int bx = blockIdx.x;
int idx = bx*BLOCK_SIZE + tx;
data[tx] = g_adata[idx]*g_bdata[idx];
if(tx<16) data[tx]+=data[tx+16];
if(tx< 8) data[tx]+=data[tx+ 8];
if(tx< 4) data[tx]+=data[tx+ 4];
if(tx< 2) data[tx]+=data[tx+ 2];
if(tx< 1) data[tx]+=data[tx+ 1];
if(tx==0) partData[bx] = data[0];
}
__global__ void dotProduct(real *g_adata, real *g_bdata, real *result)
{
__shared__ bool isLast;
__shared__ real dataStore[GRID_SIZE];
int tx = threadIdx.x; real sum = 0;
dotProdPartialSum(g_adata, g_bdata);
__threadfence();
if(tx==0)
{
int ticket = atomicInc(&retirementCount, gridDim.x);
isLast = (ticket==gridDim.x-1);
}
if(isLast)
{
for(int i = 0; i < GRID_SIZE; i+=BLOCK_SIZE)
dataStore[tx+i] = partData[tx+i];
for(int i = 0; i < GRID_SIZE; i+=2*BLOCK_SIZE)
{
if(tx<32) dataStore[tx+i]+=dataStore[tx+32+i];
if(tx<16) dataStore[tx+i]+=dataStore[tx+16+i];
if(tx< 8) dataStore[tx+i]+=dataStore[tx+ 8+i];
if(tx< 4) dataStore[tx+i]+=dataStore[tx+ 4+i];
if(tx< 2) dataStore[tx+i]+=dataStore[tx+ 2+i];
if(tx< 1) dataStore[tx+i]+=dataStore[tx+ 1+i];
sum+=dataStore[i];
}
}
*result = sum;
}
int main ()
{
real *rOut_h = new real();
real *rOut_d = new real();
real* h_x = new real[N];
real* d_x = new real[N];
for(int i = 0; i < N; i++) h_x[i] = 1;
cudaSetDevice(0);
cudaEvent_t start, stop;
float gpu_time = 0;
float copy_time = 0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void**)&d_x, sizeC);
cudaMalloc((void**)&rOut_d, sizeof(real));
cudaMemcpy(d_x, h_x, sizeC, cudaMemcpyHostToDevice);
cudaEventRecord(start, 0);
dim3 dimBlock(BLOCK_SIZE,1,1);
dim3 dimGrid(GRID_SIZE,1,1);
for(int i = 0; i < 2; i++)
{
dotProduct<<<dimGrid, dimBlock>>>(d_x, d_x, rOut_d);
cudaMemcpy(rOut_h, rOut_d, sizeof(real), cudaMemcpyDeviceToHost);
std::cout << *rOut_h << " ";
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpu_time, start, stop);
}
Does retirement count need to be reset to 0 between runs?
Wow! I knew I but couldn’t get where exactly. Thank you very much!