Sequential kernel launch - incorrect results

Hi everyone!

I’ve encountered a strange bug(or feature:) ) with launching more than one same kernel sequentially - here is code:

__device__ void dotProdPartialSum(real *g_adata, real *g_bdata)

{

	__shared__ real data[BLOCK_SIZE];

	int tx = threadIdx.x;

	int bx = blockIdx.x;

	int idx = bx*BLOCK_SIZE + tx;

	data[tx] = g_adata[idx]*g_bdata[idx];

	

	if(tx<16) 

		data[tx]+=data[tx+16];

	if(tx< 8) 

		data[tx]+=data[tx+ 8];

	if(tx< 4) 

		data[tx]+=data[tx+ 4];

	if(tx< 2) 

		data[tx]+=data[tx+ 2];

	if(tx< 1) 

		data[tx]+=data[tx+ 1];

	if(tx==0) partData[bx] = data[0];

}

__global__ void dotProduct(real *g_adata, real *g_bdata, real *result)

{		

	__shared__ bool isLast;

	__shared__ real dataStore[GRID_SIZE];

	int tx = threadIdx.x;

	real sum = 0;

	dotProdPartialSum(g_adata, g_bdata);

	

    __threadfence();

	if(tx==0)

	{

		int ticket = atomicInc(&retirementCount, gridDim.x);

        isLast = (ticket==gridDim.x-1);

	}

    if(isLast)

    {

		for(int i = 0; i < GRID_SIZE; i+=BLOCK_SIZE)

			dataStore[tx+i] = partData[tx+i];

		

		for(int i = 0; i < GRID_SIZE; i+=2*BLOCK_SIZE)

        {

			if(tx<32)

				dataStore[tx+i]+=dataStore[tx+32+i];

			if(tx<16)

				dataStore[tx+i]+=dataStore[tx+16+i];

			if(tx< 8)

				dataStore[tx+i]+=dataStore[tx+ 8+i];

			if(tx< 4)

				dataStore[tx+i]+=dataStore[tx+ 4+i];

			if(tx< 2)

				dataStore[tx+i]+=dataStore[tx+ 2+i];

			if(tx< 1)

				dataStore[tx+i]+=dataStore[tx+ 1+i];

			sum+=dataStore[i];

        }

    }	

	*result = sum;

}

// some code - generating arrays and copying them to vram

for(int i = 0; i <  2; i++)

	{

		dotProduct<<<dimGrid, dimBlock>>>(r, r, rOut_d);

		//mul<<<1,1>>>(r,r,rOut_d);

		cudaMemcpy(rOut_h, rOut_d, sizeof(real), cudaMemcpyDeviceToHost);

		std::cout << *rOut_h << " ";

        }

And here I have a problem - after first launch I have some result, but all next results are zero (“0”). Could anyone tell me where am I wrong ?

CPU/RAM - i7 920/6Gb

Card - GTX470

OS - Win 7 sp1 x64

CUDA Toolkit ver - 4.0 RC2(cudatoolkit_4.0.13_win_64)

VS2008 sp1

Could you post complete code including testing?
what is partData?

#include <iostream>

#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cuda_runtime.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#define real float

#define BLOCK_SIZE 32

#define N 8192

#define GRID_SIZE N/BLOCK_SIZE

__device__ unsigned int retirementCount = 0;

__device__ real partData[GRID_SIZE];

__device__ void dotProdPartialSum(real *g_adata, real *g_bdata)

{

        __shared__ real data[BLOCK_SIZE];

        int tx = threadIdx.x;

        int bx = blockIdx.x;

        int idx = bx*BLOCK_SIZE + tx;

        data[tx] = g_adata[idx]*g_bdata[idx];        

        if(tx<16) data[tx]+=data[tx+16];

        if(tx< 8) data[tx]+=data[tx+ 8];

        if(tx< 4) data[tx]+=data[tx+ 4];

        if(tx< 2) data[tx]+=data[tx+ 2];

        if(tx< 1) data[tx]+=data[tx+ 1];

        if(tx==0) partData[bx] = data[0];

}

__global__ void dotProduct(real *g_adata, real *g_bdata, real *result)

{               

        __shared__ bool isLast;

        __shared__ real dataStore[GRID_SIZE];

        int tx = threadIdx.x;        real sum = 0;

        dotProdPartialSum(g_adata, g_bdata);        

    __threadfence();

        if(tx==0)

        {

                int ticket = atomicInc(&retirementCount, gridDim.x);

        isLast = (ticket==gridDim.x-1);

        }

    if(isLast)

    {

                for(int i = 0; i < GRID_SIZE; i+=BLOCK_SIZE)

                        dataStore[tx+i] = partData[tx+i];                

                for(int i = 0; i < GRID_SIZE; i+=2*BLOCK_SIZE)

        {

                        if(tx<32) dataStore[tx+i]+=dataStore[tx+32+i];

                        if(tx<16) dataStore[tx+i]+=dataStore[tx+16+i];

                        if(tx< 8) dataStore[tx+i]+=dataStore[tx+ 8+i];

                        if(tx< 4) dataStore[tx+i]+=dataStore[tx+ 4+i];

                        if(tx< 2) dataStore[tx+i]+=dataStore[tx+ 2+i];

                        if(tx< 1) dataStore[tx+i]+=dataStore[tx+ 1+i];

                        sum+=dataStore[i];

        }

    }   

        *result = sum;

}

int main ()

{

	real *rOut_h = new real();

	real *rOut_d = new real();

	real* h_x = new real[N];

	real* d_x = new real[N];

	for(int i = 0; i < N; i++) h_x[i] = 1;	

	cudaSetDevice(0);

	cudaEvent_t start, stop;

	float gpu_time = 0;

	float copy_time = 0;

	cudaEventCreate(&start);

	cudaEventCreate(&stop);

	cudaMalloc((void**)&d_x, sizeC);

	cudaMalloc((void**)&rOut_d, sizeof(real));

	cudaMemcpy(d_x, h_x, sizeC, cudaMemcpyHostToDevice);

	cudaEventRecord(start, 0);	

	dim3 dimBlock(BLOCK_SIZE,1,1);

	dim3 dimGrid(GRID_SIZE,1,1);

	for(int i = 0; i <  2; i++)

	{

		dotProduct<<<dimGrid, dimBlock>>>(d_x, d_x, rOut_d);		

		cudaMemcpy(rOut_h, rOut_d, sizeof(real), cudaMemcpyDeviceToHost);

		std::cout << *rOut_h << " ";		

	}

	cudaEventRecord(stop, 0);

	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&gpu_time, start, stop);

}

Does retirement count need to be reset to 0 between runs?

Wow! I knew I but couldn’t get where exactly. Thank you very much!