Hi,
I am using CUDA1.0 and i want to do some atomic operation in a memory location at global area. My display card is 8800GTS. I changed the custom build setup to
$(CUDA_BIN_PATH)\nvcc.exe -arch sm_11 -ccbin “$(VCInstallDir)bin” -c -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/O2,/Zi,/MT -I"$(CUDA_INC_PATH)" -I./ -I…/…/common/inc -o $(ConfigurationName)\template.obj template.cu
But the program is giving some different output than what i expect.
kernel
global void
testKernel( int* g_odata)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
if(tx==0&&bx==0)
{
g_odata[0]=0;
}
int nBlocksize = 16;
int nStart = bx * ceil((float)65536/nBlocksize) + tx * ceil((float)((65536/nBlocksize)/nBlocksize));
for( int i = nStart; i <= nStart+ceil((float)((65536/nBlocksize)/nBlocksize)); i=i+1 )
{
g_odata[0] = 1.0f;
//__syncthreads();
}
}
host
void
runTest( int argc, char** argv)
{
CUT_DEVICE_INIT();
int* pCpuOutData = (int*)malloc( 256256sizeof(float));
int* pOutData;
CUDA_SAFE_CALL( cudaMalloc( (void**) &pOutData, 256 * 256 * sizeof(int)));
dim3 grid(16,1);
dim3 thread(16,1);
testKernel<<<grid,thread>>>(pOutData);
CUDA_SAFE_CALL( cudaMemcpy( pCpuOutData, pOutData, 256 * 256 * sizeof(int),
cudaMemcpyDeviceToHost) );
printf("%d",pCpuOutData[0]);
CUDA_SAFE_CALL( cudaFree(pOutData));
free( pCpuOutData );
}
The output is some junk value like 11731320…
Please help me.