Ok, I did more testing. I wrote a simple program (as below).
If I use dimBlock(32,32), I got the correct result, A.width is changed to 10. If I use dimBlock(512,512) or dimBlock(48,48), I got wrong result, A.width is still 1 (not changed to 10).
But for both dimBlock(32,32) and dimBlock(512,512), I got “kernel launch good” message. Is that my error checking is not correct?
Thanks,
lightblue
#include <stdio.h>
#include <unistd.h>
#include <cuda.h>
typedef struct
{
int width;
int height;
} data;
global void sample_kernel(data *);
int main ( int argc, char *argv)
{
data A, *d_A;
A.width=1; A.height=2;
cudaMalloc(&d_A, sizeof(data));
cudaMemcpy(d_A, &A, sizeof(data), cudaMemcpyHostToDevice);
//invoke kernel
dim3 dimBlock(32, 32);
dim3 dimGrid(24, 24);
sample_kernel<<<dimGrid, dimBlock>>>(d_A);
{
cudaError_t cudaerr = cudaDeviceSynchronize();
if ( cudaerr!=CUDA_SUCCESS)
printf(“kernel launch failed with err "%s".\n”,cudaGetErrorString(cudaerr));
else
printf(“Kernel launch good\n”);
}
cudaMemcpy(&A, d_A, sizeof(data), cudaMemcpyDeviceToHost);
printf(“now A.width is %d\n”, A.width);
}
global void sample_kernel(data *A)
{
if ( blockIdx.x==1 && blockIdx.y==1 && threadIdx.x==1 && threadIdx.y==1 )
{
A->width=10;
A->height=20;
printf(“setting A width to 10\n”);
}
}