Hi,all. I’m new to CUDA. I have thread block size and data size problem.
Below is my code.
And my question is:
set my data size to 3 by 3 and thread to 4 by 4 and block to 1 by 1, how many thread will actuall run on my device? Is 4 by 4 or 3 by 3?
And if the threads number is 4 by 4, does it mean that I will get an out of range error when running on GPU because the data size is only 3 by 3?
I do a little research on emulation mode and CPU give me the error.
So if my data size is smaller than the number of threads, what should I do to avoid the out of range error? Thanks!
__global__ void assignNumber(float* d_out, float dim)
{
int x=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
int pos=y*dim+x;
//printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\n",gridDim.x, gridDim.y, blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, pos);
d_out[pos]=pos;
}
void main()
{
int dim=3;
float* d_out;
cudaMalloc((void**)&d_out, dim*dim*sizeof(float)));
dim3 thread(4,4);
dim3 grid( (dim-1)/thread.x+1, (dim-1)/thread.y+1);
assignNumber<<<grid, thread>>>(d_out,dim);
float* h_out=new float[dim*dim];
cudaMemcpy(h_out, d_out, dim*dim*sizeof(float), cudaMemcpyDeviceToHost);
//Check if pos equals the value
bool rst=true;
for(int i=0;i<dim;i++)
{
for(int j=0;j<dim;j++)
{
int pos=i*dim+j;
if(h_out[pos]!=pos)
{
rst=false;
break;
}
}
if(!rst) break;
}
if(rst)
{
cout<<"Success"<<endl;
}
else
{
cout<<"Failed"<<endl;
}
}