I am generating 4D-array using Cuda. But When my kernel is
__global__ void getIndex(float* A, int w, int h, int d, int t)
{
int i = blockIdx.x;
int j = blockIdx.y;
int k = blockIdx.z;
A[t*h*w*d + h*w*i + w*j + k] = h*w*i+ w*j+ k;
printf("kernel value = %f\n",A[t*h*w*d + h*w*i + w*j + k]);
}
It is compiling and working but after some changes, it is not working, throwing bundles of errors.
__global__ void getIndex(float* A, int w, int h, int d, int t)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int k = threadIdx.z + blockIdx.z * blockDim.z;
A[t*h*w*d + h*w*i + w*j + k] = h*w*i+ w*j+ k;
printf("kernel value = %f\n",A[t*h*w*d + h*w*i + w*j + k]);
}
My Main Program is as follows
int main()
{
unsigned w = 5, h = 6, d = 5, nt = 7 ;
float *d_A;
cudaMalloc((void **)&d_A, nt * w * h * d * sizeof(float) );
for (int t = 0 ; t < nt ; t++)
{
dim3 blockDim(8, 8, 1);
dim3 gridSize(((w+blockDim.x)/blockDim.x),((h+blockDim.y)/blockDim.y),((d+blockDim.z)/blockDim.z));
getIndex <<<gridSize, blockDim >>> (d_A, w, h, d, t);
}
float *h_data = (float *) malloc(nt * w * h * d * sizeof(float));
cudaMemcpy(h_data, d_A, nt * w * h * d * sizeof(float), cudaMemcpyDeviceToHost);
return 0 ;
}
And I want to use multi-node multi-GPU for this task. I am unable to make it generic.
I know to use multi-GPU for a 1D array, but I am not getting for the multi-dimensional array while using dim3
.
https://github.com/Newbie-Coder-1105/NoviceChannel/tree/master/Cuda-Practice/multi_GPU_practice
Is there any way to perform such task for 3d array while using
dim3
cudaMalloc3DArray
texture memory for the 3d array
Why am I getting errors while using
threadIdx.x + blockIdx.x * blockDim.x
, in global kernel?
How could I make it multi-node multi-GPU, without knowing the number of GPUs in the cluster system?