Here’s universal Global index calculation function I use.
#include “cuda_runtime_.h”
#include “device_launch_parameters.h”
#include <stdio.h>
//////////////////////////////////////////////////////////////////////////////
//Universal Gid calulation on any Dimensional grid and any Dimensional Block//
//////////////////////////////////////////////////////////////////////////////
//Kernel code
__global__void universalGidCalculation(int* input)
{
//First section locates and calculates thread offset within a block
int column = threadIdx.x;
int row = threadIdx.y;
int aisle = threadIdx.z;
int threads_per_row = blockDim.x; //# threads in x direction aka row
int threads_per_aisle = (blockDim.x * blockDim.y); //# threads in x and y direction for total threads per aisle
int threads_per_block = (blockDim.x * blockDim.y * blockDim.z);
int rowoffset = (row * threads_per_row);//how many rows to push out offset by
int aisleOffset = (aisle * threads_per_aisle);// how many aisles to push out offset by
//Second section locates and caculates block offset withing the grid
int blockColumn = blockIdx.x;
int blockRow = blockIdx.y;
int blockAisle = blockIdx.z;
int blocks_per_row = gridDim.x;//# blocks in x direction aka blocks per row
int blocks_per_aisle = (gridDim.x * gridDim.y); // # blocks in x and y direction for total blocks per aisle
int blockRowOffset = (blockRow * blocks_per_row);// how many rows to push out block offset by
int blockAisleOffset = (blockAisle * blocks_per_aisle);// how many aisles to push out block offset by
int blockId = blockColumn + blockRowOffset + blockAisleOffset;
int blockOffset = (blockId * threads_per_block);
int gid = (blockOffset + aisleOffset + rowOffset + column);
printf ("blockIdx : (%d,%d,%d) ThreadIdx :(%d,%d,%d), gid : (%2d), input[gid] :(%2d) \n",
blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, gid, input[gidl]);
}//end universalGIDcalculation
int main()
{
int arraySize = 64;
int arrayByteSize = sizeof(int) * arrayByteSize;
int hostData = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,
33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
//print up array before running kernel code on GPU to compare against
for (int i =0; i < arraySize; i++)
{
if (i == 33)
{
printf("%d\n", hostData[i]);
}
else
printf("%d ", hostData[i]);
}
printf("\n\n");
//Create device Data ptr, allocate the space on GPU and then copy it over
int * deviceData;
cudaMalloc((void**)&deviceData, arrayByteSize);
cudaMemcpy(deviceData, hostData, arrayByteSize, cudaMemcpyHostToDevice);
//Create blocks and grid
//Change these values to reflect different block and grid sizes to test and verify GID calculation above.
dim3 block(2,2,2);
dim3 grid(2,2,2);
universalGidCalculation <<< grid, block >>> (deviceData);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}//end main