For whos of you tired to worry about threads allocation on the 2D or 3D grid of CUDA devices I worte a little utility; Hypergrid let you create a virtual N-dimensional grid wrapped on a standard CUDA 2D grid. It calculate the threads per block and the blocks per grid necessary necessary for the implementation of the virtual grid. Please see the example below:
#include <stdio.h>
#include <stdarg.h>
#define MAXDIMS 8
class hypergrid {
public:
int no_dims;
int dims[MAXDIMS];
int part_dims[MAXDIMS];
int *indexes[MAXDIMS];
int no_items;
int gx,gy,horizgrid;
hypergrid *cuda_hypergrid;
dim3 hgrid;
dim3 hblock;
void set_hgrid_size(int hg) {
int blocksx,blocksy;
int tpbx,tpby;
horizgrid=hg;
gx=hg;
gy=(((no_items+horizgrid-1) / horizgrid));
if (gx>=16) tpbx=16;
if (gy>=16) tpby=16;
if (gx>=8 && gx<16) tpbx=8;
if (gy>=8 && gy<16) tpby=8;
if (gx>=4 && gx<8) tpbx=4;
if (gy>=4 && gy<8) tpby=4;
if (gx>=2 && gx<4) tpbx=2;
if (gy>=2 && gy<4) tpby=2;
if (gx>=0 && gx<2) tpbx=1;
if (gy>=0 && gy<2) tpby=1;
blocksx=(gx+tpbx-1)/tpbx;
blocksy=(gy+tpby-1)/tpby;
if (blocksx<1) blocksx=1;
if (blocksy<1) blocksy=1;
hgrid = dim3(blocksx,blocksy);
hblock = dim3(tpbx,tpby);
copy();
}
hypergrid(int hg,int n,...) {
int val;
int i;
va_list vl;
va_start(vl,n);
no_dims=n;
for (i=0;i<no_dims;i++) {
val=va_arg(vl,int);
dims[i]=val;
}
va_end(vl);
no_items=1;
for (i=0;i<no_dims;i++) {
part_dims[i]=no_items;
no_items*=dims[i];
}
cudaMalloc((void**)&cuda_hypergrid,sizeof(hypergrid));
set_hgrid_size(hg);
printf("grid geometry: %d %d |||| %d %d
",hgrid.x,hgrid.y,hblock.x,hblock.y);
printf("Items=%d
",no_items);
}
~hypergrid() {
if (cuda_hypergrid) cudaFree(cuda_hypergrid);
}
device void access(int i,int j) {
indexes[j][i] = (i / part_dims[j]) % dims[j];
}
void copy() {
cudaMemcpy(cuda_hypergrid,this,sizeof(hypergrid),cudaMemcpyHostToDevice);
}
device int get_absidx() {
int ix,iy;
ix=blockDim.x * blockIdx.x + threadIdx.x;
iy=blockDim.y * blockIdx.y + threadIdx.y;
return ix+horizgrid*iy;
}
device int get_idx(int iz,int i) {
return (i / part_dims[iz]) % dims[iz];
}
device int get_idx2(int iz,int i) {
return (i / part_dims[iz]) & (dims[iz]-1);
}
};
struct appdata {
public:
float item1[16];
int item2[32];
double item3[16];
char item4[4];
};
global void use_idx(hypergrid *hg,appdata *data) {
int i;
int i0,i1,i2,i3;
i=hg->get_absidx();
if (ino_items) {
i0=hg->get_idx2(0,i);
i1=hg->get_idx2(1,i);
i2=hg->get_idx2(2,i);
i3=hg->get_idx2(3,i);
data->item1[i0]=i0;
data->item2[i1]=i1;
data->item3[i2]=i2;
data->item3[i3]=i3;
data->item3[i3]=i3;
printf("%d %d %d %d
",i0,i1,i2,i3);
}
}
int main(int argc,char *argv) {
hypergrid hg;
appdata data;
cudaMalloc((void)&data,sizeof(data));
// Horizontal grid size,number of dimensions,list of dimensions
hg=new hypergrid(16384,4, 16,32,16,4);
use_idx<<<hg->hgrid,hg->hblock>>>(hg->cuda_hypergrid,data);
delete hg;
cudaFree(data);
cudaDeviceReset();
return 0;
}