Hypergrid

For whos of you tired to worry about threads allocation on the 2D or 3D grid of CUDA devices I worte a little utility; Hypergrid let you create a virtual N-dimensional grid wrapped on a standard CUDA 2D grid. It calculate the threads per block and the blocks per grid necessary necessary for the implementation of the virtual grid. Please see the example below:

#include <stdio.h>
#include <stdarg.h>

#define MAXDIMS 8

class hypergrid {
public:
int no_dims;
int dims[MAXDIMS];
int part_dims[MAXDIMS];
int *indexes[MAXDIMS];
int no_items;
int gx,gy,horizgrid;
hypergrid *cuda_hypergrid;
dim3 hgrid;
dim3 hblock;

void set_hgrid_size(int hg) {
	int blocksx,blocksy;
	int tpbx,tpby;
	horizgrid=hg;
	gx=hg;
	gy=(((no_items+horizgrid-1) / horizgrid));
	if (gx>=16) tpbx=16;
	if (gy>=16) tpby=16;
	if (gx>=8 && gx<16) tpbx=8;
	if (gy>=8 && gy<16) tpby=8;
	if (gx>=4 && gx<8) tpbx=4;
	if (gy>=4 && gy<8) tpby=4;
	if (gx>=2 && gx<4) tpbx=2;
	if (gy>=2 && gy<4) tpby=2;
	if (gx>=0 && gx<2) tpbx=1;
	if (gy>=0 && gy<2) tpby=1;
	blocksx=(gx+tpbx-1)/tpbx;
	blocksy=(gy+tpby-1)/tpby;
	if (blocksx<1) blocksx=1;
	if (blocksy<1) blocksy=1;
	hgrid = dim3(blocksx,blocksy);
	hblock = dim3(tpbx,tpby);
	copy();
}

hypergrid(int hg,int n,...) {
	int val;
	int i;
	va_list vl;
	va_start(vl,n);
	no_dims=n;
	for (i=0;i<no_dims;i++) {
		val=va_arg(vl,int);
		dims[i]=val;
	}
	va_end(vl);
	no_items=1;
	for (i=0;i<no_dims;i++) {
		part_dims[i]=no_items;
		no_items*=dims[i];
	}
	cudaMalloc((void**)&cuda_hypergrid,sizeof(hypergrid));
	set_hgrid_size(hg);
	printf("grid geometry: %d %d |||| %d %d

",hgrid.x,hgrid.y,hblock.x,hblock.y);
printf("Items=%d
",no_items);
}
~hypergrid() {
if (cuda_hypergrid) cudaFree(cuda_hypergrid);
}
device void access(int i,int j) {
indexes[j][i] = (i / part_dims[j]) % dims[j];
}
void copy() {
cudaMemcpy(cuda_hypergrid,this,sizeof(hypergrid),cudaMemcpyHostToDevice);
}
device int get_absidx() {
int ix,iy;
ix=blockDim.x * blockIdx.x + threadIdx.x;
iy=blockDim.y * blockIdx.y + threadIdx.y;
return ix+horizgrid*iy;
}
device int get_idx(int iz,int i) {
return (i / part_dims[iz]) % dims[iz];
}
device int get_idx2(int iz,int i) {
return (i / part_dims[iz]) & (dims[iz]-1);
}
};

struct appdata {
public:
float item1[16];
int item2[32];
double item3[16];
char item4[4];
};

global void use_idx(hypergrid *hg,appdata *data) {
int i;
int i0,i1,i2,i3;
i=hg->get_absidx();
if (ino_items) {
i0=hg->get_idx2(0,i);
i1=hg->get_idx2(1,i);
i2=hg->get_idx2(2,i);
i3=hg->get_idx2(3,i);
data->item1[i0]=i0;
data->item2[i1]=i1;
data->item3[i2]=i2;
data->item3[i3]=i3;
data->item3[i3]=i3;
printf("%d %d %d %d
",i0,i1,i2,i3);
}
}

int main(int argc,char *argv) {
hypergrid hg;
appdata data;
cudaMalloc((void
)&data,sizeof(data));

// Horizontal grid size,number of dimensions,list of dimensions
hg=new hypergrid(16384,4,  16,32,16,4);
use_idx<<<hg->hgrid,hg->hblock>>>(hg->cuda_hypergrid,data);
delete hg;
cudaFree(data);
cudaDeviceReset();
return 0;

}