My ad-hoc macro processor for CUDA loop unroller

We’ve been complaining quite a lot on the lousy compilers. Since NV isn’t responding, maybe we should share a few ad-hoc solutions.

I wrote this thing to write unrolled loops in reasonably simple syntax. It also contains code to enable easier use of device API, but I can’t get that part to work outside of my own project now :/ .

It’s written in lex and should be mostly portable. The system lines are windows specific, though.

%{

/*

CUDA macro processor.......

*/

#include <stdio.h>

#include <ctype.h>

char big[65536];

char texcrap[65536],*crap0=texcrap;

char texcrap1[65536],*crap1=texcrap1;

char fname[260]="$.cu";

char *fnamein="<stdin>";

int did;

int first=1;

int lid=1;

int enablecu=0;

int cns[9999];

%}

%%

^"!+enablecu"\n {

	enablecu=1;

	lid++;

}

^"!+set$".*\n {

	int id,v;

	sscanf(yytext+6,"%d",&id);

	sscanf(strchr(yytext,'=')+1,"%d",&v);

	cns[id]=v;

	lid++;

}

^"!!".*\n {

	strcpy(fname,yytext+2);

	fname[strlen(fname)-1]=0;

	lid++;

}

[ \t\n]"rep"[ \t\n]+[0-9a-zA-Z]+"<"[0-9$]+ {

	char buf[128];

	int i,n,c,sn,l0;

	char *s;

	l0=lid;

	for(s=yytext;*s!='r';s++);

	for(s+=3;isspace(*s);s++);

	strcpy(buf,s);

	s=strchr(buf,'<');

	*s=0;

	if(sscanf(s+1,"%d",&n)<=0)

	{

  sscanf(s+2,"%d",&n);

  n=cns[n];

	}

	for(c=0,sn=0;;sn++)

	{

  int ch=input();

  big[sn]=ch;

  if(ch=='{')c++;else

  if(ch=='}'){if(!--c)break;}else

  if(ch==';'){if(c==0)break;}

  if(first&&ch=='\n')

  {

  	lid++;

  	sprintf(big+sn+1,"#line %d \"%s\"\n",lid,fnamein);

  	sn+=strlen(big+sn)-1;

  }

	}

	big[sn+1]=0;

	fprintf(yyout,"\n");

	for(i=0;i<n;i++)

  fprintf(yyout,"#define %s %d\n%s\n#undef %s\n",buf,i,big,buf);

	did=1;

	if(first)fprintf(yyout,"\n#line %d \"%s\"\n",lid,fnamein);

}

"@_@(" {

	/*

	f gx gy ...

	this shouldn't be put in rep -_-b

	*/

	char buf[1024];

	int bran=0;

	char *s=buf;

	int fst=1;

	for(;;)

	{

  int c=input();

  *s++=c;if(c=='\n')lid++;

  if(c=='(')bran++;

  if(c==')')

  {

  	bran--;

  	if(bran<0)goto separa;

  }

  if(c==','&&bran==0)

  {

  separa:

  	s[-1]=0;

  	fprintf(yyout,"\n#line %d \"%s\"\n%s(%s);\n",lid,fnamein,fst?"{_startcuparam":"_addcuparam",buf);

  	fst=0;

  	s=buf;

  	if(bran<0)break;

  }

	}

	fprintf(yyout,"\n#line %d \"%s\"\n_endcuparam();}\n",lid,fnamein);

	did=1;

}

^[ \t]*"texture"[ \t]*"<".+$ {

	if(first&&enablecu)

	{

  char buf[1024];

  char *s,*s1,*s2;

  strcpy(buf,yytext);

  s=strchr(buf,'<')+1;

  s1=strchr(s,',');*s1=0;

  s2=strchr(s1+1,'>')+1;while(isspace(*s2))s2++;

  s2[strlen(s2)-1]=0;

  fprintf(yyout,"CUtexref %s_drv;int %s_bound=0;",s2,s2);

  sprintf(crap0,"cuModuleGetTexRef(&%s_drv,mymod,\"%s\");\

cuTexRefSetFormat(%s_drv,CUM_FORMAT_%s,CUM_COMPO_%s);\

cuTexRefSetFlags(%s_drv,CU_TRSF_READ_AS_INTEGER);",s2,s2,s2,s,s,s2);

  sprintf(crap1,"if(%s_bound)cuParamSetTexRef(cupf,CU_PARAM_TR_DEFAULT,%s_drv);",s2,s2);

  crap0+=strlen(crap0);

  crap1+=strlen(crap1);

  did=1;

	}

	ECHO;

}

^[ \t]*"__constant__"[ \t]*.+$ {

	if(first&&enablecu)

	{

  char buf[1024];

  char *s,*s1;

  strcpy(buf,yytext);

  for(s=buf+strlen(buf)-1;!isspace(*s);s--);

  s++;

  for(s1=s;*s1!='['&&*s1!=';'&&*s1;s1++);

  *s1=0;

  fprintf(yyout,"CUdeviceptr %s_drv;",s);

  sprintf(crap0,"cuModuleGetGlobal(&%s_drv,&useless,mymod,\"%s\");",s,s);

  crap0+=strlen(crap0);

  did=1;

	}

	ECHO;

}

. ECHO;

"\"\n {lid++;ECHO;}

\n {

	if(first){lid++;fprintf(yyout,"\n#line %d \"%s\"",lid,fnamein);}

	ECHO;

}

%%

yywrap(){}

int main(int c,char**v)

{

	*crap0=0;*crap1=0;

	if(c>1){yyin=fopen(v[1],"r");fnamein=v[1];}

	yyout=fopen("$****$.tmp","w");

	for(;;){

  did=0;

  yylex();

  if(first&&enablecu)

  	fprintf(yyout,"void _texinit(){%s}void _texset(){%s}\n",texcrap,texcrap1);

  fclose(yyout);

  fclose(yyin);

  if(!did)break;

  system("del $****$.tm1>nul");

  system("move $****$.tmp $****$.tm1");

  yyin=fopen("$****$.tm1","r");

  yyout=fopen("$****$.tmp","w");

  first=0;

	}

	system("del $****$.tm1>nul");

	sprintf(big,"move $****$.tmp %s",fname);

	system(big);

	return 0;

}

Example code:

//set output file name and constant

!!a.cu

!+set$0=3

__global__ void devggmeshv(float4 *pv2,float4 *pv,int vn)

{

	int thid=threadIdx.x;

	int bid=blockIdx.x;

	int id=bid*thmax+thid;

	if(id>=vn)return;

	id*=2;

	float4 a0=pv[id],a1=pv[id+1];

	extern __shared__ float shf[];

	float volatile *b=shf+thid*3;

	//for(int i=0;i<3;i++)

	rep i<3{

  b[i]=

  	a0.x*devggtran.a[i]+

  	a0.y*devggtran.a[4+i]+

  	a0.z*devggtran.a[8+i]+

  	devggtran.a[12+i];

	}

	pv2[id]=make_float4(b[0],b[1],b[2],a0.w);

	float ilg=1.f;//sqrt(max(a1.y*a1.y+a1.z*a1.z+a1.w*a1.w,1e-8));

	//can use constants for loop upper bound

	rep i<$0{

  b[i]=ilg*(

  	a1.x*devggtran.a[16+i]+

  	a1.y*devggtran.a[20+i]+

  	a1.z*devggtran.a[24+i]);

	}

	a1.y=b[0];a1.z=b[1];a1.w=b[2];

	pv2[id+1]=a1;

}