We’ve been complaining quite a lot on the lousy compilers. Since NV isn’t responding, maybe we should share a few ad-hoc solutions.
I wrote this thing to write unrolled loops in reasonably simple syntax. It also contains code to enable easier use of device API, but I can’t get that part to work outside of my own project now External Image .
It’s written in lex and should be mostly portable. The system lines are windows specific, though.
%{
/*
CUDA macro processor.......
*/
#include <stdio.h>
#include <ctype.h>
char big[65536];
char texcrap[65536],*crap0=texcrap;
char texcrap1[65536],*crap1=texcrap1;
char fname[260]="$.cu";
char *fnamein="<stdin>";
int did;
int first=1;
int lid=1;
int enablecu=0;
int cns[9999];
%}
%%
^"!+enablecu"\n {
enablecu=1;
lid++;
}
^"!+set$".*\n {
int id,v;
sscanf(yytext+6,"%d",&id);
sscanf(strchr(yytext,'=')+1,"%d",&v);
cns[id]=v;
lid++;
}
^"!!".*\n {
strcpy(fname,yytext+2);
fname[strlen(fname)-1]=0;
lid++;
}
[ \t\n]"rep"[ \t\n]+[0-9a-zA-Z]+"<"[0-9$]+ {
char buf[128];
int i,n,c,sn,l0;
char *s;
l0=lid;
for(s=yytext;*s!='r';s++);
for(s+=3;isspace(*s);s++);
strcpy(buf,s);
s=strchr(buf,'<');
*s=0;
if(sscanf(s+1,"%d",&n)<=0)
{
sscanf(s+2,"%d",&n);
n=cns[n];
}
for(c=0,sn=0;;sn++)
{
int ch=input();
big[sn]=ch;
if(ch=='{')c++;else
if(ch=='}'){if(!--c)break;}else
if(ch==';'){if(c==0)break;}
if(first&&ch=='\n')
{
lid++;
sprintf(big+sn+1,"#line %d \"%s\"\n",lid,fnamein);
sn+=strlen(big+sn)-1;
}
}
big[sn+1]=0;
fprintf(yyout,"\n");
for(i=0;i<n;i++)
fprintf(yyout,"#define %s %d\n%s\n#undef %s\n",buf,i,big,buf);
did=1;
if(first)fprintf(yyout,"\n#line %d \"%s\"\n",lid,fnamein);
}
"@_@(" {
/*
f gx gy ...
this shouldn't be put in rep -_-b
*/
char buf[1024];
int bran=0;
char *s=buf;
int fst=1;
for(;;)
{
int c=input();
*s++=c;if(c=='\n')lid++;
if(c=='(')bran++;
if(c==')')
{
bran--;
if(bran<0)goto separa;
}
if(c==','&&bran==0)
{
separa:
s[-1]=0;
fprintf(yyout,"\n#line %d \"%s\"\n%s(%s);\n",lid,fnamein,fst?"{_startcuparam":"_addcuparam",buf);
fst=0;
s=buf;
if(bran<0)break;
}
}
fprintf(yyout,"\n#line %d \"%s\"\n_endcuparam();}\n",lid,fnamein);
did=1;
}
^[ \t]*"texture"[ \t]*"<".+$ {
if(first&&enablecu)
{
char buf[1024];
char *s,*s1,*s2;
strcpy(buf,yytext);
s=strchr(buf,'<')+1;
s1=strchr(s,',');*s1=0;
s2=strchr(s1+1,'>')+1;while(isspace(*s2))s2++;
s2[strlen(s2)-1]=0;
fprintf(yyout,"CUtexref %s_drv;int %s_bound=0;",s2,s2);
sprintf(crap0,"cuModuleGetTexRef(&%s_drv,mymod,\"%s\");\
cuTexRefSetFormat(%s_drv,CUM_FORMAT_%s,CUM_COMPO_%s);\
cuTexRefSetFlags(%s_drv,CU_TRSF_READ_AS_INTEGER);",s2,s2,s2,s,s,s2);
sprintf(crap1,"if(%s_bound)cuParamSetTexRef(cupf,CU_PARAM_TR_DEFAULT,%s_drv);",s2,s2);
crap0+=strlen(crap0);
crap1+=strlen(crap1);
did=1;
}
ECHO;
}
^[ \t]*"__constant__"[ \t]*.+$ {
if(first&&enablecu)
{
char buf[1024];
char *s,*s1;
strcpy(buf,yytext);
for(s=buf+strlen(buf)-1;!isspace(*s);s--);
s++;
for(s1=s;*s1!='['&&*s1!=';'&&*s1;s1++);
*s1=0;
fprintf(yyout,"CUdeviceptr %s_drv;",s);
sprintf(crap0,"cuModuleGetGlobal(&%s_drv,&useless,mymod,\"%s\");",s,s);
crap0+=strlen(crap0);
did=1;
}
ECHO;
}
. ECHO;
"\"\n {lid++;ECHO;}
\n {
if(first){lid++;fprintf(yyout,"\n#line %d \"%s\"",lid,fnamein);}
ECHO;
}
%%
yywrap(){}
int main(int c,char**v)
{
*crap0=0;*crap1=0;
if(c>1){yyin=fopen(v[1],"r");fnamein=v[1];}
yyout=fopen("$****$.tmp","w");
for(;;){
did=0;
yylex();
if(first&&enablecu)
fprintf(yyout,"void _texinit(){%s}void _texset(){%s}\n",texcrap,texcrap1);
fclose(yyout);
fclose(yyin);
if(!did)break;
system("del $****$.tm1>nul");
system("move $****$.tmp $****$.tm1");
yyin=fopen("$****$.tm1","r");
yyout=fopen("$****$.tmp","w");
first=0;
}
system("del $****$.tm1>nul");
sprintf(big,"move $****$.tmp %s",fname);
system(big);
return 0;
}
Example code:
//set output file name and constant
!!a.cu
!+set$0=3
__global__ void devggmeshv(float4 *pv2,float4 *pv,int vn)
{
int thid=threadIdx.x;
int bid=blockIdx.x;
int id=bid*thmax+thid;
if(id>=vn)return;
id*=2;
float4 a0=pv[id],a1=pv[id+1];
extern __shared__ float shf[];
float volatile *b=shf+thid*3;
//for(int i=0;i<3;i++)
rep i<3{
b[i]=
a0.x*devggtran.a[i]+
a0.y*devggtran.a[4+i]+
a0.z*devggtran.a[8+i]+
devggtran.a[12+i];
}
pv2[id]=make_float4(b[0],b[1],b[2],a0.w);
float ilg=1.f;//sqrt(max(a1.y*a1.y+a1.z*a1.z+a1.w*a1.w,1e-8));
//can use constants for loop upper bound
rep i<$0{
b[i]=ilg*(
a1.x*devggtran.a[16+i]+
a1.y*devggtran.a[20+i]+
a1.z*devggtran.a[24+i]);
}
a1.y=b[0];a1.z=b[1];a1.w=b[2];
pv2[id+1]=a1;
}