I’m using WinXP, CUDA 1.0, 8800GTX.
It transforms vertices like this:
struct vertex{
float position[3];
float texcoord0[3];
float normal[3];
};
using two matrices (one for position and one for normal) stored in devggtran.
The transformation is done d3d-style.
Emulation mode runs fine, but when run on device, the second component of position is screwed up.
Also, if devggtran is passed as a parameter, nvcc starts using local memory.
The code is produced by my loop unroller, thus a bit messy. It should be readable, though.
Generated ptx code looks correct at first glance.
#define JOIN(a,b) a##b
#define JOIN1(a,b) JOIN2(a,b)
#define JOIN2(a,b) JOIN(a,b)
#define xyz(d) JOIN(__xyz,d)
#define __xyz0 x
#define __xyz1 y
#define __xyz2 z
#define __xyz3 w
const int thmax=256;
struct c2mats{
float a[32];
};
void make(c2mats &m,float *m0,float *m1)
{
memcpy(m.a,m0,64);
memcpy(m.a+16,m1,64);
}
__constant__ c2mats devggtran;
__global__ void devggmeshv(float4 *pv2,float4 *pv,int vn)
__global__ void devggmeshv(float4 *pv2,float4 *pv,int vn)
{
int thid=threadIdx.x;
int bid=blockIdx.x;
int id=bid*thmax+thid;
if(id>=vn)return;
id*=2;
float4 a0=pv[id],a1=pv[id+1],b;
#define i 0
{
b.xyz(i)=
a0.x*devggtran.a[i]+
a0.y*devggtran.a[4+i]+
a0.z*devggtran.a[8+i]+
devggtran.a[12+i];
}
#undef i
#define i 1
{
b.xyz(i)=
a0.x*devggtran.a[i]+
a0.y*devggtran.a[4+i]+
a0.z*devggtran.a[8+i]+
devggtran.a[12+i];
}
#undef i
#define i 2
{
b.xyz(i)=
a0.x*devggtran.a[i]+
a0.y*devggtran.a[4+i]+
a0.z*devggtran.a[8+i]+
devggtran.a[12+i];
}
#undef i
b.w=a0.w;
pv2[id]=b;
float ilg=1.f;//sqrt(max(a1.y*a1.y+a1.z*a1.z+a1.w*a1.w,1e-8));
#define i 0
{
b.xyz(i)=ilg*(
a1.x*devggtran.a[16+i]+
a1.y*devggtran.a[20+i]+
a1.z*devggtran.a[24+i]);
}
#undef i
#define i 1
{
b.xyz(i)=ilg*(
a1.x*devggtran.a[16+i]+
a1.y*devggtran.a[20+i]+
a1.z*devggtran.a[24+i]);
}
#undef i
#define i 2
{
b.xyz(i)=ilg*(
a1.x*devggtran.a[16+i]+
a1.y*devggtran.a[20+i]+
a1.z*devggtran.a[24+i]);
}
#undef i
a1.y=b.x;a1.z=b.y;a1.w=b.z;
pv2[id+1]=a1;
}