ptxas screws up a 'vertex shader' that just does ftransform

I’m using WinXP, CUDA 1.0, 8800GTX.

It transforms vertices like this:

struct vertex{

float position[3];

float texcoord0[3];

float normal[3];

};

using two matrices (one for position and one for normal) stored in devggtran.

The transformation is done d3d-style.

Emulation mode runs fine, but when run on device, the second component of position is screwed up.

Also, if devggtran is passed as a parameter, nvcc starts using local memory.

The code is produced by my loop unroller, thus a bit messy. It should be readable, though.

Generated ptx code looks correct at first glance.

#define JOIN(a,b) a##b

#define JOIN1(a,b) JOIN2(a,b)

#define JOIN2(a,b) JOIN(a,b)

#define xyz(d) JOIN(__xyz,d)

#define __xyz0 x

#define __xyz1 y

#define __xyz2 z

#define __xyz3 w

const int thmax=256;

struct c2mats{

	float a[32];

};

void make(c2mats &m,float *m0,float *m1)

{

	memcpy(m.a,m0,64);

	memcpy(m.a+16,m1,64);

}

__constant__ c2mats devggtran;

__global__ void devggmeshv(float4 *pv2,float4 *pv,int vn)

__global__ void devggmeshv(float4 *pv2,float4 *pv,int vn)

{

	int thid=threadIdx.x;

	int bid=blockIdx.x;

	int id=bid*thmax+thid;

	if(id>=vn)return;

	id*=2;

	float4 a0=pv[id],a1=pv[id+1],b;

#define i 0

{

  b.xyz(i)=

  	a0.x*devggtran.a[i]+

  	a0.y*devggtran.a[4+i]+

  	a0.z*devggtran.a[8+i]+

  	devggtran.a[12+i];

	}

#undef i

#define i 1

{

  b.xyz(i)=

  	a0.x*devggtran.a[i]+

  	a0.y*devggtran.a[4+i]+

  	a0.z*devggtran.a[8+i]+

  	devggtran.a[12+i];

	}

#undef i

#define i 2

{

  b.xyz(i)=

  	a0.x*devggtran.a[i]+

  	a0.y*devggtran.a[4+i]+

  	a0.z*devggtran.a[8+i]+

  	devggtran.a[12+i];

	}

#undef i

	b.w=a0.w;

	pv2[id]=b;

	float ilg=1.f;//sqrt(max(a1.y*a1.y+a1.z*a1.z+a1.w*a1.w,1e-8));

#define i 0

{

  b.xyz(i)=ilg*(

  	a1.x*devggtran.a[16+i]+

  	a1.y*devggtran.a[20+i]+

  	a1.z*devggtran.a[24+i]);

	}

#undef i

#define i 1

{

  b.xyz(i)=ilg*(

  	a1.x*devggtran.a[16+i]+

  	a1.y*devggtran.a[20+i]+

  	a1.z*devggtran.a[24+i]);

	}

#undef i

#define i 2

{

  b.xyz(i)=ilg*(

  	a1.x*devggtran.a[16+i]+

  	a1.y*devggtran.a[20+i]+

  	a1.z*devggtran.a[24+i]);

	}

#undef i

	a1.y=b.x;a1.z=b.y;a1.w=b.z;

	pv2[id+1]=a1;

}

Just that, could any nVidia guy kindly please give a reply?
Though I can work around this using volatile shared memory, I still think such a simple kernel deserves to work.

It’s hard to reproduce this without a complete project. Are you a registered developer? If so, can you file a bug?

I’m not a registered developer, though I did try.
I’ll make a reproducing project ASAP.

Here it is:

#include <stdio.h>

#define JOIN(a,b) a##b

#define JOIN1(a,b) JOIN2(a,b)

#define JOIN2(a,b) JOIN(a,b)

#define xyz(d) JOIN(__xyz,d)

#define __xyz0 x

#define __xyz1 y

#define __xyz2 z

#define __xyz3 w

const int thmax=256;

struct c2mats{

float a[32];

};

__constant__ c2mats devggtran;

__global__ void devggmeshv(float4 *pv2,float4 *pv,int vn)

{

int thid=threadIdx.x;

int bid=blockIdx.x;

int id=bid*thmax+thid;

if(id>=vn)return;

id*=2;

float4 a0=pv[id],a1=pv[id+1],b;

#define i 0

{

 b.xyz(i)=

  a0.x*devggtran.a[i]+

  a0.y*devggtran.a[4+i]+

  a0.z*devggtran.a[8+i]+

  devggtran.a[12+i];

}

#undef i

#define i 1

{

 b.xyz(i)=

  a0.x*devggtran.a[i]+

  a0.y*devggtran.a[4+i]+

  a0.z*devggtran.a[8+i]+

  devggtran.a[12+i];

}

#undef i

#define i 2

{

 b.xyz(i)=

  a0.x*devggtran.a[i]+

  a0.y*devggtran.a[4+i]+

  a0.z*devggtran.a[8+i]+

  devggtran.a[12+i];

}

#undef i

b.w=a0.w;

pv2[id]=b;

float ilg=1.f;//sqrt(max(a1.y*a1.y+a1.z*a1.z+a1.w*a1.w,1e-8));

#define i 0

{

 b.xyz(i)=ilg*(

  a1.x*devggtran.a[16+i]+

  a1.y*devggtran.a[20+i]+

  a1.z*devggtran.a[24+i]);

}

#undef i

#define i 1

{

 b.xyz(i)=ilg*(

  a1.x*devggtran.a[16+i]+

  a1.y*devggtran.a[20+i]+

  a1.z*devggtran.a[24+i]);

}

#undef i

#define i 2

{

 b.xyz(i)=ilg*(

  a1.x*devggtran.a[16+i]+

  a1.y*devggtran.a[20+i]+

  a1.z*devggtran.a[24+i]);

}

#undef i

a1.y=b.x;a1.z=b.y;a1.w=b.z;

pv2[id+1]=a1;

}

float m[32]={

	9,	1,	2,	0,

	1,	9,	3,	0,

	2,	3,	9,	0,

	0.1,0.2,0.3,1,

	9,	1,	2,	0,

	1,	9,	3,	0,

	2,	3,	9,	0,

	0.1,0.2,0.3,1,

};

float vb[]={

	1,2,4,0,

	0,0,0,1

};

int main()

{

	float4 *devvb,*devvb2;

	cudaMemcpyToSymbol(devggtran,m,128);

	cudaMalloc((void**)&devvb,sizeof(vb));

	cudaMalloc((void**)&devvb2,sizeof(vb));

	cudaMemcpy(devvb,vb,sizeof(vb),cudaMemcpyHostToDevice);

	int vn=sizeof(vb)/32;

	devggmeshv<<<(vn+thmax-1)/thmax,thmax,0>>>(devvb2,devvb,vn);

	cudaMemcpy(vb,devvb2,sizeof(vb),cudaMemcpyDeviceToHost);

	for(int i=0;i<sizeof(vb)/4;i++)

  printf("%f\n",vb[i]);

	return 0;

}

“Screenshot”:

D:\f-qmhou\tcuda>nvcc -deviceemu vs.cu

tmpxft_000009dc_00000000-4.i

D:\f-qmhou\tcuda>a

19.100000

31.200001

44.299999

0.000000

0.000000

0.000000

0.000000

0.000000

D:\f-qmhou\tcuda>nvcc vs.cu

tmpxft_00000f44_00000000-8.i

D:\f-qmhou\tcuda>a

19.100000

49.299999

80.500000

0.000000

0.000000

0.000000

0.000000

0.000000

D:\f-qmhou\tcuda>

There’s only 1 thread that accesses memory at all, so emulation should be identical to the real device. But they aren’t.

Thanks for reporting this, we will investigate

Edit: It is a bug!!!

Forgot to post my work-around: remove float4 b and use

extern __shared__ float shf[];

	float volatile *b=shf+thid*3;

Guess it’s over-optimization in ptxas, right?

Will a patch be released anytime soon?

It should be fixed in the next release.
It seems like the compiler reuses a register after assigning that register a new value.

Good to know it’ll be fixed:)
By the way, when will the next release be released?
I’m likely going to use similar programming patterns a lot, and for complicated kernels I’d run out of shared memory for work-around:(
Thanks