ptxas compilation error heavy use of functions causes error

Hi,

I’ve stumbled across a very interesting problem in my code (pared down version below). When I attempt to compile this, I get ptxas error code 0xC0000005 (ACCESS_VIOLATION). I am using CUDA 1.1 on Windows XP 32-bit with a GF8400 GS.

Attempting to call any or all of the functions by themselves works fine. Splitting up the long commands by line and replacing references to b with a also causes it to work, though neither individual “fixes” it (and I can still declare b, just not use it). Declaring at least one of the device functions as noinline also fixes the problem (which is consistent with some similar code of mine which simply doesn’t call the functions as much and compiles fine). Finally, using any kind of memory other than constant for logt and alog also works.

This reminds me alot of an issue I saw in a different thread, where using a goto at the end of a long loop caused the same ptxas error. They might not be related, but I suspect that these both might point to some limitation in how ptxas (or the compiler to ptx code) handles code branches.

Also, before someone makes the obvious suggestion, simply not using constant memory isn’t a viable option, as I am comparing the performance of the kernel for different memory use arrangements. noinline is a possibility, but I would prefer to at least have a firm idea of why I am getting this error before resorting to that.

Thanks,

Andrew

__constant__ unsigned char logt[256], alog[256];

__device__ unsigned int times0e(unsigned int n);

__device__ unsigned int times0d(unsigned int n);

__device__ unsigned int times0b(unsigned int n);

__device__ unsigned int times09(unsigned int n);

__global__ void run(unsigned int* data_in,

        unsigned int* data_out)

{

	__shared__ unsigned int a[ 256 ];

	__shared__ unsigned int b[ 256 ];

	const unsigned int bid =  256  * blockIdx.x;

	a[threadIdx.x] = data_in[bid + threadIdx.x];

#pragma unroll

	for(int i =  9; i > 1; i -= 2)

	{

 b[threadIdx.x] = ((times0e( a[threadIdx.x] >> 24)    ^ times0b((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0d((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times09( a[threadIdx.x]     & 0x000000ff)) << 24)      ^

      (((times0e((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0b((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0d( a[threadIdx.x]  & 0x000000ff) ^ times09( a[threadIdx.x] >> 24)    ) << 16) & 0x00ff0000) ^

      (((times0e((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0b( a[threadIdx.x]   & 0x000000ff) ^ times0d( a[threadIdx.x] >> 24)      ^ times09((a[threadIdx.x] >> 16) & 0x000000ff)) << 8 ) & 0x0000ff00) ^

       ((times0e( a[threadIdx.x]    & 0x000000ff) ^ times0b( a[threadIdx.x] >> 24)      ^ times0d((a[threadIdx.x] >> 16) & 0x000000ff) ^ times09((a[threadIdx.x] >> 8 ) & 0x000000ff))   & 0x000000ff);

 __syncthreads();

 a[threadIdx.x] = ((times0e( b[threadIdx.x] >> 24)    ^ times0b((b[threadIdx.x] >> 16) & 0x000000ff) ^ times0d((b[threadIdx.x] >> 8 ) & 0x000000ff) ^ times09( b[threadIdx.x]     & 0x000000ff)) << 24)      ^

      (((times0e((b[threadIdx.x] >> 16) & 0x000000ff) ^ times0b((b[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0d( b[threadIdx.x]  & 0x000000ff) ^ times09( b[threadIdx.x] >> 24)    ) << 16) & 0x00ff0000) ^

      (((times0e((b[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0b( b[threadIdx.x]   & 0x000000ff) ^ times0d( b[threadIdx.x] >> 24)      ^ times09((b[threadIdx.x] >> 16) & 0x000000ff)) << 8 ) & 0x0000ff00) ^

       ((times0e( b[threadIdx.x]    & 0x000000ff) ^ times0b( b[threadIdx.x] >> 24)      ^ times0d((b[threadIdx.x] >> 16) & 0x000000ff) ^ times09((b[threadIdx.x] >> 8 ) & 0x000000ff))   & 0x000000ff);

 __syncthreads();

	}

	b[threadIdx.x] = ((times0e( a[threadIdx.x] >> 24)    ^ times0b((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0d((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times09( a[threadIdx.x]     & 0x000000ff)) << 24)      ^

    	(((times0e((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0b((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0d( a[threadIdx.x]  & 0x000000ff) ^ times09( a[threadIdx.x] >> 24)    ) << 16) & 0x00ff0000) ^

    	(((times0e((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0b( a[threadIdx.x]   & 0x000000ff) ^ times0d( a[threadIdx.x] >> 24)      ^ times09((a[threadIdx.x] >> 16) & 0x000000ff)) << 8 ) & 0x0000ff00) ^

      ((times0e( a[threadIdx.x]    & 0x000000ff) ^ times0b( a[threadIdx.x] >> 24)      ^ times0d((a[threadIdx.x] >> 16) & 0x000000ff) ^ times09((a[threadIdx.x] >> 8 ) & 0x000000ff))   & 0x000000ff);

	__syncthreads();

	data_out[bid + threadIdx.x] = b[threadIdx.x];

}

__device__ unsigned int times0e(unsigned int n)

{

	if(n == 0)

  return 0;

	else

	{

  unsigned int s = logt[n] + logt[0x0e];

  return alog[(s + (s >> 8)) & 0x000000ff];

	}

}

__device__ unsigned int times0d(unsigned int n)

{

	if(n == 0)

  return 0;

	else

	{

  unsigned int s = logt[n] + logt[0x0d];

  return alog[(s + (s >> 8)) & 0x000000ff];

	}

}

__device__ unsigned int times0b(unsigned int n)

{

	if(n == 0)

  return 0;

	else

	{

  unsigned int s = logt[n] + logt[0x0b];

  return alog[(s + (s >> 8)) & 0x000000ff];

	}

}

__device__ unsigned int times09(unsigned int n)

{

	if(n == 0)

  return 0;

	else

	{

  unsigned int s = logt[n] + logt[0x09];

  return alog[(s + (s >> 8)) & 0x000000ff];

	}

}

I’ve been tinkering with my code since then, and I’ve found a solution. If I combine all of the device functions into a single device function, it both compiles and runs (noinline, I discovered, was using up too many registers) properly. On top of that, it’s more computationally efficient; it may very well be more register efficient too, but I’ve yet to check.

I’d still be interested if anyone knows something about this problem.

Obvious suggestion would be to try v2.0. Either way, CUDA is more bug-riddled than a sub-saharan whore’s '95 box.

Also, CUDA inlines all the functions always. What you did is what it was supposed to have done.