Hi,
I’ve stumbled across a very interesting problem in my code (pared down version below). When I attempt to compile this, I get ptxas error code 0xC0000005 (ACCESS_VIOLATION). I am using CUDA 1.1 on Windows XP 32-bit with a GF8400 GS.
Attempting to call any or all of the functions by themselves works fine. Splitting up the long commands by line and replacing references to b with a also causes it to work, though neither individual “fixes” it (and I can still declare b, just not use it). Declaring at least one of the device functions as noinline also fixes the problem (which is consistent with some similar code of mine which simply doesn’t call the functions as much and compiles fine). Finally, using any kind of memory other than constant for logt and alog also works.
This reminds me alot of an issue I saw in a different thread, where using a goto at the end of a long loop caused the same ptxas error. They might not be related, but I suspect that these both might point to some limitation in how ptxas (or the compiler to ptx code) handles code branches.
Also, before someone makes the obvious suggestion, simply not using constant memory isn’t a viable option, as I am comparing the performance of the kernel for different memory use arrangements. noinline is a possibility, but I would prefer to at least have a firm idea of why I am getting this error before resorting to that.
Thanks,
Andrew
__constant__ unsigned char logt[256], alog[256];
__device__ unsigned int times0e(unsigned int n);
__device__ unsigned int times0d(unsigned int n);
__device__ unsigned int times0b(unsigned int n);
__device__ unsigned int times09(unsigned int n);
__global__ void run(unsigned int* data_in,
unsigned int* data_out)
{
__shared__ unsigned int a[ 256 ];
__shared__ unsigned int b[ 256 ];
const unsigned int bid = 256 * blockIdx.x;
a[threadIdx.x] = data_in[bid + threadIdx.x];
#pragma unroll
for(int i = 9; i > 1; i -= 2)
{
b[threadIdx.x] = ((times0e( a[threadIdx.x] >> 24) ^ times0b((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0d((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times09( a[threadIdx.x] & 0x000000ff)) << 24) ^
(((times0e((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0b((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0d( a[threadIdx.x] & 0x000000ff) ^ times09( a[threadIdx.x] >> 24) ) << 16) & 0x00ff0000) ^
(((times0e((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0b( a[threadIdx.x] & 0x000000ff) ^ times0d( a[threadIdx.x] >> 24) ^ times09((a[threadIdx.x] >> 16) & 0x000000ff)) << 8 ) & 0x0000ff00) ^
((times0e( a[threadIdx.x] & 0x000000ff) ^ times0b( a[threadIdx.x] >> 24) ^ times0d((a[threadIdx.x] >> 16) & 0x000000ff) ^ times09((a[threadIdx.x] >> 8 ) & 0x000000ff)) & 0x000000ff);
__syncthreads();
a[threadIdx.x] = ((times0e( b[threadIdx.x] >> 24) ^ times0b((b[threadIdx.x] >> 16) & 0x000000ff) ^ times0d((b[threadIdx.x] >> 8 ) & 0x000000ff) ^ times09( b[threadIdx.x] & 0x000000ff)) << 24) ^
(((times0e((b[threadIdx.x] >> 16) & 0x000000ff) ^ times0b((b[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0d( b[threadIdx.x] & 0x000000ff) ^ times09( b[threadIdx.x] >> 24) ) << 16) & 0x00ff0000) ^
(((times0e((b[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0b( b[threadIdx.x] & 0x000000ff) ^ times0d( b[threadIdx.x] >> 24) ^ times09((b[threadIdx.x] >> 16) & 0x000000ff)) << 8 ) & 0x0000ff00) ^
((times0e( b[threadIdx.x] & 0x000000ff) ^ times0b( b[threadIdx.x] >> 24) ^ times0d((b[threadIdx.x] >> 16) & 0x000000ff) ^ times09((b[threadIdx.x] >> 8 ) & 0x000000ff)) & 0x000000ff);
__syncthreads();
}
b[threadIdx.x] = ((times0e( a[threadIdx.x] >> 24) ^ times0b((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0d((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times09( a[threadIdx.x] & 0x000000ff)) << 24) ^
(((times0e((a[threadIdx.x] >> 16) & 0x000000ff) ^ times0b((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0d( a[threadIdx.x] & 0x000000ff) ^ times09( a[threadIdx.x] >> 24) ) << 16) & 0x00ff0000) ^
(((times0e((a[threadIdx.x] >> 8 ) & 0x000000ff) ^ times0b( a[threadIdx.x] & 0x000000ff) ^ times0d( a[threadIdx.x] >> 24) ^ times09((a[threadIdx.x] >> 16) & 0x000000ff)) << 8 ) & 0x0000ff00) ^
((times0e( a[threadIdx.x] & 0x000000ff) ^ times0b( a[threadIdx.x] >> 24) ^ times0d((a[threadIdx.x] >> 16) & 0x000000ff) ^ times09((a[threadIdx.x] >> 8 ) & 0x000000ff)) & 0x000000ff);
__syncthreads();
data_out[bid + threadIdx.x] = b[threadIdx.x];
}
__device__ unsigned int times0e(unsigned int n)
{
if(n == 0)
return 0;
else
{
unsigned int s = logt[n] + logt[0x0e];
return alog[(s + (s >> 8)) & 0x000000ff];
}
}
__device__ unsigned int times0d(unsigned int n)
{
if(n == 0)
return 0;
else
{
unsigned int s = logt[n] + logt[0x0d];
return alog[(s + (s >> 8)) & 0x000000ff];
}
}
__device__ unsigned int times0b(unsigned int n)
{
if(n == 0)
return 0;
else
{
unsigned int s = logt[n] + logt[0x0b];
return alog[(s + (s >> 8)) & 0x000000ff];
}
}
__device__ unsigned int times09(unsigned int n)
{
if(n == 0)
return 0;
else
{
unsigned int s = logt[n] + logt[0x09];
return alog[(s + (s >> 8)) & 0x000000ff];
}
}