So I’m trying to write a device function that uses an explicit stack. This requires me to occasionally read/write pointers to global memory as in:
#include <iostream>
#include <cstdio>
class Stack
{
public:
unsigned int* in;
unsigned int size;
};
__device__ void myMemcpy( unsigned int* out, const unsigned int* in,
unsigned int size )
{
for( unsigned int i = 0; i < size; ++i )
{
out[i] = in[i];
}
}
__global__ void foo( Stack* stack, unsigned int* input, unsigned int size )
{
extern __shared__ unsigned int shared[];
unsigned int stackPointer = 1;
myMemcpy( shared, input, size );
stack[0].in = shared;
stack[0].size = size;
Stack entry;
unsigned int result = 0xFFFFFFFF;
do
{
--stackPointer;
entry = stack[ stackPointer ];
unsigned int value = entry.in[0];
if( value < 132 && entry.size > 1 )
{
stack[ stackPointer ].in = entry.in + entry.size / 2;
stack[ stackPointer ].size = entry.size / 2;
++stackPointer;
}
if( value > 132 && entry.size > 1 )
{
stack[ stackPointer ].in = entry.in - entry.size / 2;
stack[ stackPointer ].size = entry.size / 2;
++stackPointer;
}
if( value == 132 )
{
result = ( entry.in - shared );
}
}
while( stackPointer > 0 );
input[0] = result;
}
unsigned int fooHost( )
{
unsigned int input[1024];
for( unsigned int i = 0; i < 1024; ++i )
{
input[i] = (unsigned int) i;
}
unsigned int* deviceIn;
Stack* deviceStack;
cudaMalloc( (void**) &deviceIn, sizeof( unsigned int ) * 1024 );
cudaMalloc( (void**) &deviceStack, sizeof( Stack ) * 10 );
cudaMemcpy( deviceIn, input,
sizeof( unsigned int ) * 1024, cudaMemcpyHostToDevice );
foo<<< 1, 1, 1024*sizeof( unsigned int ) >>>( deviceStack, deviceIn, 1024 );
cudaMemcpy( input, deviceIn,
sizeof( unsigned int ), cudaMemcpyDeviceToHost );
cudaFree( deviceIn );
cudaFree( deviceStack );
return input[0];
}
int main()
{
unsigned int position = fooHost();
if( position == 0xFFFFFFFF )
{
std::cout << "Not found.\n";
}
else
{
std::cout << "Found item at location " << position << ".\n";
}
}
Note that this code doesn’t do anything really, it’s just meant to illustrate my point.
This code works fine when compiling in emulation mode. However, it generates the following warning when compiling for the device:
/tmp/tmpxft_00002c8e_00000000-7_testStack.cpp3.i(49): Advisory: Cannot tell what pointer points to, assuming global memory space
So it seems that reading pointers from memory invalidates some points -to-analysis that the compiler is doing and it can no longer tell which memory space a pointer refers to – that’s fine points-to-anaylsis is really hard. Looking at the PTX code, the statement:
unsigned int value = entry.in[0]
is compiled into
ld.global.u32 %r5, [%rd3+8]; // id:64
A load from global memory instead of a load from shared memory.
Can anyone think of a workaround for this that still allows me to use shared memory? The actual application (quicksort) requires a stack and sees a significant speedup using shared memory. A great long term solution that has probably been suggested before is to allow for a user supplied “memory space tag” to all pointers, so it would possible to manually tell the compiler which memory space you were referring to. But that doesn’t help me now.
My only recourse at this point is to manually edit the PTX, which works but is far from a clean and portable solution.