temp vs TEMP


The following kernel implements the prescan Algorthim as given in SDK. I am wondering how can the kernel work when TEMP is not defind anywhere. Or is it that i am missing any important concept here?

global void scan_best(float *g_odata, float *g_idata, int n)
// Dynamically allocated shared memory for scan kernels
extern shared float temp[];

int thid = threadIdx.x;

int ai = thid;
int bi = thid + (n/2);

// compute spacing to avoid bank conflicts
int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
int bankOffsetB = CONFLICT_FREE_OFFSET(bi);

// Cache the computational window in shared memory
TEMP(ai + bankOffsetA) = g_idata[ai];
TEMP(bi + bankOffsetB) = g_idata[bi];

int offset = 1;

// build the sum in place up the tree
for (int d = n/2; d > 0; d >>= 1)

    if (thid < d)
        int ai = offset*(2*thid+1)-1;
        int bi = offset*(2*thid+2)-1;

        ai += CONFLICT_FREE_OFFSET(ai);
        bi += CONFLICT_FREE_OFFSET(bi);

        TEMP(bi) += TEMP(ai);

    offset *= 2;

// scan back down the tree

// clear the last element
if (thid == 0)
    int index = n - 1;
    index += CONFLICT_FREE_OFFSET(index);
    TEMP(index) = 0;

// traverse down the tree building the scan in place
for (int d = 1; d < n; d *= 2)
    offset /= 2;


    if (thid < d)
        int ai = offset*(2*thid+1)-1;
        int bi = offset*(2*thid+2)-1;

        ai += CONFLICT_FREE_OFFSET(ai);
        bi += CONFLICT_FREE_OFFSET(bi);

        float t  = TEMP(ai);
        TEMP(ai) = TEMP(bi);
        TEMP(bi) += t;


// write results to global memory
g_odata[ai] = TEMP(ai + bankOffsetA);
g_odata[bi] = TEMP(bi + bankOffsetB);
#endif // #ifndef SCAN_BEST_KERNEL_H

Read the top 20 lines of the scan.cu file in the SDK.

Thank you so much for your reply.

I have pasted below some of the initial code from scan.cu file from sdk. But still I am not able to understand why TEMP has been used undefined in the kernel scan_best_kernel.cu. the kernel though has the following lines:

extern shared float temp; (it is temp not TEMP)

And one more thing do I use “extern” even when I am writing the code into one .cu file,( I mean when my kernel and main() is in the same file). I hope I would get valuable pointers from you here.


// declaration, forward

void runTest( int argc, char** argv);

// regression test functionality

extern “C”

unsigned int compare( const float* reference, const float* data,

                  const unsigned int len);

extern “C”

void computeGold( float* reference, float* idata, const unsigned int len);


// Program main



main( int argc, char** argv)


runTest( argc, argv);

cutilExit(argc, argv);



//! Run a scan test for CUDA



runTest( int argc, char** argv)


// use command-line specified CUDA device, otherwise use device with highest Gflops/s

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

    cutilDeviceInit(argc, argv);


    cudaSetDevice( cutGetMaxGflopsDeviceId() );

unsigned int num_elements = 512;

cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int*)&num_elements);

unsigned int timer;

cutilCheckError( cutCreateTimer(&timer));

const unsigned int num_threads = num_elements / 2;

const unsigned int mem_size = sizeof( float) * num_elements;

// padding space is used to avoid shared memory bank conflicts

unsigned int extra_space = num_elements / NUM_BANKS;


extra_space += extra_space / NUM_BANKS;


const unsigned int shared_mem_size = sizeof(float) *

    (num_elements + extra_space);

// allocate host memory to store the input data

float* h_data = (float*) malloc( mem_size);

// initialize the input data on the host to be integer values

// between 0 and 1000

for( unsigned int i = 0; i < num_elements; ++i) 


    h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));


// compute reference solution

float* reference = (float*) malloc( mem_size);  

computeGold( reference, h_data, num_elements);

// allocate device memory input and output arrays

float* d_idata;

float* d_odata[3];

cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size));

cutilSafeCall( cudaMalloc( (void**) &(d_odata[0]), mem_size));

cutilSafeCall( cudaMalloc( (void**) &(d_odata[1]), mem_size));

cutilSafeCall( cudaMalloc( (void**) &(d_odata[2]), mem_size));

// copy host memory to device input array

cutilSafeCall( cudaMemcpy( d_idata, h_data, mem_size, cudaMemcpyHostToDevice) );


To correct myself, it is the top of scan_best_kernel.cu rather than scan.cu. This:


// Define this to more rigorously avoid bank conflicts, even at the lower (root) levels of the tree



#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))


#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)



#define TEMP(index)   cutilBankChecker(temp, index)


#define TEMP(index)   temp[index]


Thanks Avidday. :) I got the point