Pointer arithmetic with shared memory

I ported some OpenCL code to Cuda recently and encountered the following issue. In OpenCL you can mark pointer variable inside kernel as ‘__local’ (which is the analogue of Cuda’s ‘shared’). When you do this, OpenCL’s compiler understands that this pointer is used to access shared memory and only it. So you can do something like:

__local float shared_mem[192];

__local float *ptr;

// ... later in code

ptr = shared_mem + 16;

float temp = ptr[0] * ptr[1];

In Cuda though, it does not always work for some reason, even if I explicitly mark ‘ptr’ as ‘shared’. Sometimes there are warnings that compiler cannot decide which memory to access, sometimes I get garbage in output. Same kernel with integer indexes instead of pointers is working fine.

The kernel I am using is large and has many cases of such pointer arithmetic, so I would prefer not to replace ‘__local float *ptr’ by some sort of ‘size_t ptr_index’ if Cuda allows it. So, is there some way to tell compiler that the pointer is totally definitely always points to shared memory?

Thanks in advance.

I managed to compose two small kernels, one with ptr arithmetic and the other with integer indexes, which produce different results (see code below).

‘test2’ kernel (indexes) gives right result: (1, 1), (garbage, 1), whether ‘test’ (pointers) gives (garbage, 1), (garbage, 1):

$ release/test

Pointer arithmetic: 1439.000000 + i1.000000, 1439.000000 + i1.000000,

Indexes: 1.000000 + i1.000000, 750.000000 + i1.000000,

#include <stdlib.h>

#include <stdio.h>

// includes, project

#include <cutil_inline.h>

// Pointer arithmetic

__global__ void test(float2 *in, float2 *out)

{

		__shared__ float sMem[192];

		__shared__ float *lMemStore, *lMemLoad;

		int lId = threadIdx.x; // 0, 1

	float2 a = in[lId]; // a = (1, 1), (1, 1)

	lMemStore = sMem + lId; // lMemStore = sMem, sMem + 1

		lMemLoad = sMem + lId * 3; // lMemLoad = sMem, sMem + 3

		lMemStore[0] = a.x; // sMem[0] = 1, sMem[1] = 1

		__syncthreads(); // sMem: 0: 1, 1: 1

		a.x = lMemLoad[0]; // a.x = sMem[0], sMem[3]

	out[lId] = a;

}

// Indexes

__global__ void test2(float2 *in, float2 *out)

{

		__shared__ float sMem[192];

		size_t lMemStore, lMemLoad;

		int lId = threadIdx.x; // 0, 1

	float2 a = in[lId]; // a = (1, 1), (1, 1)

	lMemStore = lId; // lMemStore = sMem, sMem + 1

		lMemLoad = lId * 3; // lMemLoad = sMem, sMem + 3

		sMem[lMemStore] = a.x; // sMem[0] = 1, sMem[1] = 1

		__syncthreads(); // sMem: 0: 1, 1: 1

		a.x = sMem[lMemLoad]; // a.x = sMem[0], sMem[3]

	out[lId] = a;

}

int main( int argc, char** argv)

{

	if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))

		cutilDeviceInit(argc, argv);

	else

		cudaSetDevice(cutGetMaxGflopsDeviceId());

	int N = 2;

	float2 *h_in_buf = (float2*)malloc(N * sizeof(float2));

	float2 *h_out_buf = (float2*)malloc(N * sizeof(float2));

	float2 *d_in_buf;

	float2 *d_out_buf;

	cutilSafeCall(cudaMalloc((void**)&d_in_buf, N * sizeof(float2)));

	cutilSafeCall(cudaMalloc((void**)&d_out_buf, N * sizeof(float2)));

	h_in_buf[0] = make_float2(1, 1);

	h_in_buf[1] = make_float2(1, 1);

	cutilSafeCall(cudaMemcpy(d_in_buf, h_in_buf, N * sizeof(float2), cudaMemcpyHostToDevice));

	// test pointer arithmetic

	test<<<1, N, 192 * sizeof(float)>>>(d_in_buf, d_out_buf);

	cutilCheckMsg("test");

	cutilSafeCall(cudaMemcpy(h_out_buf, d_out_buf, N * sizeof(float2), cudaMemcpyDeviceToHost));

	printf("Pointer arithmetic: ");

	for(int i = 0; i < N; i++)

		printf("%f + i%f, ", h_out_buf[i].x, h_out_buf[i].y);

	printf("\n");

	// test

	test2<<<1, N, 192 * sizeof(float)>>>(d_in_buf, d_out_buf);

	cutilCheckMsg("test2");

	cutilSafeCall(cudaMemcpy(h_out_buf, d_out_buf, N * sizeof(float2), cudaMemcpyDeviceToHost));

	printf("Indexes: ");

	for(int i = 0; i < N; i++)

		printf("%f + i%f, ", h_out_buf[i].x, h_out_buf[i].y);

	printf("\n");

	cutilSafeCall(cudaFree(d_in_buf));

	cutilSafeCall(cudaFree(d_out_buf));

	free(h_in_buf);

	free(h_out_buf);

	cudaThreadExit();

}