Pointer arithmetic with shared memory

Manticore · February 7, 2010, 5:14am

I ported some OpenCL code to Cuda recently and encountered the following issue. In OpenCL you can mark pointer variable inside kernel as ‘__local’ (which is the analogue of Cuda’s ‘shared’). When you do this, OpenCL’s compiler understands that this pointer is used to access shared memory and only it. So you can do something like:

__local float shared_mem[192];

__local float *ptr;

// ... later in code

ptr = shared_mem + 16;

float temp = ptr[0] * ptr[1];

In Cuda though, it does not always work for some reason, even if I explicitly mark ‘ptr’ as ‘shared’. Sometimes there are warnings that compiler cannot decide which memory to access, sometimes I get garbage in output. Same kernel with integer indexes instead of pointers is working fine.

The kernel I am using is large and has many cases of such pointer arithmetic, so I would prefer not to replace ‘__local float *ptr’ by some sort of ‘size_t ptr_index’ if Cuda allows it. So, is there some way to tell compiler that the pointer is totally definitely always points to shared memory?

Thanks in advance.

Manticore · February 7, 2010, 6:35am

I managed to compose two small kernels, one with ptr arithmetic and the other with integer indexes, which produce different results (see code below).

‘test2’ kernel (indexes) gives right result: (1, 1), (garbage, 1), whether ‘test’ (pointers) gives (garbage, 1), (garbage, 1):

$ release/test

Pointer arithmetic: 1439.000000 + i1.000000, 1439.000000 + i1.000000,

Indexes: 1.000000 + i1.000000, 750.000000 + i1.000000,

#include <stdlib.h>

#include <stdio.h>

// includes, project

#include <cutil_inline.h>

// Pointer arithmetic

__global__ void test(float2 *in, float2 *out)

{

		__shared__ float sMem[192];

		__shared__ float *lMemStore, *lMemLoad;

		int lId = threadIdx.x; // 0, 1

	float2 a = in[lId]; // a = (1, 1), (1, 1)

	lMemStore = sMem + lId; // lMemStore = sMem, sMem + 1

		lMemLoad = sMem + lId * 3; // lMemLoad = sMem, sMem + 3

		lMemStore[0] = a.x; // sMem[0] = 1, sMem[1] = 1

		__syncthreads(); // sMem: 0: 1, 1: 1

		a.x = lMemLoad[0]; // a.x = sMem[0], sMem[3]

	out[lId] = a;

}

// Indexes

__global__ void test2(float2 *in, float2 *out)

{

		__shared__ float sMem[192];

		size_t lMemStore, lMemLoad;

		int lId = threadIdx.x; // 0, 1

	float2 a = in[lId]; // a = (1, 1), (1, 1)

	lMemStore = lId; // lMemStore = sMem, sMem + 1

		lMemLoad = lId * 3; // lMemLoad = sMem, sMem + 3

		sMem[lMemStore] = a.x; // sMem[0] = 1, sMem[1] = 1

		__syncthreads(); // sMem: 0: 1, 1: 1

		a.x = sMem[lMemLoad]; // a.x = sMem[0], sMem[3]

	out[lId] = a;

}

int main( int argc, char** argv)

{

	if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))

		cutilDeviceInit(argc, argv);

	else

		cudaSetDevice(cutGetMaxGflopsDeviceId());

	int N = 2;

	float2 *h_in_buf = (float2*)malloc(N * sizeof(float2));

	float2 *h_out_buf = (float2*)malloc(N * sizeof(float2));

	float2 *d_in_buf;

	float2 *d_out_buf;

	cutilSafeCall(cudaMalloc((void**)&d_in_buf, N * sizeof(float2)));

	cutilSafeCall(cudaMalloc((void**)&d_out_buf, N * sizeof(float2)));

	h_in_buf[0] = make_float2(1, 1);

	h_in_buf[1] = make_float2(1, 1);

	cutilSafeCall(cudaMemcpy(d_in_buf, h_in_buf, N * sizeof(float2), cudaMemcpyHostToDevice));

	// test pointer arithmetic

	test<<<1, N, 192 * sizeof(float)>>>(d_in_buf, d_out_buf);

	cutilCheckMsg("test");

	cutilSafeCall(cudaMemcpy(h_out_buf, d_out_buf, N * sizeof(float2), cudaMemcpyDeviceToHost));

	printf("Pointer arithmetic: ");

	for(int i = 0; i < N; i++)

		printf("%f + i%f, ", h_out_buf[i].x, h_out_buf[i].y);

	printf("\n");

	// test

	test2<<<1, N, 192 * sizeof(float)>>>(d_in_buf, d_out_buf);

	cutilCheckMsg("test2");

	cutilSafeCall(cudaMemcpy(h_out_buf, d_out_buf, N * sizeof(float2), cudaMemcpyDeviceToHost));

	printf("Indexes: ");

	for(int i = 0; i < N; i++)

		printf("%f + i%f, ", h_out_buf[i].x, h_out_buf[i].y);

	printf("\n");

	cutilSafeCall(cudaFree(d_in_buf));

	cutilSafeCall(cudaFree(d_out_buf));

	free(h_in_buf);

	free(h_out_buf);

	cudaThreadExit();

}

Topic		Replies	Views
pointer to shared memory compiler problems CUDA Programming and Performance	19	14795	June 7, 2008
Warnings on array of pointers assigned to externed shared memory CUDA Programming and Performance	4	943	April 2, 2014
Pointers to pointers on shared memory CUDA Programming and Performance	4	6910	March 30, 2009
shared memory wrong allocation? CUDA Programming and Performance	2	878	July 29, 2009
Error: "unaligned memory accesses not supported"? CUDA Programming and Performance	6	11196	April 2, 2009
Declaring what pointer points to Advisory: Cannot tell what pointer point CUDA Programming and Performance	9	5367	October 4, 2010
CUDA arrays and pointers CUDA Programming and Performance	1	2137	January 28, 2009
Pointers from const to shared CUDA Programming and Performance	2	3813	November 7, 2007
Weird pointer arithmetic bug ? CUDA Programming and Performance	18	2550	July 12, 2011
Qualifying Pointers referring to const/shared data CUDA Programming and Performance	6	3863	December 17, 2007

Pointer arithmetic with shared memory

Related topics