I managed to compose two small kernels, one with ptr arithmetic and the other with integer indexes, which produce different results (see code below).
‘test2’ kernel (indexes) gives right result: (1, 1), (garbage, 1), whether ‘test’ (pointers) gives (garbage, 1), (garbage, 1):
$ release/test
Pointer arithmetic: 1439.000000 + i1.000000, 1439.000000 + i1.000000,
Indexes: 1.000000 + i1.000000, 750.000000 + i1.000000,
#include <stdlib.h>
#include <stdio.h>
// includes, project
#include <cutil_inline.h>
// Pointer arithmetic
__global__ void test(float2 *in, float2 *out)
{
__shared__ float sMem[192];
__shared__ float *lMemStore, *lMemLoad;
int lId = threadIdx.x; // 0, 1
float2 a = in[lId]; // a = (1, 1), (1, 1)
lMemStore = sMem + lId; // lMemStore = sMem, sMem + 1
lMemLoad = sMem + lId * 3; // lMemLoad = sMem, sMem + 3
lMemStore[0] = a.x; // sMem[0] = 1, sMem[1] = 1
__syncthreads(); // sMem: 0: 1, 1: 1
a.x = lMemLoad[0]; // a.x = sMem[0], sMem[3]
out[lId] = a;
}
// Indexes
__global__ void test2(float2 *in, float2 *out)
{
__shared__ float sMem[192];
size_t lMemStore, lMemLoad;
int lId = threadIdx.x; // 0, 1
float2 a = in[lId]; // a = (1, 1), (1, 1)
lMemStore = lId; // lMemStore = sMem, sMem + 1
lMemLoad = lId * 3; // lMemLoad = sMem, sMem + 3
sMem[lMemStore] = a.x; // sMem[0] = 1, sMem[1] = 1
__syncthreads(); // sMem: 0: 1, 1: 1
a.x = sMem[lMemLoad]; // a.x = sMem[0], sMem[3]
out[lId] = a;
}
int main( int argc, char** argv)
{
if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
cutilDeviceInit(argc, argv);
else
cudaSetDevice(cutGetMaxGflopsDeviceId());
int N = 2;
float2 *h_in_buf = (float2*)malloc(N * sizeof(float2));
float2 *h_out_buf = (float2*)malloc(N * sizeof(float2));
float2 *d_in_buf;
float2 *d_out_buf;
cutilSafeCall(cudaMalloc((void**)&d_in_buf, N * sizeof(float2)));
cutilSafeCall(cudaMalloc((void**)&d_out_buf, N * sizeof(float2)));
h_in_buf[0] = make_float2(1, 1);
h_in_buf[1] = make_float2(1, 1);
cutilSafeCall(cudaMemcpy(d_in_buf, h_in_buf, N * sizeof(float2), cudaMemcpyHostToDevice));
// test pointer arithmetic
test<<<1, N, 192 * sizeof(float)>>>(d_in_buf, d_out_buf);
cutilCheckMsg("test");
cutilSafeCall(cudaMemcpy(h_out_buf, d_out_buf, N * sizeof(float2), cudaMemcpyDeviceToHost));
printf("Pointer arithmetic: ");
for(int i = 0; i < N; i++)
printf("%f + i%f, ", h_out_buf[i].x, h_out_buf[i].y);
printf("\n");
// test
test2<<<1, N, 192 * sizeof(float)>>>(d_in_buf, d_out_buf);
cutilCheckMsg("test2");
cutilSafeCall(cudaMemcpy(h_out_buf, d_out_buf, N * sizeof(float2), cudaMemcpyDeviceToHost));
printf("Indexes: ");
for(int i = 0; i < N; i++)
printf("%f + i%f, ", h_out_buf[i].x, h_out_buf[i].y);
printf("\n");
cutilSafeCall(cudaFree(d_in_buf));
cutilSafeCall(cudaFree(d_out_buf));
free(h_in_buf);
free(h_out_buf);
cudaThreadExit();
}