Shared Memory Is my program correct ?

Hello,
I’m trying to create a shared version of “incrementarray.cu” at this address : [url=“CUDA, Supercomputing for the Masses: Part 2 | Dr Dobb's”]http://www.ddj.com/cpp/207402986[/url]

Here’s my code :

[b]// incrementArray.cu
#include <stdio.h>
#include <cuda.h>
void incrementArrayOnHost(int *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1;
}

global void incrementArrayOnDevice(int a)
{
extern shared int s_data[];
int idx = blockIdx.x
blockDim.x + threadIdx.x;
s_data[idx]=a[idx]+1;
__syncthreads();

int out = blockIdx.x*blockDim.x + threadIdx.x;
a[out]=s_data[out];
}
int main(void)
{
int *a_h, *b_h; // pointers to host memory
int a_d; // pointer to device memory
int i;
int N = 512
4096;
printf(“%d\n”,N);
int numThreadsPerBlock = 512;
int numBlocks = N/ numThreadsPerBlock;
size_t size = N * sizeof(int);

// allocate arrays on host
a_h = (int *)malloc(size);
b_h = (int *)malloc(size);

// allocate array on device
cudaMalloc((void **) &a_d, size);

// initialization of host data
for (i=0; i<N; i++) a_h[i] = i;

// copy data from host to device
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

// do calculation on host
incrementArrayOnHost(a_h,N);
// do calculation on device:
// Part 1 of 2. Compute execution configuration

//size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);

dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
//int blockSize = 4;
int sharedMemSize = numThreadsPerBlock * sizeof(int);
//int nBlocks = N/numThreadsPerBlock + (N%numThreadsPerBlock == 0?0:1);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< dimGrid,dimBlock, sharedMemSize >>> (a_d);
// Retrieve result from device and store in b_h
cudaMemcpy(b_h, a_d,size, cudaMemcpyDeviceToHost);
// check results
//int toto;
/* for (i=0; i<N; i++)
{
if(a_h[i]!=b_h[i])
{
toto=i;
}
i=2N;
}
/
printf(“%d\n%d\n\n”,a_h[514], b_h[514]);
// cleanup
int tr;

scanf(“%d”,&tr);
free(a_h); free(b_h); cudaFree(a_d);
}[/b]

Do i use shared memory properly ?The profiler find the same times for global version and shared version…

That’s because the device still has to load the data from global memory into shared memory before it can be used from shared memory.

In situations where the data from global memory is only used one time, there is no benefit to using shared memory.

Try it.

[b]extern shared int s_data;

global void incrementArrayOnDevice(int *a)

{

int idx = blockIdx.x*blockDim.x + threadIdx.x;

s_data[threadIdx.x]=a[idx]+1;

__syncthreads();

a[idx]=s_data[threadIdx.x];

}[/b]