Array and Shared memory Accessing element trough shared memory

Dear all,

I’m a beginner with CUDA (and parallel programming in general).

I tried to make a very simple code which from an array will give you an array of the same size.

Each element of the output array must be a sum of all elements of the input array.

I am trying to do that with an array of float4.

In the ouput, the x component is the sum of all input x component.

Same for y and z, w is not used.

With the code below, one should get in its terminal 16 times:

16

32

48

instead i’m obtaining 16 times:

8

16

24

I truly don’t understand why.

Please help!!!

[codebox]#include<stdio.h>

#include

#include<cuda.h>

using namespace std;

device float4 make_sum(float4 bi, float4 bj, float4 ai)

{

ai.x += bj.x;

ai.y += bj.y;

ai.z += bj.z;

return ai;

}

device float4 tile_calculation(float4 myPosition, float4 accel)

{

int i;

extern shared float4 shPosition;

for(i = 0; i < blockDim.x; i++) {

accel = make_sum(myPosition, shPosition[i], accel);

}

return accel;

}

global void calculate(float4* Input, float4* output)

{

int N=16;

// Calculation through shared memory

extern shared float4 shPosition;

float4* globalX = Input;

float4* globalA = output;

float4 myPosition;

int i, tile, p;

p=blockDim.x;

float4 acc = {0.0f, 0.0f, 0.0f};

int gtid = blockIdx.x * blockDim.x + threadIdx.x;

myPosition = globalX[gtid];

for(i = 0, tile = 0; i <= N; i += p, tile++) {

int idx = tile * blockDim.x + threadIdx.x;

shPosition[threadIdx.x] = globalX[idx];

__syncthreads();

 acc = tile_calculation(myPosition, acc); 

__syncthreads();

}

// Save the result in global memory for the integration step.

float4 acc3 = {acc.x, acc.y, acc.z, 0.0f};

globalA[gtid] = acc3;

//Direct Calculation

/int i = blockIdx.xblockDim.x+threadIdx.x;

for (int p=0;p<N;p++){

output[i] = bodyBodyInteraction(Input[i], Input[p], output[i]);

}*/

}

int main (int argc, char *argv)

{

int N=16;

cudaError_t cudaStat;

float4* d_input = 0;

float4* d_output = 0;

float4* h_input = (float4 ) malloc(Nsizeof(d_input[0]));

float4* h_output = (float4 ) malloc(Nsizeof(d_output[0]));

cudaStat = cudaMalloc ((void **)&d_input, N * sizeof(d_input[0]));

if( cudaStat )

printf(" value = %d : Memory Allocation on GPU Device failed\n", cudaStat);

cudaStat = cudaMalloc ((void **)&d_output, N * sizeof(d_output[0]));

if( cudaStat )

printf(" value = %d : Memory Allocation on GPU Device failed\n", cudaStat);

//

for (int i=0;i<N;i++){

h_input[i].x=1.;

h_input[i].y=2.;

h_input[i].z=3.;

h_output[i].x=0.;

h_output[i].y=0.;

h_output[i].z=0.;

}

//make initialisation in device

cudaStat = cudaMemcpy (d_input, h_input, N * sizeof(h_input[0]), cudaMemcpyHostToDevice);

if( cudaStat )

printf(" Memory Copy from Host to Device failed.\n", cudaStat);

cudaStat = cudaMemcpy (d_output, h_output, N * sizeof(h_output[0]), cudaMemcpyHostToDevice);

if( cudaStat )

printf(" Memory Copy from Host to Device failed.\n", cudaStat);

int threadsPerBlock = 4;

int threadsPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;



calculate<<<threadsPerGrid, threadsPerBlock>>>(d_input,d_output);



cudaStat = cudaMemcpy (h_input, d_input, N * sizeof(d_input[0]), cudaMemcpyDeviceToHost);

cudaStat = cudaMemcpy (h_output, d_output, N * sizeof(d_output[0]), cudaMemcpyDeviceToHost);

for (int i=0;i<N;i++) {

cout<<h_output[i].x<<endl;

cout<<h_output[i].y<<endl;

cout<<h_output[i].z<<endl;

}

}[/codebox]

I found the solution.

Just nee to allocate space for shared memory in device.

Below the corrected code:

[codebox]

int threadsPerBlock = 4;

int threadsPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

int sizemem = threadsPerBlock * sizeof(float4)

calculate<<<threadsPerGrid, threadsPerBlock,sizemem>>>(d_input,d_output);

[/codebox]