Dear all,
I’m a beginner with CUDA (and parallel programming in general).
I tried to make a very simple code which from an array will give you an array of the same size.
Each element of the output array must be a sum of all elements of the input array.
I am trying to do that with an array of float4.
In the ouput, the x component is the sum of all input x component.
Same for y and z, w is not used.
With the code below, one should get in its terminal 16 times:
16
32
48
instead i’m obtaining 16 times:
8
16
24
I truly don’t understand why.
Please help!!!
[codebox]#include<stdio.h>
#include<cuda.h>
using namespace std;
device float4 make_sum(float4 bi, float4 bj, float4 ai)
{
ai.x += bj.x;
ai.y += bj.y;
ai.z += bj.z;
return ai;
}
device float4 tile_calculation(float4 myPosition, float4 accel)
{
int i;
extern shared float4 shPosition;
for(i = 0; i < blockDim.x; i++) {
accel = make_sum(myPosition, shPosition[i], accel);
}
return accel;
}
global void calculate(float4* Input, float4* output)
{
int N=16;
// Calculation through shared memory
extern shared float4 shPosition;
float4* globalX = Input;
float4* globalA = output;
float4 myPosition;
int i, tile, p;
p=blockDim.x;
float4 acc = {0.0f, 0.0f, 0.0f};
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
myPosition = globalX[gtid];
for(i = 0, tile = 0; i <= N; i += p, tile++) {
int idx = tile * blockDim.x + threadIdx.x;
shPosition[threadIdx.x] = globalX[idx];
__syncthreads();
acc = tile_calculation(myPosition, acc);
__syncthreads();
}
// Save the result in global memory for the integration step.
float4 acc3 = {acc.x, acc.y, acc.z, 0.0f};
globalA[gtid] = acc3;
//Direct Calculation
/int i = blockIdx.xblockDim.x+threadIdx.x;
for (int p=0;p<N;p++){
output[i] = bodyBodyInteraction(Input[i], Input[p], output[i]);
}*/
}
int main (int argc, char *argv)
{
int N=16;
cudaError_t cudaStat;
float4* d_input = 0;
float4* d_output = 0;
float4* h_input = (float4 ) malloc(Nsizeof(d_input[0]));
float4* h_output = (float4 ) malloc(Nsizeof(d_output[0]));
cudaStat = cudaMalloc ((void **)&d_input, N * sizeof(d_input[0]));
if( cudaStat )
printf(" value = %d : Memory Allocation on GPU Device failed\n", cudaStat);
cudaStat = cudaMalloc ((void **)&d_output, N * sizeof(d_output[0]));
if( cudaStat )
printf(" value = %d : Memory Allocation on GPU Device failed\n", cudaStat);
//
for (int i=0;i<N;i++){
h_input[i].x=1.;
h_input[i].y=2.;
h_input[i].z=3.;
h_output[i].x=0.;
h_output[i].y=0.;
h_output[i].z=0.;
}
//make initialisation in device
cudaStat = cudaMemcpy (d_input, h_input, N * sizeof(h_input[0]), cudaMemcpyHostToDevice);
if( cudaStat )
printf(" Memory Copy from Host to Device failed.\n", cudaStat);
cudaStat = cudaMemcpy (d_output, h_output, N * sizeof(h_output[0]), cudaMemcpyHostToDevice);
if( cudaStat )
printf(" Memory Copy from Host to Device failed.\n", cudaStat);
int threadsPerBlock = 4;
int threadsPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
calculate<<<threadsPerGrid, threadsPerBlock>>>(d_input,d_output);
cudaStat = cudaMemcpy (h_input, d_input, N * sizeof(d_input[0]), cudaMemcpyDeviceToHost);
cudaStat = cudaMemcpy (h_output, d_output, N * sizeof(d_output[0]), cudaMemcpyDeviceToHost);
for (int i=0;i<N;i++) {
cout<<h_output[i].x<<endl;
cout<<h_output[i].y<<endl;
cout<<h_output[i].z<<endl;
}
}[/codebox]