Help! Sum of vectors

Hello everyone!

I am new in CUDA and I’m trying to do a simple sum of vectors, but I’m having problems. This is my code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

	int pos = blockIdx.x * blockDim.x + threadIdx.x;

	

		  if(pos < ele)

		  {

			d_resul[pos] = Xcal[pos] + ycal[pos];

		  }

		__syncthreads();

	

}

/* Main */

int main()

{    

	

    float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

	

    int i;

	

    dim3 blocks(5,5); 

    dim3 threads(5,5);

			

    h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

	

    cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

	

    cudaMemcpy( d_Xcal, h_Xcal, sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, sizeof(float) , cudaMemcpyHostToDevice);

	

    kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

	

    cudaMemcpy( h_resul, d_resul, sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

	printf("%f \n", h_resul[i]);

    }

	printf("\n\n\n");

	getchar();

	

    return EXIT_SUCCESS;

}

He is adding only the first vector position, why it’s not adding anything else?

Thank you!

Best Regards,

Gustav.

Hello everyone!

I am new in CUDA and I’m trying to do a simple sum of vectors, but I’m having problems. This is my code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

	int pos = blockIdx.x * blockDim.x + threadIdx.x;

	

		  if(pos < ele)

		  {

			d_resul[pos] = Xcal[pos] + ycal[pos];

		  }

		__syncthreads();

	

}

/* Main */

int main()

{    

	

    float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

	

    int i;

	

    dim3 blocks(5,5); 

    dim3 threads(5,5);

			

    h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

	

    cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

	

    cudaMemcpy( d_Xcal, h_Xcal, sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, sizeof(float) , cudaMemcpyHostToDevice);

	

    kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

	

    cudaMemcpy( h_resul, d_resul, sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

	printf("%f \n", h_resul[i]);

    }

	printf("\n\n\n");

	getchar();

	

    return EXIT_SUCCESS;

}

He is adding only the first vector position, why it’s not adding anything else?

Thank you!

Best Regards,

Gustav.

Hi,

You have a couple of mistakes.

You have 4 elements in each vector. There is no need to launch 5 threads in both x and y dimension for the threads and block.

dim3 blocks(1)

dim3 threads(4)

When you copy stuff from and to the GPU you need to specify the amount of Bytes you are going to transfer.

cudaMemcpy( d_Xcal, h_Xcal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( d_ycal, h_ycal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( h_resul, d_resul, 4*sizeof(float) , cudaMemcpyDeviceToHost);

You don’t need the syncthreads() in the kernel because they all all working on different elements.

Don’t forget to free the memory when you are done.

cudaFree(d_Xcal);

free(d_resul); etc.

That should be it I think.

Feel free to ask if you want to have something explained.

Hi,

You have a couple of mistakes.

You have 4 elements in each vector. There is no need to launch 5 threads in both x and y dimension for the threads and block.

dim3 blocks(1)

dim3 threads(4)

When you copy stuff from and to the GPU you need to specify the amount of Bytes you are going to transfer.

cudaMemcpy( d_Xcal, h_Xcal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( d_ycal, h_ycal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( h_resul, d_resul, 4*sizeof(float) , cudaMemcpyDeviceToHost);

You don’t need the syncthreads() in the kernel because they all all working on different elements.

Don’t forget to free the memory when you are done.

cudaFree(d_Xcal);

free(d_resul); etc.

That should be it I think.

Feel free to ask if you want to have something explained.

Hi Gusvinhal,

Here is the working code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

        int pos = blockIdx.x * blockDim.x + threadIdx.x;

if(pos < ele)

                  {

                        d_resul[pos] = Xcal[pos] + ycal[pos];

                  }

                __syncthreads();

}

/* Main */

int main()

{    

float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

int i;

dim3 blocks(1,1); 

    dim3 threads(4,1);

h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

cudaMemcpy( d_Xcal, h_Xcal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

cudaMemcpy( h_resul, d_resul, 4* sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

        printf("%f \n", h_resul[i]);

    }

        printf("\n\n\n");

getchar();

return EXIT_SUCCESS;

}

Hi Gusvinhal,

Here is the working code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

        int pos = blockIdx.x * blockDim.x + threadIdx.x;

if(pos < ele)

                  {

                        d_resul[pos] = Xcal[pos] + ycal[pos];

                  }

                __syncthreads();

}

/* Main */

int main()

{    

float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

int i;

dim3 blocks(1,1); 

    dim3 threads(4,1);

h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

cudaMemcpy( d_Xcal, h_Xcal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

cudaMemcpy( h_resul, d_resul, 4* sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

        printf("%f \n", h_resul[i]);

    }

        printf("\n\n\n");

getchar();

return EXIT_SUCCESS;

}

Thank you brano and Julianus!!!

Now it worked!

Thank you brano and Julianus!!!

Now it worked!