Help! Sum of vectors

Gusvinhal · June 16, 2011, 1:08pm

Hello everyone!

I am new in CUDA and I’m trying to do a simple sum of vectors, but I’m having problems. This is my code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

	int pos = blockIdx.x * blockDim.x + threadIdx.x;

	

		  if(pos < ele)

		  {

			d_resul[pos] = Xcal[pos] + ycal[pos];

		  }

		__syncthreads();

	

}

/* Main */

int main()

{    

	

    float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

	

    int i;

	

    dim3 blocks(5,5); 

    dim3 threads(5,5);

			

    h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

	

    cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

	

    cudaMemcpy( d_Xcal, h_Xcal, sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, sizeof(float) , cudaMemcpyHostToDevice);

	

    kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

	

    cudaMemcpy( h_resul, d_resul, sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

	printf("%f \n", h_resul[i]);

    }

	printf("\n\n\n");

	getchar();

	

    return EXIT_SUCCESS;

}

He is adding only the first vector position, why it’s not adding anything else?

Thank you!

Best Regards,

Gustav.

Gusvinhal · June 16, 2011, 1:08pm

Hello everyone!

I am new in CUDA and I’m trying to do a simple sum of vectors, but I’m having problems. This is my code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

	int pos = blockIdx.x * blockDim.x + threadIdx.x;

	

		  if(pos < ele)

		  {

			d_resul[pos] = Xcal[pos] + ycal[pos];

		  }

		__syncthreads();

	

}

/* Main */

int main()

{    

	

    float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

	

    int i;

	

    dim3 blocks(5,5); 

    dim3 threads(5,5);

			

    h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

	

    cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

	

    cudaMemcpy( d_Xcal, h_Xcal, sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, sizeof(float) , cudaMemcpyHostToDevice);

	

    kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

	

    cudaMemcpy( h_resul, d_resul, sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

	printf("%f \n", h_resul[i]);

    }

	printf("\n\n\n");

	getchar();

	

    return EXIT_SUCCESS;

}

He is adding only the first vector position, why it’s not adding anything else?

Thank you!

Best Regards,

Gustav.

brano · June 16, 2011, 1:54pm

Hello everyone!

I am new in CUDA and I’m trying to do a simple sum of vectors, but I’m having problems. This is my code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

	int pos = blockIdx.x * blockDim.x + threadIdx.x;

	

  		if(pos < ele)

  		{

			d_resul[pos] = Xcal[pos] + ycal[pos];

  		}

		__syncthreads();

	

}

/* Main */

int main()

{    

	

    float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

	

    int i;

	

    dim3 blocks(5,5); 

    dim3 threads(5,5);

			

    h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

	

    cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

	

    cudaMemcpy( d_Xcal, h_Xcal, sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, sizeof(float) , cudaMemcpyHostToDevice);

	

    kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

	

    cudaMemcpy( h_resul, d_resul, sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

	printf("%f \n", h_resul[i]);

    }

	printf("\n\n\n");

	getchar();

	

    return EXIT_SUCCESS;

}

He is adding only the first vector position, why it’s not adding anything else?

Thank you!

Best Regards,

Gustav.

Hi,

You have a couple of mistakes.

You have 4 elements in each vector. There is no need to launch 5 threads in both x and y dimension for the threads and block.

dim3 blocks(1)

dim3 threads(4)

When you copy stuff from and to the GPU you need to specify the amount of Bytes you are going to transfer.

cudaMemcpy( d_Xcal, h_Xcal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( d_ycal, h_ycal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( h_resul, d_resul, 4*sizeof(float) , cudaMemcpyDeviceToHost);

You don’t need the syncthreads() in the kernel because they all all working on different elements.

Don’t forget to free the memory when you are done.

cudaFree(d_Xcal);

free(d_resul); etc.

That should be it I think.

Feel free to ask if you want to have something explained.

brano · June 16, 2011, 1:54pm

Hello everyone!

I am new in CUDA and I’m trying to do a simple sum of vectors, but I’m having problems. This is my code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

	int pos = blockIdx.x * blockDim.x + threadIdx.x;

	

  		if(pos < ele)

  		{

			d_resul[pos] = Xcal[pos] + ycal[pos];

  		}

		__syncthreads();

	

}

/* Main */

int main()

{    

	

    float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

	

    int i;

	

    dim3 blocks(5,5); 

    dim3 threads(5,5);

			

    h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

	

    cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

	

    cudaMemcpy( d_Xcal, h_Xcal, sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, sizeof(float) , cudaMemcpyHostToDevice);

	

    kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

	

    cudaMemcpy( h_resul, d_resul, sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

	printf("%f \n", h_resul[i]);

    }

	printf("\n\n\n");

	getchar();

	

    return EXIT_SUCCESS;

}

He is adding only the first vector position, why it’s not adding anything else?

Thank you!

Best Regards,

Gustav.

Hi,

You have a couple of mistakes.

You have 4 elements in each vector. There is no need to launch 5 threads in both x and y dimension for the threads and block.

dim3 blocks(1)

dim3 threads(4)

When you copy stuff from and to the GPU you need to specify the amount of Bytes you are going to transfer.

cudaMemcpy( d_Xcal, h_Xcal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( d_ycal, h_ycal, 4*sizeof(float) , cudaMemcpyHostToDevice);

cudaMemcpy( h_resul, d_resul, 4*sizeof(float) , cudaMemcpyDeviceToHost);

You don’t need the syncthreads() in the kernel because they all all working on different elements.

Don’t forget to free the memory when you are done.

cudaFree(d_Xcal);

free(d_resul); etc.

That should be it I think.

Feel free to ask if you want to have something explained.

MeinLieberTanz · June 16, 2011, 2:06pm

Hi Gusvinhal,

Here is the working code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

        int pos = blockIdx.x * blockDim.x + threadIdx.x;

if(pos < ele)

                  {

                        d_resul[pos] = Xcal[pos] + ycal[pos];

                  }

                __syncthreads();

}

/* Main */

int main()

{    

float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

int i;

dim3 blocks(1,1); 

    dim3 threads(4,1);

h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

cudaMemcpy( d_Xcal, h_Xcal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

cudaMemcpy( h_resul, d_resul, 4* sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

        printf("%f \n", h_resul[i]);

    }

        printf("\n\n\n");

getchar();

return EXIT_SUCCESS;

}

MeinLieberTanz · June 16, 2011, 2:06pm

Hi Gusvinhal,

Here is the working code:

#include <stdio.h>

__global__ void kernelrmsep(float *Xcal, float *ycal, int ele, float *d_resul)

{

        int pos = blockIdx.x * blockDim.x + threadIdx.x;

if(pos < ele)

                  {

                        d_resul[pos] = Xcal[pos] + ycal[pos];

                  }

                __syncthreads();

}

/* Main */

int main()

{    

float *h_Xcal;

    float *h_ycal;

    float *h_resul;

    float *d_Xcal = 0;

    float *d_ycal = 0;

    float *d_resul = 0;

int i;

dim3 blocks(1,1); 

    dim3 threads(4,1);

h_Xcal = (float*)malloc(4*sizeof(float));

    h_ycal = (float*)malloc(4*sizeof(float));

    h_resul = (float*)malloc(4*sizeof(float));

h_Xcal[0] = 1; h_Xcal[1] = 2; h_Xcal[2] = 3; h_Xcal[3] = 4; 

    h_ycal[0] = 1; h_ycal[1] = 2; h_ycal[2] = 3; h_ycal[3] = 4; 

cudaMalloc((void**)&d_Xcal, sizeof(float)*4); 

    cudaMalloc((void**)&d_ycal, sizeof(float)*4); 

    cudaMalloc((void**)&d_resul, sizeof(float)*4);

cudaMemcpy( d_Xcal, h_Xcal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

    cudaMemcpy( d_ycal, h_ycal, 4 * sizeof(float) , cudaMemcpyHostToDevice);

kernelrmsep<<<blocks,threads>>>(d_Xcal, d_ycal, 4, d_resul);

cudaMemcpy( h_resul, d_resul, 4* sizeof(float) , cudaMemcpyDeviceToHost);

for(i = 0; i < 4; i++)

    {

        printf("%f \n", h_resul[i]);

    }

        printf("\n\n\n");

getchar();

return EXIT_SUCCESS;

}

Gusvinhal · June 16, 2011, 2:24pm

Thank you brano and Julianus!!!

Now it worked!

Gusvinhal · June 16, 2011, 2:24pm

Thank you brano and Julianus!!!

Now it worked!

Topic		Replies	Views
Sum vectors CUDA Programming and Performance	24	4884	December 21, 2011
Cant modify data on the GPU CUDA Programming and Performance	16	10239	December 20, 2008
HELP with vector sum CUDA Programming and Performance	6	2201	May 11, 2010
Getting started with CUDA ... cannot add simple vectors CUDA Programming and Performance	9	20916	January 31, 2011
Different results on device and Emulation mode CUDA Programming and Performance	5	3548	February 5, 2009
CUDA - calculation of a sum CUDA Programming and Performance	7	5444	April 30, 2010
compilation CUDA Programming and Performance	3	7868	March 25, 2010
beginner question Checking if GPU is the answer to me CUDA Programming and Performance	20	12081	September 4, 2008
Urgent help with threads please! CUDA Programming and Performance	21	10784	March 6, 2008
Working with really large arrays in CUDA (how to prevent negative indexes?) CUDA Programming and Performance	5	2174	November 19, 2019

Help! Sum of vectors

Related topics