How do I multiply the corresponding elements of vector A and vector B?

I want to multiply two vector corresponding elements, I use pointers to describe them, and below is a global function I wrote. When I use a small length vector to multiply the corresponding elements, I can get the correct answer, but when the length of the data increases, my result is wrong, I want to know where I am wrong, and is there a good way to achieve my goal?

/*
InputPtr: vector A;
Array: vector B;
Len: total length of the vectors, A and B have the same length.
*/
__global__ void DotMultiply(cuFloatComplex* InputPtr, float* Array, int Len) 
{
	int i = threadIdx.x;

	__shared__ cuFloatComplex s_Input[512];
	__shared__ float s_Array[512];

	if (i < blockDim.x && i + blockDim.x * blockIdx.x < Len)
	{
		s_Input[i] = InputPtr[i + blockDim.x * blockIdx.x];
		s_Array[i] = Array[i + blockDim.x * blockIdx.x];
	}
	else
	{
		s_Input[i].x = 0;
		s_Input[i].y = 0;
		s_Array[i] = 0;
	}
	__syncthreads();
	if (i < blockDim.x)
	{
		s_Input[i].x = s_Input[i].x * s_Array[i];
		s_Input[i].y = s_Input[i].y * s_Array[i];
	}
	__syncthreads();
	if (i < blockDim.x && i + blockDim.x * blockIdx.x < Len)
	{
		InputPtr[i + blockDim.x * blockIdx.x] = s_Input[i];
	}
}```

So you would like to compute a[i]=a[i]*b[i] for i < len in parallel?

1 Like

I don’t seem to have any trouble with the code you have posted:

# cat t29.cu
#include <cuComplex.h>
#include <iostream>

/*
InputPtr: vector A;
Array: vector B;
Len: total length of the vectors, A and B have the same length.
*/
__global__ void DotMultiply(cuFloatComplex* InputPtr, float* Array, int Len)
{
        int i = threadIdx.x;

        __shared__ cuFloatComplex s_Input[512];
        __shared__ float s_Array[512];

        if (i < blockDim.x && i + blockDim.x * blockIdx.x < Len)
        {
                s_Input[i] = InputPtr[i + blockDim.x * blockIdx.x];
                s_Array[i] = Array[i + blockDim.x * blockIdx.x];
        }
        else
        {
                s_Input[i].x = 0;
                s_Input[i].y = 0;
                s_Array[i] = 0;
        }
        __syncthreads();
        if (i < blockDim.x)
        {
                s_Input[i].x = s_Input[i].x * s_Array[i];
                s_Input[i].y = s_Input[i].y * s_Array[i];
        }
        __syncthreads();
        if (i < blockDim.x && i + blockDim.x * blockIdx.x < Len)
        {
                InputPtr[i + blockDim.x * blockIdx.x] = s_Input[i];
        }
}

const int dlen = 1048575;
int main(){

  cuFloatComplex *di;
  float *da;
  cudaMallocManaged(&di, dlen *sizeof(di[0]));
  cudaMallocManaged(&da, dlen *sizeof(da[0]));
  for (int i = 0; i < dlen; i++) {di[i] = make_cuFloatComplex(1.0f, 2.0f);  da[i] = 2.0f;}
  DotMultiply<<<(dlen+511)/512, 512>>>(di, da, dlen);
  cudaDeviceSynchronize();
  for (int i = 0; i < dlen; i++) if ((di[i].x != 2.0f) || (di[i].y != 4.0f)) std::cout << "mismatch at: " << i << std::endl;
}
# nvcc -o t29 t29.cu
# compute-sanitizer ./t29
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
#

If you’re having a problem, it may lie in something you haven’t shown.

1 Like

That’s weird, but thank you for your answer, I will have another try.

If i use sing this method, would it take more time to compute, or would using shared memory be faster?

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.