a problem complex array add with cuda ?????

xuanyz · August 16, 2017, 11:05pm

I created a my complex Structures and writed the codes about complex array add with cuda,but the codes can not work well , can anyone help me?
the codes are as followed:

#include <math.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>

//-----------------------------about the complex number------------------------------------------------------------
typedef struct FCOMPLEX {
float r;
float i;} fcomplex;

host device
fcomplex Complex(float re, float im)
{
fcomplex c;
c.r=re;
c.i=im;
return c;
}

host device
fcomplex Cadd(fcomplex a, fcomplex b)
{
fcomplex c;
c.r=a.r+b.r;
c.i=a.i+b.i;
return c;
}

//---------------------------------------------------------------------------------------------------

cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size);

global void addKernel(fcomplex *c, const fcomplex *a, const fcomplex *b)
{
int i = threadIdx.x;
c[i]=Cadd( a[i], b[i]);
}

int main()
{
const int arraySize = 5;
const fcomplex a[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5)};
const fcomplex b[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5) };
fcomplex c[arraySize] = { Complex(1, 1),Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) };

// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
}

printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {%f,%f,%f,%f,%f}\n",
    c[0].r, c[1].r, c[2].r, c[3].r, c[4].r);
printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {%f,%f,%f,%f,%f}\n",
    c[0].i, c[1].i, c[2].i, c[3].i, c[4].i);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}

return 0;

}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size)
{
fcomplex *dev_a ;
fcomplex *dev_b ;
fcomplex *dev_c ;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_c->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

 cudaStatus = cudaMalloc((void**)&dev_a->r, size * sizeof(float));
 cudaStatus = cudaMalloc((void**)&dev_a->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(&dev_a->r, &a->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_a->i, &a->i, size * sizeof(float), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(&dev_b->r, &b->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_b->i, &b->i, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(&c->r, &dev_c->r, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaStatus = cudaMemcpy(&c->i, &dev_c->i, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;

}

Robert_Crovella · August 17, 2017, 2:23am

This is broken:

fcomplex *dev_c ;
...
cudaStatus = cudaMalloc((void**)&dev_c->r, size * sizeof(float));

dev_c initially doesn’t point to anything valid. So you can’t store anything at dev_c->r, because you haven’t allocated any storage for dev_c

when I run your code this behavior leads to a seg fault. This is simply a lack of understanding of proper usage of pointers in C.

This methodology is also similarly broken with your cudaMemcpy operations.

Here’s a fixed code:

$ cat t399.cu
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

//-----------------------------about the complex number------------------------------------------------------------
typedef struct FCOMPLEX {
float r;
float i;} fcomplex;

__host__ __device__
fcomplex Complex(float re, float im)
{
fcomplex c;
c.r=re;
c.i=im;
return c;
}

__host__ __device__
fcomplex Cadd(fcomplex a, fcomplex b)
{
fcomplex c;
c.r=a.r+b.r;
c.i=a.i+b.i;
return c;
}

//---------------------------------------------------------------------------------------------------

cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size);

__global__ void addKernel(fcomplex *c, const fcomplex *a, const fcomplex *b)
{
int i = threadIdx.x;
c[i]=Cadd( a[i], b[i]);
}

int main()
{
const int arraySize = 5;
const fcomplex a[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5)};
const fcomplex b[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5) };
fcomplex c[arraySize] = { Complex(1, 1),Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) };

// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}

printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {%f,%f,%f,%f,%f}\n",
c[0].r, c[1].r, c[2].r, c[3].r, c[4].r);
printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {%f,%f,%f,%f,%f}\n",
c[0].i, c[1].i, c[2].i, c[3].i, c[4].i);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}

return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size)
{
fcomplex *dev_a ;
fcomplex *dev_b ;
fcomplex *dev_c ;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(fcomplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(fcomplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(fcomplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(fcomplex), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(fcomplex), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(fcomplex), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;
}
$ nvcc -arch=sm_61 -o t399 t399.cu
$ cuda-memcheck ./t399
========= CUDA-MEMCHECK
{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {2.000000,4.000000,6.000000,8.000000,10.000000}
{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {2.000000,4.000000,6.000000,8.000000,10.000000}
========= ERROR SUMMARY: 0 errors
$

xuanyz · August 17, 2017, 6:10am

(1)thanks for txbob’s reply .your relpy is one way that can deal with my problem completely. thank you again.
But I wonder how to deal with the prblem as the way I used in my codes ,I mean that treat the complex array as two seperate parts,real and imag part, and Allocate GPU buffers for both real and imag part seperately? how should I do?

(2) Another question. If I created a structure as following:
typedef struct align(16){
float dr; // Detection grid resolution, r-direction [cm]
float dz; // Detection grid resolution, z-direction [cm]
int na; // Number of grid elements in angular-direction [-]
int nr; // Number of grid elements in r-direction
int nz; // Number of grid elements in z-direction

}DetStruct;
and i used the struct DetStruct as a member to creat an array Detstruct_all_dev[20]
could I Allocate GPU buffers for the Detstruct_all_dev[20] as followed :

DetStruct * Detstruct_all_dev
cudaStatus = cudaMalloc((void**)&Detstruct_all_dev, size * sizeof(DetStruct));

???

Topic		Replies	Views
a problem complex array add with cuda ????? CUDA Programming and Performance	0	498	August 16, 2017
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8588	June 16, 2011
I can't not get true answer at 3D array calculation CUDA Programming and Performance	12	1334	January 13, 2017
Error: Failed to suspend device for CUDA device 0 CUDA Programming and Performance	8	4517	January 4, 2023
I am new to cuda programming. In this code, c matric return by GPU is Zero matrix. I tried different... CUDA Programming and Performance	0	442	July 3, 2018
newbie in Cuda needs help with 2D arrays CUDA Programming and Performance	9	919	March 9, 2018
Struct in CUDA can i use this struct in CUDA CUDA Programming and Performance	15	89361	June 26, 2009
Error in my code... CUDA Programming and Performance	11	2531	December 19, 2014
How to choose how many threads/blocks to have? CUDA Programming and Performance	43	50576	June 7, 2022
Unable to correctly use a 2D CUDA array with a texture object CUDA Programming and Performance cuda	2	258	February 25, 2024

a problem complex array add with cuda ?????

Related topics