a problem complex array add with cuda ?????

I created a my complex Structures and writed the codes about complex array add with cuda,but the codes can not work well , can anyone help me?
the codes are as followed:

#include <math.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>

//-----------------------------about the complex number------------------------------------------------------------
typedef struct FCOMPLEX {
float r;
float i;} fcomplex;

host device
fcomplex Complex(float re, float im)
{
fcomplex c;
c.r=re;
c.i=im;
return c;
}

host device
fcomplex Cadd(fcomplex a, fcomplex b)
{
fcomplex c;
c.r=a.r+b.r;
c.i=a.i+b.i;
return c;
}

//---------------------------------------------------------------------------------------------------

cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size);

global void addKernel(fcomplex *c, const fcomplex *a, const fcomplex *b)
{
int i = threadIdx.x;
c[i]=Cadd( a[i], b[i]);
}

int main()
{
const int arraySize = 5;
const fcomplex a[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5)};
const fcomplex b[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5) };
fcomplex c[arraySize] = { Complex(1, 1),Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) };

// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
}

printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {%f,%f,%f,%f,%f}\n",
    c[0].r, c[1].r, c[2].r, c[3].r, c[4].r);
printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {%f,%f,%f,%f,%f}\n",
    c[0].i, c[1].i, c[2].i, c[3].i, c[4].i);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}

return 0;

}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size)
{
fcomplex *dev_a ;
fcomplex *dev_b ;
fcomplex *dev_c ;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_c->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

 cudaStatus = cudaMalloc((void**)&dev_a->r, size * sizeof(float));
 cudaStatus = cudaMalloc((void**)&dev_a->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(&dev_a->r, &a->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_a->i, &a->i, size * sizeof(float), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(&dev_b->r, &b->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_b->i, &b->i, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(&c->r, &dev_c->r, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaStatus = cudaMemcpy(&c->i, &dev_c->i, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;

}

This is broken:

fcomplex *dev_c ;
...
cudaStatus = cudaMalloc((void**)&dev_c->r, size * sizeof(float));

dev_c initially doesn’t point to anything valid. So you can’t store anything at dev_c->r, because you haven’t allocated any storage for dev_c

when I run your code this behavior leads to a seg fault. This is simply a lack of understanding of proper usage of pointers in C.

This methodology is also similarly broken with your cudaMemcpy operations.

Here’s a fixed code:

$ cat t399.cu
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

//-----------------------------about the complex number------------------------------------------------------------
typedef struct FCOMPLEX {
float r;
float i;} fcomplex;

__host__ __device__
fcomplex Complex(float re, float im)
{
fcomplex c;
c.r=re;
c.i=im;
return c;
}

__host__ __device__
fcomplex Cadd(fcomplex a, fcomplex b)
{
fcomplex c;
c.r=a.r+b.r;
c.i=a.i+b.i;
return c;
}

//---------------------------------------------------------------------------------------------------

cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size);

__global__ void addKernel(fcomplex *c, const fcomplex *a, const fcomplex *b)
{
int i = threadIdx.x;
c[i]=Cadd( a[i], b[i]);
}

int main()
{
const int arraySize = 5;
const fcomplex a[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5)};
const fcomplex b[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5) };
fcomplex c[arraySize] = { Complex(1, 1),Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) };

// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}

printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {%f,%f,%f,%f,%f}\n",
c[0].r, c[1].r, c[2].r, c[3].r, c[4].r);
printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {%f,%f,%f,%f,%f}\n",
c[0].i, c[1].i, c[2].i, c[3].i, c[4].i);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}

return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size)
{
fcomplex *dev_a ;
fcomplex *dev_b ;
fcomplex *dev_c ;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(fcomplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(fcomplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(fcomplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(fcomplex), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(fcomplex), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(fcomplex), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;
}
$ nvcc -arch=sm_61 -o t399 t399.cu
$ cuda-memcheck ./t399
========= CUDA-MEMCHECK
{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {2.000000,4.000000,6.000000,8.000000,10.000000}
{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {2.000000,4.000000,6.000000,8.000000,10.000000}
========= ERROR SUMMARY: 0 errors
$

(1)thanks for txbob’s reply .your relpy is one way that can deal with my problem completely. thank you again.
But I wonder how to deal with the prblem as the way I used in my codes ,I mean that treat the complex array as two seperate parts,real and imag part, and Allocate GPU buffers for both real and imag part seperately? how should I do?

(2) Another question. If I created a structure as following:
typedef struct align(16){
float dr; // Detection grid resolution, r-direction [cm]
float dz; // Detection grid resolution, z-direction [cm]
int na; // Number of grid elements in angular-direction
int nr; // Number of grid elements in r-direction
int nz; // Number of grid elements in z-direction

}DetStruct;
and i used the struct DetStruct as a member to creat an array Detstruct_all_dev[20]
could I Allocate GPU buffers for the Detstruct_all_dev[20] as followed :

DetStruct * Detstruct_all_dev
cudaStatus = cudaMalloc((void**)&Detstruct_all_dev, size * sizeof(DetStruct));

???