I created a my complex Structures and writed the codes about complex array add with cuda,but the codes can not work well , can anyone help me?
the codes are as followed:
#include <math.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
//-----------------------------about the complex number------------------------------------------------------------
typedef struct FCOMPLEX {
float r;
float i;} fcomplex;
host device
fcomplex Complex(float re, float im)
{
fcomplex c;
c.r=re;
c.i=im;
return c;
}
host device
fcomplex Cadd(fcomplex a, fcomplex b)
{
fcomplex c;
c.r=a.r+b.r;
c.i=a.i+b.i;
return c;
}
//---------------------------------------------------------------------------------------------------
cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size);
global void addKernel(fcomplex *c, const fcomplex *a, const fcomplex *b)
{
int i = threadIdx.x;
c[i]=Cadd( a[i], b[i]);
}
int main()
{
const int arraySize = 5;
const fcomplex a[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5)};
const fcomplex b[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5) };
fcomplex c[arraySize] = { Complex(1, 1),Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) };
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {%f,%f,%f,%f,%f}\n",
c[0].r, c[1].r, c[2].r, c[3].r, c[4].r);
printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {%f,%f,%f,%f,%f}\n",
c[0].i, c[1].i, c[2].i, c[3].i, c[4].i);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size)
{
fcomplex *dev_a ;
fcomplex *dev_b ;
fcomplex *dev_c ;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_c->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_a->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(&dev_a->r, &a->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_a->i, &a->i, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(&dev_b->r, &b->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_b->i, &b->i, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(&c->r, &dev_c->r, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaStatus = cudaMemcpy(&c->i, &dev_c->i, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}