Hi,
I’m trying to write a mex gateway function to add two complex float array. I want to do it with atomic functions. here is my code:
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include "cuda.h"
#include <iostream>
#include <mex.h>
#include "gpu/mxGPUArray.h"
#include "matrix.h"
#include <thrust/complex.h>
#include <string.h>
//#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
//
//inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
//{
// if (code != cudaSuccess)
// {
// fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
// if (abort) exit(code);
// }
//}
//
typedef thrust::complex<float> fcomp;
__global__ void add(fcomp * Device_DataRes, fcomp * Device_Data1, fcomp * Device_Data2, int N) {
int TID = threadIdx.y * blockDim.x + threadIdx.x;
int BlockOFFset = blockDim.x * blockDim.y * blockIdx.x;
int GID_RowBased = BlockOFFset + TID;
if (GID_RowBased < N) {
//Device_DataRes[GID_RowBased] = Device_Data1[GID_RowBased] + Device_Data2[GID_RowBased];
// Device_Data1[GID_RowBased] = Device_Data1[GID_RowBased] + Device_Data2[GID_RowBased];
atomicAdd(&Device_Data1[GID_RowBased], Device_Data2[GID_RowBased]);
}
}
void mexFunction(int nlhs, mxArray* plhs[],
int nrhs, const mxArray* prhs[]) {
mxInitGPU();
int N = 1000;
int ArrayByteSize = sizeof(fcomp) * N;
fcomp* Device_Data1;
fcomp* Device_Data2;
fcomp* DataRes;
fcomp* Device_DataRes;
mxComplexSingle* Data1 = mxGetComplexSingles(prhs[0]);
mxComplexSingle* Data2 = mxGetComplexSingles(prhs[1]);
(cudaMalloc((void**)&Device_Data1, ArrayByteSize));
(cudaMemcpy(Device_Data1, Data1, ArrayByteSize, cudaMemcpyHostToDevice));
(cudaMalloc((void**)&Device_Data2, ArrayByteSize));
(cudaMemcpy(Device_Data2, Data2, ArrayByteSize, cudaMemcpyHostToDevice));
plhs[0] = mxCreateNumericMatrix(N, 1, mxSINGLE_CLASS, mxCOMPLEX);
DataRes = static_cast<fcomp*> (mxGetData(plhs[0]));
(cudaMalloc((void**)&Device_DataRes, ArrayByteSize));
dim3 block(1024);
int GridX = (N / block.x + 1);
dim3 grid(GridX);//SystemSetup.NumberOfTransmitter
add << <grid, block >> > (Device_DataRes, Device_Data1, Device_Data2, N);
(cudaMemcpy(DataRes, Device_Data1, ArrayByteSize, cudaMemcpyDeviceToHost));
cudaFree(Device_Data1);
cudaFree(Device_Data2);
cudaFree(Device_DataRes);
//mxGPUDestroyGPUArray(MediumX);
}
The code compiles (and runs ) fine if i comment the line related to the atomic add and use the line above it. I need to use atomic function as they raise a flag if two threads want to copy to the same location (not the case in this code of course).
is there any way to do it?
Thanks in advance.
Moein.