Hi,
I’m trying to transfer a data to a KUDA kernel, do some processings and have the output. I have already evaluated the Kernel in visual studio and it works as it should. However, when I make a MEX file out of the code, It does not provide me the output i expect. Here is my MEX gateway code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
using namespace std;
#include <mex.h>
__global__ void kernel_Reconstruction2(int* Dev_RfData, int* ReconstructedImage_GPU, int transmit, int
NStart_Transmit) {
int TID = threadIdx.y * blockDim.x + threadIdx.x;
int BlockOFFset = blockDim.x * blockDim.y * blockIdx.x;
int RowOFFset = blockDim.x * blockDim.y * gridDim.x * blockIdx.y;
int GID = RowOFFset + BlockOFFset + TID;
// here is the processing
}
void mexFunction(int nlhs, mxArray* plhs[ ],
int nrhs, const mxArray* prhs[ ]) {
int* RfData; // RF data; a pinned memory was dedicated to this
int* ReconstructedImage_GPU;
RfData = (int*)mxGetPr(prhs[0]);
plhs[0] = mxCreateNumericMatrix(1, 64 * 64, mxINT32_CLASS, mxREAL);
ReconstructedImage_GPU = (int*)mxGetData(plhs[0]);
printf("RfData : %d , %d, %d . \n", RfData[22], RfData[25], RfData[35]);
int ArrayByteSize_RfData = sizeof(int) * (96 * 96 * 4096);
int BYTES_PER_STREAM = ArrayByteSize_RfData / 96;
//Memory allocation: RfData ; we send the RF data to the device with streaming
int* Device_RfData; // device pointer to the RF data.
(cudaMalloc((int**)&Device_RfData, ArrayByteSize_RfData));
int* Device_ReconstructedImage_GPU; // device pointer to the reconstructed image
int ArrayByteSize_ReconstructedImage_GPU = sizeof(int) * (96*96);
(cudaMalloc((int**)&Device_ReconstructedImage_GPU, ArrayByteSize_ReconstructedImage_GPU));
printf("The CUDA reconstruction started... \n");
dim3 block(1024, 1);
dim3 grid(64 * 64, 96);//SystemSetup.NumberOfTransmitter
cudaStream_t* streams = new cudaStream_t[96]; //SystemSetup.NumberOfTransmitter
int NStart_Transmit{};
for (int transmit = 0; transmit < 96; transmit++) {
cudaStreamCreate(&streams[transmit]);
NStart_Transmit = transmit * (96 * 4096);
cudaMemcpyAsync(&Device_RfData[NStart_Transmit], &RfData[NStart_Transmit], BYTES_PER_STREAM, cudaMemcpyHostToDevice, streams[transmit]);
kernel_Reconstruction2 << <grid, block, 0, streams[transmit] >> > (&Device_RfData[NStart_Transmit], Device_ReconstructedImage_GPU, transmit, NStart_Transmit);
(cudaPeekAtLastError());
}
for (int transmit = 0; transmit < 96; transmit++) { cudaStreamDestroy(streams[transmit]); } // destroy the streams
delete[] streams;
cudaDeviceSynchronize();
(cudaMemcpy(ReconstructedImage_GPU, Device_ReconstructedImage_GPU,
ArrayByteSize_ReconstructedImage_GPU, cudaMemcpyDeviceToHost));
(cudaFree(Device_RfData));
(cudaFree(Device_ReconstructedImage_GPU));
}
As you can see, with " printf(“RfData : %d , %d, %d . \n”, RfData[22], RfData[25], RfData[35]);" i try to check the RfData. It is as it should be. I’m sure that the processing inside the Kernel is correct (I have already checked it out with my CUDA poject in visual studio). So, the only possible cause of problem in my opinion is "something is wrong with “Device_RfData” and the data available in GPU. I tried to use “printf” inside the kernel to check if the samples of “Dev_RfData” are correctly available to the threads or not, but it is not possible to use “printf” in a Kernel when i use a MEX file in matlab.
Am I missing something here? Please help.
Moein.