Hi!
When I runs my program, I gets no errors (including cudaError_t). but the data after my kernel processing looks like it didn’t changed at all.
I tried to check my kernel memory-accesses. But using cuda-memcheck makes my program crash and reboot the system.
I was able to isolate the problem (by commenting sections of the code) and found out that it happens when the data is loaded from global memory to shared memory.
I have array W in global memory, size 102451216*8 float elements (representing 4D array).
I launched the kernel with 8192 blocks and I want each block (1024 threads) to load 32 KB of the array W from global to shared memory.
I’m working on Jetson AGX Xavier with cuda 10.2.
Here is my code. Can someone tell my what I am doing wrong?
Thank you.
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <complex>
using namespace std;
#define CUDA_RUNTIME_HANDLE(cuda_err, string_API_name) \
{ \
cudaError_t err = cuda_err; \
if (err != cudaSuccess) \
cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " << string_API_name << ": " << cudaGetErrorString(err) <<endl; \
}
typedef struct complexIn{
int16_t real;
int16_t imag;
} complexIn_t;
typedef struct complexOut{
float real;
float imag;
} complex_t;
__global__ void kernel_processing(complexIn_t *dataIn, complex_t *dataOut,
float *W, int Np, int Nrg, int Nch, int Nb,
int num_of_W_chunks, int elem_per_thread, int stride){
extern __shared__ float W_shared[];
// Load W from global to shared
for(int i =0; i < elem_per_thread; i++){
W_shared[threadIdx.x + i*stride] = W[blockIdx.x*blockDim.x*elem_per_thread + threadIdx.x + i*stride];
}
__syncthreads();
// Some processing
}
int main( int argc, char **argv ) {
cout << "complex float size: " << sizeof(complex_t) << endl;
cout << "complex int16 size: " << sizeof(complexIn_t) << endl;
int Np = 1024;
int Nrg = 512;
int Nch = 16;
int Nb = 8;
complexIn_t *dataHostIn = new complexIn_t[Np*Nrg*Nch];
complex_t *dataHostOut = new complex_t[Np*Nrg*Nb];
float *W = new float[Np*Nrg*Nb*Nch];
// Initialize dataHostIn in size Np*Nrg*Nch
for(int i =0; i<Np*Nrg*Nch; i++){
dataHostIn[i].real = (int16_t)sin(17*i);
dataHostIn[i].imag = (int16_t)sin(16*i);
}
// Initialize dataHostOut in size Np*Nrg*Nb
for(int i =0; i<Np*Nrg*Nb; i++){
dataHostOut[i].real =0;
dataHostOut[i].imag =0;
}
// Initialize W in size Np*Nrg*Nb
for(int i =0; i<Np*Nrg*Nb*Nch; i++){
W[i] = (float)sin(25*i);
}
cout << endl;
complexIn_t *dataDeviceIn;
CUDA_RUNTIME_HANDLE(cudaMalloc(&dataDeviceIn, Np*Nrg*Nch*sizeof(complexIn_t)), "cudaMalloc");
CUDA_RUNTIME_HANDLE(cudaMemcpy(dataDeviceIn, dataHostIn, Np*Nrg*Nch*sizeof(complexIn_t), cudaMemcpyDefault) , "cudaMemcpy");
float *W_Device;
CUDA_RUNTIME_HANDLE(cudaMalloc(&W_Device, Np*Nrg*Nb*Nch*sizeof(float)), "cudaMalloc");
CUDA_RUNTIME_HANDLE(cudaMemcpy(W_Device, W, Np*Nrg*Nb*Nch*sizeof(float), cudaMemcpyDefault) , "cudaMemcpy");
complex_t *dataDeviceOut;
CUDA_RUNTIME_HANDLE(cudaMalloc(&dataDeviceOut, Np*Nrg*Nb*sizeof(complex_t)), "cudaMalloc");
int W_chunk_size = Nch*Nb;
int num_of_W_chunks = 64;
int Nelements_in_W = W_chunk_size*num_of_W_chunks;
int elem_per_thread = Nelements_in_W/1024;
int stride = Nelements_in_W/elem_per_thread;
int shared_mem_per_block = W_chunk_size*num_of_W_chunks*sizeof(float);
int BlockDim = 1024;
int GridDim = (Np*Nrg)/num_of_W_chunks;
CUDA_RUNTIME_HANDLE(cudaDeviceSynchronize(), "cudaDeviceSynchronize");
kernel_processing<<<GridDim, BlockDim ,shared_mem_per_block>>>(dataDeviceIn,
dataDeviceOut, W_Device, Np, Nrg, Nch, Nb, num_of_W_chunks, elem_per_thread, stride);
CUDA_RUNTIME_HANDLE(cudaDeviceSynchronize(), "cudaDeviceSynchronize");
CUDA_RUNTIME_HANDLE(cudaMemcpy(dataHostOut, dataDeviceOut, Np*Nrg*Nb*sizeof(complex_t), cudaMemcpyDefault), "cudaMemcpy");
CUDA_RUNTIME_HANDLE(cudaFree(dataDeviceIn), "cudaFree");
CUDA_RUNTIME_HANDLE(cudaFree(dataDeviceOut), "cudaFree");
cout << "----------- Check the output: -----------" << endl;
cout << "dataOut[555] = " << dataHostOut[555].real << " + " << dataHostOut[555].imag << "i" << endl;
delete[] dataHostIn;
delete[] dataHostOut;
return 0;
}