Hi!
I am using CUDA to speed up the calculation of a 1-D convolution for two arrays. The device I am using is a Jetson Nano.
My program gets two arrays from Shared memory, and their size from command line. It then uses CUDA to calculate the convolution between A and B.
This is my code.
notice I cut the part where arrays are copied from shared memory. That part works fine and is big and messy
#define USECPSEC 1000000ULL
#define nTPB 256
#define mytype double
__global__ void conv_Kernel2(const mytype * __restrict__ A, const mytype * __restrict__ B, mytype *out, const int N){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < (N)-1){
mytype my_sum = 0;
for (int i = 0; i < N; i++)
if (((idx < N) && (i <= idx)) || ((idx >= N) && (i > (idx-N)))) my_sum += A[i]*B[idx-i];
out[idx] = my_sum;
}
}
enum ARGS{ARG_NAME, ARG_VEC1_SIZE, ARG_VEC2_SIZE, ARG_NUM};
/**
* this program gets via shared memory two arrays of double, called A and B, and gets as arguments the sizes of A and B.
*/
int main(int argc, char *argv[]){
double *h_A, *d_A, *h_result, *d_result, *result, *h_B, *d_B;
cout<<"initializing ..."<<endl;
//----------------------------------------------------
//get A and B (2 arrays of double) from shared memory copy them into h_A and h_B
//----------------------------------------------------
//Take the size of the two vectors from command line...
arg_vecA_size=atoi(argv[ARG_VEC1_SIZE]); //size of A
arg_vecB_size=atoi(argv[ARG_VEC2_SIZE]); //size of B
cout<<"Allocating memory ..."<<endl;
//allocation of memory
h_result = (double *)malloc((arg_vecB_size + arg_vecA_size) * sizeof(mytype));
//Allocation of cuda memory
if(cudaMalloc(&d_B, arg_vecB_size * sizeof(mytype)) != cudaSuccess){
throw runtime_error("Error - cudaMalloc of d_B");
};
if(cudaMalloc(&d_A, arg_vecA_size * sizeof(mytype))!=cudaSuccess){
throw runtime_error("Error - cudaMalloc of d_A");
};
if(cudaMalloc(&d_result, (arg_vecB_size + arg_vecA_size) * sizeof(mytype))!=cudaSuccess){
throw runtime_error("Error - cudaMalloc of d_result");
};
for (int i=0; i < arg_vecB_size + arg_vecA_size; i++){
h_result[i] = 0;
}
cout<<"Copying memory on device..."<<endl;
//copy memory on device
if(cudaMemset(d_result, 0, (arg_vecB_size + arg_vecA_size) * sizeof(mytype))!=cudaSuccess){
throw runtime_error("Error on cudaMemcpy of d_result");
};
if(cudaMemcpy(d_A, h_A, arg_vecA_size * sizeof(mytype), cudaMemcpyHostToDevice)!=cudaSuccess){
throw runtime_error(" Error on cudaMemcpy of d_A");
};
if(cudaMemcpy(d_B, h_B, arg_vecB_size * sizeof(mytype), cudaMemcpyHostToDevice)!=cudaSuccess){
throw runtime_error("Error on cudaMemcpy of d_B");
};
cout<<"Launching Kernel..."<<endl;
conv_Kernel2<<<((arg_vecB_size + arg_vecA_size-2)+nTPB-1)/nTPB, nTPB>>>(d_A, d_B, d_result, arg_vecB_size + arg_vecA_size);
cudaDeviceSynchronize();
int error;
error = cudaMemcpy(h_result, d_result, (arg_vecB_size + arg_vecA_size)*sizeof(mytype), cudaMemcpyDeviceToHost);
if( error != cudaSuccess){
cerr << "error number is "<< error << endl;
throw runtime_error("Error on cudaMalloc of d_result back to host");
};
cudaDeviceSynchronize();
return 0;
}
This code works correctly with small A and B.
Problem is when A and B are big (e.g. sizes of 705830 elements in A and 794029 in B , so h_result
size is 560449489070 elements)
In that case I get the exception runtime_error("Error on cudaMalloc of d_result back to host");
, corresponding to the cudamemcpy
from device to host.
The value returned by cudamemcpy
is 702
, that corresponds to cudaErrorLaunchTimeout
: this makes me think the process of copying the full array back to RAM is too slow to be completed in time…
Does anyone have an idea of what could be causing it? And how to resolve it? Thanks