I have the kernel
extern "C" void
runCuda(const int argc, const char** argv, float *sPWM,
int popSize, int matrixLen, float specificity, float *sFitness,
int numMotifSeq, char *smotifSeq, char *srmotifSeq,
int *motifSeqLen, int numBackgSeq, char *sbackgSeq, char *srbackgSeq,
int *backgSeqLen, float *scoreMotif, float *scoreBackg)
{
CUT_DEVICE_INIT();
//Declare CUDA variables
float *dsPWM, *dsFitness, *dspecificity;
int *dmotifSeqLen, *dbackgSeqLen, *dmatrixLen, *dnumMotifSeq, *dnumBackgSeq;
char *dsmotifSeq, *dsrmotifSeq, *dsbackgSeq, *dsrbackgSeq;
//Specify memory allocation size of each variable
int dsPWM_size = sizeof(float)*100*17*4;
int dsFitness_size = sizeof(float)*100;
int dmotifSeqLen_size = sizeof(int)*5001;
int dbackgSeqLen_size = sizeof(int)*5001;
int dmatrixLen_size = sizeof(int)*1;
int dspecificity_size = sizeof(float)*1;
int dnumMotifSeq_size = sizeof(int)*1;
int dnumBackgSeq_size = sizeof(int)*1;
int dsmotifSeq_size = sizeof(char)*302*10001;
int dsrmotifSeq_size = sizeof(char)*302*10001;
int dsbackgSeq_size = sizeof(char)*1500*10001;
int dsrbackgSeq_size = sizeof(char)*1500*10001;
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**) &dsPWM, dsPWM_size));
.... (all the other 10 malloc)
CUDA_SAFE_CALL(cudaMalloc((void**) &dsrbackgSeq, dsrbackgSeq_size));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(dsPWM, sPWM, dsPWM_size, cudaMemcpyHostToDevice));
.... (all the other 10 memcpy)
CUDA_SAFE_CALL(cudaMemcpy(dsrbackgSeq, srbackgSeq, dsrbackgSeq_size, cudaMemcpyHostToDevice));
// setup execution parameters
dim3 grid(10, 1, 1);
dim3 threads(10, 1, 1);
// execute the kernel
evolve<<< grid, threads >>> (dsPWM, dmatrixLen, dspecificity, dsFitness,
dnumMotifSeq, dsmotifSeq, dsrmotifSeq, dmotifSeqLen,
dnumBackgSeq, dsbackgSeq, dsrbackgSeq, dbackgSeqLen);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// copy results from device to host
CUDA_SAFE_CALL(cudaMemcpy(sFitness, dsFitness, dsFitness_size, cudaMemcpyDeviceToHost));
// cleanup memory
CUDA_SAFE_CALL(cudaFree(dsPWM));
... (all the free)
}
The kernel is simple
__global__ void evolve(float *sPWM, int *matrixLen, float *specificity, float *sFitness,
int *numMotifSeq, char *smotifSeq, char *srmotifSeq, int *motifSeqLen,
int *numBackgSeq, char *sbackgSeq, char *srbackgSeq, int *backgSeqLen)
{
float stPWM[100];
float fsumMin, fsumMax;
float fscoreMotif[302], fscoreBackg[1501];
int matLen = *matrixLen;
float spec = *specificity;
// Write data to global memory
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
transform_pwm_s(&sPWM[tid*matLen*4], stPWM, *matrixLen);
sum_position_min_s(&fsumMin, stPWM, *matrixLen);
sum_position_max_s(&fsumMax, stPWM, *matrixLen);
score_seq_s(*numMotifSeq,smotifSeq,srmotifSeq,motifSeqLen,stPWM,*mat
rixLen,fsumMin,fsumMax,fscoreMotif);
score_seq_s(*numBackgSeq,sbackgSeq,srbackgSeq,backgSeqLen,stPWM,*mat
rixLen,fsumMin,fsumMax,fscoreBackg);
sFitness[tid] = cal_ROC_s(fscoreMotif,*numMotifSeq,fscoreBackg,*numBackgSeq,1.0f-spec);
}
From the main I want to call the runCuda multiple times. I have the following problems:
-
Without the assignment back (sFitness[tid] = …) in the kernel, the code run at about 0.06sec, when I try to assign the result back, it costs me 66 secs to run. What happened?
-
I think I’m using too much memory. But I am not clear how much is too much. In the kernel, when I declare fscoreMotif to be greater than 500 elements then I got compilation error (using too much local memory). But at the current size, it works without complain.
-
In the main, for the 1st time I call runCuda, things work perfectly, I got the correct result for the data I passed in (after 66 secs) but the 2nd iteration it just runs in 18 sec and giving me wrong result, the 3 iteration causes segmentation fault. I’m not sure what went wrong. Is there anyway after every call the runCuda, I can reset everything to the initial states?
-
I want to put the CUDA_EXIT(argc, argv) inside the runCuda, but when it runs, it print out smthing like cannot parse the argv blah blah. Anyone can give me any idea about this?
Thanks a lot and best regards