I am trying to set up a large arrary on the device and then copy smaller sections to it. I hope to use the large array as a ring buffer for frames.
In C++ I would do it like this
// Create Buffer
pFilterBuff = new float[ImageSize*FilterNumFrames];
ppFilterFrame = new float*[FilterNumFrames];
for(int i=0;i<FilterNumFrames;i++){
(1) ppFilterFrame[i] = &pFilterBuff[i*ImageSize];
memset(ppFilterFrame[i],0,(sizeof(float)*ImageSize));
}
..... Some Host Code to load frame from file ....
for( int i=0;i<NumFrames;i++){
//Calculate Indexs for my buffer
int BuffIndx = i % FilterNumFrames;
int StartIndx = BuffIndx+1;
// Load Cuurent Frame to Buffer
memcpy(ppFilterFrame[BuffIndx],pCurrentFrame,(sizeof(float)*ImageSize));
//Do Calulations on Buffer
for(int i=StartIndx;i<StartIndx+FilterNumFrames;i++){
Indx = i%FilterNumFrames;
for(int j=0;j<ImageSize;j++){
(2) pResultFrame[j] += Alpha[AlphaIndex]*ppFilterFrame[Indx][j];
}
AlphaIndex++;
}
}
I have got a version of this to compile in cuda and in emulation it gives the correct answer, But when I run it out of emulation mode it givels an unknown launch failure.
Below is what I am doing now (I am only showing that deals code for the buffer).
float* dg_ppFilterBuffer[FilterNumFrames];
//Allocate Memory on Device
for(int i=0;i<FilterNumFrames;i++){
CUDA_SAFE_CALL(cudaMalloc((void**)&dg_ppFilterBuffer[i],(sizeof(float)*ImageSize)));
}
//Copy Current Frame to Device
CUDA_SAFE_CALL(cudaMemcpy(dg_ppFilterBuffer[BuffIndx],pThisFrame,(sizeof(float)*ImageSize),cudaMemcpyHostToDevice));
//Perform Filter
if(nFrameNum >= FilterNumFrames-1){
FilterFrames_Global_kernel_new<<<grid,threads>>>(StartIndx,dg_ppFilterBuffer,dg_pResultFrame);
}
Here is my Kernel Code
__global__ void
FilterFrames_Global_kernel_new(int StartIndex,float** dg_pFilterFrame,float* dg_pResultFrame)
{
int Findx = 0;
int Mindex = (blockIdx.y*ImageW) + (blockIdx.x*(blockDim.x)) + threadIdx.x;
int Alphaindex = 0;
pSums[threadIdx.x] = 0;
for(int i=StartIndex;i<StartIndex+FilterNumFrames;i++){
Findx = (i%FilterNumFrames);
pSums[threadIdx.x] += dg_Alpha[Alphaindex] * dg_pFilterFrame[Findx][Mindex];
Alphaindex++;
}
dg_pResultFrame[Mindex] = pSums[threadIdx.x];
__syncthreads();
}
The only solution I have found is to build the buffer on the host computer and then recopy the whole buffer every frame, but this make the code run slower on the card then the host. I am sure it is because of the copy time.
Does anyone have any suggestions to this problem? Can anyonje see where I have an error?
Thanks!