I’ve written a simple program to experiment with Unified memory in CUDA. The main function runs until I try to retrieve the data from shared memory [line 103].
However, if I change line 40 to something like output[index] = input[index]*0.2f; then the program runs properly and displays a correct (darker) image. In my mind the current line 40 and the above mentioned alternative are both doing the same thing: assigning a value to the array. There must be some nuance that I am missing. Can someone please shed some light on where I am going wrong?
void __device__ getCoordFromIndex(int index, int stride, int channels, int* X, int* Y, int* C){
*Y = (index / stride);
*X = (index - (*Y*stride)) / channels;
*C = index - (*Y*stride) - (*X*channels);
}
void __global__ cudaBoxBlur(unsigned char* input, unsigned char* output, int width, int height, int channels, int radius){
int index = blockIdx.x * blockDim.x + threadIdx.x;
int W = width;
int H = height;
int C = channels;
int rowstride = W*C;
int samples = radius * 2 + 1;
samples *= samples;
float sum = 0.0f;
int X, Y, Z;
X = 0;
Y = 0;
Z = 0;
getCoordFromIndex(index,rowstride,C,&X,&Y,&Z);
//add border condition checks
for (int y = -radius; y <= radius; y++){
int rowoffset = 0;
rowoffset = min(max(Y+y,0),H-1);
rowoffset*=rowstride;
for (int x = -radius; x <= radius; x++){
int coloffset = 0;
coloffset = min(max(X+x,0),W-1);
coloffset *= C;
int i = index + coloffset + rowoffset;
sum += input[i];
}
}
sum /= samples;
unsigned char out = unsigned char(int(sum+0.5f));
output[index] = out;
}
void toSharedMem(cv::Mat* image, unsigned char* shared){
int p = 0;
int H = image->rows;
int W = image->cols;
int C = image->channels();
for (int y = 0; y < H; y++){
uchar* row = image->ptr<uchar>(y);
std::copy_n(row,W*C,&shared[p]);
p+=W*C;
}
}
void fromSharedMem(cv::Mat* output, unsigned char* shared){
int p = 0;
int H = output->rows;
int W = output->cols;
int C = output->channels();
int rowlength = W*C;
std::cout<<H<<" "<<W<<" "<<C<<std::endl;
for (int y = 0; y < H; y++){
uchar* row = output->ptr<uchar>(y);
std::cout<<"actual copy"<<std::endl;
std::copy_n(&shared[p],rowlength,row);
p+=W*C;
}
}
int main(int argc, char** argv){
cudaSetDevice(1);
cv::Mat input = cv::imread(argv[1],-1);
int H = input.rows;
int W = input.cols;
int C = input.channels();
unsigned char* input_imgdata = new unsigned char[H*W*C];
unsigned char* output_imgdata = new unsigned char[H*W*C];
cudaMallocManaged(&input_imgdata, H*W*C*sizeof(char));
cudaMallocManaged(&output_imgdata,H*W*C*sizeof(char));
toSharedMem(&input,input_imgdata);
toSharedMem(&input,output_imgdata);
int blockSize = 1024;
int numBlocks = (H*W*C + blockSize - 1) / blockSize;
int radius = 10;
std::cout << "Begin\n";
cudaBoxBlur<<<numBlocks,blockSize>>>(input_imgdata, output_imgdata, W, H, C, radius);
cudaDeviceSynchronize();
cv::Mat output;
input.copyTo(output);
output*=0;
std::cout<<"Copying"<<std::endl;
fromSharedMem(&output,output_imgdata);
std::cout<<"Copied"<<std::endl;
cv::resize(output,output,cv::Size(1280,960));
cv::imshow("",output);
int k = (int)cv::waitKey(0);
cudaFree(input_imgdata);
cudaFree(output_imgdata);
return 0;
}