Hi robert, i’m working with seung51hoon now.
And i prepared complete code for this problem
void main() {
int frameWidth = 4096;
int frameHeight = 4096;
unsigned int pitch = 4096;
int imageWidth = 718;
int imageHeight = 956;
uint8_t * srcBuffer;
uint8_t * refBuffer;
uint8_t * scratchBuffer;
unsigned int sizeScratch;
float * result;
unsigned int offset;
NppiSize size = {
frameWidth,
frameHeight
};
NppiSize lumaROI = {
imageWidth,
imageHeight
};
NppiSize chromaROI = {
imageWidth / 2,
imageHeight / 2
};
cuInit(0);
cudaMalloc(&srcBuffer, frameWidth * frameHeight * 1.5);
cudaMalloc(&refBuffer, frameWidth * frameHeight * 1.5);
nppiSSIMGetBufferHostSize_8u_C1R(size, &sizeScratch);
sizeScratch *= 3;
cudaMalloc((void **)&scratchBuffer, sizeScratch);
cudaMalloc((void **)&result, sizeof(Npp32f) * 1);
printf("src : %p, ref : %p, scratch : %p\n", srcBuffer, refBuffer, scratchBuffer);
//Y
printf("Luma\n");
nppiSSIM_8u_C1R(srcBuffer, pitch, refBuffer, pitch, lumaROI, result, scratchBuffer);
//U
printf("U\n");
offset = pitch * frameHeight;
nppiSSIM_8u_C1R(srcBuffer + offset, pitch / 2, refBuffer + offset, pitch / 2, chromaROI, result, scratchBuffer);
//V
printf("V\n");
offset += (pitch * frameHeight / 4);
nppiSSIM_8u_C1R(srcBuffer + offset, pitch / 2, refBuffer + offset, pitch / 2, chromaROI, result, scratchBuffer);
cudaFree(srcBuffer);
cudaFree(refBuffer);
cudaFree(scratchBuffer);
}
when i use
sizeScratch *= 3;
there’s no problem
========= CUDA-MEMCHECK
src : 0x7fb08e000000, ref : 0x7fb080000000, scratch : 0x7fb081800000
Luma
U
V
========= ERROR SUMMARY: 0 errors
but when i decrease the number below than 3, such as 2 or 1, this occurs
cuda-memcheck ./pymTest sample/test.conf
========= CUDA-MEMCHECK
src : 0x7f00b0000000, ref : 0x7f00ae000000, scratch : 0x7f00af800000
Luma
U
========= Invalid __global__ write of size 4
========= at 0x000019f0 in void TwoPassFilter32f<float, int=1, SSIMTwoPassFunctor<float, int=1>, int=11>(Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, NppiSize, float)
========= by thread (31,7,0) in block (7,6,0)
========= Address 0x7f00afbe02b8 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24c3ad]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x256312]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x256507]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x285295]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x239712]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x23d7b3]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 (nppiSSIM_8u_C1R + 0x13) [0x23a263]
========= Host Frame:/home1/irteam/test/js100/pym/libpym.so (test + 0x1fb) [0x1c88f]
========= Host Frame:./pymTest (main + 0x19) [0x3708]
========= Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21c05]
========= Host Frame:./pymTest [0x1499]
=========
========= Invalid __global__ write of size 4
========= at 0x000019f0 in void TwoPassFilter32f<float, int=1, SSIMTwoPassFunctor<float, int=1>, int=11>(Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, NppiSize, float)
========= by thread (30,7,0) in block (7,6,0)
========= Address 0x7f00afbe02b4 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24c3ad]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x256312]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x256507]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x285295]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x239712]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x23d7b3]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 (nppiSSIM_8u_C1R + 0x13) [0x23a263]
========= Host Frame:/home1/irteam/test/js100/pym/libpym.so (test + 0x1fb) [0x1c88f]
========= Host Frame:./pymTest (main + 0x19) [0x3708]
========= Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21c05]
========= Host Frame:./pymTest [0x1499]
=========
========= Invalid __global__ write of size 4
========= at 0x000019f0 in void TwoPassFilter32f<float, int=1, SSIMTwoPassFunctor<float, int=1>, int=11>(Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, Image<float, int=1>, NppiSize, float)
========= by thread (29,7,0) in block (7,6,0)
========= Address 0x7f00afbe02b0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24c3ad]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x256312]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x256507]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x285295]
========= Host Frame:/usr/local/cuda-9.2/lib64/libnppist.so.9.2 [0x239712]
Could you give us a hint?
We’re spending much time for this problem since we’ve decided to apply SSIM to our application.