Ok
So, this is the output with (I hope) “proper CUDA error checking”:
Size of scratch buffer = 392
nppiMean_StdDev_32f_C1MR error output = 0
GPUassert: an illegal memory access was encountered NppBug_Mean_StdDev_32f_C1MR.cpp 82
Error number 77
So, it seem’s an out-of-bounds problem, that I can’t locate. I reviewed all the code and, according to the NPP documentation page 1828, section 7.101.2.5, all the parameters I’m passing to “nppiMean_StdDev_32f_C1MR” are correct.
Now it is easy to change the size of the input parameters with the defines I added to the code. The results are always the same.
I hope I’m making an embarrasing mistake, and this can be solved fast. Otherwise, if there is a bug in “nppiMean_StdDev_32f_C1MR” I would like to know.
Along with the code, I add a textual review of what I understad is expecting the npp function as parameters.
Thankyou very much.
Oscar
#include <npp.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code,
const char *file,
int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n",
cudaGetErrorString(code), file, line);
fprintf(stderr, "Error number %d\n", code);
if (abort) exit(code);
}
}
#define X_SIZE 128
#define Y_SIZE 16
#define TOTAL_SIZE X_SIZE * Y_SIZE
int main(int argc, char* argv[]) {
NppiSize total_npp;
int scratchBuffSize;
Npp8u *d_scratch;
Npp64f mean_f = 13.0, std_f = 2.0;
Npp32f * d_input;
Npp8u * d_mask;
NppStatus err;
total_npp.width = X_SIZE;
total_npp.height = Y_SIZE;
scratchBuffSize = 0;
nppiMeanStdDevGetBufferHostSize_32f_C1MR(total_npp, &scratchBuffSize);
std::cout << "Size of scratch buffer = " << scratchBuffSize << std::endl;
gpuErrchk(cudaMalloc(reinterpret_cast<void **>(&d_scratch),
scratchBuffSize * sizeof(Npp8u)));
Npp32f * h_input2;
Npp8u* h_mask2;
h_input2 = reinterpret_cast<Npp32f*>(malloc(sizeof(Npp32f) * TOTAL_SIZE));
h_mask2 =
reinterpret_cast<Npp8u*>(malloc(sizeof(Npp8u) * TOTAL_SIZE));
for (int i = 0; i < TOTAL_SIZE; i++) {
h_input2[i] = 50.0;
h_mask2[i] = 1;
}
gpuErrchk(cudaMalloc(reinterpret_cast<void **>(&d_input),
sizeof(Npp32f) * TOTAL_SIZE));
gpuErrchk(cudaMalloc(reinterpret_cast<void **>(&d_mask),
sizeof(Npp8u) * TOTAL_SIZE));
gpuErrchk(cudaMemcpy(d_input, h_input2,
TOTAL_SIZE * sizeof(Npp32f), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_mask, h_mask2,
TOTAL_SIZE * sizeof(Npp8u), cudaMemcpyHostToDevice));
err = nppiMean_StdDev_32f_C1MR(d_input, X_SIZE * sizeof(Npp32f),
d_mask, X_SIZE * sizeof(Npp8u),
total_npp, d_scratch,
&mean_f, &std_f);
std::cout << "nppiMean_StdDev_32f_C1MR error output = "
<< int(err) << std::endl;
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
std::cout << "mean_f = " << mean_f << std::endl;
std::cout << "std_f = " << std_f << std::endl;
gpuErrchk(cudaFree(d_scratch));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_mask));
return 0;
}
nppiMean_StdDev_32f_C1MR parameter analysis:
d_input:
source image pointer.
It is a GPU 1D array in memory of size X_SIZE * Y_SIZE and type Npp32f.
Indexed as a 2D image by the kernel.
X_SIZE * sizeof(Npp32f):
source image line step.
As we don’t add any padding, the line step equals to X_SIZE * sizeof(Npp32f)
d_mask:
mask image pointer.
It is a GPU 1D array in memory of size X_SIZE * Y_SIZE and type Npp8u.
Indexed as a 2D image by the kernel.
X_SIZE * sizeof(Npp8u):
mask line step.
As we don’t add any padding, the line step equals to X_SIZE * sizeof(Npp8u)
total_npp:
region of interest.
It is an struct with integer (int) components “widht” and “height”.
The region of interest in our case is all the image. Therefore “width = X_SIZE” and “height = Y_SIZE”.
d_scratch:
pointer to GPU scratch memory.
The size of this pointer is calculated with total_npp, using the function “nppiMeanStdDevGetBufferHostSize_32f_C1MR”, as told in the documentation.
&mean_f:
pointer to the computed mean.
Type Npp64f.
Initialized in the Host code so the variable is assigned a valid address in memory.
It should be “50.0” after execution.
&std_f:
pointer to the computed standard deviation.
Type Npp64f.
Initialized in the Host code so the variable is assigned a valid address in memory.
It should be “0” after execution.