I have the same problem with NPP graphcut also (NPP_CUDA_KERNEL_EXECUTION_ERROR). The dataset I use seems to be correct (there are check in the code). Do you see any problems?
int main(int argc, char** argv) {
int* terminals = load("terminals.txt");
int* left_tr = load("left_tr.txt");
for (int y = 0; y < height; ++y)
assert(left_tr[y] == 0);
int* right_tr = load("right_tr.txt");
for (int y = 0; y < height; ++y)
assert(right_tr[(width - 1) * height + y] == 0);
int* top = load("top.txt");
for (int x = 0; x < width; ++x)
assert(top[x] == 0);
int* bottom = load("bottom.txt");
for (int x = 0; x < width; ++x)
assert(bottom[(height - 1) * width + x] == 0);
int step, step_tr, step_labels;
Npp32s* d_terminals = nppiMalloc_32s_C1(width, height, &step);
cout << cudaMemcpy2D(d_terminals, step, terminals, width * sizeof(int), width * sizeof(int), height, cudaMemcpyHostToDevice) << endl;
Npp32s* d_left_tr = nppiMalloc_32s_C1(height, width, &step_tr);
cout << cudaMemcpy2D(d_left_tr, step_tr, left_tr, height * sizeof(int), height * sizeof(int), width, cudaMemcpyHostToDevice) << endl;
Npp32s* d_right_tr = nppiMalloc_32s_C1(height, width, &step_tr);
cout << cudaMemcpy2D(d_right_tr, step_tr, right_tr, height * sizeof(int), height * sizeof(int), width, cudaMemcpyHostToDevice) << endl;
Npp32s* d_top = nppiMalloc_32s_C1(width, height, &step);
cout << cudaMemcpy2D(d_top, step, top, width * sizeof(int), width * sizeof(int), height, cudaMemcpyHostToDevice) << endl;
Npp32s* d_bottom = nppiMalloc_32s_C1(width, height, &step);
cout << cudaMemcpy2D(d_bottom, step, bottom, width * sizeof(int), width * sizeof(int), height, cudaMemcpyHostToDevice) << endl;
Npp8u* d_labels = nppiMalloc_8u_C1(width, height, &step_labels);
NppiSize size;
size.width = width;
size.height = height;
Npp8u* d_buf;
int buf_size;
cout << nppiGraphcutGetSize(size, &buf_size) << endl;
cout << cudaMalloc(&d_buf, buf_size) << endl;
// It prints -3 = NPP_CUDA_KERNEL_EXECUTION_ERROR
cout << "gc: " << nppiGraphcut_32s8u(d_terminals, d_left_tr, d_right_tr, d_top, d_bottom,
step_tr, step, size, d_labels, step_labels, d_buf) << endl;
cout << cudaGetLastError() << endl;
cout << cudaDeviceSynchronize() << endl;
nppiFree(d_buf);
nppiFree(d_terminals);
nppiFree(d_left_tr);
nppiFree(d_right_tr);
nppiFree(d_top);
nppiFree(d_bottom);
nppiFree(d_labels);
delete[] terminals;
delete[] left_tr;
delete[] right_tr;
delete[] top;
delete[] bottom;
return 0;
}
NppGraphCutReproCase.zip (64.7 KB)