Sorry, I wasn’t clear first time. I want to find all the local maxima in the cube. A voxel is a local maximum if it is greater than it’s 26 neighbours in the 3d volume.

I currently have the following:

[codebox]**global** void integer_modes2(float *d_V, int d0, int d1, int d2, int *I)

{

```
int wid = threadIdx.x + (threadIdx.y + threadIdx.z * 8) * 8;
int bz = blockIdx.y / (d1 / 8);
int by = blockIdx.y - (bz * (d1 / 8));
bz *= 8;
by *= 8;
int w_num = wid / WARP_SIZE;
wid = wid % WARP_SIZE;
```

// Cache for 10 x 10 x 10

```
__shared__ float v[10][10][10];
int x, y, z;
```

// Each warp will load 10 floats coalesced

```
for (z = 0; z < 10; ++z) {
for (y = 0; y < 10; ++y) {
if (((y + z * 10) % ((8*8*8)/WARP_SIZE)) == w_num) {
if ((z == 0 && bz == 0) || (z == 9 && bz == d2-8) || (y == 0 && by == 0) || (y == 9 && by == d1-8)) {
// Boundary case
if (wid < 10) {
v[z][y][wid] = BOUNDARY_VAL;
}
} else {
// Compute the offset for each thread
x = (wid+blockIdx.x*8-1) + ((y+by) + (z+bz) * d1) * d0 % WARP_SIZE;
if (x < 10) {
// Boundary case
if ((x == 0 && blockIdx.x == 0) || (x == 9 && blockIdx.x == d0/8-1)) {
v[z][y][x] = BOUNDARY_VAL;
} else {
int vidx = (x+blockIdx.x*8-1) + ((y+by) + (z+bz) * d1) * d0;
v[z][y][x] = d_V[vidx];
}
}
}
}
}
}
```

// Record value

```
__syncthreads();
float val = v[threadIdx.z+1][threadIdx.y+1][threadIdx.x+1];
if (val == 0.0f)
return;
```

// Determine if mode

```
for (z = 0; z < 2; ++z) {
for (y = 0; y < 2; ++y) {
for (x = 0; x < 2; ++x) {
if (val < v[threadIdx.z+z][threadIdx.y+y][threadIdx.x+x])
return;
}
}
}
int off = atomicAdd(&I[MAX_NUM_FEATURES], 1);
if (off < MAX_NUM_FEATURES)
I[off] = (threadIdx.x + blockIdx.x * 8) + ((threadIdx.y + by) + (threadIdx.z + bz) * d1) * d0;
```

}[/codebox]

I launch the kernel with:

integer_modes2<<<dim3(d0/8, (d1/8)*(d2/8)), dim3(8, 8, 8)>>>(d_V, d0, d1, d2, I);

However, I currently get an unspecified launch failure. Am I on the right track, and if so, why do I get the error? If not, does anyone have another suggestion?

Many thanks.