I’ve got some relatively simple code that takes an input buffer of complex data and folds each block down to a couple of values (max, index, sum and sum-of-absolute). I’ve been having some issues that I’ve had a hard time tracking down and I’m convinced that I’ve found a compiler bug.

See the following code:

[codebox]#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <cuda.h>

#include <cutil.h>

typedef struct **align**(8) cfloat_t {

```
//__device__ __host__ cfloat_t() { re = 0.0; im = 0.0; }
float re;
float im;
```

} cfloat_t;

#define NUM_BLOCKS 256

#define THR_PER_BLK 128

**global** void fold_kernel(cfloat_t *data,

```
float *d_max, uint32_t *d_idx,
cfloat_t *d_sum, float *d_abs) {
```

**shared** float max_buff[THR_PER_BLK/2];

**shared** uint32_t idx_buff[THR_PER_BLK/2];

**shared** cfloat_t sum_buff[THR_PER_BLK/2];

**shared** float abs_buff[THR_PER_BLK/2];

// Do initial comparison straight from global memory to cut shared memory usage in half

if (threadIdx.x < blockDim.x/2) {

```
const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t idx_upper = idx_lower + (blockDim.x/2);
```

cfloat_t val_1 = data[idx_lower];

```
float mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);
```

cfloat_t val_2 = data[idx_upper];

```
float mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);
```

// Do initial sum and sum-of-absolutes

```
sum_buff[threadIdx.x].re = val_1.re + val_2.re;
sum_buff[threadIdx.x].im = val_1.im + val_2.im;
```

abs_buff[threadIdx.x] = mag_1 + mag_2;

if (mag_2 > mag_1) {

```
max_buff[threadIdx.x] = mag_2;
idx_buff[threadIdx.x] = idx_upper;
} else {
max_buff[threadIdx.x] = mag_1;
idx_buff[threadIdx.x] = idx_lower;
}
```

}

__syncthreads();

// Fold down into single value per block

for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {

```
const size_t half = ii;
```

if (threadIdx.x < half) {

```
const size_t idx_1 = threadIdx.x;
const size_t idx_2 = threadIdx.x + half;
```

// Fold sum and sum of absolutes

```
sum_buff[idx_1].re += sum_buff[idx_2].re;
sum_buff[idx_1].im += sum_buff[idx_2].im;
abs_buff[idx_1] += abs_buff[idx_2];
```

if (max_buff[idx_2] > max_buff[idx_1]) {

```
max_buff[idx_1] = max_buff[idx_2];
idx_buff[idx_1] = idx_buff[idx_2];
}
}
```

__syncthreads();

}

if (threadIdx.x == 0) {

```
d_max[blockIdx.x] = max_buff[0];
d_idx[blockIdx.x] = idx_buff[0];
d_sum[blockIdx.x] = sum_buff[0];
d_abs[blockIdx.x] = abs_buff[0];
```

}

}

**host** void test_fold_kernel() {

const uint32_t BUFF_SIZE = 32768;

const uint32_t NUM_ITER = 1;

cfloat_t *h_array = (cfloat_t*)malloc(BUFF_SIZE*sizeof(cfloat_t));

cfloat_t *d_array; CUDA_SAFE_CALL(cudaMalloc((void**)&d_array, BUFF_SIZE*sizeof(cfloat_t)));

float *h_max_arr = (float*) malloc(NUM_BLOCKS*sizeof(float));

uint32_t *h_idx_arr = (uint32_t*)malloc(NUM_BLOCKS*sizeof(uint32_t));

cfloat_t *h_sum_arr = (cfloat_t*)malloc(NUM_BLOCKS*sizeof(cfloat_t));

float *h_abs_arr = (float*) malloc(NUM_BLOCKS*sizeof(float));

float *d_max_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_max_arr, NUM_BLOCKS*sizeof(float)));

uint32_t *d_idx_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_idx_arr, NUM_BLOCKS*sizeof(uint32_t)));

cfloat_t *d_sum_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t)));

float *d_abs_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_abs_arr, NUM_BLOCKS*sizeof(float)));

// < CHANGE HERE >

for (size_t ii=0; ii < BUFF_SIZE; ii++) {

```
h_array[ii].re = 1.0;//(float)ii;
h_array[ii].im = 0.0;
```

}

for (size_t iter = 0; iter < NUM_ITER; iter++) {

```
CUDA_SAFE_CALL(
cudaMemcpy(d_array, h_array, BUFF_SIZE*sizeof(cfloat_t), cudaMemcpyHostToDevice));
```

fold_kernel<<<NUM_BLOCKS, THR_PER_BLK>>>(d_array, d_max_arr, d_idx_arr, d_sum_arr, d_abs_arr);

// Copy results back

```
CUDA_SAFE_CALL(cudaMemcpy(h_max_arr, d_max_arr, NUM_BLOCKS*sizeof(float), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_idx_arr, d_idx_arr, NUM_BLOCKS*sizeof(uint32_t), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_sum_arr, d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_abs_arr, d_abs_arr, NUM_BLOCKS*sizeof(float), cudaMemcpyDeviceToHost));
```

printf(“Sums: \n”);

```
for (size_t ii=0; ii < NUM_BLOCKS; ii++) {
printf(" %i -> (%f, %f)\n", ii, h_sum_arr[ii].re, h_sum_arr[ii].im);
}
```

float max = 0.0;

```
uint32_t max_idx = 0;
cfloat_t sum; sum.re = 0.0; sum.im = 0.0;
float abs = 0.0;
for (size_t ii=0; ii < NUM_BLOCKS; ii++) {
if (h_max_arr[ii] > max) {
max = h_max_arr[ii];
max_idx = h_idx_arr[ii];
}
```

sum.re += h_sum_arr[ii].re;

```
sum.im += h_sum_arr[ii].im;
abs += h_abs_arr[ii];
}
```

printf(“Sum: (%f, %f), Abs: %f Max: %f Idx: %u\n”,

```
sum.re, sum.im, abs, max, max_idx);
```

}

}

int main() {

test_fold_kernel();

return 0;

}[/codebox]

Which can be compiled with:

```
NVCC_OPTIONS = --ptxas-options=-v
NVCC_INCLUDE = -I/usr/local/cuda/include -I/usr/local/cuda/sdk/C/common/inc
test: test.cu
nvcc $(NVCC_OPTIONS) $(NVCC_INCLUDE) test.cu -o test[
```

When the constructor on the struct is commented out, the code runs just as you’d expect I feed it 32K (1.0,0.0) values and get (32768.0,0.0) as the output. However, when you uncomment the constructor and recompile, then some non-deterministic behavior takes over and some blocks don’t sum up to 128 as they should.

I’m totally open to the notion that I’ve missed something, but it seems like that default constructor should be supported. Is anyone able to shed some light on this or should I go ahead and file a bug report? I’ve confirmed this behavior on CUDA 3.1 (Fedora Core 13 and RHEL 5) and CUDA 3.1 (Fedora Core 13)