Possible compiler bug when using a constructor in a struct?

gct · October 22, 2010, 7:22pm

I’ve got some relatively simple code that takes an input buffer of complex data and folds each block down to a couple of values (max, index, sum and sum-of-absolute). I’ve been having some issues that I’ve had a hard time tracking down and I’m convinced that I’ve found a compiler bug.

See the following code:

[codebox]#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <cuda.h>

#include <cutil.h>

typedef struct align(8) cfloat_t {

//__device__ __host__ cfloat_t() { re = 0.0; im = 0.0; }

float re;

float im;

} cfloat_t;

#define NUM_BLOCKS 256

#define THR_PER_BLK 128

global void fold_kernel(cfloat_t *data,

                        float    *d_max, uint32_t *d_idx,

                        cfloat_t *d_sum, float    *d_abs) {

shared float max_buff[THR_PER_BLK/2];

shared uint32_t idx_buff[THR_PER_BLK/2];

shared cfloat_t sum_buff[THR_PER_BLK/2];

shared float abs_buff[THR_PER_BLK/2];

// Do initial comparison straight from global memory to cut shared memory usage in half

if (threadIdx.x < blockDim.x/2) {

   const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;

   const uint32_t idx_upper = idx_lower + (blockDim.x/2);

cfloat_t val_1 = data[idx_lower];

   float    mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);

cfloat_t val_2 = data[idx_upper];

   float    mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);

// Do initial sum and sum-of-absolutes

   sum_buff[threadIdx.x].re = val_1.re + val_2.re;

   sum_buff[threadIdx.x].im = val_1.im + val_2.im;

abs_buff[threadIdx.x] = mag_1 + mag_2;

if (mag_2 > mag_1) {

       max_buff[threadIdx.x] = mag_2;

       idx_buff[threadIdx.x] = idx_upper;

   } else {

       max_buff[threadIdx.x] = mag_1;

       idx_buff[threadIdx.x] = idx_lower;

   }

}

__syncthreads();

// Fold down into single value per block

for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {

   const size_t half = ii;

if (threadIdx.x < half) {

       const size_t idx_1 = threadIdx.x;

       const size_t idx_2 = threadIdx.x + half;

// Fold sum and sum of absolutes

       sum_buff[idx_1].re += sum_buff[idx_2].re;

       sum_buff[idx_1].im += sum_buff[idx_2].im;

       abs_buff[idx_1]    += abs_buff[idx_2];

if (max_buff[idx_2] > max_buff[idx_1]) {

           max_buff[idx_1] = max_buff[idx_2];

           idx_buff[idx_1] = idx_buff[idx_2];

       }

   }

__syncthreads();

}

if (threadIdx.x == 0) {

   d_max[blockIdx.x] = max_buff[0];

   d_idx[blockIdx.x] = idx_buff[0];

   d_sum[blockIdx.x] = sum_buff[0];

   d_abs[blockIdx.x] = abs_buff[0];

}

host void test_fold_kernel() {

const uint32_t BUFF_SIZE = 32768;

const uint32_t NUM_ITER = 1;

cfloat_t h_array = (cfloat_t)malloc(BUFF_SIZE*sizeof(cfloat_t));

cfloat_t d_array; CUDA_SAFE_CALL(cudaMalloc((void*)&d_array, BUFF_SIZE*sizeof(cfloat_t)));

float h_max_arr = (float) malloc(NUM_BLOCKS*sizeof(float));

uint32_t h_idx_arr = (uint32_t)malloc(NUM_BLOCKS*sizeof(uint32_t));

cfloat_t h_sum_arr = (cfloat_t)malloc(NUM_BLOCKS*sizeof(cfloat_t));

float h_abs_arr = (float) malloc(NUM_BLOCKS*sizeof(float));

float d_max_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_max_arr, NUM_BLOCKS*sizeof(float)));

uint32_t d_idx_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_idx_arr, NUM_BLOCKS*sizeof(uint32_t)));

cfloat_t d_sum_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t)));

float d_abs_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_abs_arr, NUM_BLOCKS*sizeof(float)));

// < CHANGE HERE >

for (size_t ii=0; ii < BUFF_SIZE; ii++) {

   h_array[ii].re = 1.0;//(float)ii;

   h_array[ii].im = 0.0;

}

for (size_t iter = 0; iter < NUM_ITER; iter++) {

   CUDA_SAFE_CALL(

      cudaMemcpy(d_array, h_array, BUFF_SIZE*sizeof(cfloat_t), cudaMemcpyHostToDevice));

fold_kernel<<<NUM_BLOCKS, THR_PER_BLK>>>(d_array, d_max_arr, d_idx_arr, d_sum_arr, d_abs_arr);

// Copy results back

   CUDA_SAFE_CALL(cudaMemcpy(h_max_arr, d_max_arr, NUM_BLOCKS*sizeof(float),    cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_idx_arr, d_idx_arr, NUM_BLOCKS*sizeof(uint32_t), cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_sum_arr, d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t), cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_abs_arr, d_abs_arr, NUM_BLOCKS*sizeof(float),    cudaMemcpyDeviceToHost));

printf(“Sums: \n”);

   for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

       printf(" %i -> (%f, %f)\n", ii, h_sum_arr[ii].re, h_sum_arr[ii].im);

   }

float max = 0.0;

   uint32_t max_idx = 0;

   cfloat_t sum; sum.re = 0.0; sum.im = 0.0;

   float    abs     = 0.0;

   for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

       if (h_max_arr[ii] > max) {

           max     = h_max_arr[ii];

           max_idx = h_idx_arr[ii];

       }

sum.re += h_sum_arr[ii].re;

       sum.im += h_sum_arr[ii].im;

       abs    += h_abs_arr[ii];

   }

printf(“Sum: (%f, %f), Abs: %f Max: %f Idx: %u\n”,

          sum.re, sum.im, abs, max, max_idx);

}

int main() {

test_fold_kernel();

return 0;

}[/codebox]

Which can be compiled with:

NVCC_OPTIONS = --ptxas-options=-v

NVCC_INCLUDE = -I/usr/local/cuda/include -I/usr/local/cuda/sdk/C/common/inc

test: test.cu

		nvcc $(NVCC_OPTIONS) $(NVCC_INCLUDE) test.cu -o test[

When the constructor on the struct is commented out, the code runs just as you’d expect I feed it 32K (1.0,0.0) values and get (32768.0,0.0) as the output. However, when you uncomment the constructor and recompile, then some non-deterministic behavior takes over and some blocks don’t sum up to 128 as they should.

I’m totally open to the notion that I’ve missed something, but it seems like that default constructor should be supported. Is anyone able to shed some light on this or should I go ahead and file a bug report? I’ve confirmed this behavior on CUDA 3.1 (Fedora Core 13 and RHEL 5) and CUDA 3.1 (Fedora Core 13)

gct · October 22, 2010, 7:22pm

I’ve got some relatively simple code that takes an input buffer of complex data and folds each block down to a couple of values (max, index, sum and sum-of-absolute). I’ve been having some issues that I’ve had a hard time tracking down and I’m convinced that I’ve found a compiler bug.

See the following code:

[codebox]#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <cuda.h>

#include <cutil.h>

typedef struct align(8) cfloat_t {

//__device__ __host__ cfloat_t() { re = 0.0; im = 0.0; }

float re;

float im;

} cfloat_t;

#define NUM_BLOCKS 256

#define THR_PER_BLK 128

global void fold_kernel(cfloat_t *data,

                        float    *d_max, uint32_t *d_idx,

                        cfloat_t *d_sum, float    *d_abs) {

shared float max_buff[THR_PER_BLK/2];

shared uint32_t idx_buff[THR_PER_BLK/2];

shared cfloat_t sum_buff[THR_PER_BLK/2];

shared float abs_buff[THR_PER_BLK/2];

// Do initial comparison straight from global memory to cut shared memory usage in half

if (threadIdx.x < blockDim.x/2) {

   const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;

   const uint32_t idx_upper = idx_lower + (blockDim.x/2);

cfloat_t val_1 = data[idx_lower];

   float    mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);

cfloat_t val_2 = data[idx_upper];

   float    mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);

// Do initial sum and sum-of-absolutes

   sum_buff[threadIdx.x].re = val_1.re + val_2.re;

   sum_buff[threadIdx.x].im = val_1.im + val_2.im;

abs_buff[threadIdx.x] = mag_1 + mag_2;

if (mag_2 > mag_1) {

       max_buff[threadIdx.x] = mag_2;

       idx_buff[threadIdx.x] = idx_upper;

   } else {

       max_buff[threadIdx.x] = mag_1;

       idx_buff[threadIdx.x] = idx_lower;

   }

}

__syncthreads();

// Fold down into single value per block

for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {

   const size_t half = ii;

if (threadIdx.x < half) {

       const size_t idx_1 = threadIdx.x;

       const size_t idx_2 = threadIdx.x + half;

// Fold sum and sum of absolutes

       sum_buff[idx_1].re += sum_buff[idx_2].re;

       sum_buff[idx_1].im += sum_buff[idx_2].im;

       abs_buff[idx_1]    += abs_buff[idx_2];

if (max_buff[idx_2] > max_buff[idx_1]) {

           max_buff[idx_1] = max_buff[idx_2];

           idx_buff[idx_1] = idx_buff[idx_2];

       }

   }

__syncthreads();

}

if (threadIdx.x == 0) {

   d_max[blockIdx.x] = max_buff[0];

   d_idx[blockIdx.x] = idx_buff[0];

   d_sum[blockIdx.x] = sum_buff[0];

   d_abs[blockIdx.x] = abs_buff[0];