Possible compiler bug when using a constructor in a struct?

I’ve got some relatively simple code that takes an input buffer of complex data and folds each block down to a couple of values (max, index, sum and sum-of-absolute). I’ve been having some issues that I’ve had a hard time tracking down and I’m convinced that I’ve found a compiler bug.

See the following code:

[codebox]#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <cuda.h>

#include <cutil.h>

typedef struct align(8) cfloat_t {

//__device__ __host__ cfloat_t() { re = 0.0; im = 0.0; }

float re;

float im;

} cfloat_t;

#define NUM_BLOCKS 256

#define THR_PER_BLK 128

global void fold_kernel(cfloat_t *data,

                        float    *d_max, uint32_t *d_idx,

                        cfloat_t *d_sum, float    *d_abs) {

shared float max_buff[THR_PER_BLK/2];

shared uint32_t idx_buff[THR_PER_BLK/2];

shared cfloat_t sum_buff[THR_PER_BLK/2];

shared float abs_buff[THR_PER_BLK/2];

// Do initial comparison straight from global memory to cut shared memory usage in half

if (threadIdx.x < blockDim.x/2) {

   const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;

   const uint32_t idx_upper = idx_lower + (blockDim.x/2);

cfloat_t val_1 = data[idx_lower];

   float    mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);

cfloat_t val_2 = data[idx_upper];

   float    mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);

// Do initial sum and sum-of-absolutes

   sum_buff[threadIdx.x].re = val_1.re + val_2.re;

   sum_buff[threadIdx.x].im = val_1.im + val_2.im;

abs_buff[threadIdx.x] = mag_1 + mag_2;

if (mag_2 > mag_1) {

       max_buff[threadIdx.x] = mag_2;

       idx_buff[threadIdx.x] = idx_upper;

   } else {

       max_buff[threadIdx.x] = mag_1;

       idx_buff[threadIdx.x] = idx_lower;

   }

}

__syncthreads();

// Fold down into single value per block

for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {

   const size_t half = ii;

if (threadIdx.x < half) {

       const size_t idx_1 = threadIdx.x;

       const size_t idx_2 = threadIdx.x + half;

// Fold sum and sum of absolutes

       sum_buff[idx_1].re += sum_buff[idx_2].re;

       sum_buff[idx_1].im += sum_buff[idx_2].im;

       abs_buff[idx_1]    += abs_buff[idx_2];

if (max_buff[idx_2] > max_buff[idx_1]) {

           max_buff[idx_1] = max_buff[idx_2];

           idx_buff[idx_1] = idx_buff[idx_2];

       }

   }

__syncthreads();

}

if (threadIdx.x == 0) {

   d_max[blockIdx.x] = max_buff[0];

   d_idx[blockIdx.x] = idx_buff[0];

   d_sum[blockIdx.x] = sum_buff[0];

   d_abs[blockIdx.x] = abs_buff[0];

}

}

host void test_fold_kernel() {

const uint32_t BUFF_SIZE = 32768;

const uint32_t NUM_ITER = 1;

cfloat_t h_array = (cfloat_t)malloc(BUFF_SIZE*sizeof(cfloat_t));

cfloat_t d_array; CUDA_SAFE_CALL(cudaMalloc((void*)&d_array, BUFF_SIZE*sizeof(cfloat_t)));

float h_max_arr = (float) malloc(NUM_BLOCKS*sizeof(float));

uint32_t h_idx_arr = (uint32_t)malloc(NUM_BLOCKS*sizeof(uint32_t));

cfloat_t h_sum_arr = (cfloat_t)malloc(NUM_BLOCKS*sizeof(cfloat_t));

float h_abs_arr = (float) malloc(NUM_BLOCKS*sizeof(float));

float d_max_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_max_arr, NUM_BLOCKS*sizeof(float)));

uint32_t d_idx_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_idx_arr, NUM_BLOCKS*sizeof(uint32_t)));

cfloat_t d_sum_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t)));

float d_abs_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_abs_arr, NUM_BLOCKS*sizeof(float)));

// < CHANGE HERE >

for (size_t ii=0; ii < BUFF_SIZE; ii++) {

   h_array[ii].re = 1.0;//(float)ii;

   h_array[ii].im = 0.0;

}

for (size_t iter = 0; iter < NUM_ITER; iter++) {

   CUDA_SAFE_CALL(

      cudaMemcpy(d_array, h_array, BUFF_SIZE*sizeof(cfloat_t), cudaMemcpyHostToDevice));

fold_kernel<<<NUM_BLOCKS, THR_PER_BLK>>>(d_array, d_max_arr, d_idx_arr, d_sum_arr, d_abs_arr);

// Copy results back

   CUDA_SAFE_CALL(cudaMemcpy(h_max_arr, d_max_arr, NUM_BLOCKS*sizeof(float),    cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_idx_arr, d_idx_arr, NUM_BLOCKS*sizeof(uint32_t), cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_sum_arr, d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t), cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_abs_arr, d_abs_arr, NUM_BLOCKS*sizeof(float),    cudaMemcpyDeviceToHost));

printf(“Sums: \n”);

   for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

       printf(" %i -> (%f, %f)\n", ii, h_sum_arr[ii].re, h_sum_arr[ii].im);

   }

float max = 0.0;

   uint32_t max_idx = 0;

   cfloat_t sum; sum.re = 0.0; sum.im = 0.0;

   float    abs     = 0.0;

   for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

       if (h_max_arr[ii] > max) {

           max     = h_max_arr[ii];

           max_idx = h_idx_arr[ii];

       }

sum.re += h_sum_arr[ii].re;

       sum.im += h_sum_arr[ii].im;

       abs    += h_abs_arr[ii];

   }

printf(“Sum: (%f, %f), Abs: %f Max: %f Idx: %u\n”,

          sum.re, sum.im, abs, max, max_idx);

}

}

int main() {

test_fold_kernel();

return 0;

}[/codebox]

Which can be compiled with:

NVCC_OPTIONS = --ptxas-options=-v

NVCC_INCLUDE = -I/usr/local/cuda/include -I/usr/local/cuda/sdk/C/common/inc

test: test.cu

		nvcc $(NVCC_OPTIONS) $(NVCC_INCLUDE) test.cu -o test[

When the constructor on the struct is commented out, the code runs just as you’d expect I feed it 32K (1.0,0.0) values and get (32768.0,0.0) as the output. However, when you uncomment the constructor and recompile, then some non-deterministic behavior takes over and some blocks don’t sum up to 128 as they should.

I’m totally open to the notion that I’ve missed something, but it seems like that default constructor should be supported. Is anyone able to shed some light on this or should I go ahead and file a bug report? I’ve confirmed this behavior on CUDA 3.1 (Fedora Core 13 and RHEL 5) and CUDA 3.1 (Fedora Core 13)

I’ve got some relatively simple code that takes an input buffer of complex data and folds each block down to a couple of values (max, index, sum and sum-of-absolute). I’ve been having some issues that I’ve had a hard time tracking down and I’m convinced that I’ve found a compiler bug.

See the following code:

[codebox]#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <cuda.h>

#include <cutil.h>

typedef struct align(8) cfloat_t {

//__device__ __host__ cfloat_t() { re = 0.0; im = 0.0; }

float re;

float im;

} cfloat_t;

#define NUM_BLOCKS 256

#define THR_PER_BLK 128

global void fold_kernel(cfloat_t *data,

                        float    *d_max, uint32_t *d_idx,

                        cfloat_t *d_sum, float    *d_abs) {

shared float max_buff[THR_PER_BLK/2];

shared uint32_t idx_buff[THR_PER_BLK/2];

shared cfloat_t sum_buff[THR_PER_BLK/2];

shared float abs_buff[THR_PER_BLK/2];

// Do initial comparison straight from global memory to cut shared memory usage in half

if (threadIdx.x < blockDim.x/2) {

   const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;

   const uint32_t idx_upper = idx_lower + (blockDim.x/2);

cfloat_t val_1 = data[idx_lower];

   float    mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);

cfloat_t val_2 = data[idx_upper];

   float    mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);

// Do initial sum and sum-of-absolutes

   sum_buff[threadIdx.x].re = val_1.re + val_2.re;

   sum_buff[threadIdx.x].im = val_1.im + val_2.im;

abs_buff[threadIdx.x] = mag_1 + mag_2;

if (mag_2 > mag_1) {

       max_buff[threadIdx.x] = mag_2;

       idx_buff[threadIdx.x] = idx_upper;

   } else {

       max_buff[threadIdx.x] = mag_1;

       idx_buff[threadIdx.x] = idx_lower;

   }

}

__syncthreads();

// Fold down into single value per block

for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {

   const size_t half = ii;

if (threadIdx.x < half) {

       const size_t idx_1 = threadIdx.x;

       const size_t idx_2 = threadIdx.x + half;

// Fold sum and sum of absolutes

       sum_buff[idx_1].re += sum_buff[idx_2].re;

       sum_buff[idx_1].im += sum_buff[idx_2].im;

       abs_buff[idx_1]    += abs_buff[idx_2];

if (max_buff[idx_2] > max_buff[idx_1]) {

           max_buff[idx_1] = max_buff[idx_2];

           idx_buff[idx_1] = idx_buff[idx_2];

       }

   }

__syncthreads();

}

if (threadIdx.x == 0) {

   d_max[blockIdx.x] = max_buff[0];

   d_idx[blockIdx.x] = idx_buff[0];

   d_sum[blockIdx.x] = sum_buff[0];

   d_abs[blockIdx.x] = abs_buff[0];

}

}

host void test_fold_kernel() {

const uint32_t BUFF_SIZE = 32768;

const uint32_t NUM_ITER = 1;

cfloat_t h_array = (cfloat_t)malloc(BUFF_SIZE*sizeof(cfloat_t));

cfloat_t d_array; CUDA_SAFE_CALL(cudaMalloc((void*)&d_array, BUFF_SIZE*sizeof(cfloat_t)));

float h_max_arr = (float) malloc(NUM_BLOCKS*sizeof(float));

uint32_t h_idx_arr = (uint32_t)malloc(NUM_BLOCKS*sizeof(uint32_t));

cfloat_t h_sum_arr = (cfloat_t)malloc(NUM_BLOCKS*sizeof(cfloat_t));

float h_abs_arr = (float) malloc(NUM_BLOCKS*sizeof(float));

float d_max_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_max_arr, NUM_BLOCKS*sizeof(float)));

uint32_t d_idx_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_idx_arr, NUM_BLOCKS*sizeof(uint32_t)));

cfloat_t d_sum_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t)));

float d_abs_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_abs_arr, NUM_BLOCKS*sizeof(float)));

// < CHANGE HERE >

for (size_t ii=0; ii < BUFF_SIZE; ii++) {

   h_array[ii].re = 1.0;//(float)ii;

   h_array[ii].im = 0.0;

}

for (size_t iter = 0; iter < NUM_ITER; iter++) {

   CUDA_SAFE_CALL(

      cudaMemcpy(d_array, h_array, BUFF_SIZE*sizeof(cfloat_t), cudaMemcpyHostToDevice));

fold_kernel<<<NUM_BLOCKS, THR_PER_BLK>>>(d_array, d_max_arr, d_idx_arr, d_sum_arr, d_abs_arr);

// Copy results back

   CUDA_SAFE_CALL(cudaMemcpy(h_max_arr, d_max_arr, NUM_BLOCKS*sizeof(float),    cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_idx_arr, d_idx_arr, NUM_BLOCKS*sizeof(uint32_t), cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_sum_arr, d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t), cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaMemcpy(h_abs_arr, d_abs_arr, NUM_BLOCKS*sizeof(float),    cudaMemcpyDeviceToHost));

printf(“Sums: \n”);

   for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

       printf(" %i -> (%f, %f)\n", ii, h_sum_arr[ii].re, h_sum_arr[ii].im);

   }

float max = 0.0;

   uint32_t max_idx = 0;

   cfloat_t sum; sum.re = 0.0; sum.im = 0.0;

   float    abs     = 0.0;

   for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

       if (h_max_arr[ii] > max) {

           max     = h_max_arr[ii];

           max_idx = h_idx_arr[ii];

       }

sum.re += h_sum_arr[ii].re;

       sum.im += h_sum_arr[ii].im;

       abs    += h_abs_arr[ii];

   }

printf(“Sum: (%f, %f), Abs: %f Max: %f Idx: %u\n”,

          sum.re, sum.im, abs, max, max_idx);

}

}

int main() {

test_fold_kernel();

return 0;

}[/codebox]

Which can be compiled with:

NVCC_OPTIONS = --ptxas-options=-v

NVCC_INCLUDE = -I/usr/local/cuda/include -I/usr/local/cuda/sdk/C/common/inc

test: test.cu

		nvcc $(NVCC_OPTIONS) $(NVCC_INCLUDE) test.cu -o test[

When the constructor on the struct is commented out, the code runs just as you’d expect I feed it 32K (1.0,0.0) values and get (32768.0,0.0) as the output. However, when you uncomment the constructor and recompile, then some non-deterministic behavior takes over and some blocks don’t sum up to 128 as they should.

I’m totally open to the notion that I’ve missed something, but it seems like that default constructor should be supported. Is anyone able to shed some light on this or should I go ahead and file a bug report? I’ve confirmed this behavior on CUDA 3.1 (Fedora Core 13 and RHEL 5) and CUDA 3.1 (Fedora Core 13)

I don’t think it’s explicitly documented, but constructor calls to shared or device variables aren’t likely to be supported. Which thread would be responsible for doing it? And device variables are even worse since it’d take two kernel executions (one to do the creation, one to run your code).

But you can do it all manually… and in doing that you’ll see why it’s not like the automatic construction that makes sense with a single threaded CPU style execution.

For example, in your code, you might write:

__shared__ cfloat_t sum_buff[THR_PER_BLK/2];

for (int i=threadIdx.x; i<THR_PER_BLK/2; i+=blockDim.x) sum_buff[i]=cfloat_t(); // call copy constructor, using a default constructed structure 

__syncthreads();

/* ... rest of code here ... */

Annoying? Yes, but not terrible, but such complexity would be really bad for the compiler to hide from you.

Device memory classes are significantly more complex and require host-side code as well since you need multiple kernels! (Or hostside construction, which is often but not always possible.)

Class construction at the thread level does make sense and is indeed done at runtime, since it’s clear which thread is responsible for doing it.

I don’t think it’s explicitly documented, but constructor calls to shared or device variables aren’t likely to be supported. Which thread would be responsible for doing it? And device variables are even worse since it’d take two kernel executions (one to do the creation, one to run your code).

But you can do it all manually… and in doing that you’ll see why it’s not like the automatic construction that makes sense with a single threaded CPU style execution.

For example, in your code, you might write:

__shared__ cfloat_t sum_buff[THR_PER_BLK/2];

for (int i=threadIdx.x; i<THR_PER_BLK/2; i+=blockDim.x) sum_buff[i]=cfloat_t(); // call copy constructor, using a default constructed structure 

__syncthreads();

/* ... rest of code here ... */

Annoying? Yes, but not terrible, but such complexity would be really bad for the compiler to hide from you.

Device memory classes are significantly more complex and require host-side code as well since you need multiple kernels! (Or hostside construction, which is often but not always possible.)

Class construction at the thread level does make sense and is indeed done at runtime, since it’s clear which thread is responsible for doing it.

I can see the constructor calls on the shared memory being ambiguous, butt one of the first things I do is to assign (by direct member access) to the values of sum_buff, so even if constructing that array is non-deterministic, it shouldn’ t matter as it’s overwritten anyhow…

I can see the constructor calls on the shared memory being ambiguous, butt one of the first things I do is to assign (by direct member access) to the values of sum_buff, so even if constructing that array is non-deterministic, it shouldn’ t matter as it’s overwritten anyhow…

A very good question!

Though the first and simplest thing to double and triple and quadruple check is that THR_PER_BLK really really is equal to the number of threads per block. If it’s off, you’d have shared memory reading and writing past the end of their arrays and corrupting each other, giving behavior like you describe.

A very good question!

Though the first and simplest thing to double and triple and quadruple check is that THR_PER_BLK really really is equal to the number of threads per block. If it’s off, you’d have shared memory reading and writing past the end of their arrays and corrupting each other, giving behavior like you describe.

I’m very confident that it is, since removing the constructor makes it work correctly, I’ll double check though.

I’m very confident that it is, since removing the constructor makes it work correctly, I’ll double check though.