I’ve got some relatively simple code that takes an input buffer of complex data and folds each block down to a couple of values (max, index, sum and sum-of-absolute). I’ve been having some issues that I’ve had a hard time tracking down and I’m convinced that I’ve found a compiler bug.
See the following code:
[codebox]#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <cuda.h>
#include <cutil.h>
typedef struct align(8) cfloat_t {
//__device__ __host__ cfloat_t() { re = 0.0; im = 0.0; }
float re;
float im;
} cfloat_t;
#define NUM_BLOCKS 256
#define THR_PER_BLK 128
global void fold_kernel(cfloat_t *data,
float *d_max, uint32_t *d_idx,
cfloat_t *d_sum, float *d_abs) {
shared float max_buff[THR_PER_BLK/2];
shared uint32_t idx_buff[THR_PER_BLK/2];
shared cfloat_t sum_buff[THR_PER_BLK/2];
shared float abs_buff[THR_PER_BLK/2];
// Do initial comparison straight from global memory to cut shared memory usage in half
if (threadIdx.x < blockDim.x/2) {
const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t idx_upper = idx_lower + (blockDim.x/2);
cfloat_t val_1 = data[idx_lower];
float mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);
cfloat_t val_2 = data[idx_upper];
float mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);
// Do initial sum and sum-of-absolutes
sum_buff[threadIdx.x].re = val_1.re + val_2.re;
sum_buff[threadIdx.x].im = val_1.im + val_2.im;
abs_buff[threadIdx.x] = mag_1 + mag_2;
if (mag_2 > mag_1) {
max_buff[threadIdx.x] = mag_2;
idx_buff[threadIdx.x] = idx_upper;
} else {
max_buff[threadIdx.x] = mag_1;
idx_buff[threadIdx.x] = idx_lower;
}
}
__syncthreads();
// Fold down into single value per block
for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {
const size_t half = ii;
if (threadIdx.x < half) {
const size_t idx_1 = threadIdx.x;
const size_t idx_2 = threadIdx.x + half;
// Fold sum and sum of absolutes
sum_buff[idx_1].re += sum_buff[idx_2].re;
sum_buff[idx_1].im += sum_buff[idx_2].im;
abs_buff[idx_1] += abs_buff[idx_2];
if (max_buff[idx_2] > max_buff[idx_1]) {
max_buff[idx_1] = max_buff[idx_2];
idx_buff[idx_1] = idx_buff[idx_2];
}
}
__syncthreads();
}
if (threadIdx.x == 0) {
d_max[blockIdx.x] = max_buff[0];
d_idx[blockIdx.x] = idx_buff[0];
d_sum[blockIdx.x] = sum_buff[0];
d_abs[blockIdx.x] = abs_buff[0];
}
}
host void test_fold_kernel() {
const uint32_t BUFF_SIZE = 32768;
const uint32_t NUM_ITER = 1;
cfloat_t h_array = (cfloat_t)malloc(BUFF_SIZE*sizeof(cfloat_t));
cfloat_t d_array; CUDA_SAFE_CALL(cudaMalloc((void*)&d_array, BUFF_SIZE*sizeof(cfloat_t)));
float h_max_arr = (float) malloc(NUM_BLOCKS*sizeof(float));
uint32_t h_idx_arr = (uint32_t)malloc(NUM_BLOCKS*sizeof(uint32_t));
cfloat_t h_sum_arr = (cfloat_t)malloc(NUM_BLOCKS*sizeof(cfloat_t));
float h_abs_arr = (float) malloc(NUM_BLOCKS*sizeof(float));
float d_max_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_max_arr, NUM_BLOCKS*sizeof(float)));
uint32_t d_idx_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_idx_arr, NUM_BLOCKS*sizeof(uint32_t)));
cfloat_t d_sum_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t)));
float d_abs_arr; CUDA_SAFE_CALL(cudaMalloc((void*)&d_abs_arr, NUM_BLOCKS*sizeof(float)));
// < CHANGE HERE >
for (size_t ii=0; ii < BUFF_SIZE; ii++) {
h_array[ii].re = 1.0;//(float)ii;
h_array[ii].im = 0.0;
}
for (size_t iter = 0; iter < NUM_ITER; iter++) {
CUDA_SAFE_CALL(
cudaMemcpy(d_array, h_array, BUFF_SIZE*sizeof(cfloat_t), cudaMemcpyHostToDevice));
fold_kernel<<<NUM_BLOCKS, THR_PER_BLK>>>(d_array, d_max_arr, d_idx_arr, d_sum_arr, d_abs_arr);
// Copy results back
CUDA_SAFE_CALL(cudaMemcpy(h_max_arr, d_max_arr, NUM_BLOCKS*sizeof(float), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_idx_arr, d_idx_arr, NUM_BLOCKS*sizeof(uint32_t), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_sum_arr, d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_abs_arr, d_abs_arr, NUM_BLOCKS*sizeof(float), cudaMemcpyDeviceToHost));
printf(“Sums: \n”);
for (size_t ii=0; ii < NUM_BLOCKS; ii++) {
printf(" %i -> (%f, %f)\n", ii, h_sum_arr[ii].re, h_sum_arr[ii].im);
}
float max = 0.0;
uint32_t max_idx = 0;
cfloat_t sum; sum.re = 0.0; sum.im = 0.0;
float abs = 0.0;
for (size_t ii=0; ii < NUM_BLOCKS; ii++) {
if (h_max_arr[ii] > max) {
max = h_max_arr[ii];
max_idx = h_idx_arr[ii];
}
sum.re += h_sum_arr[ii].re;
sum.im += h_sum_arr[ii].im;
abs += h_abs_arr[ii];
}
printf(“Sum: (%f, %f), Abs: %f Max: %f Idx: %u\n”,
sum.re, sum.im, abs, max, max_idx);
}
}
int main() {
test_fold_kernel();
return 0;
}[/codebox]
Which can be compiled with:
NVCC_OPTIONS = --ptxas-options=-v
NVCC_INCLUDE = -I/usr/local/cuda/include -I/usr/local/cuda/sdk/C/common/inc
test: test.cu
nvcc $(NVCC_OPTIONS) $(NVCC_INCLUDE) test.cu -o test[
When the constructor on the struct is commented out, the code runs just as you’d expect I feed it 32K (1.0,0.0) values and get (32768.0,0.0) as the output. However, when you uncomment the constructor and recompile, then some non-deterministic behavior takes over and some blocks don’t sum up to 128 as they should.
I’m totally open to the notion that I’ve missed something, but it seems like that default constructor should be supported. Is anyone able to shed some light on this or should I go ahead and file a bug report? I’ve confirmed this behavior on CUDA 3.1 (Fedora Core 13 and RHEL 5) and CUDA 3.1 (Fedora Core 13)