Logarithmic fold operation behaving strangely

I’ve got some code I wrote, the goal being to fold a block of complex samples down into max, max index, sum and sum-of-moduli for some signal processing work I’m doing. My problem is that while testing I’m noticing some strange behavior. If I feed the kernel all (1.0, 0.0) samples, then it sums and produces the value one would expect. But, I can put a ramp into the buffer (buffer[ii] = ii) and the result is no longer correct. I’ve pored over the code and it looks correct to me, but I thought perhaps someone would see something I’ve missed or be able to point out some nuance of the CUDA runtime that I’m missing. I’m running CUDA 3.1 on a 8800GTS.

[codebox]#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <cuda.h>

#include <cutil.h>

typedef struct cfloat_t {

float re;

float im;

} cfloat_t;

#define NUM_BLOCKS 256

#define THR_PER_BLK 128

global void fold_kernel(cfloat_t *data,

						float	*d_max, uint32_t *d_idx,

						cfloat_t *d_sum, float	*d_abs) {

__shared__ float	max_buff[THR_PER_BLK/2];

__shared__ uint32_t idx_buff[THR_PER_BLK/2];

__shared__ cfloat_t sum_buff[THR_PER_BLK/2];

__shared__ float	abs_buff[THR_PER_BLK/2];

// Do first comparison out of global memory to cut shared memory usage in half

if (threadIdx.x < blockDim.x/2) {

	const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;

	const uint32_t idx_upper = idx_lower + (blockDim.x/2);

	cfloat_t val_1 = data[idx_lower];

	float	mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);

	cfloat_t val_2 = data[idx_upper];

	float	mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);

	// Do initial sum and sum-of-absolutes

	sum_buff[threadIdx.x].re = val_1.re + val_2.re;

	sum_buff[threadIdx.x].im = val_1.im + val_2.im;

	abs_buff[threadIdx.x]	= mag_1 + mag_2;

	

	if (mag_2 > mag_1) { 

		max_buff[threadIdx.x] = mag_2;

		idx_buff[threadIdx.x] = idx_upper;			

	} else {

		max_buff[threadIdx.x] = mag_1;

		idx_buff[threadIdx.x] = idx_lower;

	}

}



__syncthreads();



// Reduce to single value per block.

for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {

	const size_t half = ii;

	if (threadIdx.x < half) {

		const size_t idx_1 = threadIdx.x;

		const size_t idx_2 = threadIdx.x + half;

		

		// Fold sum and sum of absolutes

		sum_buff[idx_1].re += sum_buff[idx_2].re;

		sum_buff[idx_1].im += sum_buff[idx_2].im;

		abs_buff[idx_1]	+= abs_buff[idx_2];

		if (max_buff[idx_2] > max_buff[idx_1]) {

			max_buff[idx_1] = max_buff[idx_2];

			idx_buff[idx_1] = idx_buff[idx_2];

		}

	}

	__syncthreads();

}

if (threadIdx.x == 0) {

	d_max[blockIdx.x] = max_buff[0];

	d_idx[blockIdx.x] = idx_buff[0];

	d_sum[blockIdx.x] = sum_buff[0];

	d_abs[blockIdx.x] = abs_buff[0];		

}

}

host void test_fold_kernel() {

const uint32_t BUFF_SIZE   = 32768;

const uint32_t NUM_ITER	= 1;

cfloat_t *h_array = (cfloat_t*)malloc(BUFF_SIZE*sizeof(cfloat_t));

cfloat_t *d_array; CUDA_SAFE_CALL(cudaMalloc((void**)&d_array, BUFF_SIZE*sizeof(cfloat_t)));

float	*h_max_arr = (float*)   malloc(NUM_BLOCKS*sizeof(float));

uint32_t *h_idx_arr = (uint32_t*)malloc(NUM_BLOCKS*sizeof(uint32_t));

cfloat_t *h_sum_arr = (cfloat_t*)malloc(NUM_BLOCKS*sizeof(cfloat_t));

float	*h_abs_arr = (float*)   malloc(NUM_BLOCKS*sizeof(float));

float	*d_max_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_max_arr, NUM_BLOCKS*sizeof(float)));

uint32_t *d_idx_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_idx_arr, NUM_BLOCKS*sizeof(uint32_t)));

cfloat_t *d_sum_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t)));

float	*d_abs_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_abs_arr, NUM_BLOCKS*sizeof(float)));

// < CHANGE HERE >

for (size_t ii=0; ii < BUFF_SIZE; ii++) {

	h_array[ii].re = 1.0; //(float)ii;

	h_array[ii].im = 0.0;

}

for (size_t iter = 0; iter < NUM_ITER; iter++) {

	CUDA_SAFE_CALL(

	  cudaMemcpy(d_array, h_array, BUFF_SIZE*sizeof(cfloat_t), cudaMemcpyHostToDevice));

	fold_kernel<<<NUM_BLOCKS, THR_PER_BLK>>>(d_array, d_max_arr, d_idx_arr, d_sum_arr, d_abs_arr);

	// Copy results back

	CUDA_SAFE_CALL(cudaMemcpy(h_max_arr, d_max_arr, NUM_BLOCKS*sizeof(float),	cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(h_idx_arr, d_idx_arr, NUM_BLOCKS*sizeof(uint32_t), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(h_sum_arr, d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(h_abs_arr, d_abs_arr, NUM_BLOCKS*sizeof(float),	cudaMemcpyDeviceToHost));

	printf("Sums: \n");

	for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

		printf(" %i -> (%f, %f)\n", ii, h_sum_arr[ii].re, h_sum_arr[ii].im);

	}

	float	max	 = 0.0;

	uint32_t max_idx = 0;

	cfloat_t sum; sum.re = 0.0; sum.im = 0.0;

	float	abs	 = 0.0;

	for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

		if (h_max_arr[ii] > max) {

			max	 = h_max_arr[ii];

			max_idx = h_idx_arr[ii];

		}

		

		sum.re += h_sum_arr[ii].re;

		sum.im += h_sum_arr[ii].im;

		abs	+= h_abs_arr[ii];

	}

	printf("Sum: (%f, %f), Abs: %f  Max: %f  Idx: %u\n",

		   sum.re, sum.im, abs, max, max_idx);

}

}

int main() {

test_fold_kernel();

return 0;

}[/codebox]

I’ve got some code I wrote, the goal being to fold a block of complex samples down into max, max index, sum and sum-of-moduli for some signal processing work I’m doing. My problem is that while testing I’m noticing some strange behavior. If I feed the kernel all (1.0, 0.0) samples, then it sums and produces the value one would expect. But, I can put a ramp into the buffer (buffer[ii] = ii) and the result is no longer correct. I’ve pored over the code and it looks correct to me, but I thought perhaps someone would see something I’ve missed or be able to point out some nuance of the CUDA runtime that I’m missing. I’m running CUDA 3.1 on a 8800GTS.

[codebox]#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <cuda.h>

#include <cutil.h>

typedef struct cfloat_t {

float re;

float im;

} cfloat_t;

#define NUM_BLOCKS 256

#define THR_PER_BLK 128

global void fold_kernel(cfloat_t *data,

						float	*d_max, uint32_t *d_idx,

						cfloat_t *d_sum, float	*d_abs) {

__shared__ float	max_buff[THR_PER_BLK/2];

__shared__ uint32_t idx_buff[THR_PER_BLK/2];

__shared__ cfloat_t sum_buff[THR_PER_BLK/2];

__shared__ float	abs_buff[THR_PER_BLK/2];

// Do first comparison out of global memory to cut shared memory usage in half

if (threadIdx.x < blockDim.x/2) {

	const uint32_t idx_lower = blockIdx.x * blockDim.x + threadIdx.x;

	const uint32_t idx_upper = idx_lower + (blockDim.x/2);

	cfloat_t val_1 = data[idx_lower];

	float	mag_1 = sqrt(val_1.re * val_1.re + val_1.im * val_1.im);

	cfloat_t val_2 = data[idx_upper];

	float	mag_2 = sqrt(val_2.re * val_2.re + val_2.im * val_2.im);

	// Do initial sum and sum-of-absolutes

	sum_buff[threadIdx.x].re = val_1.re + val_2.re;

	sum_buff[threadIdx.x].im = val_1.im + val_2.im;

	abs_buff[threadIdx.x]	= mag_1 + mag_2;

	

	if (mag_2 > mag_1) { 

		max_buff[threadIdx.x] = mag_2;

		idx_buff[threadIdx.x] = idx_upper;			

	} else {

		max_buff[threadIdx.x] = mag_1;

		idx_buff[threadIdx.x] = idx_lower;

	}

}



__syncthreads();



// Reduce to single value per block.

for (size_t ii=(THR_PER_BLK >> 2); ii > 0; ii >>= 1) {

	const size_t half = ii;

	if (threadIdx.x < half) {

		const size_t idx_1 = threadIdx.x;

		const size_t idx_2 = threadIdx.x + half;

		

		// Fold sum and sum of absolutes

		sum_buff[idx_1].re += sum_buff[idx_2].re;

		sum_buff[idx_1].im += sum_buff[idx_2].im;

		abs_buff[idx_1]	+= abs_buff[idx_2];

		if (max_buff[idx_2] > max_buff[idx_1]) {

			max_buff[idx_1] = max_buff[idx_2];

			idx_buff[idx_1] = idx_buff[idx_2];

		}

	}

	__syncthreads();

}

if (threadIdx.x == 0) {

	d_max[blockIdx.x] = max_buff[0];

	d_idx[blockIdx.x] = idx_buff[0];

	d_sum[blockIdx.x] = sum_buff[0];

	d_abs[blockIdx.x] = abs_buff[0];		

}

}

host void test_fold_kernel() {

const uint32_t BUFF_SIZE   = 32768;

const uint32_t NUM_ITER	= 1;

cfloat_t *h_array = (cfloat_t*)malloc(BUFF_SIZE*sizeof(cfloat_t));

cfloat_t *d_array; CUDA_SAFE_CALL(cudaMalloc((void**)&d_array, BUFF_SIZE*sizeof(cfloat_t)));

float	*h_max_arr = (float*)   malloc(NUM_BLOCKS*sizeof(float));

uint32_t *h_idx_arr = (uint32_t*)malloc(NUM_BLOCKS*sizeof(uint32_t));

cfloat_t *h_sum_arr = (cfloat_t*)malloc(NUM_BLOCKS*sizeof(cfloat_t));

float	*h_abs_arr = (float*)   malloc(NUM_BLOCKS*sizeof(float));

float	*d_max_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_max_arr, NUM_BLOCKS*sizeof(float)));

uint32_t *d_idx_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_idx_arr, NUM_BLOCKS*sizeof(uint32_t)));

cfloat_t *d_sum_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t)));

float	*d_abs_arr; CUDA_SAFE_CALL(cudaMalloc((void**)&d_abs_arr, NUM_BLOCKS*sizeof(float)));

// < CHANGE HERE >

for (size_t ii=0; ii < BUFF_SIZE; ii++) {

	h_array[ii].re = 1.0; //(float)ii;

	h_array[ii].im = 0.0;

}

for (size_t iter = 0; iter < NUM_ITER; iter++) {

	CUDA_SAFE_CALL(

	  cudaMemcpy(d_array, h_array, BUFF_SIZE*sizeof(cfloat_t), cudaMemcpyHostToDevice));

	fold_kernel<<<NUM_BLOCKS, THR_PER_BLK>>>(d_array, d_max_arr, d_idx_arr, d_sum_arr, d_abs_arr);

	// Copy results back

	CUDA_SAFE_CALL(cudaMemcpy(h_max_arr, d_max_arr, NUM_BLOCKS*sizeof(float),	cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(h_idx_arr, d_idx_arr, NUM_BLOCKS*sizeof(uint32_t), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(h_sum_arr, d_sum_arr, NUM_BLOCKS*sizeof(cfloat_t), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(h_abs_arr, d_abs_arr, NUM_BLOCKS*sizeof(float),	cudaMemcpyDeviceToHost));

	printf("Sums: \n");

	for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

		printf(" %i -> (%f, %f)\n", ii, h_sum_arr[ii].re, h_sum_arr[ii].im);

	}

	float	max	 = 0.0;

	uint32_t max_idx = 0;

	cfloat_t sum; sum.re = 0.0; sum.im = 0.0;

	float	abs	 = 0.0;

	for (size_t ii=0; ii < NUM_BLOCKS; ii++) {

		if (h_max_arr[ii] > max) {

			max	 = h_max_arr[ii];

			max_idx = h_idx_arr[ii];

		}

		

		sum.re += h_sum_arr[ii].re;

		sum.im += h_sum_arr[ii].im;

		abs	+= h_abs_arr[ii];

	}

	printf("Sum: (%f, %f), Abs: %f  Max: %f  Idx: %u\n",

		   sum.re, sum.im, abs, max, max_idx);

}

}

int main() {

test_fold_kernel();

return 0;

}[/codebox]