Need help on Paralle Reduction

Hello friends, I got this kernel code posted by forum member avidday. But after trying many hours, I still could not get it run properly.

His kernel code is here:
global void columnsumreduction(const int m, const int lda,
float *in, float *out)
{
extern shared float buff;

unsigned int tidx   = threadIdx.x;
unsigned int idx    = tidx + __mul24(blockIdx.x, lda);
unsigned int idxmax = m + __mul24(blockIdx.x, lda);

buff[tidx] = 0.;

// Compute partial sums down the column until we
// have covered the whole column
while (idx < idxmax) {
    buff[tidx] += in[idx];
    idx += blockDim.x;
}

__syncthreads();

// Parallel reduction of the partial sum
// in shared memory
if (blockDim.x == 512) {
    if (tidx < 256)
        buff[tidx] += buff[tidx + 256];
    __syncthreads();
}

if (blockDim.x >= 256) {
    if (tidx < 128)
        buff[tidx] += buff[tidx + 128];
    __syncthreads();
}

if (blockDim.x >= 128) {
    if (tidx < 64)
        buff[tidx] += buff[tidx + 64];

    __syncthreads();
}

if (tidx < 32) {
    if (blockDim.x >=  64) {
        buff[tidx] += buff[tidx + 32];
    }
    if (blockDim.x >=  32) {
        buff[tidx] += buff[tidx + 16];
    }
    if (blockDim.x >=  16) {
        buff[tidx] += buff[tidx + 8];
    }
    if (blockDim.x >=   8) {
        buff[tidx] += buff[tidx + 4];
    }
    if (blockDim.x >=   4) {
        buff[tidx] += buff[tidx + 2];
    }
    if (blockDim.x >=   2) {
        buff[tidx] += buff[tidx + 1];
    }
}

// write result for this block to global mem
if (tidx == 0)
    out[blockIdx.x] = buff[0];

}

The following is the host function I wrote trying to use his kernel:
int main()
{
float* in;
float* out;
int m = 4;
int lda = 8;

in =(float*)malloc(32sizeof(float));
out =(float
)malloc(8*sizeof(float));

for(int i =0; i<32; i++)
{
in [i]= 1.2f;
out[i]= 0;
}

float* d_in;
float* d_out;
cudaMalloc((void**)&d_in,32sizeof(float));
cudaMemcpy(d_in,in,32
sizeof(float),cudaMemcpyHostToDevice);

cudaMalloc((void**)&d_out,8sizeof(float));
//cudaMemcpy(d_out,out,8
sizeof(float),cudaMemcpyHostToDevice)
;

columnsumreduction<<<8,4>>>(m, lda, d_in, d_out);

cudaMemcpy(out,d_out,8*sizeof(float),cudaMemcpyDeviceToHost)
;
}

It can be compiled by NVCC, but could not be executed. The error message is

read_image: malloc.c:3074: sYSMALLOc: Assertion `(old_top == (((mbinptr) (((char *) &((av)->bins[((1) - 1) * 2])) - __builtin_offsetof (struct malloc_chunk, fd)))) && old_size == 0) || ((unsigned long) (old_size) >= (unsigned long)((((__builtin_offsetof (struct malloc_chunk, fd_nextsize))+((2 * (sizeof(size_t))) - 1)) & ~((2 * (sizeof(size_t))) - 1))) && ((old_top)->size & 0x1) && ((unsigned long)old_end & pagemask) == 0)’ failed.
Aborted

Please Help! Many many thanks!