Hello friends, I got this kernel code posted by forum member avidday. But after trying many hours, I still could not get it run properly.
His kernel code is here:
global void columnsumreduction(const int m, const int lda,
float *in, float *out)
{
extern shared float buff;
unsigned int tidx = threadIdx.x;
unsigned int idx = tidx + __mul24(blockIdx.x, lda);
unsigned int idxmax = m + __mul24(blockIdx.x, lda);
buff[tidx] = 0.;
// Compute partial sums down the column until we
// have covered the whole column
while (idx < idxmax) {
buff[tidx] += in[idx];
idx += blockDim.x;
}
__syncthreads();
// Parallel reduction of the partial sum
// in shared memory
if (blockDim.x == 512) {
if (tidx < 256)
buff[tidx] += buff[tidx + 256];
__syncthreads();
}
if (blockDim.x >= 256) {
if (tidx < 128)
buff[tidx] += buff[tidx + 128];
__syncthreads();
}
if (blockDim.x >= 128) {
if (tidx < 64)
buff[tidx] += buff[tidx + 64];
__syncthreads();
}
if (tidx < 32) {
if (blockDim.x >= 64) {
buff[tidx] += buff[tidx + 32];
}
if (blockDim.x >= 32) {
buff[tidx] += buff[tidx + 16];
}
if (blockDim.x >= 16) {
buff[tidx] += buff[tidx + 8];
}
if (blockDim.x >= 8) {
buff[tidx] += buff[tidx + 4];
}
if (blockDim.x >= 4) {
buff[tidx] += buff[tidx + 2];
}
if (blockDim.x >= 2) {
buff[tidx] += buff[tidx + 1];
}
}
// write result for this block to global mem
if (tidx == 0)
out[blockIdx.x] = buff[0];
}
The following is the host function I wrote trying to use his kernel:
int main()
{
float* in;
float* out;
int m = 4;
int lda = 8;
in =(float*)malloc(32sizeof(float));
out =(float)malloc(8*sizeof(float));
for(int i =0; i<32; i++)
{
in [i]= 1.2f;
out[i]= 0;
}
float* d_in;
float* d_out;
cudaMalloc((void**)&d_in,32sizeof(float));
cudaMemcpy(d_in,in,32sizeof(float),cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_out,8sizeof(float));
//cudaMemcpy(d_out,out,8sizeof(float),cudaMemcpyHostToDevice)
;
columnsumreduction<<<8,4>>>(m, lda, d_in, d_out);
cudaMemcpy(out,d_out,8*sizeof(float),cudaMemcpyDeviceToHost)
;
}
It can be compiled by NVCC, but could not be executed. The error message is
read_image: malloc.c:3074: sYSMALLOc: Assertion `(old_top == (((mbinptr) (((char *) &((av)->bins[((1) - 1) * 2])) - __builtin_offsetof (struct malloc_chunk, fd)))) && old_size == 0) || ((unsigned long) (old_size) >= (unsigned long)((((__builtin_offsetof (struct malloc_chunk, fd_nextsize))+((2 * (sizeof(size_t))) - 1)) & ~((2 * (sizeof(size_t))) - 1))) && ((old_top)->size & 0x1) && ((unsigned long)old_end & pagemask) == 0)’ failed.
Aborted
Please Help! Many many thanks!