What a call can not be configured?

#include “FilterDefinition.h”
#include <cuda.h>
#include <math.h>
#include <math_constants.h>
#include <cutil.h>
#include “devFilterBank.h”

void hostApplyFilterBank(FilterDefinition* fd, float* h, int n, const float* x, float* y) {
FilterDefinition *d_fd;
float *d_h;
float *d_x;
float *d_sigmas;
float *sigmas;

sigmas = (float *)malloc(sizeof(float) * NUM_THREADS);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_sigmas, sizeof(float) * NUM_THREADS));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_x, sizeof(float) * (2 * fd->hTap + 1)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_h, sizeof(float) * (fd->hTap + 1)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_fd, sizeof(FilterDefinition)));

int mem_size = sizeof(float) * NUM_THREADS + sizeof(float) * (2 * fd->hTap + 1) + sizeof(float) * (fd->hTap + 1) + sizeof(FilterDefinition);

dim3  grid( 1, 1, 1);
dim3  threads( NUM_THREADS, 1, 1);

CUDA_SAFE_CALL(cudaMemcpy(d_h, h, (fd->hTap + 1), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_fd, fd, sizeof(FilterDefinition), cudaMemcpyHostToDevice));

for(int t = 0; t < n; t++) {
	CUDA_SAFE_CALL(cudaMemcpy(d_x, &(x[t]), sizeof(float) * (2 * fd->hTap + 1), cudaMemcpyHostToDevice));
	devApplyFilterBankKernel<<< grid, threads, mem_size >>>(d_sigmas, d_fd, d_h, d_x);
	CUDA_SAFE_CALL(cudaMemcpy(sigmas, d_sigmas, sizeof(float) * NUM_THREADS, cudaMemcpyDeviceToHost));
	
	float z = 0.0F;
	for(int k = 0; k < NUM_THREADS; k++) {
		z += sigmas[k];
	}
	
	y[t] = z;
}

CUDA_SAFE_CALL( cudaFree( (void**) &d_fd ));
CUDA_SAFE_CALL( cudaFree( (void**) &d_h ));
CUDA_SAFE_CALL( cudaFree( (void**) &d_x ));
CUDA_SAFE_CALL( cudaFree( (void**) &d_sigmas ));

free(sigmas);

}

Interesting code, did you have a question?

You’re not going to get very good performance with only one thread block.

In compiling this code, I got error message.

“Call can not be configured.”

I wonder means of this message.

you should declare your kernel function with a global identifier

This should hopefully solve the problem