Why this code was crashed

This code was crashed in if ch = 1.
Of course this code is not complete. There are no device running.
But crashing, I can’t understood.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include <math_constants.h>

const int DATASIZE = 48000;
const int FIR_CHMAX = 10;
const float FIR_FLOAT_FS = 48000.0F;

short* indata;
float* h;
short* outdata;
short* refdata;

struct FilterDefinitionF {
int nTap;
float fFH;
float fFL;
};

struct SourceSignal {
short* lpwSource;
int nSourceSize;
};

float FIR_ComputeFilterReferenceKernel(FilterDefinitionF* def, short* work, float* h, int i, int nn) {
float z = 0.0F;

for(int n=0; n < def->nTap; n++) {
	short x0 = work[i + n] + work[i + nn - n];
	z += h[n] * static_cast<float>(x0);			
}

return z;

}

void FIR_ComputeFilterFloat(FilterDefinitionF* def, SourceSignal source, short* destination) {
float* h = (float )malloc(def->nTap * sizeof(float));
float fWH = 2.0F * CUDART_PI_F * def->fFH / FIR_FLOAT_FS;
float fWL = 2.0F * CUDART_PI_F * def->fFL / FIR_FLOAT_FS;
float
d_h = NULL;
const int nThreads = 1;
FilterDefinitionF* d_def = NULL;

int datnum = def->nTap + source.nSourceSize;
int datnumtmp = def->nTap + source.nSourceSize + def->nTap;
int nn = 2 * def->nTap;
	
short* work = (short *)malloc(datnumtmp * sizeof(short));
short* d_work = NULL;
float* d_c = NULL;
float* c;
cudaError_t result;

for(int n = def->nTap; n > 0; n--) {
	h[def->nTap - n] = sinf(fWH * static_cast<float>(n)) - sinf(fWL* static_cast<float>(n)) / (static_cast<float>(CUDART_PI_F) * static_cast<float>(n));
}
	
int s = def->nTap;
int s2 = def->nTap + source.nSourceSize;
	
for(int i=0; i < def->nTap; i++) {
	work[i]=0;
}
for(int i=0; i < source.nSourceSize; i++) {
	work[s + i]= source.lpwSource[i];
}
for(int i=0; i < def->nTap; i++) {
	work[s2 + i]=0;
}

result = CUDA_SAFE_CALL(cudaMalloc((void**)&d_def, sizeof(FilterDefinitionF)));
if(result == cudaErrorMemoryAllocation) {
	puts("Error");
	return;
}
result = CUDA_SAFE_CALL(cudaMalloc((void**)&d_work, sizeof(short) * datnumtmp));
if(result == cudaErrorMemoryAllocation) {
	puts("Error");
	return;
}
result = CUDA_SAFE_CALL(cudaMalloc((void**)&d_h, sizeof(float) * def->nTap));
if(result == cudaErrorMemoryAllocation) {
	puts("Error");
	return;
}
result = CUDA_SAFE_CALL(cudaMalloc((void**)&d_c, sizeof(float) * nThreads));
if(result == cudaErrorMemoryAllocation) {
	puts("Error");
	return;
}

c = (float *)malloc(sizeof(float) * nThreads);

CUDA_SAFE_CALL(cudaMemcpy(d_def, def, sizeof(FilterDefinitionF), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_work, work, sizeof(short) * datnumtmp, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_h, h, sizeof(float) * def->nTap, cudaMemcpyHostToDevice));

for(int i=0; i < source.nSourceSize; i++)
{
	float z = 0.0F;

	for(int n=0; n < def->nTap; n++) {
		short x0 = work[i + n] + work[i + nn - n];
		z += h[n] * static_cast<float>(x0);			
	}
	
	destination[i] = static_cast<short>( z + h[def->nTap] * static_cast<float>(work[ i + def->nTap]));
}
	
free(work);
free(h);

CUDA_SAFE_CALL(cudaFree((void**)&d_def));
CUDA_SAFE_CALL(cudaFree((void**)&d_work));
CUDA_SAFE_CALL(cudaFree((void**)&d_h));
CUDA_SAFE_CALL(cudaFree((void**)&d_c));
free©;

}

void FIR_ComputeFilterReference(FilterDefinitionF* def, SourceSignal source, short* destination) {
float* h = (float *)malloc(def->nTap * sizeof(float));
float fWH = 2.0F * CUDART_PI_F * def->fFH / FIR_FLOAT_FS;
float fWL = 2.0F * CUDART_PI_F * def->fFL / FIR_FLOAT_FS;

int datnum = def->nTap + source.nSourceSize;
int datnumtmp = def->nTap + source.nSourceSize + def->nTap;
int nn = 2 * def->nTap;
	
short* work = (short *)malloc(datnumtmp * sizeof(short));		

for(int n = def->nTap; n > 0; n--) {
	h[def->nTap - n] = sinf(fWH * static_cast<float>(n)) - sinf(fWL* static_cast<float>(n)) / (static_cast<float>(CUDART_PI_F) * static_cast<float>(n));
}
	
int s = def->nTap;
int s2 = def->nTap + source.nSourceSize;
	
for(int i=0; i < def->nTap; i++) {
	work[i]=0;
}
for(int i=0; i < source.nSourceSize; i++) {
	work[s + i]= source.lpwSource[i];
}
for(int i=0; i < def->nTap; i++) {
	work[s2 + i]=0;
}

for(int i=0; i < source.nSourceSize; i++)
{
	float z = FIR_ComputeFilterReferenceKernel(def, work, h, i, nn);
	
	destination[i] = static_cast<short>( z + h[def->nTap] * static_cast<float>(work[ i + def->nTap]));
}
	
free(work);
free(h);

}

bool FIR_ApplyFilterBankFloat(int ch, SourceSignal source, short* destination) {
const float fL[10]={ 20.0, 40.0, 80.0, 160.0, 320.0, 640.0, 1280.0, 2560.0, 5120.0, 10280.0 };
const float fH[10]={ 40.0, 80.0, 160.0, 320.0, 640.0, 1280.0, 2560.0, 5120.0, 10240.0, 20480.0 };
const int TAP[10]={ 29000, 25000, 12500, 6250, 3125, 1600, 1600, 1600, 1600, 1600 };

printf("Dev CH: %d\n", ch);
if(ch >= 0 && ch < FIR_CHMAX) {		
	FilterDefinitionF* def = (FilterDefinitionF*)malloc(sizeof(FilterDefinitionF));
	def->nTap = TAP[ch];
	def->fFL = fL[ch];
	def->fFH = fH[ch];
	
	FIR_ComputeFilterFloat(def, source, destination);
	free(def);
	
	return true;
}
else {
	return false;
}

}

bool FIR_ApplyFilterBankReference(int ch, SourceSignal source, short* destination) {
const float fL[10]={ 20.0, 40.0, 80.0, 160.0, 320.0, 640.0, 1280.0, 2560.0, 5120.0, 10280.0 };
const float fH[10]={ 40.0, 80.0, 160.0, 320.0, 640.0, 1280.0, 2560.0, 5120.0, 10240.0, 20480.0 };
const int TAP[10]={ 29000, 25000, 12500, 6250, 3125, 1600, 1600, 1600, 1600, 1600 };

printf("Ref CH: %d\n", ch);
if(ch >= 0 && ch < FIR_CHMAX) {		
	FilterDefinitionF* def = (FilterDefinitionF*)malloc(sizeof(FilterDefinitionF));
	def->nTap = TAP[ch];
	def->fFL = fL[ch];
	def->fFH = fH[ch];
	
	FIR_ComputeFilterReference(def, source, destination);
	free(def);
	
	return true;
}
else {
	return false;
}

}

void PrepareMemory() {
indata = (short *)malloc(DATASIZE * sizeof(short));
outdata = (short *)malloc(DATASIZE * sizeof(short));
refdata = (short *)malloc(DATASIZE * sizeof(short));
}

void ReleaseMemory() {
free(indata);
free(outdata);
free(refdata);
}

void InitSample() {
for(int i = 0; i < DATASIZE; i++) {
// indata[i] = 0.0F;
indata[i] = i;
}
indata[DATASIZE / 2] = 255.0F;
}

void DiffRef() {
double z = 0.0;

for(int i = 0; i < DATASIZE; i++) {
	z += fabs((double)outdata[i] - (double)refdata[i]);
}

printf("Diff: %f\n", z);

}

void RunTest() {
bool result;
PrepareMemory();
InitSample();

SourceSignal ss;
ss.lpwSource = indata;
ss.nSourceSize = DATASIZE;

unsigned int timer = 0;
for(int n = 0; n < FIR_CHMAX; n++) {
    CUT_SAFE_CALL( cutCreateTimer( &timer));
	CUT_SAFE_CALL( cutStartTimer( timer));
	result = FIR_ApplyFilterBankFloat(n, ss, outdata);
    CUT_SAFE_CALL( cutStopTimer( timer));
    printf( "Device Processing time: %f (ms)\n", cutGetTimerValue( timer));
    CUT_SAFE_CALL( cutDeleteTimer( timer));

	if(result) printf("TRUE\n");

    CUT_SAFE_CALL( cutCreateTimer( &timer));
	CUT_SAFE_CALL( cutStartTimer( timer));
	result = FIR_ApplyFilterBankReference(n, ss, refdata);
    CUT_SAFE_CALL( cutStopTimer( timer));
    printf( "Reference Processing time: %f (ms)\n", cutGetTimerValue( timer));
    CUT_SAFE_CALL( cutDeleteTimer( timer));

	if(result) printf("TRUE\n");
	
	DiffRef();
}

ReleaseMemory();

}

int main(int argc, char** argv) {
CUT_DEVICE_INIT();
RunTest();

CUT_EXIT(argc, argv);

}