Cudaerrorinvalidvalue Second Iteration

Hello,
I have seen a couple forum posts on this but none seem to apply to my code. I am getting the dreaded cudaerrorinvalidvalue on the second iteration call from the host code. My code is below that handles all the data prep and launches the kernel. Can someone maybe lend some insight to why im getting this error only on the second time?

cudaError_t CalcshapefunctionPrep(double* xForces, double* yForces, double* zForces, double* Temperature, double* m_ChipTable, double* shapefunction,
int* ifeed, int numspeed, int numrake, int* ispeed, int* irake, int numbackrake, int* ibackrake, int numfeed,int NumThreads, int NumBlocks)
{

const int NumDataBlocks = NumThreads * NumBlocks+1;
cudaError_t cudaStatus;


//// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
	//fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
	goto Error;
}

//// Allocate GPU buffers for three vectors (two input, one output)    .
//Force data holders--------------------------------------------------------------------------------
double* xForcesLocal = { 0 };
double* yForcesLocal = { 0 };
double* zForcesLocal = { 0 };
double* TemperatureLocal = { 0 };
//Force X Data prep and transfer to GPU Buffer-------------------------------------------
cudaStatus = cudaMalloc((void**)(&xForcesLocal), NumDataBlocks *sizeof(double));
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(xForcesLocal, xForces, NumDataBlocks * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//Force Y Data prep and transfer to GPU Buffer-------------------------------------------
cudaStatus = cudaMalloc((void**)(&yForcesLocal), NumDataBlocks * sizeof(double));
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(yForcesLocal, yForces, NumDataBlocks * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//Force Z Data prep and transfer to GPU Buffer-------------------------------------------
cudaStatus = cudaMalloc((void**)(&zForcesLocal), NumDataBlocks * sizeof(double));
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(zForcesLocal, zForces, NumDataBlocks * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//Temperature Data prep and transfer to GPU Buffer-------------------------------------------
cudaStatus = cudaMalloc((void**)(&TemperatureLocal), NumDataBlocks * sizeof(double));
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(TemperatureLocal, Temperature, NumDataBlocks * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//Force data holders--------------------------------------------------------------------------------
int* ifeedlocal = { 0 };
double* m_ChipTableLocal = { 0 };
double* shapefunctionLocal = { 0 };
int* ispeedLocal = { 0 };
int* irakeLocal = { 0 };
int* ibackrakeLocal = { 0 };
//ifeedlocal Data prep and transfer to GPU Buffer-------------------------------------------
size_t feedsize =3*sizeof(int);
cudaStatus = cudaMalloc((void**)(&ifeedlocal), feedsize);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(ifeedlocal, ifeed, feedsize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//m_ChipTableLocal Data prep and transfer to GPU Buffer-------------------------------------------
size_t m_ChipTablesize = (numfeed * numspeed * numrake * numbackrake * 4) *sizeof(double);
cudaStatus = cudaMalloc((void**)(&m_ChipTableLocal), m_ChipTablesize);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(m_ChipTableLocal, m_ChipTable, m_ChipTablesize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//shapefunctionLocal Data prep and transfer to GPU Buffer-------------------------------------------	
size_t shapefunctionsize = 16*sizeof(double);
cudaStatus = cudaMalloc((void**)(&shapefunctionLocal), shapefunctionsize);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(shapefunctionLocal, shapefunction, shapefunctionsize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//ispeedLocal Data prep and transfer to GPU Buffer-------------------------------------------
size_t ispeedsize = 3*sizeof(int);
cudaStatus = cudaMalloc((void**)(&ispeedLocal), ispeedsize);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(ispeedLocal, ispeed, ispeedsize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//irakeLocal Data prep and transfer to GPU Buffer-------------------------------------------	
size_t irakesize = 3 * sizeof(int);
cudaStatus = cudaMalloc((void**)(&irakeLocal), irakesize);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(irakeLocal, irake, irakesize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//ibackrakeLocal Data prep and transfer to GPU Buffer-------------------------------------------	
size_t ibackrakesize = 3 * sizeof(int);
cudaStatus = cudaMalloc((void**)(&ibackrakeLocal), ibackrakesize);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(ibackrakeLocal, ibackrake, ibackrakesize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	goto Error;
}


//// Launch a kernel on the GPU with one thread for each element.
Calcshapefunction <<<NumBlocks, NumThreads>>> ( xForcesLocal, yForcesLocal, zForcesLocal,  TemperatureLocal, m_ChipTableLocal,
	shapefunctionLocal, ifeedlocal, numspeed, numrake, ispeedLocal, irakeLocal, numbackrake, ibackrakeLocal);


//// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
	//fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	goto Error;
}

//// cudaDeviceSynchronize waits for the kernel to finish, and returns any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//// Copy output vector from GPU buffer to host memory.----------------------------------------------------------------------------------------------
//Forces------------------------------------------------------------------
cudaStatus = cudaMemcpy(xForces, xForcesLocal,NumDataBlocks * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(yForces, yForcesLocal, NumDataBlocks * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(zForces, zForcesLocal, NumDataBlocks * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(Temperature, TemperatureLocal, NumDataBlocks * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
//Forces------------------------------------------------------------------

cudaStatus = cudaMemcpy(ifeed, ifeedlocal, feedsize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(m_ChipTable, m_ChipTableLocal, m_ChipTablesize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(shapefunction, shapefunctionLocal, shapefunctionsize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(ispeed, ispeedLocal, ispeedsize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(irake, irakeLocal, irakesize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}
cudaStatus = cudaMemcpy(ibackrake, ibackrakeLocal, ibackrakesize, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	goto Error;
}

Error:
cudaFree(xForcesLocal);
cudaFree(yForcesLocal);
cudaFree(zForcesLocal);
cudaFree(TemperatureLocal);

cudaFree(ifeedlocal);
cudaFree(m_ChipTableLocal);
cudaFree(shapefunctionLocal);
cudaFree(ispeedLocal);
cudaFree(irakeLocal);
cudaFree(ibackrakeLocal);
cudaDeviceSynchronize();
return cudaStatus;

}