JCUDA code crashes/terminates

My requirement is I have to call the kernel code 300 times and each kernel call does convolve on 2 Vectors of 2000 size. Following is the JCUDA code and the Kernel code. The execution is quite smooth for 52 iterations but at 53rd iteration the execution automatically terminates and sometimes the display is also interrupted. Can someone help me on this?

JCUDA CODE:
//===========================================================
// Name : convolve.java
// Author : Deepti Deshpade
// Version : 1.5
// Description : CUDA version of Convolve
//===========================================================
import java.io.IOException;
import jcuda.;
import jcuda.runtime.JCuda;
import jcuda.driver.
;
import static jcuda.driver.JCudaDriver.*;

/*

  • This is a Jcuda version of Convolve to instantate the cudaconvolve.cu method
    */
    public class convolve
    {

    public static void main(String args) throws IOException
    {

     int asize = 15;
     int bsize = 20;
     double expected[]=new double[asize+bsize-1];
     double expected1[]=new double[asize+bsize-1];
     double Res[][] = new double[2][asize+bsize-1];
     
    
     int m,k;
     
     
     // Allocate and fill host input memory: A 2D float array with
     // 'numThreads' rows and 'size' columns, each row filled with
     // the values from 0 to size-1.
     double A[] = new double[asize];
     for(int i = 0; i < asize; i++)
     {
             A[i] = Math.random();
             System.out.println("A["+i+"]="+A[i]);
         
     }
     double B[] = new double[bsize];
     for(int i = 0; i < bsize; i++)
     {
             B[i] = Math.random();
             System.out.println("B["+i+"]="+B[i]);
         
     }
     for(int j = 0; j < asize; j++) {
     	  m = 0;
     	  k = j;
     	  while(m < bsize ) {
     		  expected[k] += A[j]*B[m];
     		  expected1[k] =(1.0)+(k*1.0);
     		  k++;
     		  m++;
     	  }
       }
     CUdeviceptr aMem = new CUdeviceptr();
     JCuda.cudaMalloc(aMem, asize * Sizeof.DOUBLE);
     CUdeviceptr bMem = new CUdeviceptr();
     JCuda.cudaMalloc(bMem, bsize * Sizeof.DOUBLE);
     cuMemcpyHtoD(aMem,
                 Pointer.to(A), A.length * Sizeof.DOUBLE);
     	cuMemcpyHtoD(bMem,
                 Pointer.to(B), B.length * Sizeof.DOUBLE);
     for(int z=0;z<200;z++){
     Res = convol(aMem,bMem,1.0,1.0,A.length,B.length);
     boolean passed = true;
     for(int i = 0; i < (asize+bsize)-1; i++)
     {           
    

// System.out.println(“expected[”+i+"]="+expected[i]+"\tC["+i+"]="+Res[1][i]+"\texpected1["+i+"]="+expected1[i]+"\tD["+i+"]="+Res[0][i]);
if ((Math.abs(expected[i]-Res[1][i]) > 0.000001))
{
System.out.println(“expected[”+i+"]="+expected[i]+“but calculated values is C[”+i+"]="+Res[1][i]);
passed = false;
break;
}
else if((Math.abs(expected1[i]-Res[0][i]) > 0.000001)){
System.out.println(“expected1[”+i+"]="+expected1[i]+“but calculated value is D[”+i+"]="+Res[0][i]);
passed = false;
break;
}

        JCuda.cudaThreadSynchronize();
    }
    System.out.println("Test "+(passed?"PASSED "+z:"FAILED"));
	}
	System.out.println("Hitchikers guide to galaxy");
}

// System.out.printf(“Execution time:\n 1. Copy data from Host Memory to Device Memory = %5.3fms \n 2. CUDA convolve = %5.3fms\n 3. Copy data from Device Memory to Host Memory = %5.3fms\n 4. Java convolve = %5.3fms\n 5. Total time in CUDA computation = %5.3fms”,durationCopy/1e6,durationComp/1e6,durationCopy1/1e6,durationJava/1e6,(durationCopy+durationCopy1+durationComp)/1e6);

	public static double[][]  convol(CUdeviceptr a,CUdeviceptr b, double d,double dinc,int asize,int bsize){
		cuInit(0);
        CUcontext pctx = new CUcontext();
        CUdevice dev = new CUdevice();
        cuDeviceGet(dev, 0);
        cuCtxCreate(pctx, 0, dev);
        CUmodule module = new CUmodule();
        cuModuleLoad(module, "/home/ddeshpande/cuda-workspace/cudaConvolve/src/cudaConvolve.ptx");

        // Obtain a function pointer to the "sampleKernel" function.
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "MatrixConvolveKernel");

    	double Res[][] = new double[2][asize+bsize];

// CUdeviceptr aMem = new CUdeviceptr();
// JCuda.cudaMalloc(aMem, asize * Sizeof.DOUBLE);
// CUdeviceptr bMem = new CUdeviceptr();
// JCuda.cudaMalloc(bMem, bsize * Sizeof.DOUBLE);
CUdeviceptr cMem = new CUdeviceptr();
JCuda.cudaMalloc(cMem,(asize+bsize) * Sizeof.DOUBLE);

        CUdeviceptr dMem = new CUdeviceptr();
        JCuda.cudaMalloc(dMem, (asize+bsize) * Sizeof.DOUBLE);

// cuMemcpyHtoD(aMem,
// Pointer.to(a), asize * Sizeof.DOUBLE);
//
// cuMemcpyHtoD(bMem,
// Pointer.to(b), bsize * Sizeof.DOUBLE);
Pointer kernelParams = Pointer.to(
Pointer.to(a),
Pointer.to(b),
Pointer.to(cMem),
Pointer.to(new int{asize}),
Pointer.to(new int{bsize}),
Pointer.to(dMem),
Pointer.to(new double{(double) d}),
Pointer.to(new double{(double) dinc})
);

                // Call the kernel function.
                cuLaunchKernel(function, 
                    1, 1, 1,     // Grid dimension 
                    1024, 1, 1,  // Block dimension
                    (asize+bsize), null, // Shared memory size and stream 
                    kernelParams, null // Kernel- and extra parameters
                ); 
                cuCtxSynchronize();
                // Allocate host output memory and copy the device output
                // to the host.
                cuMemcpyDtoH(Pointer.to(Res[1]), cMem,
                    (asize+bsize) * Sizeof.DOUBLE);
                cuMemcpyDtoH(Pointer.to(Res[0]), dMem,
                        (asize+bsize) * Sizeof.DOUBLE);

// for(int i=0;i<(asize+bsize)-1;i++){
// System.out.println(“Res[1][”+i+"]="+Res[0][i]);
// }
JCuda.cudaFree(a);
JCuda.cudaFree(b);
JCuda.cudaFree(cMem);
JCuda.cudaFree(dMem);
return Res;

	}

}

CUDA KERNEL function:

//=====================================================
// Name : cudaConvolve.cu
// Author : Deepti DeshpabMeme
// Version : 1.3
// Description : CUDA version of Convolve
//======================================================

extern “C”
global void MatrixConvolveKernel(double *aMem, double *bMem, double *cMem, int aWidth, int bWidth, double eMem, double d, double dinc) {
/

* Shared Memory to be accessed by All the Convolve vectors in each execution thread
*/

__shared__ double cSharedMem[4100];
/*
 * Initialize the Shared Memory to zero

// */
for(int i=0;i<4100-1;i++){
cSharedMem[i]=0;
eMem[i]=0;

}

/*
 * The GPU declaration has 3 X 1024 structure. i.e, there are 3 blocks each of 1024 threads. So, we Iterate in the blocks
 * and for each thread in the block we perform the convolve operation by getting the global thread Id : current blockID + the current thread ID
 * All the thread memory access have to be synchronized.
 */

for(int i=0;i<aWidth;i+=(blockDim.x)){
	/*
	 * Compute the Global thread ID
	 */
	int tx = i + threadIdx.x;

	int m=0;

	if(tx<aWidth ){//If Global thread ID is within limit

		for(m=0;m < bWidth;m++){
				__syncthreads();
		cSharedMem[tx+m]+= aMem[tx]*bMem[m];//Compute the convolve in GPU memory
		}
	}
}

/*
 * Store the result back from the Shared Memory to the result set
 */
for(int i=0;i<(aWidth+bWidth)-1;i++){
cMem[i]=cSharedMem[i];
eMem[i]=d+((i)*dinc);

}

}