Program is giving an illegal memory access was encountered error with higher number of threads

Hello,
I am working on Merge Sort program for my university project and as soon as I go beyond block 10 and threads 204 in the program it’s giving above error, also when I am running this program on different GPU it’s working fine.

device void gpu_bottomUpMerge(int source, int dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++) {
if (i < middle && (j >= end || source[i] < source[j])) {
dest[k] = source[i];
i++;
} else {
dest[k] = source[j];
j++;
}
}
}

global void gpu_mergesort(int source, int dest, int size, int width, int slices, dim3* threads, dim3* blocks) {

int idx = blockDim .x * blockIdx .x + threadIdx .x;

int start = width*idx*slices, 
     middle, 
     end;

for (int slice = 0; slice < slices; slice++) {
    if (start >= size)
        break;

    middle = min(start + (width >> 1), size);
    end = min(start + width, size);
    gpu_bottomUpMerge(source, dest, start, middle, end);
    start += width;
}

}

void mergesort(int data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {

// Allocate two arrays on the GPU we switch back and forth between them during the sort

int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;

// Actually allocate the two arrays

HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));

// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));  

int* A = D_data;
int* B = D_swp;

int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;

// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
    int slices = size / ((nThreads) * width) + 1;
  
    // Actually call the kernel
    gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
    cudaDeviceSynchronize();
    
    // Switch the input / output arrays instead of copying them around
    A = A == D_data ? D_swp : D_data;
    B = B == D_data ? D_swp : D_data;
}

// Get the list back from the GPU 
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));

// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));

}

Also when I am debugging it

Invalid global read of size 4
========= at 0x00000148 in /home/sharmpra/mergesort.cu:150:gpu_mergesort(int*, int*, int, int, int, dim3*, dim3*)
========= by thread (39,0,0) in block (9,0,0)
========= Address 0x934bc0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./a.out [0x1e831]
========= Host Frame:./a.out [0x3c3d3]
========= Host Frame:./a.out [0x38a8]
========= Host Frame:./a.out [0x37b1]
========= Host Frame:./a.out [0x3810]
========= Host Frame:./a.out [0x33d1]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]

========= Invalid global read of size 4
========= at 0x00000148 in /home/sharmpra/mergesort.cu:150:gpu_mergesort(int*, int*, int, int, int, dim3*, dim3*)
========= by thread (38,0,0) in block (9,0,0)
========= Address 0x9347c0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./a.out [0x1e831]
========= Host Frame:./a.out [0x3c3d3]
========= Host Frame:./a.out [0x38a8]
========= Host Frame:./a.out [0x37b1]
========= Host Frame:./a.out [0x3810]
========= Host Frame:./a.out [0x33d1]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]
unspecified launch failure in mergesort.cu at line 215
========= Program hit cudaErrorLaunchFailure (error 4) due to “unspecified launch failure” on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so.1 [0x2ef503]
========= Host Frame:./a.out [0x3e8df]
========= Host Frame:./a.out [0x3434]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]

Line 150 in the program is :- for (int k = start; k < end; k++) from gpu_bottomUpMerge function.
Line 215 in the program is :- HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost)); from mergesort function.

I would really appreciate if any of you could guide me as I am new with cuda.
Thanks in advance.

Thanks, all I managed to figure out the solution.