Hello,
I am working on Merge Sort program for my university project and as soon as I go beyond block 10 and threads 204 in the program it’s giving above error, also when I am running this program on different GPU it’s working fine.
device void gpu_bottomUpMerge(int source, int dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++) {
if (i < middle && (j >= end || source[i] < source[j])) {
dest[k] = source[i];
i++;
} else {
dest[k] = source[j];
j++;
}
}
}
global void gpu_mergesort(int source, int dest, int size, int width, int slices, dim3* threads, dim3* blocks) {
int idx = blockDim .x * blockIdx .x + threadIdx .x;
int start = width*idx*slices,
middle,
end;
for (int slice = 0; slice < slices; slice++) {
if (start >= size)
break;
middle = min(start + (width >> 1), size);
end = min(start + width, size);
gpu_bottomUpMerge(source, dest, start, middle, end);
start += width;
}
}
void mergesort(int data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {
// Allocate two arrays on the GPU we switch back and forth between them during the sort
int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;
// Actually allocate the two arrays
HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));
// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));
int* A = D_data;
int* B = D_swp;
int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;
// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
int slices = size / ((nThreads) * width) + 1;
// Actually call the kernel
gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
cudaDeviceSynchronize();
// Switch the input / output arrays instead of copying them around
A = A == D_data ? D_swp : D_data;
B = B == D_data ? D_swp : D_data;
}
// Get the list back from the GPU
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));
// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));
}
Also when I am debugging it
Invalid global read of size 4
========= at 0x00000148 in /home/sharmpra/mergesort.cu:150:gpu_mergesort(int*, int*, int, int, int, dim3*, dim3*)
========= by thread (39,0,0) in block (9,0,0)
========= Address 0x934bc0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./a.out [0x1e831]
========= Host Frame:./a.out [0x3c3d3]
========= Host Frame:./a.out [0x38a8]
========= Host Frame:./a.out [0x37b1]
========= Host Frame:./a.out [0x3810]
========= Host Frame:./a.out [0x33d1]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]
========= Invalid global read of size 4
========= at 0x00000148 in /home/sharmpra/mergesort.cu:150:gpu_mergesort(int*, int*, int, int, int, dim3*, dim3*)
========= by thread (38,0,0) in block (9,0,0)
========= Address 0x9347c0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./a.out [0x1e831]
========= Host Frame:./a.out [0x3c3d3]
========= Host Frame:./a.out [0x38a8]
========= Host Frame:./a.out [0x37b1]
========= Host Frame:./a.out [0x3810]
========= Host Frame:./a.out [0x33d1]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]
unspecified launch failure in mergesort.cu at line 215
========= Program hit cudaErrorLaunchFailure (error 4) due to “unspecified launch failure” on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so.1 [0x2ef503]
========= Host Frame:./a.out [0x3e8df]
========= Host Frame:./a.out [0x3434]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]
Line 150 in the program is :- for (int k = start; k < end; k++) from gpu_bottomUpMerge function.
Line 215 in the program is :- HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost)); from mergesort function.
I would really appreciate if any of you could guide me as I am new with cuda.
Thanks in advance.