Hi,
I recently started working with CUDA and I found out the CUDA3.1 supports recursion so I installed that and wrote a recursive quicksort code which compiles and runs perfectly in emulation mode but gives me this error when I try to compile it in release mode:
nvcc error : ‘ptxas’ died due to signal 11 (Invalid memory reference)
make: *** [obj/x86_64/release/quicksort.cu.o] Error 11
If anyone has seen such an error and knows what could be causing it, any help will be great!
Here is my code:
[codebox]
/*
-
QuickSort.cu
-
######## QUICK SORT ########
*/
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#define N 10
int* r_values;
int* d_values;
device inline void swap(int &a, int &b) {
int tmp = a;
a = b;
b = tmp;
}
device inline int part(int* values, int start, int end, int pivotIdx) {
int pivotVal = values[pivotIdx];
swap(values[end], values[pivotIdx]);
int storeIdx = start;
for (int i = start; i < end; ++i) {
if (values[i] < pivotVal) {
swap(values[i], values[storeIdx]);
storeIdx++;
}
}
swap(values[storeIdx], values[end]);
return storeIdx;
}
device inline void sort(int* values, int start, int end) {
if (end > start) {
int pivotIdx = start;
int pivotN_Idx = part(values, start, end, pivotIdx);
sort(values, start, pivotN_Idx-1);
sort(values, pivotN_Idx+1, end);
}
}
global static void qs(int* values, int start, int end) {
extern __shared__ int shared[];
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < N) {
shared[idx] = values[idx];
__syncthreads();
sort(shared, start, end);
__syncthreads();
values[idx] = shared[idx];
}
}
int main(int argc, char **argv) {
srand(time(NULL));
unsigned int hTimer;
size_t size = N * sizeof(int);
// allocate host memory
r_values = (int*)malloc(size);
// initialize a random dataset
for (int i = 0; i < N; i++) {
r_values[i] = rand();
printf("%d ", r_values[i]);
}
printf("\n");
// allocate device memory & copy data to device
cutilSafeCall( cudaMalloc((void**)&d_values, size) );
cutilSafeCall( cudaMemcpy(d_values, r_values, size, cudaMemcpyHostToDevice) );
cutilCheckError( cutCreateTimer(&hTimer) );
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
qs<<< 1, N, size*2 >>>(d_values, 0, N-1);
cutilCheckMsg("Kernel execution failed...");
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStopTimer(hTimer) );
double gpuTime = cutGetTimerValue(hTimer);
printf("SORTED...\n");
printf("\nDuration: %f ms\n", gpuTime);
// copy data back to host
cutilSafeCall( cudaMemcpy(r_values, d_values, size, cudaMemcpyDeviceToHost) );
// test print
for (int i = 0; i < N; i++) {
printf("%d ", r_values[i]);
}
printf("\n");
// free memory
cutilSafeCall( cudaFree(d_values) );
free(r_values);
cutilExit(argc, argv);
cudaThreadExit();
}
[/codebox]