Quick Sort with CUDA 3.1 nvcc error : 'ptxas' died due to signal 11 (Invalid memory ref

LFC_2010 · July 1, 2010, 1:53pm

Hi,

I recently started working with CUDA and I found out the CUDA3.1 supports recursion so I installed that and wrote a recursive quicksort code which compiles and runs perfectly in emulation mode but gives me this error when I try to compile it in release mode:

nvcc error : ‘ptxas’ died due to signal 11 (Invalid memory reference)

make: *** [obj/x86_64/release/quicksort.cu.o] Error 11

If anyone has seen such an error and knows what could be causing it, any help will be great!

Here is my code:

[codebox]

/*

QuickSort.cu
######## QUICK SORT ########

*/

#include <time.h>

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

#define N 10

int* r_values;

int* d_values;

device inline void swap(int &a, int &b) {

 int tmp = a;

 a = b;

 b = tmp;

}

device inline int part(int* values, int start, int end, int pivotIdx) {

 int pivotVal = values[pivotIdx];

 swap(values[end], values[pivotIdx]);

 int storeIdx = start;

 for (int i = start; i < end; ++i) {

     if (values[i] < pivotVal) {

         swap(values[i], values[storeIdx]);

         storeIdx++;

     }

 }

 swap(values[storeIdx], values[end]);

 return storeIdx;

}

device inline void sort(int* values, int start, int end) {

     if (end > start) {

     	int pivotIdx = start;

        int pivotN_Idx = part(values, start, end, pivotIdx);

        sort(values, start, pivotN_Idx-1);

        sort(values, pivotN_Idx+1, end);

}

global static void qs(int* values, int start, int end) {

 extern __shared__ int shared[];

const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;

 if (idx < N) {

     shared[idx] = values[idx];

     __syncthreads();

     sort(shared, start, end);

     __syncthreads();

     values[idx] = shared[idx];

 }

}

int main(int argc, char **argv) {

srand(time(NULL));

unsigned int hTimer;

size_t size = N * sizeof(int);



// allocate host memory

r_values = (int*)malloc(size);



// initialize a random dataset

for (int i = 0; i < N; i++) {

	r_values[i] = rand();

	printf("%d ", r_values[i]);

}

printf("\n");



// allocate device memory & copy data to device	

cutilSafeCall( cudaMalloc((void**)&d_values, size) );

cutilSafeCall( cudaMemcpy(d_values, r_values, size, cudaMemcpyHostToDevice) );	



	cutilCheckError( cutCreateTimer(&hTimer) );

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );

qs<<< 1, N, size*2 >>>(d_values, 0, N-1);

cutilCheckMsg("Kernel execution failed...");

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer) );

double gpuTime = cutGetTimerValue(hTimer);

printf("SORTED...\n");

printf("\nDuration: %f ms\n", gpuTime);



// copy data back to host

cutilSafeCall( cudaMemcpy(r_values, d_values, size, cudaMemcpyDeviceToHost) );



// test print

for (int i = 0; i < N; i++) {

	printf("%d ", r_values[i]);

}

printf("\n");



// free memory

cutilSafeCall( cudaFree(d_values) );

free(r_values);



cutilExit(argc, argv);

cudaThreadExit();

}

[/codebox]

njuffa · July 1, 2010, 11:28pm

Sorry, I am unable to reproduce this behavior. This program compiled and ran without a hitch on my RHEL 5.3 Linux64 box with CUDA 3.1. Note that for my convenience, I removed all instances of cutil, but this does not affect the compilation of device code which is what is failing in your case. I compiled as follows:

nvcc -Xopencc -Wall -arch=sm_20 -o quicksort quicksort.cu

When I ran the program, it produced this output:

476255832 109670421 307584693 354572635 1445835247 257020044 262013741 484395766 1645755123 957813597
SORTED…
109670421 257020044 262013741 307584693 354572635 476255832 484395766 957813597 1445835247 1645755123

Could you note the exact commandline you used to invoke nvcc, and give more information on the platform you are on? It seems you are on a 64-bit platform of some sort (as I see x86_64 in your build log).

njuffa · July 1, 2010, 11:28pm

Sorry, I am unable to reproduce this behavior. This program compiled and ran without a hitch on my RHEL 5.3 Linux64 box with CUDA 3.1. Note that for my convenience, I removed all instances of cutil, but this does not affect the compilation of device code which is what is failing in your case. I compiled as follows:

nvcc -Xopencc -Wall -arch=sm_20 -o quicksort quicksort.cu

When I ran the program, it produced this output:

476255832 109670421 307584693 354572635 1445835247 257020044 262013741 484395766 1645755123 957813597
SORTED…
109670421 257020044 262013741 307584693 354572635 476255832 484395766 957813597 1445835247 1645755123

Could you note the exact commandline you used to invoke nvcc, and give more information on the platform you are on? It seems you are on a 64-bit platform of some sort (as I see x86_64 in your build log).

LFC_2010 · July 6, 2010, 1:56pm

Sorry, I am unable to reproduce this behavior. This program compiled and ran without a hitch on my RHEL 5.3 Linux64 box with CUDA 3.1. Note that for my convenience, I removed all instances of cutil, but this does not affect the compilation of device code which is what is failing in your case. I compiled as follows:

nvcc -Xopencc -Wall -arch=sm_20 -o quicksort quicksort.cu

When I ran the program, it produced this output:

476255832 109670421 307584693 354572635 1445835247 257020044 262013741 484395766 1645755123 957813597

SORTED…

109670421 257020044 262013741 307584693 354572635 476255832 484395766 957813597 1445835247 1645755123

Could you note the exact commandline you used to invoke nvcc, and give more information on the platform you are on? It seems you are on a 64-bit platform of some sort (as I see x86_64 in your build log).

I am new to CUDA and not too familiar with nvcc. I can’t figure out how to compile using nvcc. I just ran the makefile using the make command. I’m on Fedora 12.

Lev · July 6, 2010, 2:31pm

Strange code. Do you understand that all threads of a block will try to sort that one array?

LFC_2010 · July 7, 2010, 2:39pm

I’m sure my code isn’t the best code. It does work in emulation mode at least though. I am really new to CUDA. If you can improve my code and explain the changes you’ve made, that will definitely help me learn parallel sorting better.

Thanks

Lev · July 7, 2010, 6:45pm

This code is example of limitations of emulation mode.

LFC_2010 · July 7, 2010, 9:04pm

well, it also “compiled and ran without a hitch” on njuffa’s system. Can you help me improve my code? I’m just trying to learn.

laughingrice · July 7, 2010, 11:32pm

What card are you compiling this for?

Recursion only works on fermi (compute 2.0) as there are no function pointers for 1.3 devices and below. Could be that you are compiling for the default 1.1 or 1.0 devices.
Before fermi all device function calls are inlined so there is no recursion.

LFC_2010 · July 8, 2010, 2:51pm

I’m compiling for GeForce GTX 480.

LFC_2010 · July 8, 2010, 3:22pm

Sorry, I am unable to reproduce this behavior. This program compiled and ran without a hitch on my RHEL 5.3 Linux64 box with CUDA 3.1. Note that for my convenience, I removed all instances of cutil, but this does not affect the compilation of device code which is what is failing in your case. I compiled as follows:

nvcc -Xopencc -Wall -arch=sm_20 -o quicksort quicksort.cu

When I ran the program, it produced this output:

476255832 109670421 307584693 354572635 1445835247 257020044 262013741 484395766 1645755123 957813597

SORTED…

109670421 257020044 262013741 307584693 354572635 476255832 484395766 957813597 1445835247 1645755123

Could you note the exact commandline you used to invoke nvcc, and give more information on the platform you are on? It seems you are on a 64-bit platform of some sort (as I see x86_64 in your build log).

So I used the same commandline that you used to invoke nvcc and I got this:

In file included from …/…/src/quicksort/quicksort.cu:51:

/syssoft/cuda/bin/…/include/common_functions.h: In function `__cuda_clock’:

/syssoft/cuda/bin/…/include/common_functions.h:72: warning: implicit declaration of function `clock’

…/…/src/quicksort/quicksort.cu: At top level:

/syssoft/cuda/bin/…/include/common_functions.h:71: warning: `__cuda_clock’ defined but not used

/syssoft/cuda/bin/…/include/common_functions.h:76: warning: `__cuda_memset’ defined but not used

/syssoft/cuda/bin/…/include/common_functions.h:85: warning: `__cuda_memcpy’ defined but not used

nvcc error : ‘ptxas’ died due to signal 11 (Invalid memory reference)

Do you know what could be causing this?

arklogic · July 8, 2010, 3:32pm

I’m using a Tesla C1060 card. So I won’t be able to use recursion unless I get a Fermi card?

xmonraz · July 8, 2010, 5:01pm

I got the same error when I compiled against 3.0. The error went away when I compiled against 3.1. Make sure you have the right toolkit.

LFC_2010 · July 8, 2010, 8:08pm

I just reinstalled CUDA 3.1 and compiled again. But i get the same error External Media

Greg_Ross · July 14, 2010, 8:31am

Without a compute 2.0 device, recursion is not possible.

cudauser101 · November 16, 2010, 4:48am

This code doesn’t work. Does anyone have a working code for this?

Hi,

I recently started working with CUDA and I found out the CUDA3.1 supports recursion so I installed that and wrote a recursive quicksort code which compiles and runs perfectly in emulation mode but gives me this error when I try to compile it in release mode:

nvcc error : ‘ptxas’ died due to signal 11 (Invalid memory reference)

make: *** [obj/x86_64/release/quicksort.cu.o] Error 11

If anyone has seen such an error and knows what could be causing it, any help will be great!

Here is my code:

[codebox]

/*

QuickSort.cu

######## QUICK SORT ########

*/

include <time.h>

include <stdio.h>

include <stdlib.h>

include <cutil_inline.h>

define N 10

int* r_values;

int* d_values;

device inline void swap(int &a, int &b) {
 int tmp = a;

 a = b;

 b = tmp;
}

device inline int part(int* values, int start, int end, int pivotIdx) {
 int pivotVal = values[pivotIdx];

 swap(values[end], values[pivotIdx]);

 int storeIdx = start;

 for (int i = start; i < end; ++i) {

     if (values[i] < pivotVal) {

         swap(values[i], values[storeIdx]);

         storeIdx++;

     }

 }

 swap(values[storeIdx], values[end]);

 return storeIdx;
}

device inline void sort(int* values, int start, int end) {
     if (end > start) {

     	int pivotIdx = start;

        int pivotN_Idx = part(values, start, end, pivotIdx);

        sort(values, start, pivotN_Idx-1);

        sort(values, pivotN_Idx+1, end);
}

}

global static void qs(int* values, int start, int end) {
 extern __shared__ int shared[]; 
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
 if (idx < N) {

     shared[idx] = values[idx];

     __syncthreads();

     sort(shared, start, end);

     __syncthreads();

     values[idx] = shared[idx];

 }
}

int main(int argc, char **argv) {
srand(time(NULL));

unsigned int hTimer;

size_t size = N * sizeof(int);



// allocate host memory

r_values = (int*)malloc(size);



// initialize a random dataset

for (int i = 0; i < N; i++) {

	r_values[i] = rand();

	printf("%d ", r_values[i]);

}

printf("\n");



// allocate device memory & copy data to device	

cutilSafeCall( cudaMalloc((void**)&d_values, size) );

cutilSafeCall( cudaMemcpy(d_values, r_values, size, cudaMemcpyHostToDevice) );	



	cutilCheckError( cutCreateTimer(&hTimer) );

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );

qs<<< 1, N, size*2 >>>(d_values, 0, N-1);

cutilCheckMsg("Kernel execution failed...");

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer) );

double gpuTime = cutGetTimerValue(hTimer);

printf("SORTED...\n");

printf("\nDuration: %f ms\n", gpuTime);



// copy data back to host

cutilSafeCall( cudaMemcpy(r_values, d_values, size, cudaMemcpyDeviceToHost) );



// test print

for (int i = 0; i < N; i++) {

	printf("%d ", r_values[i]);

}

printf("\n");



// free memory

cutilSafeCall( cudaFree(d_values) );

free(r_values);



cutilExit(argc, argv);

cudaThreadExit();
}

[/codebox]

cudauser101 · November 16, 2010, 4:48am

This code doesn’t work. Does anyone have a working code for this?

Hi,

I recently started working with CUDA and I found out the CUDA3.1 supports recursion so I installed that and wrote a recursive quicksort code which compiles and runs perfectly in emulation mode but gives me this error when I try to compile it in release mode:

nvcc error : ‘ptxas’ died due to signal 11 (Invalid memory reference)

make: *** [obj/x86_64/release/quicksort.cu.o] Error 11

If anyone has seen such an error and knows what could be causing it, any help will be great!

Here is my code:

[codebox]

/*

QuickSort.cu

######## QUICK SORT ########

*/

include <time.h>

include <stdio.h>

include <stdlib.h>

include <cutil_inline.h>

define N 10

int* r_values;

int* d_values;

device inline void swap(int &a, int &b) {
 int tmp = a;

 a = b;

 b = tmp;
}

device inline int part(int* values, int start, int end, int pivotIdx) {
 int pivotVal = values[pivotIdx];

 swap(values[end], values[pivotIdx]);

 int storeIdx = start;

 for (int i = start; i < end; ++i) {

     if (values[i] < pivotVal) {

         swap(values[i], values[storeIdx]);

         storeIdx++;

     }

 }

 swap(values[storeIdx], values[end]);

 return storeIdx;
}

device inline void sort(int* values, int start, int end) {
     if (end > start) {

     	int pivotIdx = start;

        int pivotN_Idx = part(values, start, end, pivotIdx);

        sort(values, start, pivotN_Idx-1);

        sort(values, pivotN_Idx+1, end);
}

}

global static void qs(int* values, int start, int end) {
 extern __shared__ int shared[]; 
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
 if (idx < N) {

     shared[idx] = values[idx];

     __syncthreads();

     sort(shared, start, end);

     __syncthreads();

     values[idx] = shared[idx];

 }
}

int main(int argc, char **argv) {
srand(time(NULL));

unsigned int hTimer;

size_t size = N * sizeof(int);



// allocate host memory

r_values = (int*)malloc(size);



// initialize a random dataset

for (int i = 0; i < N; i++) {

	r_values[i] = rand();

	printf("%d ", r_values[i]);

}

printf("\n");



// allocate device memory & copy data to device	

cutilSafeCall( cudaMalloc((void**)&d_values, size) );

cutilSafeCall( cudaMemcpy(d_values, r_values, size, cudaMemcpyHostToDevice) );	



	cutilCheckError( cutCreateTimer(&hTimer) );

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );

qs<<< 1, N, size*2 >>>(d_values, 0, N-1);

cutilCheckMsg("Kernel execution failed...");

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer) );

double gpuTime = cutGetTimerValue(hTimer);

printf("SORTED...\n");

printf("\nDuration: %f ms\n", gpuTime);



// copy data back to host

cutilSafeCall( cudaMemcpy(r_values, d_values, size, cudaMemcpyDeviceToHost) );



// test print

for (int i = 0; i < N; i++) {

	printf("%d ", r_values[i]);

}

printf("\n");



// free memory

cutilSafeCall( cudaFree(d_values) );

free(r_values);



cutilExit(argc, argv);

cudaThreadExit();
}

[/codebox]

Lev · December 8, 2010, 11:22pm

The code is totaly wrong. Need to undersntad that gpu perfroms many threads in parallel. It is not just iterations of a cycle.