ULF on a simple example CUDA program

Hallo everyone,

while reading the programming manual I’ve tried to implement a simple parallel vector adding programm: it takes 3 arrays A,B,C with 5 elements each, copies the stuff into global device memory, executes a global kernel on 1 block with 5 threads, this copies the stuff into shared memory and uses 5 threads to add the 5 elements (A[tx]+B[tx]=C[tx]), then the resulting C array is copied back into global memory, the kernel finishes and C is copied into the final host array.

Or so it should happen, but it doesn’t - an “unspecified launch failure” at line 82 (the line where the result matrix is copied from device to host memory).

I’ve tried deviceemulation to check whether the threads go out of array bounds but they don’t.

Valgrind shows a memory misread by the GNU linker.

Any idea what might be going wrong?

Here is the code (I’ve included one call from the cutil at the beginning)

[codebox]#include <stdio.h>

#define ARRSIZE 5

#define CUDA_SAFE_CALL_NO_SYNC( call) do { \

cudaError err = call; \

if( cudaSuccess != err) { \

  fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \

          __FILE__, __LINE__, cudaGetErrorString( err) );              \

  exit(EXIT_FAILURE);                                                  \

} } while (0)

global void vecAdd(float *A,float *B,float *C)


int tx=threadIdx.x;

// Declaration of the shared memory array A,B,C

__shared__ float As[ARRSIZE];

__shared__ float Bs[ARRSIZE];

__shared__ float Cs[ARRSIZE];

// Load the arrays from device memory to shared mem,

// each thread loads one element




// Synchronize to make sure the matrices are loaded


//compute teh stuff threadwise


// And sync…


//load C into device mem (each thread one elem)


//printf("I'm thread nr. %d and I'm setting C[%d]=%d\n",threadIdx.x,tx,C[tx]);


void vecAddser(float *A, float *B, float *C, float n)


for (int i=0;i<n;i++) C[i]=A[i]+B[i];


int main()


float A={1,2,3,4,5};

float B={6,7,8,9,0};

float C={0,0,0,0,0};

float reference={0,0,0,0,0};

float size = ARRSIZE * sizeof(int);

int* Aondev;

int* Bondev;

int* Condev;

//copy the arrays to global mem

CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&Aondev, size));

CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(Aondev, A, size, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&Bondev, size));

CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(Bondev, B, size, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&Condev, size));

CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(Condev, C, size, cudaMemcpyHostToDevice));

//Kernel invocation which says: exec vecAdd on a Grid containing one Block which contains 5 Threads

vecAdd<<<1, ARRSIZE>>>(A,B,C);

vecAddser(A,B,reference,ARRSIZE); //serial stuff for reference

// Read C from the device

CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(C, Condev, size, cudaMemcpyDeviceToHost)); //line 82

// Free device memory




for (int i=0;i<5;i++) printf(“C[%d]=%d\t”,i,C[i]);


for (int i=0;i<5;i++) printf(“R[%d]=%d\t”,i,reference[i]);



The output:

[codebox]Cuda error in file ‘vectorf.cu’ in line 82 : unspecified launch failure.[/codebox]

Here is the valgrind output:

[codebox]==19956== Memcheck, a memory error detector.

==19956== Copyright © 2002-2007, and GNU GPL’d, by Julian Seward et al.

==19956== Using LibVEX rev 1804, a library for dynamic binary translation.

==19956== Copyright © 2004-2007, and GNU GPL’d, by OpenWorks LLP.

==19956== Using valgrind-3.3.0-Debian, a dynamic binary instrumentation framework.

==19956== Copyright © 2000-2007, and GNU GPL’d, by Julian Seward et al.

==19956== For more details, rerun with: -v


==19956== Invalid read of size 4

==19956== at 0x40151E3: (within /lib/ld-2.7.so)

==19956== by 0x4005C59: (within /lib/ld-2.7.so)

==19956== by 0x4007A87: (within /lib/ld-2.7.so)

==19956== by 0x400BC06: (within /lib/ld-2.7.so)

==19956== by 0x400D5C5: (within /lib/ld-2.7.so)

==19956== by 0x400BDE9: (within /lib/ld-2.7.so)

==19956== by 0x4011593: (within /lib/ld-2.7.so)

==19956== by 0x400D5C5: (within /lib/ld-2.7.so)

==19956== by 0x4010F4D: (within /lib/ld-2.7.so)

==19956== by 0x42F1C18: (within /lib/tls/i686/cmov/libdl-2.7.so)

==19956== by 0x400D5C5: (within /lib/ld-2.7.so)

==19956== by 0x42F22BB: (within /lib/tls/i686/cmov/libdl-2.7.so)

==19956== Address 0x43186a8 is 16 bytes inside a block of size 19 alloc’d

==19956== at 0x4022AB8: malloc (vg_replace_malloc.c:207)

==19956== by 0x4008021: (within /lib/ld-2.7.so)

==19956== by 0x400BC06: (within /lib/ld-2.7.so)

==19956== by 0x400D5C5: (within /lib/ld-2.7.so)

==19956== by 0x400BDE9: (within /lib/ld-2.7.so)

==19956== by 0x4011593: (within /lib/ld-2.7.so)

==19956== by 0x400D5C5: (within /lib/ld-2.7.so)

==19956== by 0x4010F4D: (within /lib/ld-2.7.so)

==19956== by 0x42F1C18: (within /lib/tls/i686/cmov/libdl-2.7.so)

==19956== by 0x400D5C5: (within /lib/ld-2.7.so)

==19956== by 0x42F22BB: (within /lib/tls/i686/cmov/libdl-2.7.so)

==19956== by 0x42F1B50: dlopen (in /lib/tls/i686/cmov/libdl-2.7.so)

Cuda error in file ‘vectorf.cu’ in line 82 : unspecified launch failure.


==19956== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 35 from 1)

==19956== malloc/free: in use at exit: 262,305 bytes in 187 blocks.

==19956== malloc/free: 1,627 allocs, 1,440 frees, 7,984,379 bytes allocated.

==19956== For counts of detected errors, rerun with: -v

==19956== searching for pointers to 187 not-freed blocks.

==19956== checked 516,528 bytes.


==19956== LEAK SUMMARY:

==19956== definitely lost: 0 bytes in 0 blocks.

==19956== possibly lost: 0 bytes in 0 blocks.

==19956== still reachable: 262,305 bytes in 187 blocks.

==19956== suppressed: 0 bytes in 0 blocks.

==19956== Rerun with --leak-check=full to see details of leaked memory.[/codebox]

Umm, never mind. It turned out I’ve given the wrong arrays (host arrays instead of the device arrays) to the kernel.

On a different note: the valgrind “Invalid read” can be easily reproduced with the following code, which is rather scary - a simple cudaMalloc produces a memory misread?

[codebox]#define ARRSIZE 5

int main()


unsigned int arraysize = sizeof(float) * ARRSIZE;

float* Aondev;

cudaMalloc((void**)&Aondev, arraysize);



I put those into a supressions file. cudaMalloc is a pretty low level driver type routine and valgrind probably can’t detect everything it does. There are a lot of default supressions for false positive errors for things in glibc and other system libraries, too.