i’ m new of CUDA programming and i want to test this technology on my new 8600m GT because i like parallel programming. I spent some day reading the documentation but i don’t have much time to dedicate to it and today i have tried one program taken from this forum. I create one file .cu with this code:

#include <stdio.h>

#include <stdlib.h>

#include <cutil.h>

#include <cuda_runtime_api.h>

#include <device_launch_parameters.h>

#define PARALLEL	1

#define BLOCK_SIZE	16

#define GRID_SIZE	20

#define FLAG_SET	8

#define FLAG_UNSET	0

__global__ void work(char* table)


	// Block index

	int bx = blockIdx.x;

	int by = blockIdx.y;

	// Thread index

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	// do a heavy job

	unsigned int times = 4294180900;

	unsigned int i = 0;

	for (unsigned int k=0;k<4294180900;k++)

	for (unsigned int w=0;w<4294180900;w++)

	for (unsigned int e=0;e<4294180900;e++)

	for (unsigned int r=0;r<4294180900;r++)

	for (unsigned int t=0;t<4294180900;t++)

	for (unsigned int y=0;y<4294180900;y++)

	for (unsigned int u=0;u<4294180900;u++)

	for (unsigned int a=0;a<4294180900;a++)

	for (unsigned int s=0;s<4294180900;s++)

	for (unsigned int d=0;d<4294180900;d++)

	for (unsigned int z=0;z<4294180900;z++)

	for (i=0;i<times;i++) {

  float b = (3809.023 * 238.3901) / 66.215 - 222.321;

  float bb = 33.02;

  bb += b * b / (tx + 1);


	if (i == times) {

  int width = BLOCK_SIZE * GRID_SIZE;

  int offset = width * BLOCK_SIZE * by + BLOCK_SIZE * bx;

  table[offset + width * ty + tx] = FLAG_SET;



void runTest(int argc, char** argv)



	printf("sizeof(char)=%d\n", sizeof(char));

	printf("sizeof(short)=%d\n", sizeof(short));

	printf("sizeof(int)=%d\n", sizeof(int));

	printf("sizeof(long)=%d\n", sizeof(long));

	// init a table to be filled

	unsigned int size = (BLOCK_SIZE * GRID_SIZE) * (BLOCK_SIZE * GRID_SIZE);

	unsigned int mem_size = sizeof(char) * size;

	printf("table size = %d\n", size);

	printf("mem_size = %d bytes, which is %d kb, or %d mb\n",

  mem_size, mem_size / 1024, mem_size / 1024 / 1024);

	char* h_table = (char*) malloc(mem_size);

	if (h_table == NULL) printf("Memory allocation failed!\n");


	// unset table

	for (unsigned int i=0;i<size;++i)

  h_table[i] = FLAG_UNSET;

	// allocate device memory

	char* d_table;

	CUDA_SAFE_CALL(cudaMalloc((void**) &d_table, mem_size));

	// copy host memory to device

	CUDA_SAFE_CALL(cudaMemcpy(d_table, h_table, mem_size,


	// create and start timer

	unsigned int timer = 0;



	// setup execution parameters

	dim3 block(BLOCK_SIZE, BLOCK_SIZE);

	dim3 grid(GRID_SIZE, GRID_SIZE);

	// execute the kernel

	printf("Ready to send job.\n");

	work<<< grid, block >>>(d_table);

	printf("Job sent!\n");

	// check if kernel execution generated and error

	CUT_CHECK_ERROR("Kernel execution failed");


  printf("Parallel time!\n");

	else {

  printf("Before Thread Sync.\n");


  printf("After Thread Sync.\n");


	// copy result from device to host

	CUDA_SAFE_CALL(cudaMemcpy(h_table, d_table, mem_size,

  cudaMemcpyDeviceToHost) );

	printf("Data copied back!\n");

	// stop and destroy timer


	printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));


	// check result

	unsigned int fail_count = 0;

	for (unsigned int i=0;i<size;++i) {

  if (h_table[i] != FLAG_SET)



	if (fail_count == 0)

  printf("=========== Success! All data is set! =============\n");


  printf("*********** Failed! %d unset! **************\n", fail_count);

	// clean up memory




int main(int argc, char** argv)


	runTest(argc, argv);


and i’ ve tried to compile it with

/usr/local/cuda/bin/nvcc test1cpp.cu -o test1 -I /home/Amirecron/NVIDIA_CUDA_SDK/common/inc -L /usr/local/cuda/include/

Result was:

/tmp/tmpxft_000055cf_00000000-9.o: In function `runTest(int, char**)':

tmpxft_000055cf_00000000-8.i:(.text+0x6e46): undefined reference to `cutCreateTimer'

tmpxft_000055cf_00000000-8.i:(.text+0x6e51): undefined reference to `cutStartTimer'

tmpxft_000055cf_00000000-8.i:(.text+0x6f24): undefined reference to `cutStopTimer'

tmpxft_000055cf_00000000-8.i:(.text+0x6f2f): undefined reference to `cutGetTimerValue'

tmpxft_000055cf_00000000-8.i:(.text+0x6f4a): undefined reference to `cutDeleteTimer'

collect2: ld returned 1 exit status

Anyone can help me? I’m not a real programmer, i do it for hobby and i don’t know much about C programming.

You are using some routines from the cutil library which is located in the CUDA SDK. You need to link against that library.


/usr/local/cuda/bin/nvcc test1cpp.cu -o main -L /usr/local/cuda/lib/ -L /home/Amirecron/NVIDIA_CUDA_SDK/lib/ -L /home/Amirecron/NVIDIA_CUDA_SDK/common/lib/ -I /home/Amirecron/NVIDIA_CUDA_SDK/common/inc/ -I /usr/local/cuda/include/ -I /home/Amirecron/NVIDIA_CUDA_SDK/common/obj/release/ -l cutil -l glut

I link cutil and glut library and nvcc compiles fine. But when i start the program the result is


./main: error while loading shared libraries: libcudart.so: cannot open shared object file: No such file or directory

I don’t know how to fix it.

It seems like you did not add the required info to PATH and LD_LIBRARY_PATH.
Put these 2 lines in your .bashrc ( or the equivalent shell you are using):

export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib:$LD_LIBRARY_PATH

This info is in the README btw, you should read the documentation.

Sorry mfatica, i read README file but i export CUDA path manually and i must have forgotten.

Sorry and thx