Problem with compiling own CUDA C Code

Hello to everyone,

i am trying to compile my own program written in CUDA C.

On my computer i run win7 x64 with a GTX 260.

Therefore i installed CUDA SDK, latest driver, CUDA toolkit and Visual Studio 2010

driver: 296.10-desktop-win7-winvista-64bit-international-whql

sdk: gpucomputingsdk_4.1.28_win_64

toolkit: cudatoolkit_4.1.28_win_64

Ok, so far so good …

After installing everything i started the precompiled examples from the sdk, those run perfect.

Then i compiled the example “bandwidthTest” from the CUDA sdk, perfect, too. Found the new binary unter …/bin/Win64/Debug and could start that, too.

Then i tried compiling my own code, so i used the existing project bandwidthTest and copied my code in the file bandwidthTest.cu.

When i compile the code i get the following error:

The Code i tried to compile:

#include <cuda.h>

const int N = 1024;

const int blocksize = 16;

__global__ void add_matrix(float* a, float *b, float *c, int N)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index = i + j*N;

if (i < N && j < N)

c[index] = a[index] + b[index];

}

int main() {

float *a = new float[N*N];

float *b = new float[N*N];

float *c = new float[N*N];

for ( int i = 0; i < N*N; ++i ) {

a[i] = 1.0f; b[i] = 3.5f; 

}

float *ad, *bd, *cd;

const int size = N*N*sizeof(float);

cudaMalloc( (void**)&ad, size );

cudaMalloc( (void**)&bd, size );

cudaMalloc( (void**)&cd, size );

cudaMemcpy( ad, a, size, cudaMemcpyHostToDevice );

cudaMemcpy( bd, b, size, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, blocksize );

dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );

add_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );

cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );

cudaFree( ad ); cudaFree( bd ); cudaFree( cd );

delete[] a; delete[] b; delete[] c;

return EXIT_SUCCESS;

}

Does anybody have an idea what i am doing wrong or how i can compile this simple code?

http://stackoverflow.com/questions/7233291/visual-studio-express-simple-build-configuration-error

ok got that running now, thx ;)

i now want to measure the execution time of this snippet as following:

/*

 * Copyright 1993-2011 NVIDIA Corporation.  All rights reserved.

 *

 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.

 *

 */

/* 

 * This is a simple test program to measure the memcopy bandwidth of the GPU.

 * It can measure device to device copy bandwidth, host to device copy bandwidth 

 * for pageable and pinned memory, and device to host copy bandwidth for pageable 

 * and pinned memory.

 *

 * Usage:

 * ./bandwidthTest [option]...

 */

// includes

//#include <sdkHelper.h>  // helper for shared functions common to CUDA SDK samples

//#include <shrQATest.h> // This is for automated testing output (--qatest)

//#include <shrUtils.h>

#include <memory>

#include <iostream>

#include <cassert>

#include <stdio.h>

#include <cuda.h>

#include <Winsock2.h>

#include <time.h>

#include <windows.h>

#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)

  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64

#else

  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL

#endif

struct timezone

{

  int  tz_minuteswest; /* minutes W of Greenwich */

  int  tz_dsttime;     /* type of dst correction */

};

int gettimeofday(struct timeval *tv, struct timezone *tz)

{

// Define a structure to receive the current Windows filetime

  FILETIME ft;

// Initialize the present time to 0 and the timezone to UTC

  unsigned __int64 tmpres = 0;

  static int tzflag = 0;

if (NULL != tv)

  {

    GetSystemTimeAsFileTime(&ft);

// The GetSystemTimeAsFileTime returns the number of 100 nanosecond 

// intervals since Jan 1, 1601 in a structure. Copy the high bits to 

// the 64 bit tmpres, shift it left by 32 then or in the low 32 bits.

    tmpres |= ft.dwHighDateTime;

    tmpres <<= 32;

    tmpres |= ft.dwLowDateTime;

// Convert to microseconds by dividing by 10

    tmpres /= 10;

// The Unix epoch starts on Jan 1 1970.  Need to subtract the difference 

// in seconds from Jan 1 1601.

    tmpres -= DELTA_EPOCH_IN_MICROSECS;

// Finally change microseconds to seconds and place in the seconds value. 

// The modulus picks up the microseconds.

    tv->tv_sec = (long)(tmpres / 1000000UL);

    tv->tv_usec = (long)(tmpres % 1000000UL);

  }

if (NULL != tz)

  {

    if (!tzflag)

    {

      _tzset();

      tzflag++;

    }

// Adjust for the timezone west of Greenwich

      tz->tz_minuteswest = _timezone / 60;

    tz->tz_dsttime = _daylight;

  }

return 0;

}

const int N = 4096;

const int blocksize = 64;

struct timeval start_time;

struct timeval end_time;

int compute_result(struct timeval *end_time, struct timeval *start_time) {

if(end_time->tv_sec == 0 && end_time->tv_usec == 0) {

        return 0;

    }

    return (end_time->tv_sec * 1000 + end_time->tv_usec / 1000) - (start_time->tv_sec * 1000 + start_time->tv_usec / 1000); 

}

__global__ void add_matrix(float* a, float *b, float *c, int N)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

int index = i + j*N;

if (i < N && j < N)

c[index] = a[index] + b[index];

}

int main() {

float *a = new float[N*N];

float *b = new float[N*N];

float *c = new float[N*N];

for ( int i = 0; i < N*N; ++i ) {

a[i] = 1.0f; b[i] = 3.5f; 

}

float *ad, *bd, *cd;

const int size = N*N*sizeof(float);

cudaMalloc( (void**)&ad, size );

cudaMalloc( (void**)&bd, size );

cudaMalloc( (void**)&cd, size );

cudaMemcpy( ad, a, size, cudaMemcpyHostToDevice );

cudaMemcpy( bd, b, size, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, blocksize );

dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );

gettimeofday(&start_time, NULL);

double t1=start_time.tv_sec+(start_time.tv_usec/1000000.0);

printf("start: %d\n", t1);

add_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );

gettimeofday(&end_time, NULL);

double t2=end_time.tv_sec+(end_time.tv_usec/1000000.0);

printf("start: %d\n", t2);

cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );

cudaFree( ad ); cudaFree( bd ); cudaFree( cd );

delete[] a; delete[] b; delete[] c;

//printf("Ergebnis: %d", compute_result(&end_time, &start_time));

printf("Ergebnis: %f", t2-t1);

return EXIT_SUCCESS;

}

but the timedifference is always 0 …

if i run the standard c version of this program i get some ms as result …

i also found this to measure time:

cudaEvent_t start, stop;

cudaEventCreate(&start);

cudaEventCreate(&stop);

cudaEventRecord( start, 0 );

// your code

cudaEventRecord( stop, 0 );

cudaEventSynchronize( stop );

float elapsedTime;

cudaEventElapsedTime( &elapsedTime, start, stop );

printf( "Time to generate: %3.1f ms\n", elapsedTime );

but if i replace //your code with my kernel-call its 0.00ms aswell …

CUDA launches are asynchronous.

If you want to do timing from the host side, you need to synchronize either using cudaMemcpy or with explicit synchronization ( cudaDeviceSynchronize )

You should modify your code to:

printf("start: %d\n", t1);

add_matrix<<<dimGrid, dimBlock>>>( ad, bd, cd, N );

cudaDeviceSynchronize();

gettimeofday(&end_time, NULL);

after you call a kernel the control is given back to the host. you need to sync the host and the gpu. try this:

float gputime;

    cudaEvent_t start,stop;

    cudaEventCreate(&start);

    cudaEventCreate(&stop);

//....

cudaEventRecord(start,0); 

// ... gpu work

cudaEventRecord(stop,0);

    cudaEventSynchronize(stop); 

    cudaEventElapsedTime(&gputime,start,stop);

cudaEventDestroy(start);

    cudaEventDestroy(stop) ;   

    printf(" \n");

printf("Time = %g \n",  gputime/1000.0f);

ah thx mfatica :)
cudaDeviceSynchronize(); works

Still it is better to use the cuda events to time the code. Refer to Cuda C Best Practice Guide Chapter 5 for more information on timing and performance metrics. (it is only 5 pages)

cheers,

Are these methods effective to usse them when mpi or openmo barriers are used ? In cases some communication between threads/processes are needed.