Hi, I just started programming with cuda… it took 1 day to get everything to work and try a small testprogramm I found at youtube. Is it normal, that compiling takes about 20 sec for such a simple code? It takes the same time in release mode…
#include <iostream>
#include "device_launch_parameters.h"
#include <cuda_runtime_api.h>
#include <stdlib.h>
#include <ctime>
using namespace std;
__global__ void add(int* a, int *b,int * c, int count)
int id = blockIdx.x*blockDim.x + threadIdx.x;
if (id < count)
c[id] = a[id] + b[id];
int main()
int count = 1000;
int *h_a = new int[count];
int *h_b = new int[count];
int *h_c = new int[count];
for (int i = 0; i < count; i++)
h_a[i] = rand() % 1000;
h_b[i] = rand() % 1000;
h_c[i] = 0;
int* d_a;
int* d_b;
int* d_c;
if (cudaMalloc((void**)&d_a, sizeof(int)*count) != cudaSuccess ||
cudaMalloc((void**)&d_b, sizeof(int)*count) != cudaSuccess ||
cudaMalloc((void**)&d_c, sizeof(int)*count) != cudaSuccess)
cout << "false alloc" << endl;
delete[] h_a;
delete[] h_b;
delete[] h_c;
return 0;
if (cudaMemcpy(d_a, h_a, sizeof(int)*count, cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_b, h_b, sizeof(int)*count, cudaMemcpyHostToDevice) != cudaSuccess)
cout << "false cpy" << endl;
delete[] h_a;
delete[] h_b;
delete[] h_c;
return 0;
add<<<count/256 +1,256>>>(d_a, d_b, d_c, count);
if (cudaMemcpy(h_c, d_c, sizeof(int)*count, cudaMemcpyDeviceToHost) != cudaSuccess)
cout << "false cpy back" << endl;
delete[] h_a;
delete[] h_b;
delete[] h_c;
return 0;
for (int i = 0; i < 100;i++)
cout << h_a[i] << "+" << h_b[i] << "=" << h_c[i] << endl;
delete[] h_a;
delete[] h_b;
delete[] h_c;
return 0;
Or do I have to change some options in my visual studio 2013 community? What can I do? And please keep explanations simple. I’m just starting with CUDA.
The buildnotes look like:
1> Compiling CUDA source file main.cu...
1> d:\dokumente\visual studio 2013\Projects\Cudatest3\Cudatest3>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\bin\nvcc.exe" -gencode=arch=compute_20,code=\"sm_20,compute_20\" --use-local-env --cl-version 2013 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_LIB -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\main.cu.obj "d:\dokumente\visual studio 2013\Projects\Cudatest3\Cudatest3\main.cu"
1> main.cu
1> Cudatest3.vcxproj -> d:\dokumente\visual studio 2013\Projects\Cudatest3\Debug\Cudatest3.exe
the long second line takes the most time…
(i’m using a i7-2600k, 16GB RAM, GTX 970)
thanks in advance =)
PS: btw why do I have to use
#include "device_launch_parameters.h"
#include <cuda_runtime_api.h>
instead of
#include <cuda.h>
like i saw in many tutorials??