Hi !!
I try to use the profiler for the first time. I looked in the doc in the Visual Profiler Guide but my problem isn’t mentionned.
As it is the first use, I wanted to profile a simple example : the sum of two vectors. I wrote the code and I got this message when I profiled it :
Error : Profiler data file ‘/home/sebastien/Dropbox/Doctorat/Etudiants/IPD/tp3 - gpu/Solutions/temp_compute_profiler_0_0.csv’ does not contain profiler output.This can happen when:
a) Profiling is disabled during the entire run of the application.
b) The application does not invoke any kernel launches or memory transfers.
c) The application does not release resources (contexts, events, etc.).
The program needs to be modified to properly free up all resources before termination.
As I didn’t disable the profiling, as the application does invoke a kernel and it does release resources, I don’t know what to do to make it works.
When I look at the program runs, I see that some failed (#1, 3, 5 and 6) (maybe it’s correlated).
Have someone got an idea of the problem ?
Thanks for your help,
Best regards
#include <sys/time.h>
#include <stdio.h>
#include <cuda.h>
#define THREAD_PER_BLOCK 512
__global__ void addition(int * a, int * b, int * c, unsigned int limit_d)
{
int id = blockIdx.x * THREAD_PER_BLOCK + threadIdx.x;
if(id<limit_d)
c[id] = a[id] + b[id];
}
int main(int agrc, char * argv[]){
unsigned int size = atoi(argv[1]), i;
int * a_h = (int *) malloc(size * sizeof(int)), * a_d, * b_h = (int *) malloc(size * sizeof(int)), * b_d, * c1_h = (int *) malloc(size * sizeof(int)), * c2_h = (int *) malloc(size * sizeof(int)), * c_d;
unsigned long long diffH = 0, diffD = 0;
struct timeval tv1,tv2;
cuInit(0);
cudaSetDevice(0);
cudaMalloc((void**) &a_d, size * sizeof(int));
cudaMalloc((void**) &b_d, size * sizeof(int));
cudaMalloc((void**) &c_d, size * sizeof(int));
srand(1234);
for(i=0;i<size;++i){
a_h[i] = rand()%size;
b_h[i] = rand()%size;
}
gettimeofday(&tv1, NULL);
for(i = 0; i<size; ++i)
c1_h[i] = a_h[i] + b_h[i];
gettimeofday(&tv2, NULL);
diffH+=(tv2.tv_sec-tv1.tv_sec) * 1000000L + (tv2.tv_usec-tv1.tv_usec);
gettimeofday(&tv1, NULL);
cudaMemcpy(a_d, a_h, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, size * sizeof(int), cudaMemcpyHostToDevice);
unsigned int blocks = (size) / THREAD_PER_BLOCK + ((size) % THREAD_PER_BLOCK > 0);
addition<<<blocks,THREAD_PER_BLOCK>>>(a_d,b_d,c_d,size);
cudaMemcpy(c2_h, c_d, size * sizeof(int), cudaMemcpyDeviceToHost);
gettimeofday(&tv2, NULL);
diffD+=(tv2.tv_sec-tv1.tv_sec) * 1000000L + (tv2.tv_usec-tv1.tv_usec);
int ok = 1;
for(i=0;i<size;++i)
if(c1_h[i]!=c2_h[i]){
ok = 0;
printf("Différence : %d != %d (= %d + %d)\n", c1_h[i], c2_h[i], a_h[i], b_h[i]);
}
if(ok)
printf("Temps de calcul, CPU [%llu usec] GPU [%llu usec] \n", diffH, diffD);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
free(a_h);
free(b_h);
free(c1_h);
free(c2_h);
}