Hello,
I’ve implemented a CUDA kernel that makes simple additions of arrays, but the content of the arrays is not modified. It’s not my first experience with CUDA, and I really don’t understand what’s wrong. I tried to execute the code on two different GPUs and the result (or more likely the absence of result) is the same. Here is my code :
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <sys/time.h>
#include <string.h>
#include <semaphore.h>
#include <assert.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <algorithm>
#include <cuda.h>
/*
#define N 130000
#define NITER 100
#define THREAD_PER_BLOCK 512
*/
#define N 512
#define NITER 1
#define THREAD_PER_BLOCK 512
/*measure time*/
static struct timeval start_time;
__device__ __constant__ unsigned int limit_d;
void init_time() {
gettimeofday(&start_time, NULL);
}
long long get_time() {
struct timeval t;
gettimeofday(&t, NULL);
return (long long) (t.tv_sec - start_time.tv_sec) * 1000000 + (t.tv_usec - start_time.tv_usec);
}
__global__ void addition(double * arr1, double * arr2, double * arr3, unsigned int * limit){
unsigned int id = blockIdx.x * THREAD_PER_BLOCK + threadIdx.x;
arr1[id] = arr1[id] + arr2[id] + arr3[id];
arr2[id] = arr2[id] + 2.0 * arr3[id];
syncthreads();
}
int main(int argc, char *argv[])
{
int i, j;
unsigned int limit_h = N;
printf("Limite : %d \n", limit_h);
long long start, stop;
double t, sum, sol;
//srand(10);
init_time();
/*define initial state*/
double yvar1[N];
double yvar2[N];
double yvar3[N];
double yvar1ini[N];
double yvar2ini[N];
for(i = 0 ; i < N ; i++){
yvar1ini[i] = (double) rand() / (double) RAND_MAX;
yvar2ini[i] = (double) rand() / (double) RAND_MAX;
yvar3[i] = (double) rand() / (double) RAND_MAX;
}
fprintf(stdout,"1;2;3\n");
for(i=0;i<N;++i){
fprintf(stdout,"%f;%f;%f\n",yvar1ini[i],yvar2ini[i],yvar3[i]);
}
/*inplace sequential prediction +=*/
/*
memcpy(yvar1, yvar1ini, N * sizeof(double));
memcpy(yvar2, yvar2ini, N * sizeof(double));
start = get_time();
for (j=0; j < NITER; j++)
for(i = 0 ; i < N ; i++){
yvar1[i] += yvar2[i] + yvar3[i];
yvar2[i] += 2.0 * yvar3[i];
}
stop = get_time();
t= (double) (stop-start) / 1000.0;
printf("inplace sequential prediction += : %f ms\n",t/NITER);
*/
/*GPU parallalization*/
printf("Limite : %d \n", limit_h);
double * yvar1_d, *yvar2_d, *yvar3_d;
unsigned int * limite_d, test;
cudaMalloc((void**) &yvar1_d, sizeof(double) * limit_h);
cudaMalloc((void**) &yvar2_d, sizeof(double) * limit_h);
cudaMalloc((void**) &yvar3_d, sizeof(double) * limit_h);
cudaMalloc((void**) &limite_d, sizeof(unsigned int));
cudaMemcpyToSymbol("limit_d", &limit_h, sizeof(unsigned int));
cudaMemcpy(limite_d, &limit_h, sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaMemcpy(&test, limite_d, sizeof(unsigned int), cudaMemcpyDeviceToHost);
printf("Limite GPU : %d\n", test);
cudaMemcpy(yvar1_d, yvar1ini, sizeof(double)*limit_h, cudaMemcpyHostToDevice);
cudaMemcpy(yvar2_d, yvar2ini, sizeof(double)*limit_h, cudaMemcpyHostToDevice);
cudaMemcpy(yvar3_d, yvar3, sizeof(double)*limit_h, cudaMemcpyHostToDevice);
for(j=0;j<NITER;++j){
unsigned int blocks = limit_h / THREAD_PER_BLOCK + (limit_h % THREAD_PER_BLOCK > 0);
printf("Blocks : %d, threads : %d\n", blocks, THREAD_PER_BLOCK);
addition<<<blocks, THREAD_PER_BLOCK>>>(yvar1_d, yvar2_d, yvar3_d, limite_d);
}
cudaMemcpy(yvar1, yvar1_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);
cudaMemcpy(yvar2, yvar2_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);
cudaMemcpy(yvar3, yvar3_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);
cudaFree(yvar1_d);
cudaFree(yvar2_d);
cudaFree(yvar3_d);
cudaFree(limite_d);
cudaFree(&limit_d);
FILE * saisie;
saisie=fopen("prediction.csv","wt");
fprintf(stdout,"1;2;3\n");
for(i=0;i<N;++i){
fprintf(stdout,"%f;%f;%f\n",yvar1[i],yvar2[i],yvar3[i]);
}
fclose(saisie);
return 0;
}
The results are :
Limite : 10
1;2;3
0.840188;0.394383;0.783099
0.798440;0.911647;0.197551
0.335223;0.768230;0.277775
0.553970;0.477397;0.628871
0.364784;0.513401;0.952230
0.916195;0.635712;0.717297
0.141603;0.606969;0.016301
0.242887;0.137232;0.804177
0.156679;0.400944;0.129790
0.108809;0.998924;0.218257
Limite : 10
Limite GPU : 10
Blocks : 1, threads : 512
1;2;3
0.840188;0.394383;0.783099
0.798440;0.911647;0.197551
0.335223;0.768230;0.277775
0.553970;0.477397;0.628871
0.364784;0.513401;0.952230
0.916195;0.635712;0.717297
0.141603;0.606969;0.016301
0.242887;0.137232;0.804177
0.156679;0.400944;0.129790
0.108809;0.998924;0.218257
As you can see, the data transfers are well executed (the final array of the host is initialized but not filled before the cudaMemcpyDeviceToHost), and one block is launched but the arrays are not modified. Have someone got any idea of the problem ?
Thank you,
Best regards.