The kernel isn't working

Hello,

I’ve implemented a CUDA kernel that makes simple additions of arrays, but the content of the arrays is not modified. It’s not my first experience with CUDA, and I really don’t understand what’s wrong. I tried to execute the code on two different GPUs and the result (or more likely the absence of result) is the same. Here is my code :

#include <stdio.h>

#include <stdlib.h>

#include <pthread.h>

#include <sys/time.h>

#include <string.h>

#include <semaphore.h>

#include <assert.h>

#include <string.h> 

#include <math.h> 

#include <float.h>

#include <algorithm>

#include <cuda.h>

/*

#define N 130000

#define NITER 100

#define THREAD_PER_BLOCK 512

*/

#define N 512

#define NITER 1

#define THREAD_PER_BLOCK 512

/*measure time*/

static struct timeval start_time;

__device__ __constant__ unsigned int limit_d;

void init_time() {

gettimeofday(&start_time, NULL);

}

long long get_time() {

struct timeval t;

gettimeofday(&t, NULL);

return (long long) (t.tv_sec - start_time.tv_sec) * 1000000 + (t.tv_usec - start_time.tv_usec);

}

__global__ void addition(double * arr1, double * arr2, double * arr3, unsigned int * limit){

	unsigned int id = blockIdx.x * THREAD_PER_BLOCK + threadIdx.x;

		arr1[id] = arr1[id] + arr2[id] + arr3[id];

		arr2[id] = arr2[id] + 2.0 * arr3[id];

	syncthreads();

}

int main(int argc, char *argv[])

{

	int i, j;

	unsigned int limit_h = N;

	printf("Limite : %d \n", limit_h);

	long long start, stop;

	double t, sum, sol;

	//srand(10);

	init_time();

	/*define initial state*/

	double yvar1[N];

	double yvar2[N];

        double yvar3[N];

	

	double yvar1ini[N];

	double yvar2ini[N];

	for(i = 0 ; i < N ; i++){

		yvar1ini[i] = (double) rand() / (double) RAND_MAX;

		yvar2ini[i] = (double) rand() / (double) RAND_MAX;

		yvar3[i] = (double) rand() / (double) RAND_MAX;

	}

	

	fprintf(stdout,"1;2;3\n");

	for(i=0;i<N;++i){

		fprintf(stdout,"%f;%f;%f\n",yvar1ini[i],yvar2ini[i],yvar3[i]);

	}

	

	/*inplace sequential prediction +=*/

	/*

	memcpy(yvar1, yvar1ini, N * sizeof(double));

	memcpy(yvar2, yvar2ini, N * sizeof(double));

	start = get_time();

	for (j=0; j < NITER; j++)

		for(i = 0 ; i < N ; i++){

			yvar1[i] += yvar2[i] + yvar3[i];

			yvar2[i] += 2.0 * yvar3[i];

		}

	stop = get_time();

	t= (double) (stop-start) / 1000.0;

	printf("inplace sequential prediction += : %f ms\n",t/NITER);

	*/

	/*GPU parallalization*/

	printf("Limite : %d \n", limit_h);

	

	double * yvar1_d, *yvar2_d, *yvar3_d;

	unsigned int * limite_d, test;

        cudaMalloc((void**) &yvar1_d, sizeof(double) * limit_h);

	cudaMalloc((void**) &yvar2_d, sizeof(double) * limit_h);

	cudaMalloc((void**) &yvar3_d, sizeof(double) * limit_h);

	cudaMalloc((void**) &limite_d, sizeof(unsigned int));

	cudaMemcpyToSymbol("limit_d", &limit_h, sizeof(unsigned int));

	cudaMemcpy(limite_d, &limit_h, sizeof(unsigned int), cudaMemcpyHostToDevice);

	cudaMemcpy(&test, limite_d, sizeof(unsigned int), cudaMemcpyDeviceToHost);

	

	printf("Limite GPU : %d\n", test);

	

	cudaMemcpy(yvar1_d, yvar1ini, sizeof(double)*limit_h, cudaMemcpyHostToDevice);

	cudaMemcpy(yvar2_d, yvar2ini, sizeof(double)*limit_h, cudaMemcpyHostToDevice);

	cudaMemcpy(yvar3_d, yvar3, sizeof(double)*limit_h, cudaMemcpyHostToDevice);

	for(j=0;j<NITER;++j){

		unsigned int blocks = limit_h / THREAD_PER_BLOCK + (limit_h % THREAD_PER_BLOCK > 0);

		printf("Blocks : %d, threads : %d\n", blocks, THREAD_PER_BLOCK);

		addition<<<blocks, THREAD_PER_BLOCK>>>(yvar1_d, yvar2_d, yvar3_d, limite_d);

	}

	cudaMemcpy(yvar1, yvar1_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);

	cudaMemcpy(yvar2, yvar2_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);

	cudaMemcpy(yvar3, yvar3_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);

	

	cudaFree(yvar1_d);

	cudaFree(yvar2_d);

	cudaFree(yvar3_d);

	cudaFree(limite_d);

	cudaFree(&limit_d);

	

	FILE * saisie;

	saisie=fopen("prediction.csv","wt");

	fprintf(stdout,"1;2;3\n");

	for(i=0;i<N;++i){

		fprintf(stdout,"%f;%f;%f\n",yvar1[i],yvar2[i],yvar3[i]);

	}

	fclose(saisie);

	return 0;

}

The results are :

Limite : 10

1;2;3

0.840188;0.394383;0.783099

0.798440;0.911647;0.197551

0.335223;0.768230;0.277775

0.553970;0.477397;0.628871

0.364784;0.513401;0.952230

0.916195;0.635712;0.717297

0.141603;0.606969;0.016301

0.242887;0.137232;0.804177

0.156679;0.400944;0.129790

0.108809;0.998924;0.218257

Limite : 10

Limite GPU : 10

Blocks : 1, threads : 512

1;2;3

0.840188;0.394383;0.783099

0.798440;0.911647;0.197551

0.335223;0.768230;0.277775

0.553970;0.477397;0.628871

0.364784;0.513401;0.952230

0.916195;0.635712;0.717297

0.141603;0.606969;0.016301

0.242887;0.137232;0.804177

0.156679;0.400944;0.129790

0.108809;0.998924;0.218257

As you can see, the data transfers are well executed (the final array of the host is initialized but not filled before the cudaMemcpyDeviceToHost), and one block is launched but the arrays are not modified. Have someone got any idea of the problem ?

Thank you,

Best regards.

The program you posted is so full of bugs it would not even compile. So where do the results come from that you posted?

I didn’t put all the initialization details, I edited my message and it’s compilable now. I tried to remove the limit in the kernel and runned it with arrays of 512 elements, but the result is the same.

The arrays appear to be the same because after invoking the kernel you still print [font=“Courier New”]yvar1ini[/font] and [font=“Courier New”]yvar2ini[/font], not [font=“Courier New”]yvar1[/font] and [font=“Courier New”]yvar2[/font].

Also, [font=“Courier New”]cudaFree(&limit_d);[/font] is going to fail because it was never allocated. Always check return values!

Thanks for the cudaFree, I was wondering if it was usefull or not. The display of the initial arrays was an error (I don’t remember why I did that at some point), even when I display the normally up to date arrays, I’ve got the same absence of results (I’ve edited the code).

Well it works for me.

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

// #define N 512

#define N 10

#define NITER 1

#define THREAD_PER_BLOCK 512

__global__ void addition(double * arr1, double * arr2, double * arr3, unsigned int * limit){

        unsigned int id = blockIdx.x * THREAD_PER_BLOCK + threadIdx.x;

if(id<*limit) {

                arr1[id] = arr1[id] + arr2[id] + arr3[id];

                arr2[id] = arr2[id] + 2.0 * arr3[id];

        }

}

int main(int argc, char *argv[])

{

        int i, j;

        unsigned int limit_h = N;

        printf("Limite : %d \n", limit_h);

//srand(10);

/*define initial state*/

        double yvar1[N];

        double yvar2[N];

        double yvar3[N];

double yvar1ini[N];

        double yvar2ini[N];

for(i = 0 ; i < N ; i++){

                yvar1ini[i] = (double) rand() / (double) RAND_MAX;

                yvar2ini[i] = (double) rand() / (double) RAND_MAX;

                yvar3[i] = (double) rand() / (double) RAND_MAX;

        }

fprintf(stdout,"1;2;3\n");

        for(i=0;i<N;++i){

                fprintf(stdout,"%f;%f;%f\n",yvar1ini[i],yvar2ini[i],yvar3[i]);

        }

/*GPU parallalization*/

        printf("Limite : %d \n", limit_h);

double * yvar1_d, *yvar2_d, *yvar3_d;

        unsigned int * limite_d, test;

        cudaMalloc((void**) &yvar1_d, sizeof(double) * limit_h);

        cudaMalloc((void**) &yvar2_d, sizeof(double) * limit_h);

        cudaMalloc((void**) &yvar3_d, sizeof(double) * limit_h);

        cudaMalloc((void**) &limite_d, sizeof(unsigned int));

        printf("%s.\n", cudaGetErrorString(cudaGetLastError()));

cudaMemcpy(limite_d, &limit_h, sizeof(unsigned int), cudaMemcpyHostToDevice);

        cudaMemcpy(&test, limite_d, sizeof(unsigned int), cudaMemcpyDeviceToHost);

printf("Limite GPU : %d\n", test);

cudaMemcpy(yvar1_d, yvar1ini, sizeof(double)*limit_h, cudaMemcpyHostToDevice);

        cudaMemcpy(yvar2_d, yvar2ini, sizeof(double)*limit_h, cudaMemcpyHostToDevice);

        cudaMemcpy(yvar3_d, yvar3, sizeof(double)*limit_h, cudaMemcpyHostToDevice);

        printf("%s.\n", cudaGetErrorString(cudaGetLastError()));

        for(j=0;j<NITER;++j){

                unsigned int blocks = limit_h / THREAD_PER_BLOCK + (limit_h % THREAD_PER_BLOCK > 0);

                printf("Blocks : %d, threads : %d\n", blocks, THREAD_PER_BLOCK);

                addition<<<blocks, THREAD_PER_BLOCK>>>(yvar1_d, yvar2_d, yvar3_d, limite_d);

                printf("%s.\n", cudaGetErrorString(cudaGetLastError()));

        }

        cudaMemcpy(yvar1, yvar1_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);

        cudaMemcpy(yvar2, yvar2_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);

        cudaMemcpy(yvar3, yvar3_d, sizeof(double)*limit_h, cudaMemcpyDeviceToHost);

        printf("%s.\n", cudaGetErrorString(cudaGetLastError()));

cudaFree(yvar1_d);

        cudaFree(yvar2_d);

        cudaFree(yvar3_d);

        cudaFree(limite_d);

        printf("%s.\n", cudaGetErrorString(cudaGetLastError()));

fprintf(stdout,"1;2;3\n");

        for(i=0;i<N;++i){

                fprintf(stdout,"%f;%f;%f\n",yvar1[i],yvar2[i],yvar3[i]);

        }

        return 0;

}

result:

[font=“Courier New”]Limite : 10

1;2;3

0.840188;0.394383;0.783099

0.798440;0.911647;0.197551

0.335223;0.768230;0.277775

0.553970;0.477397;0.628871

0.364784;0.513401;0.952230

0.916195;0.635712;0.717297

0.141603;0.606969;0.016301

0.242887;0.137232;0.804177

0.156679;0.400944;0.129790

0.108809;0.998925;0.218257

Limite : 10

no error.

Limite GPU : 10

no error.

Blocks : 1, threads : 512

no error.

no error.

no error.

1;2;3

2.017670;1.960581;0.783099

1.907639;1.306750;0.197551

1.381227;1.323779;0.277775

1.660238;1.735139;0.628871

1.830415;2.417860;0.952230

2.269204;2.070306;0.717297

0.764872;0.639570;0.016301

1.184295;1.745585;0.804177

0.687414;0.660525;0.129790

1.325990;1.435438;0.218257

[/font]

Thank you for your help, I know what’s wrong : I don’t know why but double are not supported on my GPUs. I took the vectorAdd example of the SDK and transform it so it can fulfill my needs. It works because it uses float arrays. When I use double, there is no results. Maybe it’s because I’m on a 64-bits system and 64 bits variables are not supported.

Not all GPUs support double precision: only compute capability 1.3, 2.0 and 2.1 are double precision capable. There is a table in an appendix of the CUDA programming guide which lists cards and their compute capability.

Thanks, I looked in it, but the Quadro FX 5800 and the Tesla C1060 have both 1.3 compute capabilities and should be able to support the double precision (according to the appendinx). That’s just strange.

Are you compiling for compute capability 1.3? The compiler defaults to compute 1.1 (ie. no double precision support) if you don’t tell it otherwise.