Can't call a function with CUDA when i compile my code, the "add" function just won'

can anybody help me trace my code please? i want to use the add function to compute the array, but i just cant do it…
does anyone know what’s wrong with my code, or i missed something?

The Code :

#include “iostream”
#include “conio.h”

#define N 10

global void add( int **a, int **b ) {
int tid = threadIdx.x;
if (tid < N) {
for (int j=0; j<N; j++) {
b[tid][j] = a[tid][j] + b[tid][j];
}
}
}

int main( void ) {
int *a[N], *b[N];
int **dev_a, **dev_b;

cudaMalloc( (void **)&dev_a, Nsizeof(int ));
cudaMalloc( (void **)&dev_b, N
sizeof(int ));
for(int i=0; i<N; i++) {
cudaMalloc( (void **)&a[i], N
N
sizeof(int));
cudaMalloc( (void **)&b[i], NNsizeof(int));
}

cudaMemcpy(dev_a, a, NNsizeof(int ), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N
N*sizeof(int *), cudaMemcpyHostToDevice);

int **cpuArrayA;
int **cpuArrayB;

cpuArrayA = new int*[N];
cpuArrayB = new int*[N];
for(int i=0;i<N;i++) {
cpuArrayA[i] = new int[N];
cpuArrayB[i] = new int[N];
}

for(int i=0;i<N;i++) {
for(int j=0;j<N;j++) {
cpuArrayA[i][j] = i+j;
cpuArrayB[i][j] = i*j;
}
}

for(int i=0; i<N; i++) {
cudaMemcpy(a[i], cpuArrayA[i], Nsizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b[i], cpuArrayB[i], N
sizeof(int), cudaMemcpyHostToDevice);
}

add<<<1,N>>>( dev_a, dev_b );

for(int i=0; i<N; i++) {
cudaMemcpy(cpuArrayB[i], b[i], N*sizeof(int), cudaMemcpyDeviceToHost);
}

for(int i=0;i<N;i++) {
for(int j=0;j<N;j++) {
printf(“cpuArrayB[%d][%d] = %d\n”, i, j, cpuArrayB[i][j]);
}
}
_getch();
}

can anybody help me trace my code please? i want to use the add function to compute the array, but i just cant do it…
does anyone know what’s wrong with my code, or i missed something?

The Code :

#include “iostream”
#include “conio.h”

#define N 10

global void add( int **a, int **b ) {
int tid = threadIdx.x;
if (tid < N) {
for (int j=0; j<N; j++) {
b[tid][j] = a[tid][j] + b[tid][j];
}
}
}

int main( void ) {
int *a[N], *b[N];
int **dev_a, **dev_b;

cudaMalloc( (void **)&dev_a, Nsizeof(int ));
cudaMalloc( (void **)&dev_b, N
sizeof(int ));
for(int i=0; i<N; i++) {
cudaMalloc( (void **)&a[i], N
N
sizeof(int));
cudaMalloc( (void **)&b[i], NNsizeof(int));
}

cudaMemcpy(dev_a, a, NNsizeof(int ), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N
N*sizeof(int *), cudaMemcpyHostToDevice);

int **cpuArrayA;
int **cpuArrayB;

cpuArrayA = new int*[N];
cpuArrayB = new int*[N];
for(int i=0;i<N;i++) {
cpuArrayA[i] = new int[N];
cpuArrayB[i] = new int[N];
}

for(int i=0;i<N;i++) {
for(int j=0;j<N;j++) {
cpuArrayA[i][j] = i+j;
cpuArrayB[i][j] = i*j;
}
}

for(int i=0; i<N; i++) {
cudaMemcpy(a[i], cpuArrayA[i], Nsizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b[i], cpuArrayB[i], N
sizeof(int), cudaMemcpyHostToDevice);
}

add<<<1,N>>>( dev_a, dev_b );

for(int i=0; i<N; i++) {
cudaMemcpy(cpuArrayB[i], b[i], N*sizeof(int), cudaMemcpyDeviceToHost);
}

for(int i=0;i<N;i++) {
for(int j=0;j<N;j++) {
printf(“cpuArrayB[%d][%d] = %d\n”, i, j, cpuArrayB[i][j]);
}
}
_getch();
}

You’ve got 2 sets of arrays going for some unknown reason.
You’re initializing the values of the cpuArrays but adding the values of dev_a and dev_b.

You’ve got 2 sets of arrays going for some unknown reason.
You’re initializing the values of the cpuArrays but adding the values of dev_a and dev_b.

oh… so what i need to do then? can you help me? i’m still a beginner in this…

and i have this CUDA for my thesis…

i’ve tried to change the dev_a to cpuArrayA, and dev_b to cpuArrayB but still the same output…

add<<<1,N>>>( dev_a, dev_b );

to :

add<<<1,N>>>( cpuArrayA, cpuArrayB );

oh… so what i need to do then? can you help me? i’m still a beginner in this…

and i have this CUDA for my thesis…

i’ve tried to change the dev_a to cpuArrayA, and dev_b to cpuArrayB but still the same output…

add<<<1,N>>>( dev_a, dev_b );

to :

add<<<1,N>>>( cpuArrayA, cpuArrayB );

I suggest you look at the Vector Addition example in the SDK.

You need to

  1. allocate memory on the host for two 2d arrays
  2. initialize those arrays
  3. allocate memory on the device for two 2d arrays
  4. copy your host arrays to the device arrays
  5. call your kernel
  6. copy your result 2d array back to the host

I suggest you look at the Vector Addition example in the SDK.

You need to

  1. allocate memory on the host for two 2d arrays
  2. initialize those arrays
  3. allocate memory on the device for two 2d arrays
  4. copy your host arrays to the device arrays
  5. call your kernel
  6. copy your result 2d array back to the host

mr. Dittoaway, i’ve changed my code, follow your instruction but i got 0 for the result…

anything wrong with my code?

include <conio.h>

include <stdio.h>

include <cutil_inline.h>

global void add( int **d, int **e, int **f, int N ) {

int tid = threadIdx.x;

if (tid < N) {

    for (int j=0; j<N; j++) {

        f[tid][j] = d[tid][j] + e[tid][j]; 

    } 

}

}

int main( int argc, char** argv ) {

const int N = 10;

int i, j;

int *a[N], *b[N], *c[N];

int **dev_a, **dev_b, **dev_c;

int **cpuArrayA;

int **cpuArrayB;

int **cpuArrayC;

//allocate memory on the host for 2d array

cpuArrayA = (int**)malloc(N * sizeof(int*));

cpuArrayB = (int**)malloc(N * sizeof(int*));

cpuArrayC = (int**)malloc(N * sizeof(int*));

for(i=0; i<N; i++) {

    cpuArrayA[i] = (int*)malloc(N * sizeof(int));

    cpuArrayB[i] = (int*)malloc(N * sizeof(int));

    cpuArrayC[i] = (int*)malloc(N * sizeof(int));

}

//initialize those arrays

for(i=0;i<N;i++) {

    for(j=0;j<N;j++) {

        cpuArrayA[i][j] = i+j;

        cpuArrayB[i][j] = i*j;

    }

}

//allocate memory on the device for two 2d arrays

cudaMalloc( (void **)&dev_a, N*sizeof(int *));

cudaMalloc( (void **)&dev_b, N*sizeof(int *));

cudaMalloc( (void **)&dev_c, N*sizeof(int *));

for(i=0; i<N; i++) {

    cudaMalloc( (void **)&a[i], N*N*sizeof(int));

    cudaMalloc( (void **)&b[i], N*N*sizeof(int));

    cudaMalloc( (void **)&c[i], N*N*sizeof(int));

}

//copy your host arrays to the device arrays

cudaMemcpy(dev_a, a, N*N*sizeof(int *), cudaMemcpyHostToDevice);

cudaMemcpy(dev_b, b, N*N*sizeof(int *), cudaMemcpyHostToDevice);

for(i=0; i<N; i++) {

    cudaMemcpy(a[i], cpuArrayA[i], N*sizeof(int), cudaMemcpyHostToDevice);

    cudaMemcpy(b[i], cpuArrayB[i], N*sizeof(int), cudaMemcpyHostToDevice);

}

//call your kernel

add<<<1,N>>>( dev_a, dev_b, dev_c, N );

//copy your result 2d array back to the host

for(i=0; i<N; i++) {

    cudaMemcpy(cpuArrayC[i], c[i], N*sizeof(int), cudaMemcpyDeviceToHost);

}

for (i=0; i<N; i++) {

    for(j=0;j<N;j++) {

        int sum = cpuArrayA[i][j] + cpuArrayB[i][j];

        printf("CPU cpuArrayC[%d][%d] = %d \n", i, j, sum);

    }

}

for(i=0;i<N;i++) {

    for(j=0;j<N;j++) {

        printf("GPU cpuArrayC[%d][%d] = %d\n", i, j, cpuArrayC[i][j]);

    }

}

_getch();

}

mr. Dittoaway, i’ve changed my code, follow your instruction but i got 0 for the result…

anything wrong with my code?

include <conio.h>

include <stdio.h>

include <cutil_inline.h>

global void add( int **d, int **e, int **f, int N ) {

int tid = threadIdx.x;

if (tid < N) {

    for (int j=0; j<N; j++) {

        f[tid][j] = d[tid][j] + e[tid][j]; 

    } 

}

}

int main( int argc, char** argv ) {

const int N = 10;

int i, j;

int *a[N], *b[N], *c[N];

int **dev_a, **dev_b, **dev_c;

int **cpuArrayA;

int **cpuArrayB;

int **cpuArrayC;

//allocate memory on the host for 2d array

cpuArrayA = (int**)malloc(N * sizeof(int*));

cpuArrayB = (int**)malloc(N * sizeof(int*));

cpuArrayC = (int**)malloc(N * sizeof(int*));

for(i=0; i<N; i++) {

    cpuArrayA[i] = (int*)malloc(N * sizeof(int));

    cpuArrayB[i] = (int*)malloc(N * sizeof(int));

    cpuArrayC[i] = (int*)malloc(N * sizeof(int));

}

//initialize those arrays

for(i=0;i<N;i++) {

    for(j=0;j<N;j++) {

        cpuArrayA[i][j] = i+j;

        cpuArrayB[i][j] = i*j;

    }

}

//allocate memory on the device for two 2d arrays

cudaMalloc( (void **)&dev_a, N*sizeof(int *));

cudaMalloc( (void **)&dev_b, N*sizeof(int *));

cudaMalloc( (void **)&dev_c, N*sizeof(int *));

for(i=0; i<N; i++) {

    cudaMalloc( (void **)&a[i], N*N*sizeof(int));

    cudaMalloc( (void **)&b[i], N*N*sizeof(int));

    cudaMalloc( (void **)&c[i], N*N*sizeof(int));

}

//copy your host arrays to the device arrays

cudaMemcpy(dev_a, a, N*N*sizeof(int *), cudaMemcpyHostToDevice);

cudaMemcpy(dev_b, b, N*N*sizeof(int *), cudaMemcpyHostToDevice);

for(i=0; i<N; i++) {

    cudaMemcpy(a[i], cpuArrayA[i], N*sizeof(int), cudaMemcpyHostToDevice);

    cudaMemcpy(b[i], cpuArrayB[i], N*sizeof(int), cudaMemcpyHostToDevice);

}

//call your kernel

add<<<1,N>>>( dev_a, dev_b, dev_c, N );

//copy your result 2d array back to the host

for(i=0; i<N; i++) {

    cudaMemcpy(cpuArrayC[i], c[i], N*sizeof(int), cudaMemcpyDeviceToHost);

}

for (i=0; i<N; i++) {

    for(j=0;j<N;j++) {

        int sum = cpuArrayA[i][j] + cpuArrayB[i][j];

        printf("CPU cpuArrayC[%d][%d] = %d \n", i, j, sum);

    }

}

for(i=0;i<N;i++) {

    for(j=0;j<N;j++) {

        printf("GPU cpuArrayC[%d][%d] = %d\n", i, j, cpuArrayC[i][j]);

    }

}

_getch();

}

i’m confused while allocate 2D array

i’m confused while allocate 2D array

Yes it’s confusing me too.

It’s much easier to treat a 2d array as a flat 1d array.

float* array1;

size_t array1Size = width * height * sizeof(float);

cudaMalloc((void**)&array1, array1Size);

and in your kernel compute the index yourself

array1[i + width*j] = …

Yes it’s confusing me too.

It’s much easier to treat a 2d array as a flat 1d array.

float* array1;

size_t array1Size = width * height * sizeof(float);

cudaMalloc((void**)&array1, array1Size);

and in your kernel compute the index yourself

array1[i + width*j] = …

oic, i’ll try it… how about 3d array? what is the formula to make it become flat 1d array?

oic, i’ll try it… how about 3d array? what is the formula to make it become flat 1d array?

for(i=0;i<length;i++)

   for(j=0;j<width;j++)

      for(k=0;k<height;k++)

      {

         one_d[i*length*width + j*width + k] = three_d[i][j][k];

      }
for(i=0;i<length;i++)

   for(j=0;j<width;j++)

      for(k=0;k<height;k++)

      {

         one_d[i*length*width + j*width + k] = three_d[i][j][k];

      }

i got it… but i use the different formula for 3d array…

i think it should be

i + j * length + k * length * width

i got it… but i use the different formula for 3d array…

i think it should be

i + j * length + k * length * width