[Cuda C++ dev] Problem with "new" instructions

Hello,

I’m currently trying to learn how to use Cuda, by creating a program that gives the sum of two matrixes. I already have understood that with Cuda, we can’t use array of pointers, so I found a different way (just below), but when I try to allocate my original 2D arrays, I just get “expression has no effect” warnings , always followed by "expected a “;” " and “unrecognized token” errors… On the code below, it’s on the lines where I set b[i] and c[i].

Anybody has an explanation ? Thank you very much…

include <stdio.h>
include

typedef int mytype;

global void matrix_add(int *a, int *b, int *c)
{
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

mytype **add_matrixes(mytype **a, mytype **b, int nbRows, int nbCols)
{
mytype *a_vec;
mytype *b_vec;
mytype *c_vec;

mytype *d_a;
mytype *d_b;
mytype *d_c;

//--------------------

a_vec = new mytype[nbRows * nbCols];
b_vec = new mytype[nbRows * nbCols];
c_vec = new mytype[nbRows * nbCols];

for (int i = 0; i < nbRows; i++)
{
    for (int j = 0; j < nbCols; j++)
    {
        a_vec[i * nbCols + j] = a[i][j];
        b_vec[i * nbCols + j] = b[i][j];
        c_vec[i * nbCols + j] = 0;
    }
}

cudaMalloc(&d_a, nbRows*nbCols*sizeof(mytype));
cudaMalloc(&d_b, nbRows*nbCols*sizeof(mytype));
cudaMalloc(&d_c, nbRows*nbCols*sizeof(mytype));

//copy to device
cudaMemcpy(d_a, a_vec, nbRows*nbCols*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b_vec, nbRows*nbCols*sizeof(mytype), cudaMemcpyHostToDevice);

matrix_add<<<1, 4>>>(d_a, d_b, d_c);

cudaMemcpy(c_vec, d_c, nbRows*nbCols*sizeof(mytype), cudaMemcpyDeviceToHost);

mytype **c = new mytype*[nbRows];
for (unsigned int i = 0; i < nbRows; i++)
{
    c[i] = new mytype[nbCols];
}

for (int i = 0; i < nbRows; i++)
{
    for (int j = 0; j < nbCols; j++)
    {
        c[i][j] = c_vec[i * nbCols + j];
    }
}

return(c);

}

int main(){

const int rows = 4;
const int cols = 4;

mytype **a = new mytype *[rows];
mytype **b = new mytype *[rows];
mytype **c = new mytype *[rows];

for (int i = 0; i < rows; i++)
{
    a[i] = new mytype[cols];
    b[i] = new mytype[cols];
    c[i] = new mytype[cols];
    for (int j = 0; j < cols; j++)
    {
        a[i][j] = rand() * 100;
        b[i][j] = rand() * 100;
        c[i][j] = 0;
    }
}


for (int i = 0; i < rows; i++)
{
    for (int j = 0; j < cols; j++)
    {
        std::cout << c[i][j] << std::endl;
    }
}

return 0;

}

EDIT :

I have tried a few other ways, and when I do it like below, it works. However, I have once again this error when I want to set the whole array values by hand… In the second version, I have the same error as above on every last value of a row (a[0][3], a[1][3], b[0][3], etc…)

Working :

include <stdio.h>
include

global void matrix_add(int *a, int *b, int *c)
{
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

int **add_matrixes(int **a, int **b, int nbRows, int nbCols)
{
int *a_vec;
int *b_vec;
int *c_vec;

int *d_a;
int *d_b;
int *d_c;

//--------------------

a_vec = new int[nbRows * nbCols];
b_vec = new int[nbRows * nbCols];
c_vec = new int[nbRows * nbCols];

for (int i = 0; i < nbRows; i++)
{
    for (int j = 0; j < nbCols; j++)
    {
        a_vec[i * nbCols + j] = a[i][j];
        b_vec[i * nbCols + j] = b[i][j];
        c_vec[i * nbCols + j] = 0;
    }
}

cudaMalloc(&d_a, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_b, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_c, nbRows*nbCols*sizeof(int));

//copy to device
cudaMemcpy(d_a, a_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);

matrix_add<<<1, 16>>>(d_a, d_b, d_c);

cudaMemcpy(c_vec, d_c, nbRows*nbCols*sizeof(int), cudaMemcpyDeviceToHost);

int **c = new int*[nbRows];
for (unsigned int i = 0; i < nbRows; i++)
{
    c[i] = new int[nbCols];
}

for (int i = 0; i < nbRows; i++)
{
    for (int j = 0; j < nbCols; j++)
    {
        c[i][j] = c_vec[i * nbCols + j];
    }
}

return(c);

}

int main(){

const int rows = 4;
const int cols = 4;

int **a = new int*[rows];
int **b = new int*[rows];
int **c;

for (int i = 0; i < rows; i++)
{
    a[i] = new int[cols];
    for (int j = 0; j < cols; j++)
    {
        a[i][j] = rand() * 100;
    }
}
for (int i = 0; i < rows; i++)
{
    b[i] = new int[cols];
    for (int j = 0; j < cols; j++)
    {
        b[i][j] = rand() * 100;
    }
}

c = add_matrixes(a, b, rows, cols);

for (int i = 0; i < rows; i++)
{
    for (int j = 0; j < cols; j++)
    {
        std::cout << "a[" << i << "][" << j << "] = " << a[i][j] << std::endl;
        std::cout << "b[" << i << "][" << j << "] = " << b[i][j] << std::endl;
        std::cout << "c[" << i << "][" << j << "] = " << c[i][j] << std::endl;
    }
}

return 0;

}

Not working :

include <stdio.h>
include

global void matrix_add(int *a, int *b, int *c)
{
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

int **add_matrixes(int **a, int **b, int nbRows, int nbCols)
{
int *a_vec;
int *b_vec;
int *c_vec;

int *d_a;
int *d_b;
int *d_c;

//--------------------

a_vec = new int[nbRows * nbCols];
b_vec = new int[nbRows * nbCols];
c_vec = new int[nbRows * nbCols];

for (int i = 0; i < nbRows; i++)
{
    for (int j = 0; j < nbCols; j++)
    {
        a_vec[i * nbCols + j] = a[i][j];
        b_vec[i * nbCols + j] = b[i][j];
        c_vec[i * nbCols + j] = 0;
    }
}

cudaMalloc(&d_a, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_b, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_c, nbRows*nbCols*sizeof(int));

//copy to device
cudaMemcpy(d_a, a_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);

matrix_add<<<1, 16>>>(d_a, d_b, d_c);

cudaMemcpy(c_vec, d_c, nbRows*nbCols*sizeof(int), cudaMemcpyDeviceToHost);

int **c = new int*[nbRows];
for (unsigned int i = 0; i < nbRows; i++)
{
    c[i] = new int[nbCols];
}

for (int i = 0; i < nbRows; i++)
{
    for (int j = 0; j < nbCols; j++)
    {
        c[i][j] = c_vec[i * nbCols + j];
    }
}

return(c);

}

int main(){

const int rows = 5;
const int cols = 5;

int **a = new int*[rows];
int **b = new int*[rows];
int **c;

for (int i = 0; i < rows; i++)
{
    a[i] = new int[cols];
}
a[0][0] = 1;
a[0][1] = 2;
a[0][2] = 3;
a[0][3] = 4;
a[1][0] = 5;
a[1][1] = 6;
a[1][2] = 7;
a[1][3] = 8;
a[2][0] = 9;
a[2][1] = 10;
a[2][2] = 11;
a[2][3] = 12;
a[3][0] = 13;
a[3][1] = 14;
a[3][2] = 15;
a[3][3] = 16;
for (int i = 0; i < rows; i++)
{
    b[i] = new int[cols];
}
b[0][0] = 1;
b[0][1] = 2;
b[0][2] = 3;
b[0][3] = 4;
b[1][0] = 5;
b[1][1] = 6;
b[1][2] = 7;
b[1][3] = 8;
b[2][0] = 9;
b[2][1] = 10;
b[2][2] = 11;
b[2][3] = 12;
b[3][0] = 13;
b[3][1] = 14;
b[3][2] = 15;
b[3][3] = 16;
c = add_matrixes(a, b, rows, cols);

for (int i = 0; i < rows; i++)
{
    for (int j = 0; j < cols; j++)
    {
        std::cout << "a[" << i << "][" << j << "] = " << a[i][j] << std::endl;
        std::cout << "b[" << i << "][" << j << "] = " << b[i][j] << std::endl;
        std::cout << "c[" << i << "][" << j << "] = " << c[i][j] << std::endl;
    }
}

return 0;

}