Hello,
I’m currently trying to learn how to use Cuda, by creating a program that gives the sum of two matrixes. I already have understood that with Cuda, we can’t use array of pointers, so I found a different way (just below), but when I try to allocate my original 2D arrays, I just get “expression has no effect” warnings , always followed by "expected a “;” " and “unrecognized token” errors… On the code below, it’s on the lines where I set b[i] and c[i].
Anybody has an explanation ? Thank you very much…
typedef int mytype;
global void matrix_add(int *a, int *b, int *c)
{
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}mytype **add_matrixes(mytype **a, mytype **b, int nbRows, int nbCols)
{
mytype *a_vec;
mytype *b_vec;
mytype *c_vec;mytype *d_a; mytype *d_b; mytype *d_c;
//--------------------
a_vec = new mytype[nbRows * nbCols]; b_vec = new mytype[nbRows * nbCols]; c_vec = new mytype[nbRows * nbCols]; for (int i = 0; i < nbRows; i++) { for (int j = 0; j < nbCols; j++) { a_vec[i * nbCols + j] = a[i][j]; b_vec[i * nbCols + j] = b[i][j]; c_vec[i * nbCols + j] = 0; } } cudaMalloc(&d_a, nbRows*nbCols*sizeof(mytype)); cudaMalloc(&d_b, nbRows*nbCols*sizeof(mytype)); cudaMalloc(&d_c, nbRows*nbCols*sizeof(mytype)); //copy to device cudaMemcpy(d_a, a_vec, nbRows*nbCols*sizeof(mytype), cudaMemcpyHostToDevice); cudaMemcpy(d_b, b_vec, nbRows*nbCols*sizeof(mytype), cudaMemcpyHostToDevice); matrix_add<<<1, 4>>>(d_a, d_b, d_c); cudaMemcpy(c_vec, d_c, nbRows*nbCols*sizeof(mytype), cudaMemcpyDeviceToHost); mytype **c = new mytype*[nbRows]; for (unsigned int i = 0; i < nbRows; i++) { c[i] = new mytype[nbCols]; } for (int i = 0; i < nbRows; i++) { for (int j = 0; j < nbCols; j++) { c[i][j] = c_vec[i * nbCols + j]; } } return(c);
}
int main(){
const int rows = 4; const int cols = 4; mytype **a = new mytype *[rows]; mytype **b = new mytype *[rows]; mytype **c = new mytype *[rows]; for (int i = 0; i < rows; i++) { a[i] = new mytype[cols]; b[i] = new mytype[cols]; c[i] = new mytype[cols]; for (int j = 0; j < cols; j++) { a[i][j] = rand() * 100; b[i][j] = rand() * 100; c[i][j] = 0; } } for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { std::cout << c[i][j] << std::endl; } } return 0;
}
EDIT :
I have tried a few other ways, and when I do it like below, it works. However, I have once again this error when I want to set the whole array values by hand… In the second version, I have the same error as above on every last value of a row (a[0][3], a[1][3], b[0][3], etc…)
Working :
global void matrix_add(int *a, int *b, int *c)
{
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
int **add_matrixes(int **a, int **b, int nbRows, int nbCols)
{
int *a_vec;
int *b_vec;
int *c_vec;
int *d_a;
int *d_b;
int *d_c;
//--------------------
a_vec = new int[nbRows * nbCols];
b_vec = new int[nbRows * nbCols];
c_vec = new int[nbRows * nbCols];
for (int i = 0; i < nbRows; i++)
{
for (int j = 0; j < nbCols; j++)
{
a_vec[i * nbCols + j] = a[i][j];
b_vec[i * nbCols + j] = b[i][j];
c_vec[i * nbCols + j] = 0;
}
}
cudaMalloc(&d_a, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_b, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_c, nbRows*nbCols*sizeof(int));
//copy to device
cudaMemcpy(d_a, a_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);
matrix_add<<<1, 16>>>(d_a, d_b, d_c);
cudaMemcpy(c_vec, d_c, nbRows*nbCols*sizeof(int), cudaMemcpyDeviceToHost);
int **c = new int*[nbRows];
for (unsigned int i = 0; i < nbRows; i++)
{
c[i] = new int[nbCols];
}
for (int i = 0; i < nbRows; i++)
{
for (int j = 0; j < nbCols; j++)
{
c[i][j] = c_vec[i * nbCols + j];
}
}
return(c);
}
int main(){
const int rows = 4;
const int cols = 4;
int **a = new int*[rows];
int **b = new int*[rows];
int **c;
for (int i = 0; i < rows; i++)
{
a[i] = new int[cols];
for (int j = 0; j < cols; j++)
{
a[i][j] = rand() * 100;
}
}
for (int i = 0; i < rows; i++)
{
b[i] = new int[cols];
for (int j = 0; j < cols; j++)
{
b[i][j] = rand() * 100;
}
}
c = add_matrixes(a, b, rows, cols);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
std::cout << "a[" << i << "][" << j << "] = " << a[i][j] << std::endl;
std::cout << "b[" << i << "][" << j << "] = " << b[i][j] << std::endl;
std::cout << "c[" << i << "][" << j << "] = " << c[i][j] << std::endl;
}
}
return 0;
}
Not working :
global void matrix_add(int *a, int *b, int *c)
{
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
int **add_matrixes(int **a, int **b, int nbRows, int nbCols)
{
int *a_vec;
int *b_vec;
int *c_vec;
int *d_a;
int *d_b;
int *d_c;
//--------------------
a_vec = new int[nbRows * nbCols];
b_vec = new int[nbRows * nbCols];
c_vec = new int[nbRows * nbCols];
for (int i = 0; i < nbRows; i++)
{
for (int j = 0; j < nbCols; j++)
{
a_vec[i * nbCols + j] = a[i][j];
b_vec[i * nbCols + j] = b[i][j];
c_vec[i * nbCols + j] = 0;
}
}
cudaMalloc(&d_a, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_b, nbRows*nbCols*sizeof(int));
cudaMalloc(&d_c, nbRows*nbCols*sizeof(int));
//copy to device
cudaMemcpy(d_a, a_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b_vec, nbRows*nbCols*sizeof(int), cudaMemcpyHostToDevice);
matrix_add<<<1, 16>>>(d_a, d_b, d_c);
cudaMemcpy(c_vec, d_c, nbRows*nbCols*sizeof(int), cudaMemcpyDeviceToHost);
int **c = new int*[nbRows];
for (unsigned int i = 0; i < nbRows; i++)
{
c[i] = new int[nbCols];
}
for (int i = 0; i < nbRows; i++)
{
for (int j = 0; j < nbCols; j++)
{
c[i][j] = c_vec[i * nbCols + j];
}
}
return(c);
}
int main(){
const int rows = 5;
const int cols = 5;
int **a = new int*[rows];
int **b = new int*[rows];
int **c;
for (int i = 0; i < rows; i++)
{
a[i] = new int[cols];
}
a[0][0] = 1;
a[0][1] = 2;
a[0][2] = 3;
a[0][3] = 4;
a[1][0] = 5;
a[1][1] = 6;
a[1][2] = 7;
a[1][3] = 8;
a[2][0] = 9;
a[2][1] = 10;
a[2][2] = 11;
a[2][3] = 12;
a[3][0] = 13;
a[3][1] = 14;
a[3][2] = 15;
a[3][3] = 16;
for (int i = 0; i < rows; i++)
{
b[i] = new int[cols];
}
b[0][0] = 1;
b[0][1] = 2;
b[0][2] = 3;
b[0][3] = 4;
b[1][0] = 5;
b[1][1] = 6;
b[1][2] = 7;
b[1][3] = 8;
b[2][0] = 9;
b[2][1] = 10;
b[2][2] = 11;
b[2][3] = 12;
b[3][0] = 13;
b[3][1] = 14;
b[3][2] = 15;
b[3][3] = 16;
c = add_matrixes(a, b, rows, cols);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
std::cout << "a[" << i << "][" << j << "] = " << a[i][j] << std::endl;
std::cout << "b[" << i << "][" << j << "] = " << b[i][j] << std::endl;
std::cout << "c[" << i << "][" << j << "] = " << c[i][j] << std::endl;
}
}
return 0;
}