2d matrix passing values help with this code

// example1.cpp : Defines the entry point for the console application.
//

#include “stdafx.h”

#include <assert.h>
#include <cuda.h>

#include<stdio.h>

void matmul(float **a,float **b,float **c,int N);
void matmul(float **a,float **b,float **c,int N)
{
int i,j,k;

 for(i=0;i<N;i++)

{
for(j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
c[i][j] += a[i][k]*b[k][j];

             }
             
             
       
       }

}

 for(i=0;i<N;i++)
       {
           for(j=0;j<N;j++)
           {
               printf("%f\t",a[i][j]);      
                                      
           }
        printf("\n");
                                                      
       }
                                                      
                                                      
 for(i=0;i<N;i++)

{
for(j=0;j<N;j++)
{
printf("%f\t",b[i][j]);

     }
   printf("\n");
                                                      
 }
                                                      
  for(i=0;i<N;i++)
   {
    for(j=0;j<N;j++)
      {
        printf("%f\t",c[i][j]);      
                                      
     }
    printf("\n");
                                                      
}

}

global void matmulondevice(float **a,float **b,float **c)
{
int i = threadIdx.x;
int j = threadIdx.y;
c[i][j]= a[i][j]+b[i][j];

}
int main(void)
{
float **a,**b,**c;
float **a_d,**b_d,**c_d;
int N = 2;

a = (float**)malloc(sizeof(float *)*N);
b = (float**)malloc(sizeof(float *)*N);
c = (float**)malloc(sizeof(float *)*N);

cudaMalloc((void **) &a_d, sizeof(float*)*N);
cudaMalloc((void **) &b_d, sizeof(float*)*N);
cudaMalloc((void **) &c_d, sizeof(float*)*N);

int i;
int j;
for(i=0;i<N;i++)
{
                a[i] = (float*)malloc(sizeof(float)*N);
                b[i] = (float*)malloc(sizeof(float)*N);
                c[i] = (float*)malloc(sizeof(float)*N);

}

for(i=0;i<N;i++)
{
cudaMalloc((void **) &a_d[i], sizeof(float)*N);
cudaMalloc((void **) &b_d[i], sizeof(float)*N);
cudaMalloc((void **) &c_d[i], sizeof(float)*N);
}

for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“Enter the %d,%d element”,i,j);
scanf("%f",&a[i][j]);
scanf("%f",&b[i][j]);
c[i][j]=0;
}
}

matmul(a,b,c,N);

for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“Enter the %d,%d element”,i,j);
scanf("%f",&a[i][j]);
scanf("%f",&b[i][j]);
c[i][j]=0;
}
}

dim3 dimBlock(2,2);

matmulondevice<<<1,dimBlock>>>(a_d,b_d,c_d);

for(i=0;i<N;i++)
{

cudaMemcpy(c[i], c_d[i], sizeof(float)*N, cudaMemcpyDeviceToHost);

}

for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf("%f\t",c[i][j]);

     }
    printf("\n");
                                                      
}

free(a);free(B);free©;
cudaFree(a_d);cudaFree(b_d);cudaFree(c_d);

}

The C function works fine,
The kernel however is the problem, it gives 0 for all values.After the values are retrieved from the device and printed with the loop statement.
My guess is i am not accessing the thredid’s in the right way or i am not passing the values correctly to the kernel.
please help me

  1. your host code, matrix multiplication must be corrected as

[codebox]void matmul(float **a,float **b,float **c,int N)

{

int i,j,k;

for(i=0; i<N;i++){

	for(j=0;j<N;j++){

		c[i][j] = 0.0 ;  // you miss this 

		for(k=0; k<N;k++){

			c[i][j] += a[i][k]*b[k][j];

		}

	}

} 

}[/codebox]

  1. you original code to allocate pointer array in device is wrong

your original code

[codebox] cutilSafeCall( cudaMalloc((void *) &a_d, sizeof(float)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &b_d, sizeof(float*)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &c_d, sizeof(float*)*N) ) ;

for(i=0; i < N ; i++){

	cutilSafeCall( cudaMalloc((void **) &a_d[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &b_d[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &c_d[i], sizeof(float)*N) ); 

}[/codebox]

Question: why does your original code fail?

ans: when you do “cudaMalloc((void *) &a_d, sizeof(float)*N)”, then a_d is a pointer pointing

to memory in device, so float* a_d[i] is an address in device, you cannot access this address

in host code. However “cudaMalloc((void **) &a_d[i], sizeof(float)*N)” means that you want to

allocate a device memory with size “sizeof(float)*N” and put the address into

device variable a_d[i]

^^^^^^^^^^^^^^^^^^^^^^

this is the problem because a_d[i] cannot be access by host code.

you can use following code

[codebox] cutilSafeCall( cudaMalloc((void *) &a_d, sizeof(float)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &b_d, sizeof(float*)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &c_d, sizeof(float*)*N) ) ;

float **a_d_host,**b_d_host,**c_d_host;

a_d_host = (float**)malloc(sizeof(float *)*N);  

b_d_host = (float**)malloc(sizeof(float *)*N);

c_d_host = (float**)malloc(sizeof(float *)*N);

for(i=0; i < N ; i++){

	cutilSafeCall( cudaMalloc((void **) &a_d_host[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &b_d_host[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &c_d_host[i], sizeof(float)*N) ); 

}

for(i=0;i < N;i++){

	CUDA_SAFE_CALL(cudaMemcpy(a_d, a_d_host, sizeof(float*)*N , cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(b_d, b_d_host, sizeof(float*)*N , cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(c_d, c_d_host, sizeof(float*)*N , cudaMemcpyHostToDevice) );

} [/codebox]

now you put all address of device memory into host pointer array a_d_host, b_d_host and c_d_host

and then copy whole pointer array into device memory pointing by a_d, b_d, c_d

  1. copy data from host to device

[codebox] for(i=0;i < N;i++){

	CUDA_SAFE_CALL(cudaMemcpy(a_d_host[i], a[i], sizeof(float)*N , cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(b_d_host[i], b[i], sizeof(float)*N , cudaMemcpyHostToDevice) );

} [/codebox]
  1. copy data from device to host

[codebox] for(i=0;i<N;i++){

	cudaMemcpy(c[i], c_d_host[i], sizeof(float)*N, cudaMemcpyDeviceToHost);

}[/codebox]
  1. your kernel function do “matrix addition”, not “matrix multiplication”

  2. you can use 1-D array with 2-D logical index, this is more simple

Thank you Lung Sheng Chien. Using a 1D format is easier inface i implemented it after i made the post here . But i really wanted to use this 2D addressing format. I’ll implement your code and get it working soon.

well, i just found this one and had to dig up this old threads …i have a very similar problem…

please take a look at my code…

i wonder whats wrong with my for loop in the middle of the code.

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

would you please help? Thanks in advance.

===================================================================================================

int main(int argc, char *argv)
{
int i, j;
int **crray2d;
int **c_d;
int **c_d_host;

crray2d = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
crray2d[i] = (int*)malloc(Y*sizeof(int));

CUDA_SAFE_CALL( cudaMalloc((void **) &c_d, Xsizeof(int)) );

c_d_host = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMalloc((void **) &c_d_host[i], Y*sizeof(int)) );

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

/* at this point i can not pass this point so the followings are not important now */

//dim3 dimBlock(X,Y);
//MatAdd<<<1,dimBlock>>>(c_d);

//for(i=0;i < X;i++){
// cudaMemcpy(crray2d, c_d, Ysizeof(int), cudaMemcpyDeviceToHost) ;
//}

}

well, i just found this one and had to dig up this old threads …i have a very similar problem…

please take a look at my code…

i wonder whats wrong with my for loop in the middle of the code.

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

would you please help? Thanks in advance.

===================================================================================================

int main(int argc, char *argv)
{
int i, j;
int **crray2d;
int **c_d;
int **c_d_host;

crray2d = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
crray2d[i] = (int*)malloc(Y*sizeof(int));

CUDA_SAFE_CALL( cudaMalloc((void **) &c_d, Xsizeof(int)) );

c_d_host = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMalloc((void **) &c_d_host[i], Y*sizeof(int)) );

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

/* at this point i can not pass this point so the followings are not important now */

//dim3 dimBlock(X,Y);
//MatAdd<<<1,dimBlock>>>(c_d);

//for(i=0;i < X;i++){
// cudaMemcpy(crray2d, c_d, Ysizeof(int), cudaMemcpyDeviceToHost) ;
//}

}