2d matrix passing values help with this code

Whitchurch · September 18, 2009, 1:11am

// example1.cpp : Defines the entry point for the console application.
//

#include “stdafx.h”

#include<stdio.h>

void matmul(float **a,float **b,float **c,int N);
void matmul(float **a,float **b,float **c,int N)
{
int i,j,k;

 for(i=0;i<N;i++)

{
for(j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
c[i][j] += a[i][k]*b[k][j];

}

 for(i=0;i<N;i++)
       {
           for(j=0;j<N;j++)
           {
               printf("%f\t",a[i][j]);      
                                      
           }
        printf("\n");
                                                      
       }
                                                      
                                                      
 for(i=0;i<N;i++)

{
for(j=0;j<N;j++)
{
printf(“%f\t”,b[i][j]);

     }
   printf("\n");
                                                      
 }
                                                      
  for(i=0;i<N;i++)
   {
    for(j=0;j<N;j++)
      {
        printf("%f\t",c[i][j]);      
                                      
     }
    printf("\n");
                                                      
}

}

global void matmulondevice(float **a,float **b,float **c)
{
int i = threadIdx.x;
int j = threadIdx.y;
c[i][j]= a[i][j]+b[i][j];

}
int main(void)
{
float **a,**b,**c;
float **a_d,**b_d,**c_d;
int N = 2;

a = (float**)malloc(sizeof(float *)*N);
b = (float**)malloc(sizeof(float *)*N);
c = (float**)malloc(sizeof(float *)*N);

cudaMalloc((void **) &a_d, sizeof(float*)*N);
cudaMalloc((void **) &b_d, sizeof(float*)*N);
cudaMalloc((void **) &c_d, sizeof(float*)*N);

int i;
int j;
for(i=0;i<N;i++)
{
                a[i] = (float*)malloc(sizeof(float)*N);
                b[i] = (float*)malloc(sizeof(float)*N);
                c[i] = (float*)malloc(sizeof(float)*N);

}

for(i=0;i<N;i++)
{
cudaMalloc((void **) &a_d[i], sizeof(float)*N);
cudaMalloc((void **) &b_d[i], sizeof(float)*N);
cudaMalloc((void **) &c_d[i], sizeof(float)*N);
}

for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“Enter the %d,%d element”,i,j);
scanf(“%f”,&a[i][j]);
scanf(“%f”,&b[i][j]);
c[i][j]=0;
}
}

matmul(a,b,c,N);

for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“Enter the %d,%d element”,i,j);
scanf(“%f”,&a[i][j]);
scanf(“%f”,&b[i][j]);
c[i][j]=0;
}
}

dim3 dimBlock(2,2);

matmulondevice<<<1,dimBlock>>>(a_d,b_d,c_d);

for(i=0;i<N;i++)
{

cudaMemcpy(c[i], c_d[i], sizeof(float)*N, cudaMemcpyDeviceToHost);

}

for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“%f\t”,c[i][j]);

     }
    printf("\n");
                                                      
}

free(a);free(B);free©;
cudaFree(a_d);cudaFree(b_d);cudaFree(c_d);

}

The C function works fine,
The kernel however is the problem, it gives 0 for all values.After the values are retrieved from the device and printed with the loop statement.
My guess is i am not accessing the thredid’s in the right way or i am not passing the values correctly to the kernel.
please help me

LSChien · September 18, 2009, 6:33am

your host code, matrix multiplication must be corrected as

[codebox]void matmul(float **a,float **b,float **c,int N)

{

int i,j,k;

for(i=0; i<N;i++){

	for(j=0;j<N;j++){

		c[i][j] = 0.0 ;  // you miss this 

		for(k=0; k<N;k++){

			c[i][j] += a[i][k]*b[k][j];

		}

	}

}

}[/codebox]

you original code to allocate pointer array in device is wrong

your original code

[codebox] cutilSafeCall( cudaMalloc((void *) &a_d, sizeof(float)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &b_d, sizeof(float*)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &c_d, sizeof(float*)*N) ) ;

for(i=0; i < N ; i++){

	cutilSafeCall( cudaMalloc((void **) &a_d[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &b_d[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &c_d[i], sizeof(float)*N) ); 

}[/codebox]

Question: why does your original code fail?

ans: when you do “cudaMalloc((void *) &a_d, sizeof(float)*N)”, then a_d is a pointer pointing

to memory in device, so float* a_d[i] is an address in device, you cannot access this address

in host code. However “cudaMalloc((void **) &a_d[i], sizeof(float)*N)” means that you want to

allocate a device memory with size “sizeof(float)*N” and put the address into

device variable a_d[i]

^^^^^^^^^^^^^^^^^^^^^^

this is the problem because a_d[i] cannot be access by host code.

you can use following code

[codebox] cutilSafeCall( cudaMalloc((void *) &a_d, sizeof(float)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &b_d, sizeof(float*)*N) ) ;

cutilSafeCall( cudaMalloc((void **) &c_d, sizeof(float*)*N) ) ;

float **a_d_host,**b_d_host,**c_d_host;

a_d_host = (float**)malloc(sizeof(float *)*N);  

b_d_host = (float**)malloc(sizeof(float *)*N);

c_d_host = (float**)malloc(sizeof(float *)*N);

for(i=0; i < N ; i++){

	cutilSafeCall( cudaMalloc((void **) &a_d_host[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &b_d_host[i], sizeof(float)*N) );

	cutilSafeCall( cudaMalloc((void **) &c_d_host[i], sizeof(float)*N) ); 

}

for(i=0;i < N;i++){

	CUDA_SAFE_CALL(cudaMemcpy(a_d, a_d_host, sizeof(float*)*N , cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(b_d, b_d_host, sizeof(float*)*N , cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(c_d, c_d_host, sizeof(float*)*N , cudaMemcpyHostToDevice) );

} [/codebox]

now you put all address of device memory into host pointer array a_d_host, b_d_host and c_d_host

and then copy whole pointer array into device memory pointing by a_d, b_d, c_d

copy data from host to device

[codebox] for(i=0;i < N;i++){

	CUDA_SAFE_CALL(cudaMemcpy(a_d_host[i], a[i], sizeof(float)*N , cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL(cudaMemcpy(b_d_host[i], b[i], sizeof(float)*N , cudaMemcpyHostToDevice) );

} [/codebox]

copy data from device to host

[codebox] for(i=0;i<N;i++){

	cudaMemcpy(c[i], c_d_host[i], sizeof(float)*N, cudaMemcpyDeviceToHost);

}[/codebox]

your kernel function do “matrix addition”, not “matrix multiplication”
you can use 1-D array with 2-D logical index, this is more simple

Whitchurch · September 18, 2009, 2:10pm

Thank you Lung Sheng Chien. Using a 1D format is easier inface i implemented it after i made the post here . But i really wanted to use this 2D addressing format. I’ll implement your code and get it working soon.

syoon · November 10, 2010, 10:45pm

well, i just found this one and had to dig up this old threads …i have a very similar problem…

please take a look at my code…

i wonder whats wrong with my for loop in the middle of the code.

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

would you please help? Thanks in advance.

===================================================================================================

int main(int argc, char *argv)
{
int i, j;
int **crray2d;
int **c_d;
int **c_d_host;

crray2d = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
crray2d[i] = (int*)malloc(Y*sizeof(int));

CUDA_SAFE_CALL( cudaMalloc((void **) &c_d, Xsizeof(int)) );

c_d_host = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMalloc((void **) &c_d_host[i], Y*sizeof(int)) );

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

/* at this point i can not pass this point so the followings are not important now */

//dim3 dimBlock(X,Y);
//MatAdd<<<1,dimBlock>>>(c_d);

//for(i=0;i < X;i++){
// cudaMemcpy(crray2d, c_d, Ysizeof(int), cudaMemcpyDeviceToHost) ;
//}

}

syoon · November 10, 2010, 10:45pm

well, i just found this one and had to dig up this old threads …i have a very similar problem…

please take a look at my code…

i wonder whats wrong with my for loop in the middle of the code.

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

would you please help? Thanks in advance.

===================================================================================================

int main(int argc, char *argv)
{
int i, j;
int **crray2d;
int **c_d;
int **c_d_host;

crray2d = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
crray2d[i] = (int*)malloc(Y*sizeof(int));

CUDA_SAFE_CALL( cudaMalloc((void **) &c_d, Xsizeof(int)) );

c_d_host = (int**)malloc(Xsizeof(int));
for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMalloc((void **) &c_d_host[i], Y*sizeof(int)) );

for(i=0; i<X; i++)
CUDA_SAFE_CALL( cudaMemcpy(c_d[i], c_d_host[i], Ysizeof(int), cudaMemcpyHostToDevice) );

/* at this point i can not pass this point so the followings are not important now */

//dim3 dimBlock(X,Y);
//MatAdd<<<1,dimBlock>>>(c_d);

//for(i=0;i < X;i++){
// cudaMemcpy(crray2d, c_d, Ysizeof(int), cudaMemcpyDeviceToHost) ;
//}

}

Topic		Replies	Views
Help with cuda 2d array CUDA Programming and Performance	6	7452	September 29, 2014
2d array testing in very simple code using CUDA CUDA Programming and Performance	29	30404	November 15, 2010
seems that cuda doesn't support pointer to pointer problem report CUDA Programming and Performance	11	11706	March 29, 2012
cudaMalloc error in big loop CUDA Programming and Performance	12	15608	May 21, 2008
CudaMallocPitch and CudaMemcpy2D CUDA Programming and Performance	7	5558	August 3, 2015
multi dimension array CUDA Programming and Performance	26	32774	February 12, 2010
Copying 2D array from host to device CUDA Programming and Performance	7	7244	July 27, 2010
How to copy Device Struct with pointers to Host? CUDA Programming and Performance	10	8234	July 8, 2014
CudaFree 2D-Array CUDA Programming and Performance	10	9358	August 3, 2009
2D matrix transfer and handling problem Help required CUDA Programming and Performance	7	1464	July 13, 2010

2d matrix passing values help with this code

Related topics