I can use “nvcc -deviceemu **.cu” to compile successfully in Ubuntu.
But when I ran *.out, the error occurred as following:
pitch = 64
Segmentation fault
I check the value of matrix a and b in the kernel function and it indicates clearly that the value isn’t transfered from host to device after calling function “cudaMemcpy2D”.
Then I make some change to the code in the main(). I change the declaration of a from “float *a[n]” to “float a[n][n]” as well as b and c. Compiling and running again, everything is OK.
I don’t understand why it is.
Hope you could help me to make it clear.
What’s wrong with my code?
The following is the source code.
#include “cuda.h”
#include “stdio.h”
#define n 5
global void MatAdd( float a, float b, float c, int w, int h, int pitch )
{
for ( int i=0;i<h;i++ )
{
float a_row = (float)((char)a+ipitch);
float b_row = (float)((char)b+ipitch);
float c_row = (float)((char)c+i*pitch);
for ( int j=0;j<w;j++ )
{
c_row[j] = a_row[j] + b_row[j];
printf ( "%f ",c_row[j] );
printf ( "%f ",a_row[j] );
printf ( "%f ",b_row[j] );
}
}
}
int main(int argc, char* argv)
{
float *a[n],*b[n],*c[n];
float *da,*db,*dc;
int i,j;
for ( i=0;i<n;i++ )
{
a[i] = (float*)malloc ( n*sizeof(float) );
b[i] = (float*)malloc ( n*sizeof(float) );
c[i] = (float*)malloc ( n*sizeof(float) );
for ( j=0;j<n;j++)
{
a[i][j] = 1;
b[i][j] = 1;
c[i][j] = 0;
}
size_t pitch = 0;
cudaMallocPitch ( (void**)&da, &pitch, n*sizeof(float), n );
cudaMallocPitch ( (void**)&db, &pitch, n*sizeof(float), n );
cudaMallocPitch ( (void**)&dc, &pitch, n*sizeof(float), n );
printf ( "pitch = %d\n", pitch );
// dim3 dimBlock(16,16);
//dim3 dimGrid ( (n+dimBlock.x-1)/dimBlock.x,(n+dimBlock.y-1)/dimBlock.y );
cudaMemcpy2D ( da, pitch, a, n*sizeof(float), n*sizeof(float), n, cudaMemcpyHostToDevice );
cudaMemcpy2D ( db, pitch, b, n*sizeof(float), n*sizeof(float), n, cudaMemcpyHostToDevice );
//MatAdd<<<dimGrid, dimBlock>>>( da,db,dc,n,n,pitch );
MatAdd<<<1,1>>>( da,db,dc,n,n,pitch );
cudaMemcpy2D ( c, n*sizeof(float), dc, pitch, n*sizeof(float), n, cudaMemcpyDeviceToHost );
for ( i=0;i<n;i++ )
{
printf ( "\n\n" );
for ( j=0;j<n;j++ )
printf ( "c[%d][%d] = %f ",i,j,c[i][j] );
}
cudaFree ( da );
cudaFree ( db );
cudaFree ( dc );
free ( a );
free (B);
free ©;
}