Problem with Printing results My first Matrix Multiplication program in CUDA

I have implemented a program, I multiplies two Matrices.
It works fine.
But gives Segmentation fault if I print the results.

Here is the code , please help me .


#include <stdio.h>

global void multi( float *M1, float *M2, float M3, size_t p_M1,size_t p_M2, size_t p_M3, int N)
{
int idx = blockIdx.x
blockDim.x + threadIdx.x;
if ( idx < N ){

int k = 0;
float* row_M2 = (float*)((char*)M2 + idx * p_M2);
float* row_M3 = (float*)((char*)M3 + idx * p_M3);

for(k=0;k<N;k++){

	float* row_M1 = (float*)((char*)M1 + k * p_M1);
	row_M3[idx] += row_M1[k] * row_M2[idx];

	}

}
	__syncthreads();

}

int const N = 1000;

int main(){
/* pointers to host memory */
float **Host_M1, **Host_M2, *Host_M3;
/
pointers to device memory */
float *GPU_M1, *GPU_M2, *GPU_M3;
size_t pitch_M1,pitch_M2,pitch_M3;

//int N=1000;
int i,j;



/* Allocate 2darrays  on host*/
Host_M1 = (float**) malloc(N*sizeof(float*));
Host_M2 = (float**) malloc(N*sizeof(float*));
Host_M3 = (float**) malloc(N*sizeof(float*));

printf("OK \n ");


for(int i = 0 ; i<N ; i++){

	Host_M1[i] =  (float*) malloc(N*sizeof(float));
	Host_M2[i] =  (float*) malloc(N*sizeof(float));
	Host_M3[i] =  (float*) malloc(N*sizeof(float));
}

printf("OK mem 2d host\n ");	
/* Allocate 2darrays  on device*/

size_t width = N* sizeof(float);
size_t height = N;

cudaMallocPitch((void**)&GPU_M1, &pitch_M1,width,height);
cudaMallocPitch((void**)&GPU_M2, &pitch_M2,width,height);
cudaMallocPitch((void**)&GPU_M3, &pitch_M3,width,height);

printf("OK mem2d cuda\n ");

/*
cudaMalloc ((void **) &a_d, sizeof(float)*N);
cudaMalloc ((void **) &b_d, sizeof(float)*N);
cudaMalloc ((void **) &c_d, sizeof(float)*N);
*/



/* Initialize arrays a and b */
for (i=0; i<N; i++)
{
	for(int j = 0 ; j<N ; j++){

	Host_M1[i][j] = (float) 1;
	Host_M2[i][j] = (float) 1;

	}
}
	
printf("OK initialize\n ");

/* Copy data from host memory to device memory */
cudaMemcpy2D(GPU_M1, pitch_M1,Host_M1,width, width,height, cudaMemcpyHostToDevice);
cudaMemcpy2D(GPU_M2, pitch_M2,Host_M2,width, width,height, cudaMemcpyHostToDevice);

printf("OK  memcpy H to D\n ");

//cudaMemcpy(b_d, b, sizeof(float)*N, cudaMemcpyHostToDevice);

// Invoke kernel
// here the threads and blocks are stuctured in linear way
int threadsPerBlock = 256;
//int blocksPerGrid = (N + threadsPerBlock - 1)/threadsPerBlock;


multi<<<4,threadsPerBlock>>>(GPU_M1,GPU_M2,GPU_M3,pitch_M1,pitch_M2,pitch_M3,N);


printf("OK Kernel\n ");

cudaMemcpy2D(Host_M3,width,GPU_M3,pitch_M3,width,height ,cudaMemcpyDeviceToHost);


printf("OK memcp D to H\n ");

printf("OK done\n");



for(i = 0 ; i < N ; i++){

		for(j = 0 ; j< N ; j++){
		
		printf("%f   ",Host_M3[i][j]);

		}
		printf("\n");

	}



// Time to free the memories 

/*
for(i = 0 ; i< N ; i++){

	free(Host_M1[i]);
	free(Host_M2[i]);
	free(Host_M3[i]);

}
free(Host_M1);
free(Host_M2);
free(Host_M3);


	printf("OK freeHost\n ");

*/
cudaFree(GPU_M1);
cudaFree(GPU_M1);
cudaFree(GPU_M1);

}

You will notice that I have commented the code which frees Host memory, becuase it gave errors.

Hope to hear from you guys soon.
Thanks

This is the output


bibrak@biebo-laptop:/media/Academics/Academic/Research/HPC/CUDA/Iam new to CUDA/Matrix$ nvcc -o matMulti matMulti.cu
bibrak@biebo-laptop:/media/Academics/Academic/Research/HPC/CUDA/Iam new to CUDA/Matrix$ ./matMulti
OK
OK mem 2d host
OK mem2d cuda
OK initialize
OK memcpy H to D
OK Kernel
OK memcp D to H
OK done
Segmentation fault

Implementing your matrices as a set of separate rows is a very bad idea. Right now, it’s not guaranteed that neighbouring rows of your matrix are stored sequentially, meaning that row 1 can be located in an entirely different part of memory than rows 0 and 2… while your device to host copy assumes that your data is stored in a contiguous block.
You should allocate contiguous storage for NN elements, and then create N pointers for i=0…N-1 which point to iN.

PS. I believe you’re also overwriting your pointers instead of your data when issuing the memcpy.

N.

Yes, This may be the cause my Gnome was crashing, when i used to copy from device to host. Thanks for that.

I have made some changing with the code.
Now it works . But gives garbage values in result.


#include <stdio.h>

global void multi( float *M1, float *M2, float M3, size_t p_M1,size_t p_M2, size_t p_M3, int N)
{
int idx = blockIdx.x
blockDim.x + threadIdx.x;
if ( idx < N ){

int k = 0;
float* row_M2 = (float*)((char*)M2 + idx * p_M2);
float* row_M3 = (float*)((char*)M3 + idx * p_M3);

for(k=0;k<N;k++){

	float* row_M1 = (float*)((char*)M1 + k * p_M1);
	row_M3[idx] += row_M1[k] * row_M2[idx];

	}

}
	__syncthreads();

}

int const N = 1000;

/*
void print_result(float M3[N], int N1 ){

	int i,j;
	for(i = 0 ; i < N1 ; i++){

		for(j = 0 ; j< N1 ; j++){
		
		printf("%fd   ",M3[i][j]);

		}
		printf("\n");

	}


}*/

int main(){
/* pointers to host memory */
float *Host_M1, *Host_M2, Host_M3;
/
pointers to device memory */
float *GPU_M1, *GPU_M2, *GPU_M3;
size_t pitch_M1,pitch_M2,pitch_M3;

//int N=1000;
int i,j;
/*
int count = 0;
cudaGetDeviceCount (&count);
printf("The count is %d \n",count);

*/

/* Allocate 2darrays  on host*/
Host_M1 = (float*) malloc(N*N*sizeof(float));
Host_M2 = (float*) malloc(N*N*sizeof(float));
Host_M3 =  (float*) malloc(N*N*sizeof(float));


printf("OK mem 2d host\n ");	
/* Allocate 2darrays  on device*/

size_t width = N* sizeof(float);
size_t height = N;

cudaMallocPitch((void**)&GPU_M1, &pitch_M1,width,height);
cudaMallocPitch((void**)&GPU_M2, &pitch_M2,width,height);
cudaMallocPitch((void**)&GPU_M3, &pitch_M3,width,height);

printf("OK mem2d cuda\n ");

/*
cudaMalloc ((void **) &a_d, sizeof(float)*N);
cudaMalloc ((void **) &b_d, sizeof(float)*N);
cudaMalloc ((void **) &c_d, sizeof(float)*N);
*/



/* Initialize arrays a and b */
for (i=0; i<N*N; i++)
{
	

	Host_M1[i] = (float) 1;
	Host_M2[i] = (float) 1;

	
}

/*
for(i = 0 ; i < N*N ; i++){

		printf("%f   ",Host_M1[i]);

		if(i%N == N-1)
		printf("\n");

	}

*/

printf("OK initialize\n\n\n\n\n ");

/* Copy data from host memory to device memory */
cudaMemcpy2D(GPU_M1, pitch_M1,Host_M1,width, width,height, cudaMemcpyHostToDevice);
cudaMemcpy2D(GPU_M2, pitch_M2,Host_M2,width, width,height, cudaMemcpyHostToDevice);

printf("OK  memcpy H to D\n ");

//cudaMemcpy(b_d, b, sizeof(float)*N, cudaMemcpyHostToDevice);

// Invoke kernel
// here the threads and blocks are stuctured in linear way
int threadsPerBlock = 256;
//int blocksPerGrid = (N + threadsPerBlock - 1)/threadsPerBlock;


multi<<<4,threadsPerBlock>>>(GPU_M1,GPU_M2,GPU_M3,pitch_M1,pitch_M2,pitch_M3,N);


printf("OK Kernel\n ");

cudaMemcpy2D(Host_M3,width,GPU_M3,pitch_M3,width,height ,cudaMemcpyDeviceToHost);


printf("OK memcp D to H\n ");

printf("OK done\n");

//print_result(Host_M3,N);

//int out = (int)Host_M3[0][0];

//printf("%d   ",out);

for(i = 0 ; i < N*N ; i++){

		
		
		printf("%f   ",Host_M3[i]);

		if(i%N == N-1)
		printf("\n");

	}



// Time to free the memories 

free(Host_M1);
free(Host_M2);
free(Host_M3);


	printf("OK freeHost\n ");

cudaFree(GPU_M1);
cudaFree(GPU_M1);
cudaFree(GPU_M1);

}

An other modification in the kernel.

int idx = blockIdx.x* blockDim.x + threadIdx.x;

int myrow = idx / N;

int point = idx % N;

float* row_M3 = (float*)((char*)M3 + myrow * p_M3);