cudaMemcpy2d result from copying 2d array from device to Host , gives garbage, help

This is the code iam runing , i have used cudamemcpy2d to copy 2d array from Device to Host, and when I print it, It shows garbage,
Can any body guide me .
Thanks


#include <stdio.h>

global void multi( double *M1, size_t p_M1, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;

int myrow = idx;
int j = 0;


if ( idx < N ){



double* row_M1 = (double*)((char*)M1 + myrow * p_M1);

for(j = 0 ; j <N ; j++){

	row_M1[j] = 0;


}

}
	__syncthreads();

}

int const N = 16;

int main(){
/* pointer to host memory */
double Host_M1;
/
pointer to device memory */
double *GPU_M1;
size_t pitch_M1;

int i;

/* Allocate 2darrays  on host*/
Host_M1 = (double*) malloc(N*N*sizeof(double));

printf("OK mem 2d host\n ");	
/* Allocate 2darray  on device*/

size_t width = N * sizeof(double);
size_t height = N;

cudaMallocPitch((void**)&GPU_M1, &pitch_M1,width,height);





/* Initialize arrays a and b */
for (i=0; i<N*N; i++)
{
	

	Host_M1[i] = (double) 0;


	
}





// Invoke kernel
// here the threads and blocks are stuctured in linear way
int threadsPerBlock = 8;



multi<<<2,threadsPerBlock>>>(GPU_M1,pitch_M1,N);


printf("OK Kernel\n ");

cudaMemcpy2D(Host_M1,width,GPU_M1,pitch_M1,width,height ,cudaMemcpyDeviceToHost);





for(i = 0 ; i < N ; i++){

		
		
		printf("%lf   ",Host_M1[i]);

		if(i%N == N-1)
		printf("\n");

	}



// Time to free the memories 

free(Host_M1);



	printf("OK freeHost\n ");

cudaFree(GPU_M1);


printf("OK freeDevice\n ");

}

Runs just fine over here, returns all zeros.
Please use a codebox when posting code on the forum :)

N.

Yes I restarted my computer and now it works Thanks.

But a code similar to previous causes unpredicted results , please help

#include <stdio.h>

__global__ void multi( double *M1, double *M2, double *M3, size_t p_M1,size_t p_M2, size_t p_M3, int N)

{

	int idx =  blockIdx.x * blockDim.x   +  threadIdx.x;

	

	int myrow = idx;

	int j= 0,i=0;

	//int point = idx % N;

	if (idx < N ){

	double* row_M3 = (double*)((char*)M3 + myrow * p_M3);

	for(i = 0; i< N; i++)

	row_M3[i] = (double) 8;

	}

	/*if ( idx < N ){

	int k = 0;

	

	double* row_M3 = (double*)((char*)M3 + myrow * p_M3);

	double* row_M1 = (double*)((char*)M1 + myrow * p_M1);

	

	for(j = 0; j <N; j++){

	

		row_M3[j] = (double) 0;

	for(k=0;k<N;k++){

		double* row_M2 = (double*)((char*)M2 + k * p_M2);

		row_M3[j] += row_M1[k] * row_M2[j];

		}

	}

	}*/

		__syncthreads();

}

	int const N = 8;

int main(){

	/* pointers to host memory */

	double *Host_M1, *Host_M2, *Host_M3;

	/* pointers to device memory */

	double *GPU_M1, *GPU_M2, *GPU_M3;

	size_t pitch_M1,pitch_M2,pitch_M3;

	

	int i;

	/* Allocate 2darrays  on host*/

	Host_M1 = (double*) malloc(N*N*sizeof(double));

	Host_M2 = (double*) malloc(N*N*sizeof(double));

	

	

	printf("OK mem 2d host\n ");	

	/* Allocate 2darrays  on device*/

	

	size_t width = N* sizeof(double);

	size_t height = N;

	

	cudaMallocPitch((void**)&GPU_M1, &pitch_M1,width,height);

	cudaMallocPitch((void**)&GPU_M2, &pitch_M2,width,height);

	cudaMallocPitch((void**)&GPU_M3, &pitch_M3,width,height);

	

	printf("OK mem2d cuda\n ");

	

	/* Initialize arrays a and b */

	for (i=0; i<N*N; i++)

	{

		

	

		Host_M1[i] = (double) 4;

		Host_M2[i] = (double) 2;

		

	}

		

	printf("OK initialize\n\n\n\n\n ");

	

	/* Copy data from host memory to device memory */

	cudaMemcpy2D(GPU_M1, pitch_M1,Host_M1,width, width,height, cudaMemcpyHostToDevice);

	cudaMemcpy2D(GPU_M2, pitch_M2,Host_M2,width, width,height, cudaMemcpyHostToDevice);

	printf("OK  memcpy H to D\n ");

	//cudaMemcpy(b_d, b, sizeof(double)*N, cudaMemcpyHostToDevice);

	// Invoke kernel

	// here the threads and blocks are stuctured in linear way

	int threadsPerBlock = 4;

	//int blocksPerGrid = (N + threadsPerBlock - 1)/threadsPerBlock;

	multi<<<2,threadsPerBlock>>>(GPU_M1,GPU_M2,GPU_M3,pitch_M1,pitch_M2,pitch_M3,N);

	printf("OK Kernel\n ");

	Host_M3 =  (double*) malloc(N*N*sizeof(double));

	cudaMemcpy2D(Host_M3,width,GPU_M3,pitch_M3,width,height ,cudaMemcpyDeviceToHost);

	printf("OK memcp D to H\n ");

	printf("OK done\n");

	

	for(i = 0; i < N*N; i++){

	

			

			

			printf("%lf   ",Host_M3[i]);

			if(i%N == N-1)

			printf("\n");

		}

/*	printf("%lf   \n",Host_M3[0]);

	printf("%lf   \n",Host_M3[100]);

	printf("%lf   \n",Host_M3[5001]);

	printf("%lf   \n",Host_M3[50001]);

	printf("%lf   \n",Host_M3[N*N - 150000]);

	printf("%lf   \n",Host_M3[N*N-100000]);

	printf("%lf   \n",Host_M3[N*N-50000]);

	printf("%lf   \n",Host_M3[N*N-5000]);

	printf("%lf   \n",Host_M3[N*N-1001]);

	printf("%lf   \n",Host_M3[N*N-1]);

	printf("%lf  M1 \n",Host_M1[N*N - 150000]);

	printf("%lf  M2  \n",Host_M2[N*N - 150000]);

	printf("%lf  M1 \n",Host_M1[500]);

	printf("%lf  M2  \n",Host_M2[N*N-1]);

	*/

	// Time to free the memories 

	free(Host_M1);

	free(Host_M2);

	free(Host_M3);

		printf("OK freeHost\n ");

	cudaFree(GPU_M1);

	cudaFree(GPU_M1);

	cudaFree(GPU_M1);

	printf("OK freeDevice\n ");

}

the output is :

bibrak@biebo-laptop:/media/Academics/Academic/Research/HPC/CUDA/Iam new to CUDA/Matrix$ ./matMulti

OK mem 2d host

OK mem2d cuda

OK initialize

OK memcpy H to D

OK Kernel

OK memcp D to H

OK done

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

OK freeHost

OK freeDevice


It must be all 8s

Again, works just fine over here :)

OK mem 2d host
OK mem2d cuda
OK initialize

OK memcpy H to D
OK Kernel
OK memcp D to H
OK done
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.0
00000
OK freeHost
OK freeDevice

Process returned 0 (0x0) execution time : 0.000 s
Press ENTER to continue.

N.