Yes I restarted my computer and now it works Thanks.
But a code similar to previous causes unpredicted results , please help
#include <stdio.h>
__global__ void multi( double *M1, double *M2, double *M3, size_t p_M1,size_t p_M2, size_t p_M3, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int myrow = idx;
int j= 0,i=0;
//int point = idx % N;
if (idx < N ){
double* row_M3 = (double*)((char*)M3 + myrow * p_M3);
for(i = 0; i< N; i++)
row_M3[i] = (double) 8;
}
/*if ( idx < N ){
int k = 0;
double* row_M3 = (double*)((char*)M3 + myrow * p_M3);
double* row_M1 = (double*)((char*)M1 + myrow * p_M1);
for(j = 0; j <N; j++){
row_M3[j] = (double) 0;
for(k=0;k<N;k++){
double* row_M2 = (double*)((char*)M2 + k * p_M2);
row_M3[j] += row_M1[k] * row_M2[j];
}
}
}*/
__syncthreads();
}
int const N = 8;
int main(){
/* pointers to host memory */
double *Host_M1, *Host_M2, *Host_M3;
/* pointers to device memory */
double *GPU_M1, *GPU_M2, *GPU_M3;
size_t pitch_M1,pitch_M2,pitch_M3;
int i;
/* Allocate 2darrays on host*/
Host_M1 = (double*) malloc(N*N*sizeof(double));
Host_M2 = (double*) malloc(N*N*sizeof(double));
printf("OK mem 2d host\n ");
/* Allocate 2darrays on device*/
size_t width = N* sizeof(double);
size_t height = N;
cudaMallocPitch((void**)&GPU_M1, &pitch_M1,width,height);
cudaMallocPitch((void**)&GPU_M2, &pitch_M2,width,height);
cudaMallocPitch((void**)&GPU_M3, &pitch_M3,width,height);
printf("OK mem2d cuda\n ");
/* Initialize arrays a and b */
for (i=0; i<N*N; i++)
{
Host_M1[i] = (double) 4;
Host_M2[i] = (double) 2;
}
printf("OK initialize\n\n\n\n\n ");
/* Copy data from host memory to device memory */
cudaMemcpy2D(GPU_M1, pitch_M1,Host_M1,width, width,height, cudaMemcpyHostToDevice);
cudaMemcpy2D(GPU_M2, pitch_M2,Host_M2,width, width,height, cudaMemcpyHostToDevice);
printf("OK memcpy H to D\n ");
//cudaMemcpy(b_d, b, sizeof(double)*N, cudaMemcpyHostToDevice);
// Invoke kernel
// here the threads and blocks are stuctured in linear way
int threadsPerBlock = 4;
//int blocksPerGrid = (N + threadsPerBlock - 1)/threadsPerBlock;
multi<<<2,threadsPerBlock>>>(GPU_M1,GPU_M2,GPU_M3,pitch_M1,pitch_M2,pitch_M3,N);
printf("OK Kernel\n ");
Host_M3 = (double*) malloc(N*N*sizeof(double));
cudaMemcpy2D(Host_M3,width,GPU_M3,pitch_M3,width,height ,cudaMemcpyDeviceToHost);
printf("OK memcp D to H\n ");
printf("OK done\n");
for(i = 0; i < N*N; i++){
printf("%lf ",Host_M3[i]);
if(i%N == N-1)
printf("\n");
}
/* printf("%lf \n",Host_M3[0]);
printf("%lf \n",Host_M3[100]);
printf("%lf \n",Host_M3[5001]);
printf("%lf \n",Host_M3[50001]);
printf("%lf \n",Host_M3[N*N - 150000]);
printf("%lf \n",Host_M3[N*N-100000]);
printf("%lf \n",Host_M3[N*N-50000]);
printf("%lf \n",Host_M3[N*N-5000]);
printf("%lf \n",Host_M3[N*N-1001]);
printf("%lf \n",Host_M3[N*N-1]);
printf("%lf M1 \n",Host_M1[N*N - 150000]);
printf("%lf M2 \n",Host_M2[N*N - 150000]);
printf("%lf M1 \n",Host_M1[500]);
printf("%lf M2 \n",Host_M2[N*N-1]);
*/
// Time to free the memories
free(Host_M1);
free(Host_M2);
free(Host_M3);
printf("OK freeHost\n ");
cudaFree(GPU_M1);
cudaFree(GPU_M1);
cudaFree(GPU_M1);
printf("OK freeDevice\n ");
}
the output is :
bibrak@biebo-laptop:/media/Academics/Academic/Research/HPC/CUDA/Iam new to CUDA/Matrix$ ./matMulti
OK mem 2d host
OK mem2d cuda
OK initialize
OK memcpy H to D
OK Kernel
OK memcp D to H
OK done
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
OK freeHost
OK freeDevice
It must be all 8s