I have implemented a program, I multiplies two Matrices.
It works fine.
But gives Segmentation fault if I print the results.
Here is the code , please help me .
#include <stdio.h>
global void multi( float *M1, float *M2, float M3, size_t p_M1,size_t p_M2, size_t p_M3, int N)
{
int idx = blockIdx.x blockDim.x + threadIdx.x;
if ( idx < N ){
int k = 0;
float* row_M2 = (float*)((char*)M2 + idx * p_M2);
float* row_M3 = (float*)((char*)M3 + idx * p_M3);
for(k=0;k<N;k++){
float* row_M1 = (float*)((char*)M1 + k * p_M1);
row_M3[idx] += row_M1[k] * row_M2[idx];
}
}
__syncthreads();
}
int const N = 1000;
int main(){
/* pointers to host memory */
float **Host_M1, **Host_M2, *Host_M3;
/ pointers to device memory */
float *GPU_M1, *GPU_M2, *GPU_M3;
size_t pitch_M1,pitch_M2,pitch_M3;
//int N=1000;
int i,j;
/* Allocate 2darrays on host*/
Host_M1 = (float**) malloc(N*sizeof(float*));
Host_M2 = (float**) malloc(N*sizeof(float*));
Host_M3 = (float**) malloc(N*sizeof(float*));
printf("OK \n ");
for(int i = 0 ; i<N ; i++){
Host_M1[i] = (float*) malloc(N*sizeof(float));
Host_M2[i] = (float*) malloc(N*sizeof(float));
Host_M3[i] = (float*) malloc(N*sizeof(float));
}
printf("OK mem 2d host\n ");
/* Allocate 2darrays on device*/
size_t width = N* sizeof(float);
size_t height = N;
cudaMallocPitch((void**)&GPU_M1, &pitch_M1,width,height);
cudaMallocPitch((void**)&GPU_M2, &pitch_M2,width,height);
cudaMallocPitch((void**)&GPU_M3, &pitch_M3,width,height);
printf("OK mem2d cuda\n ");
/*
cudaMalloc ((void **) &a_d, sizeof(float)*N);
cudaMalloc ((void **) &b_d, sizeof(float)*N);
cudaMalloc ((void **) &c_d, sizeof(float)*N);
*/
/* Initialize arrays a and b */
for (i=0; i<N; i++)
{
for(int j = 0 ; j<N ; j++){
Host_M1[i][j] = (float) 1;
Host_M2[i][j] = (float) 1;
}
}
printf("OK initialize\n ");
/* Copy data from host memory to device memory */
cudaMemcpy2D(GPU_M1, pitch_M1,Host_M1,width, width,height, cudaMemcpyHostToDevice);
cudaMemcpy2D(GPU_M2, pitch_M2,Host_M2,width, width,height, cudaMemcpyHostToDevice);
printf("OK memcpy H to D\n ");
//cudaMemcpy(b_d, b, sizeof(float)*N, cudaMemcpyHostToDevice);
// Invoke kernel
// here the threads and blocks are stuctured in linear way
int threadsPerBlock = 256;
//int blocksPerGrid = (N + threadsPerBlock - 1)/threadsPerBlock;
multi<<<4,threadsPerBlock>>>(GPU_M1,GPU_M2,GPU_M3,pitch_M1,pitch_M2,pitch_M3,N);
printf("OK Kernel\n ");
cudaMemcpy2D(Host_M3,width,GPU_M3,pitch_M3,width,height ,cudaMemcpyDeviceToHost);
printf("OK memcp D to H\n ");
printf("OK done\n");
for(i = 0 ; i < N ; i++){
for(j = 0 ; j< N ; j++){
printf("%f ",Host_M3[i][j]);
}
printf("\n");
}
// Time to free the memories
/*
for(i = 0 ; i< N ; i++){
free(Host_M1[i]);
free(Host_M2[i]);
free(Host_M3[i]);
}
free(Host_M1);
free(Host_M2);
free(Host_M3);
printf("OK freeHost\n ");
*/
cudaFree(GPU_M1);
cudaFree(GPU_M1);
cudaFree(GPU_M1);
}
You will notice that I have commented the code which frees Host memory, becuase it gave errors.
Hope to hear from you guys soon.
Thanks