Segmentation fault (core dumped)

i am new to cuda environment .as i try to run matrix multiplication i get Segmentation fault (core dumped) error.when to try to run on this version release 11.0, V11.0.221. here is the coding attached .

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{ 
int row = blockIdx.y * blockDim.y + threadIdx.y; 
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m) 
{
for(int i = 0; i < n; i++) 
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
} 
__global__ void gpu_square_matrix_mult(int *d_a, int *d_b, int *d_result, int n) 
{
__shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int tmp = 0;
int idx;
for (int sub = 0; sub < gridDim.x; ++sub) 
{
idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
if(idx >= n*n)
{
tile_a[threadIdx.y][threadIdx.x] = 0;
}
else
{
tile_a[threadIdx.y][threadIdx.x] = d_a[idx];
}
idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
if(idx >= n*n)
{
tile_b[threadIdx.y][threadIdx.x] = 0;
}  
else
{
tile_b[threadIdx.y][threadIdx.x] = d_b[idx];
}
__syncthreads();
for (int k = 0; k < BLOCK_SIZE; ++k) 
{
tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
}
__syncthreads();
}
if(row < n && col < n)
{
d_result[row * n + col] = tmp;
}
}
__global__ void gpu_matrix_transpose(int* mat_in, int* mat_out, unsigned int rows, unsigned int cols) 
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < cols && idy < rows) 
{
unsigned int pos = idy * cols + idx;
unsigned int trans_pos = idx * rows + idy;
mat_out[trans_pos] = mat_in[pos];
}
}
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i) 
{
for (int j = 0; j < k; ++j) 
{
int tmp = 0.0;
for (int h = 0; h < n; ++h) 
{
tmp += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = tmp;
}
}
}
int main(int argc, char const *argv[])
{
int m, n, k;
srand(3333);
printf("please type in m n and k\n");
scanf("%d %d %d", &m, &n, &k);
int *h_a, *h_b, *h_c, *h_cc;
cudaMallocHost( (void **) &h_a, sizeof(int)*m*n );
cudaMallocHost( (void **) &h_b, sizeof(int)*n*k );
cudaMallocHost( (void **) &h_c, sizeof(int)*m*k );
cudaMallocHost( (void **) &h_cc, sizeof(int)*m*k );
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
}
}
float gpu_elapsed_time_ms, cpu_elapsed_time_ms;
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
int *d_a, *d_b, *d_c;
cudaMalloc( (void **) &d_a, sizeof(int)*m*n );
cudaMalloc( (void **) &d_b, sizeof(int)*n*k );
cudaMalloc( (void **) &d_c, sizeof(int)*m*k );
cudaMemcpy( d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice );
cudaMemcpy( d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice );
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
if(m == n && n == k)
{
gpu_square_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n);    
}
else
{
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);    
}
cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on GPU: %f ms.\n\n", m, n, n, k, gpu_elapsed_time_ms);
cudaEventRecord(start, 0);
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on CPU: %f ms.\n\n", m, n, n, k, cpu_elapsed_time_ms);
int all_ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
printf("[%d][%d]:%d == [%d][%d]:%d, ", i, j, h_cc[i*k + j], i, j, h_c[i*k + j]);
if(h_cc[i*k + j] != h_c[i*k + j])
{
all_ok = 0;
}
}
printf("\n");
}
if(all_ok)
{
printf("all results are correct!!!, speedup = %f\n", cpu_elapsed_time_ms / gpu_elapsed_time_ms);
}
else
{
printf("incorrect results\n");
}
cudaFree( d_a );
cudaFree( d_b );
cudaFree( d_c );
cudaFreeHost( h_a );
cudaFreeHost( h_b );
cudaFreeHost( h_c );
cudaFreeHost( h_cc );
return 0;
}

“Segmentation fault” indicates the problem is in host code. It is due to access out-of-bounds, or an uninitialized pointer, etc. Use the debugger of your choice (or code instrumentation) to identify where in the code the bad access occurs, then work backwards from there.

please explain me clearly as i am very new to this environment .

When I run your code with m,n,k = 3,4,5 I have no trouble:

$ cuda-memcheck ./t1892
========= CUDA-MEMCHECK
please type in m n and k
3 4 5
Time elapsed on matrix multiplication of 3x4 . 4x5 on GPU: 6.787392 ms.

Time elapsed on matrix multiplication of 3x4 . 4x5 on CPU: 0.002976 ms.

[0][0]:1742781 == [0][0]:1742781, [0][1]:1824605 == [0][1]:1824605, [0][2]:1909241 == [0][2]:1909241, [0][3]:2540980 == [0][3]:2540980, [0][4]:1756988 == [0][4]:1756988,
[1][0]:1033315 == [1][0]:1033315, [1][1]:1116463 == [1][1]:1116463, [1][2]:1229773 == [1][2]:1229773, [1][3]:1401812 == [1][3]:1401812, [1][4]:1112162 == [1][4]:1112162,
[2][0]:928447 == [2][0]:928447, [2][1]:917466 == [2][1]:917466, [2][2]:1169810 == [2][2]:1169810, [2][3]:1317321 == [2][3]:1317321, [2][4]:919387 == [2][4]:919387,
all results are correct!!!, speedup = 0.000438
========= ERROR SUMMARY: 0 errors
$

$ nvcc martixmul.cu
$ ./a.out
please type in m n and k
3 4 5
Segmentation fault (core dumped)
$ cuda-memcheck./t1892
-bash: cuda-memcheck./t1892: No such file or directory
$

I tried giving this values but the error presist.

try running

cuda-memcheck ./a.out

My guess would be your calls to cudaMallocHost are returning an error, meaning the machine you are on is not properly set up.

Any time you are having trouble with a CUDA code, its a good idea to do proper CUDA error checking. Google “proper CUDA error checking”, take the first hit, and apply it to your code.