Segmentation fault (core dumped)

21phcsf001 · September 13, 2021, 4:22am

i am new to cuda environment .as i try to run matrix multiplication i get Segmentation fault (core dumped) error.when to try to run on this version release 11.0, V11.0.221. here is the coding attached .

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{ 
int row = blockIdx.y * blockDim.y + threadIdx.y; 
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m) 
{
for(int i = 0; i < n; i++) 
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
} 
__global__ void gpu_square_matrix_mult(int *d_a, int *d_b, int *d_result, int n) 
{
__shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int tmp = 0;
int idx;
for (int sub = 0; sub < gridDim.x; ++sub) 
{
idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
if(idx >= n*n)
{
tile_a[threadIdx.y][threadIdx.x] = 0;
}
else
{
tile_a[threadIdx.y][threadIdx.x] = d_a[idx];
}
idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
if(idx >= n*n)
{
tile_b[threadIdx.y][threadIdx.x] = 0;
}  
else
{
tile_b[threadIdx.y][threadIdx.x] = d_b[idx];
}
__syncthreads();
for (int k = 0; k < BLOCK_SIZE; ++k) 
{
tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
}
__syncthreads();
}
if(row < n && col < n)
{
d_result[row * n + col] = tmp;
}
}
__global__ void gpu_matrix_transpose(int* mat_in, int* mat_out, unsigned int rows, unsigned int cols) 
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < cols && idy < rows) 
{
unsigned int pos = idy * cols + idx;
unsigned int trans_pos = idx * rows + idy;
mat_out[trans_pos] = mat_in[pos];
}
}
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i) 
{
for (int j = 0; j < k; ++j) 
{
int tmp = 0.0;
for (int h = 0; h < n; ++h) 
{
tmp += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = tmp;
}
}
}
int main(int argc, char const *argv[])
{
int m, n, k;
srand(3333);
printf("please type in m n and k\n");
scanf("%d %d %d", &m, &n, &k);
int *h_a, *h_b, *h_c, *h_cc;
cudaMallocHost( (void **) &h_a, sizeof(int)*m*n );
cudaMallocHost( (void **) &h_b, sizeof(int)*n*k );
cudaMallocHost( (void **) &h_c, sizeof(int)*m*k );
cudaMallocHost( (void **) &h_cc, sizeof(int)*m*k );
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
}
}
float gpu_elapsed_time_ms, cpu_elapsed_time_ms;
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
int *d_a, *d_b, *d_c;
cudaMalloc( (void **) &d_a, sizeof(int)*m*n );
cudaMalloc( (void **) &d_b, sizeof(int)*n*k );
cudaMalloc( (void **) &d_c, sizeof(int)*m*k );
cudaMemcpy( d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice );
cudaMemcpy( d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice );
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
if(m == n && n == k)
{
gpu_square_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n);    
}
else
{
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);    
}
cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on GPU: %f ms.\n\n", m, n, n, k, gpu_elapsed_time_ms);
cudaEventRecord(start, 0);
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on CPU: %f ms.\n\n", m, n, n, k, cpu_elapsed_time_ms);
int all_ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
printf("[%d][%d]:%d == [%d][%d]:%d, ", i, j, h_cc[i*k + j], i, j, h_c[i*k + j]);
if(h_cc[i*k + j] != h_c[i*k + j])
{
all_ok = 0;
}
}
printf("\n");
}
if(all_ok)
{
printf("all results are correct!!!, speedup = %f\n", cpu_elapsed_time_ms / gpu_elapsed_time_ms);
}
else
{
printf("incorrect results\n");
}
cudaFree( d_a );
cudaFree( d_b );
cudaFree( d_c );
cudaFreeHost( h_a );
cudaFreeHost( h_b );
cudaFreeHost( h_c );
cudaFreeHost( h_cc );
return 0;
}

njuffa · September 13, 2021, 5:23am

“Segmentation fault” indicates the problem is in host code. It is due to access out-of-bounds, or an uninitialized pointer, etc. Use the debugger of your choice (or code instrumentation) to identify where in the code the bad access occurs, then work backwards from there.

21phcsf001 · September 13, 2021, 6:08am

please explain me clearly as i am very new to this environment .

Robert_Crovella · September 13, 2021, 3:46pm

When I run your code with m,n,k = 3,4,5 I have no trouble:

$ cuda-memcheck ./t1892
========= CUDA-MEMCHECK
please type in m n and k
3 4 5
Time elapsed on matrix multiplication of 3x4 . 4x5 on GPU: 6.787392 ms.

Time elapsed on matrix multiplication of 3x4 . 4x5 on CPU: 0.002976 ms.

[0][0]:1742781 == [0][0]:1742781, [0][1]:1824605 == [0][1]:1824605, [0][2]:1909241 == [0][2]:1909241, [0][3]:2540980 == [0][3]:2540980, [0][4]:1756988 == [0][4]:1756988,
[1][0]:1033315 == [1][0]:1033315, [1][1]:1116463 == [1][1]:1116463, [1][2]:1229773 == [1][2]:1229773, [1][3]:1401812 == [1][3]:1401812, [1][4]:1112162 == [1][4]:1112162,
[2][0]:928447 == [2][0]:928447, [2][1]:917466 == [2][1]:917466, [2][2]:1169810 == [2][2]:1169810, [2][3]:1317321 == [2][3]:1317321, [2][4]:919387 == [2][4]:919387,
all results are correct!!!, speedup = 0.000438
========= ERROR SUMMARY: 0 errors
$

21phcsf001 · September 14, 2021, 3:57am

$ nvcc martixmul.cu
$ ./a.out
please type in m n and k
3 4 5
Segmentation fault (core dumped)
$ cuda-memcheck./t1892
-bash: cuda-memcheck./t1892: No such file or directory
$

I tried giving this values but the error presist.

Robert_Crovella · September 14, 2021, 2:30pm

try running

cuda-memcheck ./a.out

My guess would be your calls to cudaMallocHost are returning an error, meaning the machine you are on is not properly set up.

Any time you are having trouble with a CUDA code, its a good idea to do proper CUDA error checking. Google “proper CUDA error checking”, take the first hit, and apply it to your code.

Topic		Replies	Views
Segmentation fault (core dumped) CUDA Programming and Performance	4	13077	May 13, 2017
Segmentation fault (core dumped) after running mex cuda code CUDA Programming and Performance	3	2360	February 25, 2014
CUDA C++ Segmentation Fault CUDA Programming and Performance	7	14703	October 1, 2017
Newbie:Trying Matrix Vector Multiplication CUDA Programming and Performance	3	4214	November 10, 2008
matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) . CUDA Programming and Performance	6	1756	October 5, 2018
an illegal memory access was encountered CUDA Programming and Performance	7	64125	November 10, 2017
Problem with Printing results My first Matrix Multiplication program in CUDA CUDA Programming and Performance	4	2147	July 6, 2009
Segmentation fault when calling virtual function on host CUDA Programming and Performance	9	2464	September 10, 2019
Matrix Multiplication In CUDA CUDA Programming and Performance	6	2539	May 11, 2015
Segmentation Fault when using UMA and pthreads CUDA Programming and Performance cuda , ubuntu	10	1150	March 29, 2023

Segmentation fault (core dumped)

Related topics