i am new to cuda environment .as i try to run matrix multiplication i get Segmentation fault (core dumped) error.when to try to run on this version release 11.0, V11.0.221. here is the coding attached .
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m)
{
for(int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
__global__ void gpu_square_matrix_mult(int *d_a, int *d_b, int *d_result, int n)
{
__shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int tmp = 0;
int idx;
for (int sub = 0; sub < gridDim.x; ++sub)
{
idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
if(idx >= n*n)
{
tile_a[threadIdx.y][threadIdx.x] = 0;
}
else
{
tile_a[threadIdx.y][threadIdx.x] = d_a[idx];
}
idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
if(idx >= n*n)
{
tile_b[threadIdx.y][threadIdx.x] = 0;
}
else
{
tile_b[threadIdx.y][threadIdx.x] = d_b[idx];
}
__syncthreads();
for (int k = 0; k < BLOCK_SIZE; ++k)
{
tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
}
__syncthreads();
}
if(row < n && col < n)
{
d_result[row * n + col] = tmp;
}
}
__global__ void gpu_matrix_transpose(int* mat_in, int* mat_out, unsigned int rows, unsigned int cols)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < cols && idy < rows)
{
unsigned int pos = idy * cols + idx;
unsigned int trans_pos = idx * rows + idy;
mat_out[trans_pos] = mat_in[pos];
}
}
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
int tmp = 0.0;
for (int h = 0; h < n; ++h)
{
tmp += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = tmp;
}
}
}
int main(int argc, char const *argv[])
{
int m, n, k;
srand(3333);
printf("please type in m n and k\n");
scanf("%d %d %d", &m, &n, &k);
int *h_a, *h_b, *h_c, *h_cc;
cudaMallocHost( (void **) &h_a, sizeof(int)*m*n );
cudaMallocHost( (void **) &h_b, sizeof(int)*n*k );
cudaMallocHost( (void **) &h_c, sizeof(int)*m*k );
cudaMallocHost( (void **) &h_cc, sizeof(int)*m*k );
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
}
}
float gpu_elapsed_time_ms, cpu_elapsed_time_ms;
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
int *d_a, *d_b, *d_c;
cudaMalloc( (void **) &d_a, sizeof(int)*m*n );
cudaMalloc( (void **) &d_b, sizeof(int)*n*k );
cudaMalloc( (void **) &d_c, sizeof(int)*m*k );
cudaMemcpy( d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice );
cudaMemcpy( d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice );
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
if(m == n && n == k)
{
gpu_square_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n);
}
else
{
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);
}
cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on GPU: %f ms.\n\n", m, n, n, k, gpu_elapsed_time_ms);
cudaEventRecord(start, 0);
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on CPU: %f ms.\n\n", m, n, n, k, cpu_elapsed_time_ms);
int all_ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
printf("[%d][%d]:%d == [%d][%d]:%d, ", i, j, h_cc[i*k + j], i, j, h_c[i*k + j]);
if(h_cc[i*k + j] != h_c[i*k + j])
{
all_ok = 0;
}
}
printf("\n");
}
if(all_ok)
{
printf("all results are correct!!!, speedup = %f\n", cpu_elapsed_time_ms / gpu_elapsed_time_ms);
}
else
{
printf("incorrect results\n");
}
cudaFree( d_a );
cudaFree( d_b );
cudaFree( d_c );
cudaFreeHost( h_a );
cudaFreeHost( h_b );
cudaFreeHost( h_c );
cudaFreeHost( h_cc );
return 0;
}