Hi All,
I am new to CUDA programming, I was trying to modify existing matrix multiplication program to multiply MM matrix with M1 vector.
But I always get SEGMENTATION FAULT, Can any body tell me why this error occurred and what should be the approach I must use.
I am pasting my code below:
Main program: mmul.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include “kernel.cu”
#define CHECK_STATUS(x) if ((x) != 0) fprintf(stderr, “Error execution %s on line %s\n”, #x, LINE);
void init_host(float A, int m, int n)
{
int i, j;
for(i = 0; i < m; i++) {
for(j = 0; j < n; j++) {
int pos = im + j;
A[pos] = 1.0;
}
}
}
int test_host_C(float* A, int m,int n)
{
int i, j;
for(i = 0; i < m; i++) {
for(j = 0; j < n; j++) {
float sum = n;
int pos = i*m + j;
if(A[pos] != sum) return 0;
}
}
return 1;
}
#define NUMBER_OF_AVERAGES 2
// Thread block size
#define BLOCK_SIZE 16
// Forward declaration of the device multiplication function
// Host multiplication function
// Compute C = A * B
// hA is the height of A
// wA is the width of A
// wB is the width of B
void Mul(float* A, float* B, int hA, int wA, int wB,float* C)
{
int size;
// Load A and B to the device
float* Ad;
size = hA * wA * sizeof(float);
cudaMalloc((void**)&Ad, size);
cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);
float* Bd;
size = wA * wB * sizeof(float);
cudaMalloc((void**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// Allocate C on the device
float* Cd;
size = hA * wB * sizeof(float);
cudaMalloc((void**)&Cd, size);
// Compute the execution configuration assuming
// the matrix dimensions are multiples of BLOCK_SIZE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);
// Launch the device computation
Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);
// Read C from the device
CHECK_STATUS(cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost));
if(test_host_C(C,hA,wB) == 0) {
printf("Test failed for size = %d %d %d\n",hA,wA,wB);
exit(1);
}
// Free device memory
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
}
int main()
{
float t1, t2, t3, t4;
int i,j;
int m=8192,n=1;
float *A = (float *) malloc(m*m*sizeof(float));
init_host(A,m,m);
printf("Matrix A:\n");
for(i=0;i<m;i++)
{
for(j=0;j<m;j++)
{
int pos=i*m+j;
//printf("%d%d",i,j);
printf("%f ",A[pos]);
}
printf("\n");
}
float *B = (float *) malloc(m*n*sizeof(float));
init_host(B,m,n);
printf("Matrix B:\n");
for(i=0;i<m;i++)
{
for(j=0;j<n;j++)
{
int pos=i*m+j;
//printf("%d%d",i,j);
printf("%f ",B[pos]);
}
printf("\n");
}
float *C = (float *) malloc(m*n*sizeof(float));
Mul(A,B ,m,m,n,C);
printf("Matrix C:\n");
for(i=0;i<m;i++)
{
for(j=0;j<n;j++)
{
int pos=i*m+j;
//printf("%d%d",i,j);
printf("%f ",C[pos]);
}
printf("\n");
}
return 0;
}
Kernel Program: kernel.cu
/* Contains only the kernel definitions.
*/
#include <cuda.h>
#define BLOCK_SIZE 16
global void
Muld(float* A, float* B, int wA, int wB, float* C)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// The element of the block sub-matrix that is computed
// by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B required to
// compute the block sub-matrix
for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep) {
// Shared memory for the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Shared memory for the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from global memory to shared memory;
// each thread loads one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to global memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
Thanks in Advance
Tilak