# Newbie:Trying Matrix Vector Multiplication

Hi All,

I am new to CUDA programming, I was trying to modify existing matrix multiplication program to multiply MM matrix with M1 vector.

But I always get SEGMENTATION FAULT, Can any body tell me why this error occurred and what should be the approach I must use.

I am pasting my code below:

Main program: mmul.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#include “kernel.cu”

#define CHECK_STATUS(x) if ((x) != 0) fprintf(stderr, “Error execution %s on line %s\n”, #x, LINE);

void init_host(float A, int m, int n)
{
int i, j;
for(i = 0; i < m; i++) {
for(j = 0; j < n; j++) {
int pos = i
m + j;
A[pos] = 1.0;

``````    }
}
``````

}

int test_host_C(float* A, int m,int n)
{
int i, j;
for(i = 0; i < m; i++) {
for(j = 0; j < n; j++) {
float sum = n;
int pos = i*m + j;
if(A[pos] != sum) return 0;
}
}
return 1;

``````    }
``````

#define NUMBER_OF_AVERAGES 2

#define BLOCK_SIZE 16
// Forward declaration of the device multiplication function
// Host multiplication function
// Compute C = A * B
// hA is the height of A
// wA is the width of A
// wB is the width of B
void Mul(float* A, float* B, int hA, int wA, int wB,float* C)
{
int size;

``````         // Load A and B to the device
size = hA * wA * sizeof(float);

float* Bd;
size = wA * wB * sizeof(float);
cudaMalloc((void**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);

// Allocate C on the device
float* Cd;
size = hA * wB * sizeof(float);
cudaMalloc((void**)&Cd, size);

// Compute the execution configuration assuming
// the matrix dimensions are multiples of BLOCK_SIZE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);

// Launch the device computation
Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

// Read C from the device
CHECK_STATUS(cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost));
if(test_host_C(C,hA,wB) == 0)  {
printf("Test failed for size = %d %d %d\n",hA,wA,wB);
exit(1);
}

// Free device memory
cudaFree(Bd);
cudaFree(Cd);
``````

}

int main()
{

``````    float t1, t2, t3, t4;
int i,j;
int m=8192,n=1;
float *A = (float *) malloc(m*m*sizeof(float));
init_host(A,m,m);
printf("Matrix A:\n");
for(i=0;i<m;i++)
{
for(j=0;j<m;j++)
{
int pos=i*m+j;
//printf("%d%d",i,j);
printf("%f ",A[pos]);
}
printf("\n");
}

float *B = (float *) malloc(m*n*sizeof(float));
init_host(B,m,n);
printf("Matrix B:\n");
for(i=0;i<m;i++)
{
for(j=0;j<n;j++)
{
int pos=i*m+j;
//printf("%d%d",i,j);
printf("%f ",B[pos]);
}
printf("\n");
}

float *C = (float *) malloc(m*n*sizeof(float));

Mul(A,B ,m,m,n,C);

printf("Matrix C:\n");
for(i=0;i<m;i++)
{
for(j=0;j<n;j++)
{
int pos=i*m+j;
//printf("%d%d",i,j);
printf("%f ",C[pos]);

}
printf("\n");
}

return 0;
``````

}

Kernel Program: kernel.cu

/* Contains only the kernel definitions.
*/
#include <cuda.h>

#define BLOCK_SIZE 16
global void
Muld(float* A, float* B, int wA, int wB, float* C)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

``````// Thread index

// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block
int aEnd   = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;

// The element of the block sub-matrix that is computed
float Csub = 0;

// Loop over all the sub-matrices of A and B required to
// compute the block sub-matrix
for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep) {

// Shared memory for the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

// Shared memory for the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from global memory to shared memory;

As[ty][tx] = A[a + wA * ty + tx];

Bs[ty][tx] = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded

// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];

// Synchronize to make sure that the preceding

// sub-matrices of A and B in the next iteration
}

// Write the block sub-matrix to global memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;
``````

}