Newbie:Trying Matrix Vector Multiplication

Hi All,

I am new to CUDA programming, I was trying to modify existing matrix multiplication program to multiply MM matrix with M1 vector.

But I always get SEGMENTATION FAULT, Can any body tell me why this error occurred and what should be the approach I must use.

I am pasting my code below:

Main program: mmul.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#include “kernel.cu”

#define CHECK_STATUS(x) if ((x) != 0) fprintf(stderr, “Error execution %s on line %s\n”, #x, LINE);

void init_host(float A, int m, int n)
{
int i, j;
for(i = 0; i < m; i++) {
for(j = 0; j < n; j++) {
int pos = i
m + j;
A[pos] = 1.0;

    }
}

}

int test_host_C(float* A, int m,int n)
{
int i, j;
for(i = 0; i < m; i++) {
for(j = 0; j < n; j++) {
float sum = n;
int pos = i*m + j;
if(A[pos] != sum) return 0;
}
}
return 1;

    }

#define NUMBER_OF_AVERAGES 2

// Thread block size
#define BLOCK_SIZE 16
// Forward declaration of the device multiplication function
// Host multiplication function
// Compute C = A * B
// hA is the height of A
// wA is the width of A
// wB is the width of B
void Mul(float* A, float* B, int hA, int wA, int wB,float* C)
{
int size;

         // Load A and B to the device
         float* Ad;
         size = hA * wA * sizeof(float);
         cudaMalloc((void**)&Ad, size);
         cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);

         float* Bd;
         size = wA * wB * sizeof(float);
         cudaMalloc((void**)&Bd, size);
         cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);

         // Allocate C on the device
         float* Cd;
         size = hA * wB * sizeof(float);
         cudaMalloc((void**)&Cd, size);

         // Compute the execution configuration assuming
         // the matrix dimensions are multiples of BLOCK_SIZE
         dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
         dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);

         // Launch the device computation
         Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

         // Read C from the device
         CHECK_STATUS(cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost));
         if(test_host_C(C,hA,wB) == 0)  {
            printf("Test failed for size = %d %d %d\n",hA,wA,wB);
            exit(1);
         }




         // Free device memory
         cudaFree(Ad);
         cudaFree(Bd);
         cudaFree(Cd);

}

int main()
{

    float t1, t2, t3, t4;
    int i,j;
    int m=8192,n=1;
    float *A = (float *) malloc(m*m*sizeof(float));
    init_host(A,m,m);
    printf("Matrix A:\n");
    for(i=0;i<m;i++)
    {
    for(j=0;j<m;j++)
    {
    int pos=i*m+j;
    //printf("%d%d",i,j);
    printf("%f ",A[pos]);
    }
    printf("\n");
    }

    float *B = (float *) malloc(m*n*sizeof(float));
    init_host(B,m,n);
    printf("Matrix B:\n");
    for(i=0;i<m;i++)
    {
    for(j=0;j<n;j++)
    {
    int pos=i*m+j;
    //printf("%d%d",i,j);
    printf("%f ",B[pos]);
    }
    printf("\n");
    }


    float *C = (float *) malloc(m*n*sizeof(float));

    Mul(A,B ,m,m,n,C);

    printf("Matrix C:\n");
    for(i=0;i<m;i++)
    {
    for(j=0;j<n;j++)
    {
    int pos=i*m+j;
    //printf("%d%d",i,j);
    printf("%f ",C[pos]);

    }
    printf("\n");
    }

  return 0;

}

Kernel Program: kernel.cu

/* Contains only the kernel definitions.
*/
#include <cuda.h>

#define BLOCK_SIZE 16
global void
Muld(float* A, float* B, int wA, int wB, float* C)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block
int aEnd   = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;

// The element of the block sub-matrix that is computed
// by the thread
float Csub = 0;



// Loop over all the sub-matrices of A and B required to
// compute the block sub-matrix
for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep) {

// Shared memory for the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

// Shared memory for the sub-matrix of B
 __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from global memory to shared memory;
// each thread loads one element of each matrix

As[ty][tx] = A[a + wA * ty + tx];

Bs[ty][tx] = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded
__syncthreads();

 // Multiply the two matrices together;
 // each thread computes one element
 // of the block sub-matrix
 for (int k = 0; k < BLOCK_SIZE; ++k)
       Csub += As[ty][k] * Bs[k][tx];

 // Synchronize to make sure that the preceding
 // computation is done before loading two new

 // sub-matrices of A and B in the next iteration
 __syncthreads();
 }

 // Write the block sub-matrix to global memory;
 // each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

Thanks in Advance
Tilak

Have you tried using cublasSgemv or cublasDgemv? They implement matrix-vector multiplication for you.

I wanted to try doing it without using CUBLAS … am I wrong in some calculation ???

I didnt look at the entire code, but from a quick look, it seems you have only changed the host side code and not the device side code.

The access of B array, for instance looks to be wrong for an nx1 matrix.