Matrix Multiplication Help

Hi.

I’m trying to make a basic matrix multiplication program. The program compiles but currently is giving wrong/nonsense answers. I’m not sure whether the problem lies in the CUDA code or the CPU side. If anyone could help me spot this I’d be very thankful.

Here’s the code:

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <cutil.h>

#include “cublas.h”

#define BLOCK_SIZE 16

// Kernel

global void CudaMul(float *A, float *B, float *C, int noc0, int noc1, int nor1)

{

int k;

int i = blockDim.x*blockIdx.x + threadIdx.x;

int j = blockDim.y*blockIdx.y + threadIdx.y;

if(i < noc0 && j < nor1)

{

	C[j*noc1 + i] = 0;

	for(k = 0; k < noc0; k++)

	{

		C[j*noc1 + i] += A[j*noc0 + k] * B[k*noc1 + i];

	}

}

}

// Main

int main ()

{

float* matrixA;

float* matrixB;

float* matrixC;

int HA, HB, HC, WA, WB, WC, i;

// get matrix size from data file

int* size;

size = (int*)malloc(2 * sizeof(size[0]));

FILE *sizes;

if ( (sizes = fopen( "SZ.dat", "r" )) == NULL ) printf("Can't open file!\n");

for(i=0; i<2; i++)

{

	fscanf(sizes, "%d", &size[i]);

}

fclose(sizes);

// Matrix size

HA = size[0];

WA = size[1];

HB = size[1];

WB = size[0];

HC = HA;

WC = WB;

unsigned int mem_size_A = HA * WA * sizeof(matrixA[0]);

unsigned int mem_size_B = HB * WB * sizeof(matrixB[0]);

unsigned int mem_size_C = HC * WC * sizeof(matrixC[0]);

/* Allocate host memory for the matrices */

matrixA = (float*)malloc(mem_size_A);

if (matrixA == 0) {

    fprintf (stderr, "!!!! host memory allocation error (Matrix A)\n");

    return EXIT_FAILURE;

}

matrixB = (float*)malloc(mem_size_B);

if (matrixB == 0) {

    fprintf (stderr, "!!!! host memory allocation error (Matrix B)\n");

    return EXIT_FAILURE;

}

matrixC = (float*)malloc(mem_size_C);

if (matrixC == 0) {

    fprintf (stderr, "!!!! host memory allocation error (Matrix C)\n");

    return EXIT_FAILURE;

}

// Get data from fiiles

FILE *file;

if ( (file = fopen( "AW.dat", "r" )) == NULL ) printf("Can't open file!\n");

for (i=0;i<(HA*WA);i++)

{

	fscanf( file, "%f", &matrixA[i] );

}

fclose(file);



FILE *file2;

if ( (file2 = fopen( "RR.dat", "r" )) == NULL ) printf("Can't open file!\n");

for (i=0;i<(HB*WB);i++)

{

	fscanf( file2, "%f", &matrixB[i] );

}

fclose(file2);

/* Allocate device memory for the matrices */

float* d_A;

float* d_B;

float* d_C;

cublasAlloc(HA * WA, sizeof(d_A[0]), (void**)&d_A);

cublasAlloc(HB * WB, sizeof(d_B[0]), (void**)&d_B);

cublasAlloc(HC * WC, sizeof(d_C[0]), (void**)&d_C);

// Transfer matrices

cublasSetVector(HA * WA, sizeof(matrixA[0]), matrixB, 1, d_A, 1);

cublasSetVector(HB * WB, sizeof(matrixB[0]), matrixB, 1, d_B, 1);

cublasSetVector(HC * WC, sizeof(matrixC[0]), matrixC, 1, d_C, 1);

// set kernel config

dim3 threads( BLOCK_SIZE, BLOCK_SIZE );

dim3 grid( WC / threads.x,  HC / threads.y );

// invoke kernel

CudaMul<<< grid, threads >>>(d_A, d_B, d_C, WA, WB, HC);

// Copy result to device

cudaMemcpy(matrixC, d_C, mem_size_C, cudaMemcpyDeviceToHost);

// Print results to file

FILE *file3;

file3 = fopen ("DC.dat", "w");

for (i=0; i<HC*WC; i++)

{

	fprintf(file3, "%f ", matrixC[i]);

}

fclose(file3);

 // find C

float* CC;

float* CC_bp;

/* Allocate host memory for CC */

CC = (float*)malloc(HC * WC * sizeof(CC[0]));

if (CC == 0) {

	fprintf (stderr, "!!!! host memory allocation error (CC)\n");

	return EXIT_FAILURE;

}

/* Calculate CC*/

CC[0] = matrixC[0];

for (i=1; i<HC*WC; i++)

{

	CC[i] = (((1+CC[i-1]/100)*(1+matrixC[i]/100))-1)*100;

}

// Print results to file

FILE *file4;

file4 = fopen ("CC.dat", "w");

for (i=0; i<HC*WC; i++)

{

	fprintf(file4, "%f ", CC[i]);

}

fclose(file4);

/* Allocate host memory for CC_bp */

CC_bp = (float*)malloc(WC * HC * sizeof(CC_bp[0]));

if (CC_bp == 0)

{

	fprintf (stderr, "!!!! host memory allocation error (CC_bp)\n");

	return EXIT_FAILURE;

}

/* Calculate CC_bp*/

for (i=0; i<HC*WC; i++)

{

	CC_bp[i] = CC[i] * 100;

}

/* Print CC_bp */

FILE *file5;

if ( (file5 = fopen( "CCbp.dat", "w" )) == NULL ) printf("Can't open file!\n");

for (i=0; i<WC*HC; i++)

{

	fprintf(file5, "%f s\n", (i+1), CC_bp[i]);

}

fclose(file5);

// clean up memory

free(matrixA);

free(matrixB);

free(matrixC);

free(size);

free(CC);

free(CC_bp);

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

return EXIT_SUCCESS;

}[/codebox]

Thank you in advance!

Tom W

Hi, i have posted this answer that should prove to be useful. CUBLAS will give generally better results in matrix multiplication. Hope it helps, David Lisin.

http://forums.nvidia.com/index.php?showtop…;hl=david+lisin

Thanks I’ll have a look at that. I was however able to solve my problem in the end.

Tom W

hI Tom W

How you can solve your problem, I have the same problem, the answer for multiplication is fake.

thanks a lot

Abdo

There are better ways of doing it using CUBLAS or the examples from the SDK.

However when I have some free time I’ll look into posting the code which works.

Tom W

thank you very much for your concern

I will try will CUBLAS and I wait your code

thanks a lot

Abdo