Hi.
I’m trying to make a basic matrix multiplication program. The program compiles but currently is giving wrong/nonsense answers. I’m not sure whether the problem lies in the CUDA code or the CPU side. If anyone could help me spot this I’d be very thankful.
Here’s the code:
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#include “cublas.h”
#define BLOCK_SIZE 16
// Kernel
global void CudaMul(float *A, float *B, float *C, int noc0, int noc1, int nor1)
{
int k;
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
if(i < noc0 && j < nor1)
{
C[j*noc1 + i] = 0;
for(k = 0; k < noc0; k++)
{
C[j*noc1 + i] += A[j*noc0 + k] * B[k*noc1 + i];
}
}
}
// Main
int main ()
{
float* matrixA;
float* matrixB;
float* matrixC;
int HA, HB, HC, WA, WB, WC, i;
// get matrix size from data file
int* size;
size = (int*)malloc(2 * sizeof(size[0]));
FILE *sizes;
if ( (sizes = fopen( "SZ.dat", "r" )) == NULL ) printf("Can't open file!\n");
for(i=0; i<2; i++)
{
fscanf(sizes, "%d", &size[i]);
}
fclose(sizes);
// Matrix size
HA = size[0];
WA = size[1];
HB = size[1];
WB = size[0];
HC = HA;
WC = WB;
unsigned int mem_size_A = HA * WA * sizeof(matrixA[0]);
unsigned int mem_size_B = HB * WB * sizeof(matrixB[0]);
unsigned int mem_size_C = HC * WC * sizeof(matrixC[0]);
/* Allocate host memory for the matrices */
matrixA = (float*)malloc(mem_size_A);
if (matrixA == 0) {
fprintf (stderr, "!!!! host memory allocation error (Matrix A)\n");
return EXIT_FAILURE;
}
matrixB = (float*)malloc(mem_size_B);
if (matrixB == 0) {
fprintf (stderr, "!!!! host memory allocation error (Matrix B)\n");
return EXIT_FAILURE;
}
matrixC = (float*)malloc(mem_size_C);
if (matrixC == 0) {
fprintf (stderr, "!!!! host memory allocation error (Matrix C)\n");
return EXIT_FAILURE;
}
// Get data from fiiles
FILE *file;
if ( (file = fopen( "AW.dat", "r" )) == NULL ) printf("Can't open file!\n");
for (i=0;i<(HA*WA);i++)
{
fscanf( file, "%f", &matrixA[i] );
}
fclose(file);
FILE *file2;
if ( (file2 = fopen( "RR.dat", "r" )) == NULL ) printf("Can't open file!\n");
for (i=0;i<(HB*WB);i++)
{
fscanf( file2, "%f", &matrixB[i] );
}
fclose(file2);
/* Allocate device memory for the matrices */
float* d_A;
float* d_B;
float* d_C;
cublasAlloc(HA * WA, sizeof(d_A[0]), (void**)&d_A);
cublasAlloc(HB * WB, sizeof(d_B[0]), (void**)&d_B);
cublasAlloc(HC * WC, sizeof(d_C[0]), (void**)&d_C);
// Transfer matrices
cublasSetVector(HA * WA, sizeof(matrixA[0]), matrixB, 1, d_A, 1);
cublasSetVector(HB * WB, sizeof(matrixB[0]), matrixB, 1, d_B, 1);
cublasSetVector(HC * WC, sizeof(matrixC[0]), matrixC, 1, d_C, 1);
// set kernel config
dim3 threads( BLOCK_SIZE, BLOCK_SIZE );
dim3 grid( WC / threads.x, HC / threads.y );
// invoke kernel
CudaMul<<< grid, threads >>>(d_A, d_B, d_C, WA, WB, HC);
// Copy result to device
cudaMemcpy(matrixC, d_C, mem_size_C, cudaMemcpyDeviceToHost);
// Print results to file
FILE *file3;
file3 = fopen ("DC.dat", "w");
for (i=0; i<HC*WC; i++)
{
fprintf(file3, "%f ", matrixC[i]);
}
fclose(file3);
// find C
float* CC;
float* CC_bp;
/* Allocate host memory for CC */
CC = (float*)malloc(HC * WC * sizeof(CC[0]));
if (CC == 0) {
fprintf (stderr, "!!!! host memory allocation error (CC)\n");
return EXIT_FAILURE;
}
/* Calculate CC*/
CC[0] = matrixC[0];
for (i=1; i<HC*WC; i++)
{
CC[i] = (((1+CC[i-1]/100)*(1+matrixC[i]/100))-1)*100;
}
// Print results to file
FILE *file4;
file4 = fopen ("CC.dat", "w");
for (i=0; i<HC*WC; i++)
{
fprintf(file4, "%f ", CC[i]);
}
fclose(file4);
/* Allocate host memory for CC_bp */
CC_bp = (float*)malloc(WC * HC * sizeof(CC_bp[0]));
if (CC_bp == 0)
{
fprintf (stderr, "!!!! host memory allocation error (CC_bp)\n");
return EXIT_FAILURE;
}
/* Calculate CC_bp*/
for (i=0; i<HC*WC; i++)
{
CC_bp[i] = CC[i] * 100;
}
/* Print CC_bp */
FILE *file5;
if ( (file5 = fopen( "CCbp.dat", "w" )) == NULL ) printf("Can't open file!\n");
for (i=0; i<WC*HC; i++)
{
fprintf(file5, "%f s\n", (i+1), CC_bp[i]);
}
fclose(file5);
// clean up memory
free(matrixA);
free(matrixB);
free(matrixC);
free(size);
free(CC);
free(CC_bp);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return EXIT_SUCCESS;
}[/codebox]
Thank you in advance!
Tom W