I have been having problems with the matrix multiplication program in in the Programmer’s Guide. I enclose the code in the posting:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <time.h>
// Matrices will be strored in row major order
// M(row, col) = *(M.elements + row * M.width + col)(cuda-gdb) n(cuda-gdb) n
typedef struct {
int width;
int height;
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Forward declaration of the matrix multiplication kernel
global void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix Multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE(cuda-gdb) n
void matrixMulGPU(const Matrix A, const Matrix B, const Matrix C)[Current CUDA Thread <<<(0,0),(0,0,0)>>>]
{
// Load A and B to device memory
Matrix d_A;
d_A.width = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc((void**)&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc((void**)&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc((void**)&d_C.elements, size);
//Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x + 1, A.height / dimBlock.y + 1);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory[Current CUDA Thread <<<(0,0),(0,0,0)>>>]
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
cudaThreadExit();(cuda-gdb)
}
// Matrix multiplication kernel called by MatMul()
global void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Each thread computes one elements of C
// by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
for (int e = 0; e < A.width; ++e) {
Cvalue += A.elements[row * A.width + e]
* B.elements[e * B.width + col];
}
__syncthreads();
C.elements[row * C.width + col] = Cvalue;
}
float floatRand(float MaxVal) {
return ( (float)rand() / (float)RAND_MAX ) * MaxVal;
}
void timedate() {
time_t temps_act;
time(&temps_act);
printf(“%s\n”, ctime(&temps_act));
}
void matrixMulCPU(Matrix M, Matrix N, Matrix P) {
int i, j, k;
for(i=0; i<M.height; i++) {
for(j=0; j<N.width; j++) {
for(k=0; k<M.width; k++) {[Current CUDA Thread <<<(0,0),(0,0,0)>>>]
P.elements[i*N.width + j] += M.elements[i*M.width + k]
* N.elements[k*N.width + j];
}
}
}
}
int main() {
int size = 10;
int i, j;
int print = true;
Matrix A;
Matrix B;
Matrix C;
Matrix D;
A.width = size;
A.height = size;
A.elements = (float*) malloc(A.widthA.heightsizeof(float));
B.width = size;(cuda-gdb)
B.height = size;
B.elements = (float*) malloc(B.widthB.heightsizeof(float));
C.width = size;
C.height = size;
C.elements = (float*) malloc(C.widthC.heightsizeof(float));
D.width = size;
D.height = size;
D.elements = (float*) malloc(D.widthD.heightsizeof(float));
printf(“matrix generation\n”);
srand ( time(NULL) );
for(i=0; i<size*size; i++) {
A.elements[i] = 2; //floatRand(2);
B.elements[i] = 2; //floatRand(2);
// A.elements[i] = 3;
// B.elements[i] = 3;
C.elements[i] = 0;
D.elements[i] = 0;
}
if(print) {
printf(“A =\n”);
for(i=0; i<A.height; i++) {
for(j=0; j<A.width; j++) {
printf("%f ",A.elements[i*size + j]);
}
printf("\n");
}
}
printf(“\n\n”);
printf(“B =\n”);
for(i=0; i<B.height; i++) {
for(j=0; j<B.width; j++) {
printf(“%f “,B.elements[i*size + j]);
}
printf(”\n”);
}
}
timedate();
printf("GPU matrix multiplication\n");
matrixMulGPU(A, B, C);
timedate();
timedate();
printf("CPU matrix multiplication\n");
matrixMulCPU(A, B, D);
timedate();
//print result
if(print) {
printf("C =\n");
for(i=0; i<C.height; i++) {
for(j=0; j<C.width; j++) {
printf("%f ",C.elements[i*size + j]);
}
printf("\n");
}
printf("\n\n");
printf("D =\n");
for(i=0; i<D.height; i++) {
for(j=0; j<D.width; j++) {
printf("%f ",D.elements[i*size + j]);
}
printf("\n");
}
}
}
This program gives me different output put every time that I run it. The first time it gives me the correct output, the second time something different is in the C matrix or the matrix multiplication carried out by the GPU. These first two runs were carried out on Friday afternoon. This Monday morning I ran the program and got still different output. Again this is only in the C matrix. It is always in the first six columns and can be either in one, two or three rows. Every matrix here should have 40 in each of its elements. This is what comes out in the D matrix or the matrix that the CPU calculates.
Why does the C matrix change every time that I run the program? I will now show the outputs from last Friday and today.
GPU matrix multiplication
Fri Dec 4 15:24:28 2009
Fri Dec 4 15:24:28 2009
CPU matrix multiplication
Fri Dec 4 15:24:28 2009
C =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
CPU matrix multiplication
Fri Dec 4 15:24:47 2009
C =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
116.000000 116.000000 116.000000 116.000000 116.000000 116.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
./example
matrix generation
Mon Dec 7 08:58:56 2009
GPU matrix multiplication
Mon Dec 7 08:58:57 2009
Mon Dec 7 08:58:57 2009
CPU matrix multiplication
Mon Dec 7 08:58:57 2009
C =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
36.000000 36.000000 36.000000 36.000000 36.000000 36.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
36.000000 36.000000 36.000000 36.000000 36.000000 36.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
Mon Dec 7 08:59:02 2009 CPU matrix multiplication
Fri Dec 4 15:24:47 2009
C =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
116.000000 116.000000 116.000000 116.000000 116.000000 116.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 (cuda-gdb) n
GPU matrix multiplication CPU matrix multiplication
Fri Dec 4 15:24:47 2009
C =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
116.000000 116.000000 116.000000 116.000000 116.000000 116.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
Mon Dec 7 08:59:03 2009
Mon Dec 7 08:59:03 2009
CPU matrix multiplication
Mon Dec 7 08:59:03 2009
C =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
36.000000 36.000000 36.000000 36.000000 36.000000 36.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
36.000000 36.000000 36.000000 36.000000 36.000000 36.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
36.000000 36.000000 36.000000 36.000000 36.000000 36.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
Now please understand that no line of code has been changed between these runs, but the run outputs are different for the C matrix in each run of the program. As I stated before the first six columns are affected and they appear randomly; sometimes it is two lines, sometimes it is three lines and sometimes only one line. The numbers in the matrix elements are randomly chosen.
The source for the program is enclosed above, but the issue I believe is with the subprogram MatMulKernel. This is, of course, the GPU kernel. Compiling it for debugging and walking through the code goes for a few lines, then you get.
(cuda-gdb) n
[CUDA execution terminated]
0x00007f72e0b731e0 in ?? () from /usr/lib/libcuda.so
Now if you compile the program in debugging mode and emulation mode the program hangs in the function MatMulKernel, it also hangs when you run it in emulation mode.
I am using Ubuntu 9.04 in 64 bit mode. It says that it cannot access libcuda.so. I did not set up the software, but I do know something about libcuda.so is in the release notes for the 32 bit version of CUDA.
What is going on here?
Newport_j