hi all,
i just implemented a matrix multiplication code, from the programing guide.
i have 9200M GE in my laptop it has 256MB memory using ubuntu 9.04.
i am providing my code, if i put N = 1024 it works fine. if N = 2048 it gives “cuda out of memory error”.
please help me find the solution how to increase the N .
[codebox]#include <stdio.h>
int N = 2048;
typedef struct {
int width;
int height;
float* elements;
} Matrix;
#define BSIZE 16
// forward declaration of kernel function
global void matkernel(const Matrix, const Matrix, Matrix);
void matmul(const Matrix A, const Matrix B, Matrix C){
float timeStart,timeEnd;
Matrix d_A;
d_A.width = A.width ;
d_A.height = A.height;
size_t sizeA = A.width * A.height * sizeof(float);
cudaMalloc((void**)&d_A.elements, sizeA);
cudaMemcpy(d_A.elements, A.elements, sizeA, cudaMemcpyHostToDevice );
Matrix d_B;
d_B.width = B.width ;
d_B.height = B.height;
size_t sizeB = B.width * B.height * sizeof(float);
cudaMalloc((void**)&d_B.elements, sizeB);
cudaMemcpy(d_B.elements, B.elements, sizeB, cudaMemcpyHostToDevice );
// Allocate C in device memory
Matrix d_C;
d_C.width = C.width; d_C.height = C.height;
size_t sizeC = C.width * C.height * sizeof(float);
cudaMalloc((void**)&d_C.elements, sizeC);
// time to invlove kernel
dim3 threads(BSIZE, BSIZE);
dim3 Grid(B.width / threads.x, A.height / threads.y);
timeStart = clock();
matkernel<<<Grid,threads>>>(d_A,d_B,d_C);
timeEnd = clock();
// read C from device
cudaError_t erro = cudaGetLastError();
if(erro != cudaSuccess)
{
printf("ERROR PREP launch FAIL!\n");
}
printf("error is %s \n",cudaGetErrorString (erro) );
cudaMemcpy(C.elements,d_C.elements,sizeC,cudaMemcpyDeviceToH
ost);
for(int i = N/2 + (C.width * (N-1)) ; i < C.width*C.height ; i++){
printf("%f(%d) ",C.elements[i],i%C.width);
if(i%C.width == C.width-1)
printf("\n");
}
printf("\nTime = %f ",timeEnd - timeStart );
// free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
global void matkernel(Matrix A, Matrix B, Matrix C){
// each elements computes one element of C
// here we accumulate results into Cv
float Cv = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// (row,col) is the elements of the matrix C as well as
// a thread from our pool of threads
for (int e = 0; e < A.width; ++e){
Cv += A.elements[row * A.width + e]
* B.elements[e * B.width + col];
}
// Matrix A traverses from row (row) starting from e ...
// Matrix B traverses verticaly with a constant col (col)
// A.width is used in the for loop because the resultant matrix C
// has dimentions of width equal to B.width (C.width = B.width)
// (C.height = A.height) because each elements from a row of A is
// multiplied and accumulated with each element of B's height
// therefore C.width = B.height
C.elements[row * C.width + col] = Cv;
}
int main (){
// A x B
Matrix A;
Matrix B;
Matrix C;
A.width = N;
A.height = N;
B.width = N;
B.height = N;
C.width = B.width;
C.height = A.height;
//Host_M1 = (int*) malloc(N*N*sizeof(int));
A.elements = (float*) malloc(A.width * A.height * sizeof(float));
B.elements = (float*) malloc(B.width * B.width * sizeof(float));
C.elements = (float*) malloc(C.width * C.width * sizeof(float));
for(int i = 0; i<A.width*A.height; i++){
A.elements[i] = (float) 1;
}
for(int i = 0; i<B.width*B.height; i++){
B.elements[i] = (float) 1;
}
matmul(A,B,C);
free(A.elements);
free(B.elements);
free(C.elements);
return 0;
}
[/codebox]