Hello, everyone!
I wrote a simple CUDA program for matrix dot product that can be found right under this text.
// includes
#include <cutil_inline.h>
#include <shrUtils.h>
#include <cuda.h>
#define Tile_Width 2
#define Width 5
__global__ void MatrixMul(int *Md, int *Nd, int *Pd){
int tx=threadIdx.x;
int ty=threadIdx.y;
float pom=0;
for(int k=0;k<Width;k++){
pom+=Md[ty*Width+k]*Nd[k*Width+tx];
Pd[ty*Width+tx]=pom;
}
}
int main(int argc, char** argv) {
int *Md,*Nd,*Pd;
int *M,*N,*P;
int i,j;
int Amount=Width*Width*sizeof(int);
// Allocation of memory for CPU based matrix M,N and P
M = (int*) malloc(sizeof(Amount));
N = (int*) malloc(sizeof(Amount));
P = (int*) malloc(sizeof(Amount));
// Initialization of matrices M and N
for(i=0;i<Width;i++){
for(j=0;j<Width;j++){
if(i==j){
M[j+i*Width]=1;
N[j+i*Width]=1;
}
if(i!=j){
M[j+i*Width]=0;
N[j+i*Width]=0;
}
}
}
// Allocation of GPU based memory for Md, Nd and Pd matrices
cudaMalloc((void**)&Md,Amount);
cudaMemcpy(Md,M,Amount,cudaMemcpyHostToDevice);
cudaMalloc((void**)&Nd,Amount);
cudaMemcpy(Nd,N,Amount,cudaMemcpyHostToDevice);
cudaMalloc((void**)&Pd,Amount);
// Setting up parameteres for kernel invocation
dim3 dimGrid(1,1,1);
dim3 dimBlock(Width,Width,1);
// Kernel invocation
MatrixMul<<<dimGrid,dimBlock>>>(Md,Nd,Pd);
// Return of results from GPu to CPU
cudaMemcpy(P,Pd,(Width*Width*sizeof(int)),cudaMemcpyDeviceToHost);
// Printing of results
for(i=0;i<Width;i++){
printf("\n");
for(j=0;j<Width;j++){
printf("[%d] ",P[i*Width+j]);
}
}
printf("\n");
}
Program crashes only if Width is 5 or more than that, if it’s 4 or less everything is working OK.
Does anyone know why is this happening?
Width is the y dimension of matrix.
Thank you,
I’am using GeForce 9600m GT on Win7 64-bit in VS 2008 with CUDA 3.0