Hello, again!
I solved that simple matrix multiplication problem I had earlier, but now, when I need to use square matrices that are larger then 16 x 16 elements, I have to use blocks of threads. That is, I have to distribute matrix across more blocks and access a particular element in some block using appropriate blockIdx.x and blockIdx.y as well as thradIdx.x and threadIdx.y. The problem arises when I have to do actual accessing of element using . So, here is the code and the part in is missing, which is the part I wanted to ask you how to solve it.
Thanks!
// includes
#include <cutil_inline.h>
#include <shrUtils.h>
#include <cuda.h>
#define Width 16
#define Tile_Width 16
__global__ void MatrixMul(int *Md, int *Nd, int *Pd){
int x=blockIdx.x*Tile_Width+threadIdx.x;
int y=blockIdx.y*Tile_Width+threadIdx.y;
int pom=0;
for(int k=0;k<Width;k++){
pom+=Md[ ]*Nd[ ]; // Here is the blank part that is missing !!!!!!!!
Pd[ ]=pom;
}
}
int main(int argc, char** argv) {
int *Md,*Nd,*Pd; // Pointers to GPU memory
int *M,*N,*P; // Pointers to CPU memory
int i,j;
int Amount=(2*Width)*(2*Width)*sizeof(int);
char a;
// Allocation of memory for CPU based matrix M,N and P
M = (int*) malloc((Amount));
N = (int*) malloc((Amount));
P = (int*) malloc((Amount));
// Initialization of matrices M and N
for(i=0;i<Width*2;i++){
for(j=0;j<Width*2;j++){
if(j==i){
M[i*(Width*2)+j]=1;
N[i*(Width*2)+j]=1;
}
if(i!=j){
M[i*(Width*2)+j]=0;
N[i*(Width*2)+j]=0;
}
}
}
// Allocation of GPU based memory for Md, Nd and Pd matrices
cudaMalloc((void**)&Md,Amount);
cudaMemcpy(Md,M,Amount,cudaMemcpyHostToDevice);
cudaMalloc((void**)&Nd,Amount);
cudaMemcpy(Nd,N,Amount,cudaMemcpyHostToDevice);
cudaMalloc((void**)&Pd,Amount);
// Setting up parameteres for kernel invocation
dim3 dimGrid(2,2,1);
dim3 dimBlock(Width,Width,1);
// Kernel invocation
MatrixMul<<<dimGrid,dimBlock>>>(Md,Nd,Pd);
// Return of results from GPu to CPU
cudaMemcpy(P,Pd,(Width*Width*sizeof(int)),cudaMemcpyDeviceToHost);
// Printing of results
for(i=0;i<(Width*2);i++){
printf("\n");
printf("%d. ",i);
for(j=0;j<(Width*2);j++){
printf("[%d]",P[i*(Width*2)+j]);
}
}
printf("\n");
printf("When you cried I always wiped out all of your teats, ...\n");
scanf("%d",&a);
}
As you can see, the idea was to have 4 blocks of 16 x 16 elements, so that I can have one 32 x 32 matrix.