Hi,

Thank you for your attention. Here is my complete code. Please take a look.

In this program, I want to calculate multiple elements of a matrix by using

one thread. Please help to advise. Thanks a lot!

```
/*
-----1 Thread compute multiple elements of matrix product-----
*/
#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#pragma comment(lib, "cudart")
//Function executed on host
void cpu_matrixMul(int *a, int *b, int *c, int N){
int row, col, k, sum;
for (row=0; row<N; row++)
for (col=0; col<N; col++){
sum = 0;
for (k=0; k<N; k++)
sum += a[row*N+k]*b[k*N+col];
c[row*N+col]=sum;
}
}
//GPU kernel, each thread computes an area of data size TWxTW
//using one thread block
__global__ void gpu_matrixMul1(int *a, int *b, int *c, int N, int TW){
int start_row = threadIdx.y*TW;
int end_row = start_row+TW;
int start_col = threadIdx.x;
int end_col = start_col+TW;
int k, sum = 0;
int row, col;
if ((row < N) && (col <N)){
for (row = start_row; row < end_row; row++){
for(col = start_col; col < end_col; col++){
for(k = 0; k< N; k++){
sum += a[row*N+k]*b[k*N+col];
c[row*N+col] = sum;
}
}
}
}
}
//GPU kernel, each thread computes an area of data size TWxTW
//using multiple thread blocks
__global__ void gpu_matrixMul2(int *a, int *b, int *c, int N, int TW){
int start_row = blockDim.y*blockIdx.y+threadIdx.y*TW;
int end_row = start_row + TW;
int start_col = blockDim.x*blockIdx.x+threadIdx.x*TW;
int end_col = start_col + TW;
int k, sum = 0;
for (int row = start_row; row < end_row; row++){
for (int col = start_col; col <end_col; col++){
for (k = 0; k < N; k++){
sum += a[row*N+k]*b[k*N+col];
c[row*N+col] = sum;
}
}
}
}
int main (int argc, char *argv[]){
/Declare variables
char key;
int Grid_Dim = 1; //Grid structure
int Block_Dim = 1; //Block structure
int N=10; //Size of matrix in one side
int TW=2; //size of data area computed by one thread
int *a, *b, *c, *d;
int *dev_a, *dev_b, *dev_c;
int size;
//Input data
do{
printf("Input N, current N is %d: ", N);
scanf("%d", &N);
printf("Input TW, current TW %d: ", TW);
scanf("%d", &N);
printf("\nInput number of threads in x/y dimension in a block, current number %d: ", Block_Dim);
scanf("%d", &Block_Dim);
printf("\nInput number if blocks in x/y dimension in a grid, current number %d: ", Grid_Dim);
scanf("%d", &Grid_Dim);
dim3 Grid(Grid_Dim, Grid_Dim); //grid structure
dim3 Block(Block_Dim, Block_Dim); //block structure
size = N*N*sizeof(int); //size of matrix
a=(int*)malloc(size);
b=(int*)malloc(size);
c=(int*)malloc(size);
d=(int*)malloc(size);
//data sample
for(int i= 0; i<N; i++)
for(int j=0; j<N; j++){
a[i*N+j]=j;
b[i*N+j]=j*1;
}
//Print sample data
printf("\nMatrix A and B:\n");
for (int i=0; i<N; i++){
for(int j=0; j<N; j++)
printf("%d ", a[i*N+j]);
printf("\n");
}
//Compute on GPU
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);
gpu_matrixMul2<<<Grid, Block>>>(dev_a, dev_b, dev_c, N, TW);
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
//Compute on CPU
cpu_matrixMul(a, b, d, N);
//Compare results
for(int i=0; i<N*N; i++){
if(c[i] = d[i])
printf("\nCORRECT!!! CPU and GPU create same anwser\n");
else
printf("\nERROR!!! CPU and GPU create different anwser\n");
break;
}
printf("\nMatrix result from GPU:\n");
for (int i=0; i<N; i++){
for(int j=0; j<N; j++)
printf("%d ", c[i*N+j]);
printf("\n");
}
printf("\nMatrix result from CPU:\n");
for (int i=0; i<N; i++){
for(int j=0; j<N; j++)
printf("%d ", d[i*N+j]);
printf("\n");
}
printf("\nType n to start a new computation\n");
scanf("%c", &key);
scanf("%c", &key);
}while (key=='n'); //loop of complete program
//Free the memory
free(a);
free(b);
free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
getch();
}
```