#include "stdio.h"
#define BLOCK_SIZE 32
#define N 3
__global__ void Muld(float *A, float *B, int wA, int wB, float *C) {
int bx = blockIdx.x; // Block index
int by = blockIdx.y;
int tx = threadIdx.x; // Thread index
int ty = threadIdx.y;
int aBegin = wA * BLOCK_SIZE * by; // Index of the first sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1; // Index of the last sub-matrix of A processed by the block
int aStep = BLOCK_SIZE; // Step size used to iterate through the sub-matrices of A
int bBegin = BLOCK_SIZE * bx; // Index of the first sub-matrix of B processed by the block
int bStep = BLOCK_SIZE * wB; // Step size used to iterate through the sub-matrices of B
float Csub = 0; // The element of the block sub-matrix that is computed by the thread
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Shared memory for the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Shared memory for the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx]; // Load the matrices from global memory to shared memory;
Bs[ty][tx] = B[b + wB * ty + tx]; // each thread loads one element of each matrix
__syncthreads(); // Synchronize to make sure the matrices are loaded
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to global memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
C[1]=12;
}
int main(){
int numBytes = N * N * sizeof ( float );
float A[]={1, 2, 3,
3, 4, 5,
3, 4, 5},
B[]={1, 2, 3,
3, 4, 5,
3, 4, 5},
C[9];
float * adev = NULL;
float * bdev = NULL;
float * cdev = NULL;
cudaMalloc ((void**)&adev, numBytes );
cudaMalloc ((void**)&bdev, numBytes );
cudaMalloc ((void**)&cdev, numBytes );
cudaMemcpy(adev, A, numBytes, cudaMemcpyHostToDevice);
cudaMemcpy(bdev, B, numBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cdev, C, numBytes, cudaMemcpyHostToDevice);
dim3 threads(32,32), blocks(N / threads.x, N / threads.y,1);
Muld <<< blocks, threads >>> (adev,bdev,N,N,cdev);
cudaMemcpy(C, cdev, numBytes, cudaMemcpyDeviceToHost);
for (int i=0; i<9; i++) printf("%0.2f ",C[i]);
printf("\n");
return 0;
}
Integer divisions are floored, so
blocks(N / threads.x, N / threads.y,1) == blocks(3 / 32, 3 / 32,1) == blocks(0,0,1)
N.
thanks