I have to use the wmma api to do some matrix multiplication, and I want to access every single element in the fragment to do some operation, maybe simple as multiply a variable or something more complex.
Below is the example. At the beginning, I think that I should let thread access different index according to the threadIdx, like thread 0 operate x[0] to x[7], thread 1 operate x[8] to x[15], but is doesn’t work.
After some tries, I find out that it seems every single have different acces to the fragment, like for thread 0, x[0] maps to the first element in the fragment, x[1] maps to the second, x[2] maps to 8th, x[3] maps to 9th… So, how does it maps? I don’t find any instructions in any docs(or maybe I missed)
#include <cuda_runtime.h>
#include <mma.h>
#include <iostream>
using namespace nvcuda;
__global__ void wmma_ker(half *a, half *b, float *c) {
wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
wmma::fill_fragment(c_frag, 0.0f);
wmma::load_matrix_sync(a_frag, a, 16);
wmma::load_matrix_sync(b_frag, b, 16);
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
// here is the operation
for (int i = 0; i < 16; i++){
c_frag.x[i] = c_frag.x[i] * threadIdx.x;
}
wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}
void print_data(half *data){
for (int i = 0; i < 16; i++){
for (int j = 0; j < 16; j++){
std::cout << __half2float(data[16*i+j]) << ",";
}
std::cout << std::endl;
}
std::cout << std::endl << std::endl;
}
int main(){
half* data_a_host = (half*)malloc(sizeof(half) * 16 * 16);
half* data_b_host = (half*)malloc(sizeof(half) * 16 * 16);
float* data_c_host = (float*)malloc(sizeof(float) * 16 * 16);
for (int i = 0; i < 256; i++){
data_a_host[i] = half(i);
data_b_host[i] = half(i);
}
print_data(data_a_host);
print_data(data_b_host);
half* data_a, *data_b;
float* data_c;
cudaMalloc(&data_a, sizeof(half) * 16 * 16);
cudaMalloc(&data_b, sizeof(half) * 16 * 16);
cudaMalloc(&data_c, sizeof(float) * 16 * 16);
cudaMemcpy(data_a, data_a_host, sizeof(half) * 256, cudaMemcpyHostToDevice);
cudaMemcpy(data_b, data_b_host, sizeof(half) * 256, cudaMemcpyHostToDevice);
wmma_ker<<<1, 32>>>(data_a, data_b, data_c);
cudaDeviceSynchronize();
cudaMemcpy(data_c_host, data_c, sizeof(float) * 256, cudaMemcpyDeviceToHost);
for (int i = 0; i < 16; i++){
for (int j = 0; j < 16; j++){
std::cout << data_c_host[16*i+j] << ",";
}
std::cout << std::endl;
}
cudaFree(data_a);
cudaFree(data_b);
cudaFree(data_c);
return 0;
}