[codebox]/************************************************************
created: 2009/09/19
created: 19:9:2009 12:00
filename: vector_matrix_multiplication.cu
file base: vector_matrix_multiplication
file ext: CUDA
author: zhao.kaiyong(at)gmail.com
purpose: vector matrix multiplication
copyright: everyone can use this code, please specify source.
ä»»æ„使用,请注明出处;
<a target='_blank' rel='noopener noreferrer' href='"http://www.hpctech.com"'>http://www.hpctech.com</a>
<a target='_blank' rel='noopener noreferrer' href='"http://openhero.net"'>http://openhero.net</a>
http://www.comp.hkbu.edu.hk/~kyzhao/
*********/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
/************************************************************
************/
/* Init CUDA */
/************************************************************
************/
#if DEVICE_EMULATION
bool InitCUDA(void){return true;}
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
printf(“CUDA initialized.\n”);
return true;
}
/************************************************************
************/
/* Example */
/************************************************************
************/
global static void vector_matrix_mult_kernel(float* A, long wA, float* B, long wB, float* C)
{
__shared__ float subA[64];
A = A + threadIdx.x;
B = B + blockIdx.x * 64 + threadIdx.x;
C = C + blockIdx.x * 64 + threadIdx.x;
float subC = 0.0;
for (int i = 0; i < wA; i+=64)
{
subA[threadIdx.x] = A[ i];
__syncthreads();
#pragma unroll
for (int j = 0; j < 64; j++, B += wB)
{
subC += subA[j] * B[0];
}
__syncthreads();
}
C[0] = subC;
}
//global static void vector_matrix_mult_kernel_32T(float* A, long wA, float* B, long wB, float* C)
//{
//}
#define RUN_TEST
#define WA 32
#define WB 64
/************************************************************
************/
/* HelloCUDA */
/************************************************************
************/
int main(int argc, char* argv)
{
if(!InitCUDA()) {
return 0;
}
srand(2009);
long wA = 64* WA;
long wB = 64* WB;
long size_A = wA;
long size_B = wA*wB;
long size_C = wB;
float hA = (float)malloc(sizeof(float) * size_A);
float *hB = (float*)malloc(sizeof(float) * size_B);
float *hC = (float*)malloc(sizeof(float) * size_C);
float *testhC = (float*)malloc(sizeof(float) * size_C);
for (int i = 0; i < size_A; i++)
{
hA[ i] = (float)rand()/(float)RAND_MAX;
}
for (int i = 0; i < size_B; i++)
{
hB[ i] = (float)rand()/(float)RAND_MAX;
}
float *dA = 0;
float *dB = 0;
float *dC = 0;
CUDA_SAFE_CALL( cudaMalloc((void**) &dA, sizeof(float) * size_A));
CUDA_SAFE_CALL( cudaMalloc((void**) &dB, sizeof(float) * size_B));
CUDA_SAFE_CALL( cudaMalloc((void**) &dC, sizeof(float) * size_C));
CUDA_SAFE_CALL( cudaMemcpy(dA, hA, sizeof(float)*size_A, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL( cudaMemcpy(dB, hB, sizeof(float)*size_B, cudaMemcpyHostToDevice));
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
dim3 threads = 64;
dim3 blocks = wB/64;
vector_matrix_mult_kernel<<<blocks, threads>>>(dA, wA, dB, wB, dC);
CUT_CHECK_ERROR("Kernel execution failed\n");
CUDA_SAFE_CALL( cudaMemcpy(hC, dC, sizeof(float) * size_C, cudaMemcpyDeviceToHost));
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutResetTimer( timer));
for (int i = 0; i < wB; i++)
{
float subC = 0.0;
for (int j = 0; j <wA; j++)
{
subC += hA[ j] * hB[j*wB + i];
}
testhC[ i] = subC;
}
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
#ifdef RUN_TEST
CUTBoolean res = cutCompareL2fe(testhC, hC, size_C, 1e-6f);
printf("Test %s \n", (1 == res) ? "PASSED":"FAILED");
CUDA_SAFE_CALL( cudaFree(dA));
CUDA_SAFE_CALL( cudaFree(dB));
CUDA_SAFE_CALL( cudaFree(dC));
free(hA);
free(hB);
free(hC);
CUT_EXIT(argc, argv);
return 0;
}
[/codebox]