vector matrix multiplication, share my code:)

[codebox]/************************************************************


    created:        2009/09/19

    created:        19:9:2009   12:00

    filename:         vector_matrix_multiplication.cu

    file base:        vector_matrix_multiplication

    file ext:        CUDA

    author:                zhao.kaiyong(at)gmail.com

purpose: vector matrix multiplication

    copyright:        everyone can use this code, please specify source.

                    任意使用,请注明出处;

                    <a target='_blank' rel='noopener noreferrer' href='"http://www.hpctech.com"'>http://www.hpctech.com</a>

                    <a target='_blank' rel='noopener noreferrer' href='"http://openhero.net"'>http://openhero.net</a>

http://www.comp.hkbu.edu.hk/~kyzhao/


*********/

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

/************************************************************

************/

/* Init CUDA */

/************************************************************

************/

#if DEVICE_EMULATION

bool InitCUDA(void){return true;}

#else

bool InitCUDA(void)

{

    int count = 0;

    int i = 0;

cudaGetDeviceCount(&count);

    if(count == 0) {

            fprintf(stderr, "There is no device.\n");

            return false;

    }

for(i = 0; i < count; i++) {

            cudaDeviceProp prop;

            if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

                    if(prop.major >= 1) {

                            break;

                    }

            }

    }

    if(i == count) {

            fprintf(stderr, "There is no device supporting CUDA.\n");

            return false;

    }

    cudaSetDevice(i);

printf(“CUDA initialized.\n”);

    return true;

}

#endif

/************************************************************

************/

/* Example */

/************************************************************

************/

global static void vector_matrix_mult_kernel(float* A, long wA, float* B, long wB, float* C)

{

    __shared__ float subA[64];

A = A + threadIdx.x;

    B = B + blockIdx.x * 64 + threadIdx.x;

    C = C + blockIdx.x * 64 + threadIdx.x;

float subC = 0.0;

for (int i = 0; i < wA; i+=64)

    {

            subA[threadIdx.x] = A[ i];

__syncthreads();

#pragma unroll

            for (int j = 0; j < 64; j++, B += wB)

            {

                    subC += subA[j] * B[0];

            }

            __syncthreads();

    }

C[0] = subC;

}

//global static void vector_matrix_mult_kernel_32T(float* A, long wA, float* B, long wB, float* C)

//{

//}

#define RUN_TEST

#define WA 32

#define WB 64

/************************************************************

************/

/* HelloCUDA */

/************************************************************

************/

int main(int argc, char* argv)

{

if(!InitCUDA()) {

            return 0;

    }

srand(2009);

long wA = 64* WA;

    long wB = 64* WB; 

long size_A = wA;

    long size_B = wA*wB;

    long size_C = wB;

float hA = (float)malloc(sizeof(float) * size_A);

    float        *hB = (float*)malloc(sizeof(float) * size_B);

    float        *hC = (float*)malloc(sizeof(float) * size_C);

    float        *testhC = (float*)malloc(sizeof(float) * size_C);

for (int i = 0; i < size_A; i++)

    {

            hA[ i] = (float)rand()/(float)RAND_MAX;

    }

for (int i = 0; i < size_B; i++)

    {

            hB[ i] = (float)rand()/(float)RAND_MAX;

    }

float *dA = 0;

    float        *dB = 0; 

    float        *dC = 0; 

CUDA_SAFE_CALL( cudaMalloc((void**) &dA, sizeof(float) * size_A));

    CUDA_SAFE_CALL( cudaMalloc((void**) &dB, sizeof(float) * size_B));

    CUDA_SAFE_CALL( cudaMalloc((void**) &dC, sizeof(float) * size_C));

CUDA_SAFE_CALL( cudaMemcpy(dA, hA, sizeof(float)*size_A, cudaMemcpyHostToDevice));

    CUDA_SAFE_CALL( cudaMemcpy(dB, hB, sizeof(float)*size_B, cudaMemcpyHostToDevice));

unsigned int timer = 0;

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

dim3 threads = 64;

    dim3 blocks = wB/64;

    vector_matrix_mult_kernel<<<blocks, threads>>>(dA, wA, dB, wB, dC);

    CUT_CHECK_ERROR("Kernel execution failed\n");

CUDA_SAFE_CALL( cudaMemcpy(hC, dC, sizeof(float) * size_C, cudaMemcpyDeviceToHost));

    CUT_SAFE_CALL( cutStopTimer( timer));

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutResetTimer( timer));

for (int i = 0; i < wB; i++)

    {

            float subC = 0.0;

            for (int j = 0; j <wA; j++)

            {

                    subC += hA[ j] * hB[j*wB + i];

            }

            testhC[ i] = subC;

    }

CUT_SAFE_CALL( cutStopTimer( timer));

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

#ifdef RUN_TEST

CUTBoolean res = cutCompareL2fe(testhC, hC, size_C, 1e-6f);

    printf("Test %s \n", (1 == res) ? "PASSED":"FAILED");

#endif

CUDA_SAFE_CALL( cudaFree(dA));

    CUDA_SAFE_CALL( cudaFree(dB));

    CUDA_SAFE_CALL( cudaFree(dC));

    free(hA);

    free(hB);

    free(hC);

    CUT_EXIT(argc, argv);

return 0;

}

[/codebox]

Hello,
The code seems to work fine :)
However, let try with #define WA (10*640) and #define WB 1
The test will not passed. I got my own code for CUDA vector-matrix mul as well and get the same problem. So do with cublasSgemm.
:)