I am developing an algorithm on the GPU that involves some calls to cuBlas on the device. I am using a Tesla K20c(compute capability 3.5) with CUDA 7.0. The Tesla K20c has theoretical peak performance of 3.52 TFLOPs. I figured out the flop count for my algorithm, which consists mainly of matrix-vector multiplication and vector-vector addition, as 3,900,109. My back of the envelope calculation tells me that 3900109 * 1./ 3.52e.12 = 1.108 e-6 s or 1.108 usec. I have written the kernel to use clock64() to time a portion of it. I have been calling my kernel through pyCUDA in a for loop during a simulation and getting the elapsed clock ticks. Over the 400 or so cycles of the for loop the average elapsed clock ticks is 94451. 94451 / 706 MHz is ~ 133usec. I realize I will never get the theoretical peak performance from my GPU but I feel like my algorithm should be closer to the ~1 usec than ~133 usec. I post my kernel code below and if you have any bright ideas on how to make this faster please let me know, I am kind of new to GPU computing. Thanks.

```
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
extern "C" {
#define MATRIX_SIZE 1384
#define G_WIDTH 3
#define C_HEIGHT 22
#define BLOCK_SIZE 1384
#define LLCV2_STATUS_TLATCH 6
#define NCHAN 96
#define TLATCH_OFFSET ((NCHAN/2) + LLCV2_STATUS_TLATCH)
#define COUNTSPERVOLT_SCALED (1. / 30.) * (32767. / 10.) //converts to counts and then scales by audio amp gain
#define VOLTSPERCOUNT 10. / 32767.
#define AO32MAX 0x8000 //max output of AO32 in counts 0x8000 = 10V
typedef unsigned int u32;
typedef unsigned short u16;
typedef signed short s16;
__global__ void valiant(const float *Phi, //closed-loop dynamics matrix
const float *Ko, //observer gain
const float *Kc, //controller gain
float *xhatp, //past state estimate
float *xhatf, //future state estimate
u32* d_hb, //device inbuffer<-ACQ196
s16* d_ao_hb, //device outbuffer->AO32
float *u, //system input
//float *y, //output vector
float *tmp, //temp array holder
uint64_t *clk, //time measurement
const u32 tlatch_go, //start feedback when d_hb[TLATCH_OFFSET] = tlatch_go
const u32 tlatch_stop //stop feedback when d_hb[TLATCH_OFFSET] = tlatch_stop
) {
float y[C_HEIGHT]; //measurment vector
//uint tx = threadIdx.x;
//s16 ao[3]; //AO32 output
//set memory locations
//build measurement vector
//y[2*tx] = (float)((s16)d_hb[tx]); y[2*tx + 1] = (float)((s16)(d_hb[tx] >> 0x10));
y[0] = (float)((s16)d_hb[0]); y[1] = (float)((s16)(d_hb[0] >> 0x10));
y[2] = (float)((s16)d_hb[1]); y[3] = (float)((s16)(d_hb[1] >> 0x10));
y[4] = (float)((s16)d_hb[2]); y[5] = (float)((s16)(d_hb[2] >> 0x10));
y[6] = (float)((s16)d_hb[3]); y[7] = (float)((s16)(d_hb[3] >> 0x10));
y[8] = (float)((s16)d_hb[4]); y[9] = (float)((s16)(d_hb[4] >> 0x10));
y[10] = (float)((s16)d_hb[5]); y[11] = (float)((s16)(d_hb[5] >> 0x10));
y[12] = (float)((s16)d_hb[6]); y[13] = (float)((s16)(d_hb[6] >> 0x10));
y[14] = (float)((s16)d_hb[7]); y[15] = (float)((s16)(d_hb[7] >> 0x10));
y[16] = (float)((s16)d_hb[8]); y[17] = (float)((s16)(d_hb[8] >> 0x10));
y[18] = (float)((s16)d_hb[9]); y[19] = (float)((s16)(d_hb[9] >> 0x10));
y[20] = (float)((s16)d_hb[10]); y[21] = (float)((s16)(d_hb[10] >> 0x10));
const float alpha = 1.;
const float nalpha = -1.;
const float zero = 0.;
const float *d_alpha = α
const float *neg_alpha = &nalpha;
const float *d_zero = &zero;
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
if (status != CUBLAS_STATUS_SUCCESS)
{
return;
}
uint64_t start_time = clock64(); //start kernel timer
//if (tx == 0) {
if (d_hb[TLATCH_OFFSET] >= tlatch_go && d_hb[TLATCH_OFFSET] <= tlatch_stop) {
//LQG in 3 steps
//xhatf = Phi*xhatp + Ko*y
status = cublasSgemv(cnpHandle,
CUBLAS_OP_N,
MATRIX_SIZE, MATRIX_SIZE,
d_alpha,
Phi, MATRIX_SIZE,
xhatp, 1,
d_zero,
xhatf, 1);
status = cublasSgemv(cnpHandle,
CUBLAS_OP_N,
MATRIX_SIZE, C_HEIGHT,
d_alpha,
Ko, MATRIX_SIZE,
y, 1,
d_zero,
tmp, 1);
status = cublasSaxpy(cnpHandle,
MATRIX_SIZE,
d_alpha,
tmp, 1,
xhatf, 1);
//u = -Kc*xhatf
status = cublasSgemv(cnpHandle,
CUBLAS_OP_N,
G_WIDTH, MATRIX_SIZE,
neg_alpha,
Kc, G_WIDTH,
xhatf, 1,
d_zero,
u, 1);
//do time advancement
status = cublasScopy(cnpHandle,
MATRIX_SIZE,
xhatf, 1,
xhatp, 1);
//}
//__syncthreads();
//build output vector, check for overloading AO32
//if (tx <= 2) {
//ao[tx] = (s16)(u[tx] * COUNTSPERVOLT_SCALED);
s16 ao0 = (s16)(u[0] * COUNTSPERVOLT_SCALED);
s16 ao1 = (s16)(u[1] * COUNTSPERVOLT_SCALED);
s16 ao2 = (s16)(u[2] * COUNTSPERVOLT_SCALED);
// if (ao[tx] < AO32MAX)
// d_ao_hb[tx] = ao[tx];
// else
// d_ao_hb[tx] = (s16)copysignf((float)AO32MAX, (float)ao[tx]);
if (ao0 < AO32MAX)
d_ao_hb[0] = ao0;
else
d_ao_hb[0] = (s16)copysignf((float)AO32MAX, (float)ao0);
if (ao1 < AO32MAX)
d_ao_hb[1] = ao1;
else
d_ao_hb[1] = (s16)copysignf((float)AO32MAX, (float)ao1);
if (ao2 < AO32MAX)
d_ao_hb[2] = ao2;
else
d_ao_hb[2] = (s16)copysignf((float)AO32MAX, (float)ao2);
}
uint64_t stop_time = clock64();
//clk[0] = stop_time > start_time ? stop_time - start_time : stop_time + (0xffffffff - start_time);
clk[0] = stop_time - start_time;
}
```