Dissatisfying GFLOPs How to improve?

Hey guys, yea me again with another question. The following program iterates the cublasSaxpy a several times and measures the speed. Using the pragma unroll thing and doing a warmup, i only get up to 0.6 GFLOPs. Is there something wrong with my code?

Graka: GeForce 8400M GS, Driver 186.03, Cuda 2.2

#include<stdio.h>

#include<cuda.h>

#include<cublas.h>

#include<stdlib.h>

#include<time.h>

#define N 8 * 1e6

#define ITERS 1000

int main(int argc, char** argv) {

	cublasStatus stat;

	float *xptr;

	float *yptr;

	float *vecx;

	float *vecy;

	double diff, diffs;

	double speed;

	clock_t begin, end;

	int i;

	stat = cublasInit();

	

	if(cublasInit() == CUBLAS_STATUS_NOT_INITIALIZED) {

		printf("CUBLAS init error.\n");

		return 1;

	}

	stat = cublasAlloc(N, sizeof(*vecx), (void **) &xptr);

	

	if(stat == CUBLAS_STATUS_SUCCESS) {

		printf("Memory for vecx allocated\n\n");

	}

	else {

		printf("!!!Memory not allocated\n\n");

		return 1;

	}

	

	stat = cublasAlloc(N, sizeof(*vecy), (void **) &yptr);

	if(stat == CUBLAS_STATUS_SUCCESS) {

		printf("Memory for vecy allocated\n\n");

	}

	else {

		printf("!!!Memory not allocated\n\n");

		return 1;

	}

	vecx = (float *) malloc( N * sizeof(float) );

	vecy = (float *) malloc( N * sizeof(float) );

	

	for(i = 0; i < N; ++i) {

		vecx[i] = 0.5;

		vecy[i] = 0.3;

	}

	cublasSetVector(N, sizeof(*vecx), vecx, 1, xptr, 1);

	cublasSetVector(N, sizeof(*vecy), vecy, 1, yptr, 1);

	

	for(i = 0; i < ITERS; ++i) {

		cublasSaxpy(N, 2.0f, xptr, 1, yptr, 1);

	}

	

	printf("\n\nINFO: Warmup completed\n\n");

	cudaThreadSynchronize();

	

	begin = clock();

	

	#pragma unroll 1000

	for(i = 0; i < ITERS; ++i) {

		cublasSaxpy(N, 2.0f, xptr, 1, yptr, 1);

	}

	cudaThreadSynchronize();

	end = clock();

	diff = end - begin;

	diffs = diff / CLOCKS_PER_SEC;

	speed = (ITERS * 1E-9 * 2 * N) / diffs;

	cublasGetVector(N, sizeof(float), yptr, 1, vecy, 1);

	

	free(vecx);

	free(vecy);

	cublasFree(xptr);

	cublasFree(yptr);

	cublasShutdown();

	printf("RESULTS OF COMPUTING:\n\n");

	printf("Vector size: %6.0f\n", N);

	printf("Number of iterations: %d\n", ITERS);

	printf("Elapsed Time [s]: %3f\n", diffs);

	printf("Speed [GFLOP/s]: %6f", speed);

	scanf("%d", i);

return EXIT_SUCCESS;

}