Concurrent Kernel using GTX 570 on WinXp Concurrent Kernel

Hi all,
I am working on concurrent kernel now.
I tried a program to test concurrent kernel on my desktop(winXp, VS2008, CUDA 3.2, GTX 570).
But I don’t see much speedup.
Can anyone give me some advice?
Thank you very much.

The code is as followed:
#include <stdio.h>
#include <cuda.h>
#include <>
#include “cutil_inline.h”
#include “cuda_runtime.h”
#include <cutil.h>
#include “cudaHelper.h”

using namespace std;

global void mykernel( int *a, int n )
int idx = threadIdx.x;
int value = 1;
for(int i=0; i<n; i++)
value *= sin( (float)i ) + tan( (float)i );
a[idx] = value;
int main(int argc, const char **argv)
int nblocks = 4;
int nthreads = 64;
int n = 500000;
int nkernels = 8;
int nbytes;

    int devID;
cudaDeviceProp deviceProps;
    int * d_A=0;
    int * d_B=0;
cudaStream_t *stream;
cudaEvent_t start, stop;
float elapsedTime;

int qatest = 0;

printf("[concurrentKernels] - Starting...\n\n");
devID = 0;
// QA testing mode
if (cutCheckCmdLineFlag(argc, (const char**)argv, "qatest"))
    qatest = 1;
cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA Device %s has %d Multi-Processors\n",, deviceProps.multiProcessorCount);
printf("CUDA Device %s is%s capable of concurrent kernel execution\n",, (deviceProps.concurrentKernels==0)?" NOT":"");
    stream = (cudaStream_t *)malloc(nkernels * sizeof(cudaStream_t));
    //create streams
for(int i=0; i<nkernels; i++)

// note: in this sample we will repeatedly overwrite the same
// block of device mem, but that's okay because we don't really
// care about the output of the kernel for the purposes of this
// example.

nbytes = nkernels * nthreads * sizeof(int);
cutilSafeCall(cudaMalloc((void **)&d_A, nbytes));

cutilSafeCall(cudaMalloc((void **)&d_B, nbytes));


// start timer then launch all kernels in their streams
cutilSafeCall(cudaEventRecord(start, 0));
for(int i=0; i<nkernels; i++)
    // avoid synchronization points (events, error checks, etc.) inside
    // this loop in order to get concurrent execution on devices that support it
    mykernel<<<nblocks, nthreads, 0, stream[i]>>>(&d_A[i*nthreads], n);

cutilSafeCall(cudaEventRecord(stop, 0));
// wait for all streams to finish
// get total time for all kernels
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("\nAll %d kernels together took %.3fs\n", nkernels, elapsedTime/1000.f);

cudaEvent_t start1, stop1;

float elapsedTime1 = 0.0;
    // check time to execute a single iteration
cutilSafeCall(cudaEventRecord(start1, 0));

mykernel<<<nblocks, nthreads>>>(d_B,n);

cutilCheckMsg("kernel launch failure");
cutilSafeCall(cudaEventRecord(stop1, 0));
cutilSafeCall(cudaEventElapsedTime(&elapsedTime1, start1, stop1));

printf("if no concurrent execution the time is %.3fs\n",elapsedTime1/1000.f);

// cleanup   
printf("\nCleaning up...\n");
if (stream)
    for(int i=0; i<nkernels; i++)
if (d_A)
if (d_B)

if (qatest)
    // any errors that might have happened will have already been reported
    printf("[concurrentKernels] - Test Results:\nPASSED\n");
    return 0;