Concurrent Kernel using GTX 570 on WinXp Concurrent Kernel

Hi all,
I am working on concurrent kernel now.
I tried a program to test concurrent kernel on my desktop(winXp, VS2008, CUDA 3.2, GTX 570).
But I don’t see much speedup.
Can anyone give me some advice?
Thank you very much.

The code is as followed:
#include <stdio.h>
#include <cuda.h>
#include <cuPrintf.cu>
#include “cutil_inline.h”
#include “cuda_runtime.h”
#include
#include <cutil.h>
#include “cudaHelper.h”

using namespace std;

global void mykernel( int *a, int n )
{
int idx = threadIdx.x;
int value = 1;
for(int i=0; i<n; i++)
value *= sin( (float)i ) + tan( (float)i );
a[idx] = value;
}
int main(int argc, const char **argv)
{
int nblocks = 4;
int nthreads = 64;
int n = 500000;
int nkernels = 8;
int nbytes;

    int devID;
cudaDeviceProp deviceProps;
   
    int * d_A=0;
    int * d_B=0;
cudaStream_t *stream;
cudaEvent_t start, stop;
float elapsedTime;

int qatest = 0;

printf("[concurrentKernels] - Starting...\n\n");
devID = 0;
cutilSafeCall(cudaSetDevice(devID));
// QA testing mode
if (cutCheckCmdLineFlag(argc, (const char**)argv, "qatest"))
    {
    qatest = 1;
}
cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA Device %s has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
printf("CUDA Device %s is%s capable of concurrent kernel execution\n", deviceProps.name, (deviceProps.concurrentKernels==0)?" NOT":"");
    stream = (cudaStream_t *)malloc(nkernels * sizeof(cudaStream_t));
    //create streams
for(int i=0; i<nkernels; i++)
{
    cutilSafeCall(cudaStreamCreate(&stream[i]));
}

// note: in this sample we will repeatedly overwrite the same
// block of device mem, but that's okay because we don't really
// care about the output of the kernel for the purposes of this
// example.

nbytes = nkernels * nthreads * sizeof(int);
cutilSafeCall(cudaMalloc((void **)&d_A, nbytes));

cutilSafeCall(cudaMalloc((void **)&d_B, nbytes));

cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));

// start timer then launch all kernels in their streams
cutilSafeCall(cudaEventRecord(start, 0));
for(int i=0; i<nkernels; i++)
{
    // avoid synchronization points (events, error checks, etc.) inside
    // this loop in order to get concurrent execution on devices that support it
    mykernel<<<nblocks, nthreads, 0, stream[i]>>>(&d_A[i*nthreads], n);
}

cutilSafeCall(cudaEventRecord(stop, 0));
// wait for all streams to finish
cutilSafeCall(cudaEventSynchronize(stop));
// get total time for all kernels
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("\nAll %d kernels together took %.3fs\n", nkernels, elapsedTime/1000.f);

cudaEvent_t start1, stop1;

float elapsedTime1 = 0.0;
cutilSafeCall(cudaEventCreate(&start1));
cutilSafeCall(cudaEventCreate(&stop1));
    // check time to execute a single iteration
cutilSafeCall(cudaEventRecord(start1, 0));

mykernel<<<nblocks, nthreads>>>(d_B,n);

cutilCheckMsg("kernel launch failure");
cutilSafeCall(cudaEventRecord(stop1, 0));
cutilSafeCall(cudaEventSynchronize(stop1));
cutilSafeCall(cudaEventElapsedTime(&elapsedTime1, start1, stop1));

printf("if no concurrent execution the time is %.3fs\n",elapsedTime1/1000.f);

// cleanup   
printf("\nCleaning up...\n");
cudaEventDestroy(start);
cudaEventDestroy(stop);
if (stream)
{
    for(int i=0; i<nkernels; i++)
    {
        cutilSafeCall(cudaStreamDestroy(stream[i]));
    }
    free(stream);
}
if (d_A)
    cudaFree(d_A);
if (d_B)
    cudaFree(d_B);

if (qatest)
    {
    // any errors that might have happened will have already been reported
    printf("[concurrentKernels] - Test Results:\nPASSED\n");
}
exit(0);
    return 0;

}