the computing time with more instructions in one thread

I did a experinment with a simple vector add
in case 1, I set only one instruction in one thread, like c[id] = a[id] + b[id];
and in case 2,I set 3 instructions in one thread like c[id] = a[id] + b[id]; d[id] = c[id] * f[id]; u[id] = d[id] * f[id];

after that, I use the cuda profiler to calculate the computing time, and these 2 case have the same computing time.

I am very confused about that and I have several questions.

  1. why these 2 cases have same computing time? In my view, case 2 should have 3 times computing time than case 1 i need to use syncthread between different instructions?

the whole code is following:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”

// CUDA kernel. Each thread takes care of one element of c
global void vecAdd(double *a, double *b, double *c, double *d, double *e, double *f, double u, int n)
// Get our global thread ID
int id = blockIdx.x
blockDim.x + threadIdx.x;

// Make sure we do not go out of bounds
int k;
while (id < n) {
	for (k = 0; k < 18000; k++)
		c[id] = a[id] + b[id];
	d[id] = c[id] * f[id];

	u[id] = d[id] * f[id];

	id += gridDim.x*blockDim.x;


int main(int argc, char* argv)
// Size of vectors
int n = 1024;

// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
double *h_d;
double *h_e;
double *h_f;
double *h_u;

// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
double *d_d;
double *d_e;
double *d_f;
double *d_u;

// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);

// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
h_d = (double*)malloc(bytes);
h_e = (double*)malloc(bytes);
h_f = (double*)malloc(bytes);
h_u = (double*)malloc(bytes);

// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
cudaMalloc(&d_d, bytes);
cudaMalloc(&d_e, bytes);
cudaMalloc(&d_f, bytes);
cudaMalloc(&d_u, bytes);

int i;
// Initialize vectors on host
for (i = 0; i < n; i++) {
	h_a[i] = sin(i)*sin(i);
	h_b[i] = cos(i)*cos(i);
	h_e[i] = sin(i)*sin(i);
	h_f[i] = cos(i)*cos(i);

// Copy host vectors to device
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_e, h_e, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_f, h_f, bytes, cudaMemcpyHostToDevice);

int blockSize, gridSize;

// Number of threads in each thread block
blockSize = 1024;

// Number of thread blocks in grid
gridSize = 1;

// Execute the kernel
vecAdd << <gridSize, blockSize >> >(d_a, d_b, d_c, d_d, d_e, d_f, d_u, n);

// Copy array back to host
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_d, d_d, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_u, d_u, bytes, cudaMemcpyDeviceToHost);

// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (i = 0; i<n; i++)
	sum += h_c[i];
printf("final result: %f\n", sum);

double sum1 = 0;
for (i = 0; i<n; i++)
	sum1 += h_d[i];
printf("final result1: %f\n", sum1);

// Release device memory

// Release host memory

return 0;


Why are you doing the c[id] calculation 18000 times but the others only once?

oh my god, this is my bad

thank you for every one. I know my problem