# question about the warp divergence

in case 1: I make even and odd thrads to do different work, like following:
if (id % 2 == 0){
c[id] = a[id] + b[id];
}
else {

			u[id] = a[id] * b[id];}


in case 2 I make the whole warp to do the same work and different warps to do different work

if ((id / 32) % 2 == 0){
c[id] = a[id] + b[id];
}
else {

			u[id] = a[id] * b[id];}


I think the xase 1 have the problem of the warp divergence and the case 2 can avoid the warp divergence
but when I test the computing time , the case 2 use more time.
so,I think maybe my method or code has some problem?

the whole code is following:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include

// CUDA kernel. Each thread takes care of one element of c
global void vecAdd(double *a, double *b, double *c, double *d, double *e, double *f, double u, int n)
{
// Get our global thread ID
int id = blockIdx.x

// Make sure we do not go out of bounds
int k;
while (id < n) {
for (k = 0; k < 18000; k++)
{
if ((id / 32) % 2  == 0){
c[id] = a[id] + b[id];
}
else {

u[id] = a[id] * b[id];
}

}

id += gridDim.x*blockDim.x;
}


}

int main(int argc, char* argv)
{
// Size of vectors
int n = 512;

// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
double *h_d;
double *h_e;
double *h_f;
double *h_u;

// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
double *d_d;
double *d_e;
double *d_f;
double *d_u;

// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);

// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
h_d = (double*)malloc(bytes);
h_e = (double*)malloc(bytes);
h_f = (double*)malloc(bytes);
h_u = (double*)malloc(bytes);

// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
cudaMalloc(&d_d, bytes);
cudaMalloc(&d_e, bytes);
cudaMalloc(&d_f, bytes);
cudaMalloc(&d_u, bytes);

int i;
// Initialize vectors on host
for (i = 0; i < n; i++) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
h_e[i] = sin(i)*sin(i);
h_f[i] = cos(i)*cos(i);
}

// Copy host vectors to device
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_e, h_e, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_f, h_f, bytes, cudaMemcpyHostToDevice);

int blockSize, gridSize;

blockSize = 512;

// Number of thread blocks in grid
gridSize = 1;

// Execute the kernel
vecAdd << <gridSize, blockSize >> >(d_a, d_b, d_c, d_d, d_e, d_f, d_u, n);

// Copy array back to host
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_d, d_d, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_u, d_u, bytes, cudaMemcpyDeviceToHost);

// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (i = 0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum);

double sum1 = 0;
for (i = 0; i<n; i++)
sum1 += h_d[i];
printf("final result1: %f\n", sum1);

double sum2 = 0;
for (i = 0; i<n; i++)
sum2 += h_u[i];
printf("final result1: %f\n", sum2);

// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_d);
cudaFree(d_e);
cudaFree(d_f);
cudaFree(d_u);

// Release host memory
free(h_a);
free(h_b);
free(h_c);
free(h_d);
free(h_e);
free(h_f);
free(h_u);

return 0;


}

When posting code on this forum, please mark it as code. To do so, paste your code in the edit window, then select the entire section of code, and go to the top edit bar of the edit window, and click on the </> button.

When I modify your code like this:

for (k = 0; k < 18000; k++)
{
#ifdef USE_DIVERGENCE
if ((id % 2) == 0){
#else
if (((id / 32) % 2) == 0){
#endif
c[id] = a[id] + b[id];


and compile it and run it in a linux environment on Pascal Titan X with nvprof, nvprof indicates that the divergent kernel runs slower:

$nvcc -arch=sm_61 -o t377 t377.cu$ nvprof ./t377
==23957== NVPROF is profiling process 23957, command: ./t377
final result: 67828802170708287488.000000
final result1: 271772228112728162304.000000
final result1: 68293429946191372288.000000
==23957== Profiling application: ./t377
==23957== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
99.79%  4.3066ms         1  4.3066ms  4.3066ms  4.3066ms  vecAdd(double*, double*, double*, double*, double*, double*, double*, int)
0.13%  5.4080us         4  1.3520us  1.3440us  1.3760us  [CUDA memcpy HtoD]
0.08%  3.4560us         3  1.1520us     704ns  1.3760us  [CUDA memcpy DtoH]

==23957== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
96.46%  302.59ms         7  43.227ms  2.5470us  302.57ms  cudaMalloc
1.40%  4.3812ms         7  625.89us  7.0270us  4.3185ms  cudaMemcpy
1.16%  3.6245ms       364  9.9570us     244ns  426.96us  cuDeviceGetAttribute
0.83%  2.6184ms         4  654.60us  640.11us  697.44us  cuDeviceTotalMem
0.09%  277.62us         4  69.405us  63.943us  82.118us  cuDeviceGetName
0.05%  170.55us         7  24.364us  3.1340us  142.11us  cudaFree
0.01%  24.056us         1  24.056us  24.056us  24.056us  cudaLaunch
0.00%  4.7240us        12     393ns     263ns     880ns  cuDeviceGet
0.00%  3.6490us         8     456ns     135ns  2.2700us  cudaSetupArgument
0.00%  2.9240us         3     974ns     328ns  1.9610us  cuDeviceGetCount
0.00%  1.0070us         1  1.0070us  1.0070us  1.0070us  cudaConfigureCall
$nvcc -arch=sm_61 -o t377 t377.cu -DUSE_DIVERGENCE$ nvprof ./t377
==24019== NVPROF is profiling process 24019, command: ./t377
final result: 67828802170708287488.000000
final result1: 271772228112728162304.000000
final result1: 68293429946191372288.000000
==24019== Profiling application: ./t377
==24019== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
99.82%  4.9924ms         1  4.9924ms  4.9924ms  4.9924ms  vecAdd(double*, double*, double*, double*, double*, double*, double*, int)
0.11%  5.3760us         4  1.3440us  1.3440us  1.3440us  [CUDA memcpy HtoD]
0.07%  3.4560us         3  1.1520us     704ns  1.3760us  [CUDA memcpy DtoH]

==24019== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
96.27%  305.04ms         7  43.577ms  2.4850us  305.01ms  cudaMalloc
1.60%  5.0680ms         7  724.00us  7.2070us  5.0044ms  cudaMemcpy
1.16%  3.6875ms       364  10.130us     247ns  467.10us  cuDeviceGetAttribute
0.82%  2.5979ms         4  649.47us  640.12us  671.68us  cuDeviceTotalMem
0.09%  273.59us         4  68.398us  63.528us  80.236us  cuDeviceGetName
0.05%  169.53us         7  24.218us  3.3180us  141.32us  cudaFree
0.01%  25.091us         1  25.091us  25.091us  25.091us  cudaLaunch
0.00%  4.9380us        12     411ns     248ns  1.1160us  cuDeviceGet
0.00%  3.6890us         8     461ns     137ns  2.3120us  cudaSetupArgument
0.00%  3.3250us         3  1.1080us     363ns  2.3460us  cuDeviceGetCount
0.00%  1.1160us         1  1.1160us  1.1160us  1.1160us  cudaConfigureCall
\$


In the non-divergent case, the kernel takes 4.3066ms to execute, whereas in the divergent case the kernel takes about 15% longer at 4.9924ms

but I think it should be 100% longer?

Have you analyzed the difference in machine code between the two execution paths to determine that one is 100% longer than the other? I think you’ll find predicated execution in both cases which is going to blur the difference.

Would you also assume 100% longer if the time to access data from memory or cache was significantly longer than the time it takes to fetch and execute instructions?