I read some paper about the warp divergence and I did a experinment about it.
in case 1: I make even and odd thrads to do different work, like following:
if (id % 2 == 0){
c[id] = a[id] + b[id];
}
else {
u[id] = a[id] * b[id];}
in case 2 I make the whole warp to do the same work and different warps to do different work
if ((id / 32) % 2 == 0){
c[id] = a[id] + b[id];
}
else {
u[id] = a[id] * b[id];}
I think the xase 1 have the problem of the warp divergence and the case 2 can avoid the warp divergence
but when I test the computing time , the case 2 use more time.
so,I think maybe my method or code has some problem?
the whole code is following:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include
// CUDA kernel. Each thread takes care of one element of c
global void vecAdd(double *a, double *b, double *c, double *d, double *e, double *f, double u, int n)
{
// Get our global thread ID
int id = blockIdx.xblockDim.x + threadIdx.x;
// Make sure we do not go out of bounds
int k;
while (id < n) {
for (k = 0; k < 18000; k++)
{
if ((id / 32) % 2 == 0){
c[id] = a[id] + b[id];
}
else {
u[id] = a[id] * b[id];
}
}
id += gridDim.x*blockDim.x;
}
}
int main(int argc, char* argv)
{
// Size of vectors
int n = 512;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
double *h_d;
double *h_e;
double *h_f;
double *h_u;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
double *d_d;
double *d_e;
double *d_f;
double *d_u;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
h_d = (double*)malloc(bytes);
h_e = (double*)malloc(bytes);
h_f = (double*)malloc(bytes);
h_u = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
cudaMalloc(&d_d, bytes);
cudaMalloc(&d_e, bytes);
cudaMalloc(&d_f, bytes);
cudaMalloc(&d_u, bytes);
int i;
// Initialize vectors on host
for (i = 0; i < n; i++) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
h_e[i] = sin(i)*sin(i);
h_f[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_e, h_e, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_f, h_f, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 512;
// Number of thread blocks in grid
gridSize = 1;
// Execute the kernel
vecAdd << <gridSize, blockSize >> >(d_a, d_b, d_c, d_d, d_e, d_f, d_u, n);
// Copy array back to host
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_d, d_d, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_u, d_u, bytes, cudaMemcpyDeviceToHost);
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (i = 0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum);
double sum1 = 0;
for (i = 0; i<n; i++)
sum1 += h_d[i];
printf("final result1: %f\n", sum1);
double sum2 = 0;
for (i = 0; i<n; i++)
sum2 += h_u[i];
printf("final result1: %f\n", sum2);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_d);
cudaFree(d_e);
cudaFree(d_f);
cudaFree(d_u);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
free(h_d);
free(h_e);
free(h_f);
free(h_u);
return 0;
}