Hello,

I was trying to find any ready to use functions which implements right/left bit shift for CUDA’s type int4. Now as I see there are no such functions in SDK, so I had to implement it by myself, please take a look at right shift operator:

```
__device__ int4 right_shift_int4(int4 a_value, unsigned int a_num)
{
int w_h = a_value.w >> (a_num % 32);
int z_h = a_value.z >> (a_num % 32);
int z_l = a_value.z << (32 - (a_num % 32));
int y_h = a_value.y >> (a_num % 32);
int y_l = a_value.y << (32 - (a_num % 32));
int x_h = a_value.x >> (a_num % 32);
int x_l = a_value.x << (32 - (a_num %32));
if (a_num < 32) {
a_value = make_int4(x_h, y_h | x_l, z_h | y_l, w_h | z_l);
}
else if (a_num >= 32 && a_num < 64) {
a_value = make_int4(0, x_h, y_h | x_l, z_h | y_l);
}
else if (a_num >= 64 && a_num < 96) {
a_value = make_int4(0, 0, x_h, y_h | x_l);
}
else {
a_value = make_int4(0, 0, 0, x_h);
}
return a_value;
}
```

The question is this effective or there is should be other ways to implement these for int4 (for example to avoid thread divergence)? Thank you.