Bit-by-bit conversion: int4 to cuDoubleComplex Compiler Bug?

Edit: Sorry, immediately after posting this i had the idea. Working code below. Please give comments anyway, if you have some suggestions.

My programs reads many double complex values from global memory. One of my approaches (AoS) is to

[list=1]

[*]store them continuosly as cuDoubleComplex values in pitch linear memory

[*]bind them to an int4 texture

[*]read them as int4 values in kernel

[*]convert from int4 to cuDoubleComplex bit-by-bit in kernel

[*](perform some computations)

[*]write results as cuDoubleComplex to (if needed different location in) global memory

A similar test with int2/double worked well, but when switching to int4/cuDoubleComplex the program does not do what I intended to do. Instead, it takes the first two elements and converts them the usual way.

Here is my try. Using toolkit 4.0.

#include <iostream>

#include <iomanip>

#include <stdio.h>

using namespace std;

#include "/usr/local/cuda/include/cuComplex.h"

void checkCUDAError( const char *msg ) {

  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err ) {

    cerr << "Cuda error: " << msg << ": " << cudaGetErrorString( err ) << endl;

    exit(EXIT_FAILURE);

  }

}

texture<int4, cudaTextureType2D, cudaReadModeElementType> tex;

__global__ void texTest( cuDoubleComplex* d_out ) {

  int i = blockIdx.x*blockDim.x + threadIdx.x;

int4 temp = tex2D( tex, ( float(i)+0.5f )/32.0f, 0.5f ); // adjust N here!

// if d_out is int4*, the following sometimes works

  /*cuDoubleComplex moep = *((cuDoubleComplex*)&temp);

moep.x *= 100.0;

d_out[i] = *((int4*)&moep);*/

// if d_out is cuDoubleComplex*, the following does not convert bit-by-bit but in the usual way

  int4* temp1 = &temp;

  void** temp2 = (void**)(&temp1);

cuDoubleComplex* moep1 = (cuDoubleComplex*)(*temp2);

cuDoubleComplex moep = *moep1;

moep.x *= 100.0;

d_out[i] = moep;

}

int main() {

  const int N = 32;

  const unsigned threadsPerBlock = 16;

  const unsigned blockCount = N/threadsPerBlock;

// allocate memory, put some data in

  int4* h_in = (int4*)malloc( N*sizeof(int4) );

  cuDoubleComplex* h_out = (cuDoubleComplex*)malloc( N*sizeof(cuDoubleComplex) );

cuDoubleComplex temp;

  for(int i=0; i<N; ++i) {

    temp = make_cuDoubleComplex(5.0+double(i)+0.5, 4.0+double(i));

    //h_in[i] = *( (int4*)(&temp) );  // later this is the purpose

    h_in[i] = make_int4(3+2*i, 100*i, 1000*i, 3564+3*i);  // this is only for testing

  }

cuDoubleComplex* d_out;

  cudaMalloc( &d_out, N*sizeof(cuDoubleComplex) );

int4* d_in;

  size_t pitchBytes = 0;

  cudaMallocPitch( &d_in, &pitchBytes, N*sizeof(int4), 1 );

cudaMemcpy( d_in, h_in, N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice ); // later copy 2D

// 2. bind to int4 texture

  size_t offset = 0;

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<int4>();

  cudaBindTexture2D(&offset, &tex, (int4*)d_in, &channelDesc, N, 1, pitchBytes);  

tex.normalized = true;

  tex.filterMode = cudaFilterModePoint;

  tex.addressMode[0] = cudaAddressModeWrap;

  tex.addressMode[1] = cudaAddressModeWrap;

// run kernel

  texTest<<< blockCount, threadsPerBlock >>>( d_out );

// look at result

  cudaMemcpy( h_out, d_out, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );

double buffer = 0;

  for(int i=0; i<N; ++i) {

    buffer = *((double*)(&h_out[i]));

    cout << i << "\t" << buffer << endl;

  }

// free

  free(h_in);

  free(h_out);

cudaUnbindTexture(tex);

  cudaFree(d_in);

  cudaFree(d_out);

  checkCUDAError("cuda free operations");

}

If I change the device output to int4, it (sometimes) converts correctly. Here is the (cleaned up) ptx code for the original cuComplexDouble version:

tex.2d.v4.s32.f32 {%r6,%r7,%r8,%r9},[tex,{%f6,%f8,%f10,%f12}];

	.loc	17	41	0

 //  37    cuDoubleComplex moep = *moep1;

 //  38    

 //  39    moep.x *= 100.0;

 //  40    

 //  41    d_out[i] = moep;

	ld.param.u64 	%rd1, [__cudaparm__Z7texTestP7double2_d_out];

	cvt.s64.s32 	%rd2, %r5;

	mul.wide.s32 	%rd3, %r5, 16;

	add.u64 	%rd4, %rd1, %rd3;

	cvt.rn.f64.s32 	%fd1, %r6;

	mov.f64 	%fd2, 0d4059000000000000;	// 100

	mul.f64 	%fd3, %fd1, %fd2;

	cvt.rn.f64.s32 	%fd4, %r8;

	st.global.v2.f64 	[%rd4+0], {%fd3,%fd4};

Here for the int4 version:

tex.2d.v4.s32.f32 {%r6,%r7,%r8,%r9},[tex,{%f6,%f8,%f10,%f12}];

	.loc	17	25	0

 //  23    

 //  24    // if d_out is int4*, the following sometimes works

 //  25    cuDoubleComplex moep = *((cuDoubleComplex*)&temp);

	cvt.rn.f64.s32 	%fd1, %r8;

	mov.f64 	%fd2, %fd1;

	.loc	17	27	0

 //  26    

 //  27    moep.x *= 100.0;

	cvt.rn.f64.s32 	%fd3, %r6;

	mov.f64 	%fd4, 0d4059000000000000;	// 100

	mul.f64 	%fd5, %fd3, %fd4;

	mov.f64 	%fd6, %fd5;

	.loc	17	29	0

 //  28    

 //  29    d_out[i] = *((int4*)&moep);

	ld.param.u64 	%rd1, [__cudaparm__Z7texTestP4int4_d_out];

	cvt.s64.s32 	%rd2, %r5;

	mul.wide.s32 	%rd3, %r5, 16;

	add.u64 	%rd4, %rd1, %rd3;

	cvt.rzi.s32.f64 	%r18, %fd6;

	mov.b64 	%rd5, %fd6;

	mov.u64 	%rd6, %rd5;

	shr.u64 	%rd7, %rd6, 32;

	mov.b64 	%fd7, %rd7;

	cvt.rzi.s32.f64 	%r19, %fd7;

	cvt.rzi.s32.f64 	%r20, %fd2;

	mov.b64 	%rd8, %fd2;

	mov.u64 	%rd9, %rd8;

	shr.u64 	%rd10, %rd9, 32;

	mov.b64 	%fd8, %rd10;

	cvt.rzi.s32.f64 	%r21, %fd8;

	st.global.v4.s32 	[%rd4+0], {%r18,%r19,%r20,%r21};

So how to correctly do this conversion?

Edit: Here is the code that works for me. Using toolkit 4.0, CC 2.1.

#include <iostream>

#include <iomanip>

#include <stdio.h>

using namespace std;

#include "/usr/local/cuda/include/cuComplex.h"

void checkCUDAError( const char *msg ) {

  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err ) {

    cerr << "Cuda error: " << msg << ": " << cudaGetErrorString( err ) << endl;

    exit(EXIT_FAILURE);

  }

}

texture<int4, cudaTextureType2D, cudaReadModeElementType> tex;

__global__ void texTest( cuDoubleComplex* d_out ) {

  int i = blockIdx.x*blockDim.x + threadIdx.x;

  d_out[i] = make_cuDoubleComplex( 0.0, 0.0 );

int4 temp = tex2D( tex, ( float(i)+0.5f )/32.0f, 0.5f ); // adjust N here!

long re = temp.x;

  re <<= 32;

  re += temp.y;

long im = temp.z;

  im <<= 32;

  im += temp.w;

cuDoubleComplex moep = make_cuDoubleComplex( *((double*)&re), *((double*)&im) );

moep.x *= 0.25;

d_out[i] = moep;

}

int main() {

  const int N = 32;

  const unsigned threadsPerBlock = 16;

  const unsigned blockCount = N/threadsPerBlock;

// allocate memory, put some data in

  int4* h_in = (int4*)malloc( N*sizeof(int4) );

  cuDoubleComplex* h_out = (cuDoubleComplex*)malloc( N*sizeof(cuDoubleComplex) );

cuDoubleComplex temp;

  long re = 0;

  long im = 0;

  for(int i=0; i<N; ++i) {

    temp = make_cuDoubleComplex(8.0+double(i)+0.5, 4.0+double(i));

re = *((long*)&(temp.x));

    h_in[i].x = (int)(re>>32);

    h_in[i].y = (int)((re<<32)>>32);

im = *((long*)&(temp.y));

    h_in[i].w = (int)(im>>32);

    h_in[i].z = (int)((im<<32)>>32);

  }

cuDoubleComplex* d_out;

  cudaMalloc( &d_out, N*sizeof(cuDoubleComplex) );

int4* d_in;

  size_t pitchBytes = 0;

  cudaMallocPitch( &d_in, &pitchBytes, N*sizeof(int4), 1 );

cudaMemcpy( d_in, h_in, N*sizeof(int4), cudaMemcpyHostToDevice ); // later copy 2D

// 2. bind to int4 texture

  size_t offset = 0;

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<int4>();

  cudaBindTexture2D(&offset, &tex, (int4*)d_in, &channelDesc, N, 1, pitchBytes);  

tex.normalized = true;

  tex.filterMode = cudaFilterModePoint;

  tex.addressMode[0] = cudaAddressModeWrap;

  tex.addressMode[1] = cudaAddressModeWrap;

// run kernel

  texTest<<< blockCount, threadsPerBlock >>>( d_out );

// look at result

  cudaMemcpy( h_out, d_out, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );

double buffer = 0;

  for(int i=0; i<N; ++i) {

    buffer = *((double*)(&h_out[i]));

    cout << i << "\t" << buffer << endl;

  }

// free

  free(h_in);

  free(h_out);

cudaUnbindTexture(tex);

  cudaFree(d_in);

  cudaFree(d_out);

  checkCUDAError("cuda free operations");

}

You are probably looking for __hiloint2double() or __longlong_as_double().

But why not just declare the texture as [font=“Courier New”]double2[/font]?

Ah, these functions are probably the smarter way to do this.

double2 texture did not work for me, I will try this again.

Edit: OK, I have already been at this point in CUDA programming guide: “The type of a texel is restricted to the basic integer and single-precision floating-point types and any of the 1-, 2-, and 4-component vector types defined in Section B.3.1” (3.2.10.1 Texture Memory). However, when trying to use double2:

texture<double2, cudaTextureType2D, cudaReadModeElementType> tex;

// ...

double2 temp = tex2D( tex, ( float(i)+0.5f )/32.0f, 0.5f );

compiler claims:

Where is the mistake? I would like to keep using normalized texture coordinates.

Sorry, my mistake. I thought I had used double textures before, but of course I just used [font=“Courier New”]__hiloint2double()[/font] for them.

Ah, fine. Then my problem is solved using int4 textures and the smart functions you suggested:

int4 temp = tex2D( tex, ( float(i)+0.5f )/32.0f, 0.5f ); // adjust N here!

double re = __hiloint2double(temp.y, temp.x); // Note that int4 is stored y x w z !!! [Edit:] Note comments below!

  double im = __hiloint2double(temp.w, temp.z);

// ...

d_out[i] = make_cuDoubleComplex(re, im);

Any Pointers h_in, h_out, d_in, d_out are cuDoubleComplex now, so it is very easy to write/read the usual way, too.

This problem was annoying me the last days, so External Image a lot, tera.

You are welcome! And the nice thing about the conversions is they compile to zero instructions… (although there have been some instances where the compiler missed to optimize them away).

Please note that the comment in the above code is misleading. In order from least significant to most significant component, an int4 is stored as

int x, y, z, w;

The function signature for __hiloint2double() is:

double __hiloint2double(int hi, int lo);

The order of the constituent parts of a cuDoubleComplex (which is really a double2 on the device) is:

double x /* real part */, y /* imaginary part */;

Combining these three pieces of information, we arrive at the following code snippet in straightforward manner:

int4 temp;

cuDoubleComplex foo;

double re = __hiloint2double(temp.y, temp.x); 

double im = __hiloint2double(temp.w, temp.z);

foo = make_cuDoubleComplex(re, im);

Alternatively, one could also write:

int4 temp;

cuDoubleComplex foo;

foo.x = __hiloint2double(temp.y, temp.x); 

foo.y = __hiloint2double(temp.w, temp.z);

Well, I assumed hi and low means that a double (8 byte) can be constructed by interpreting the two int (2x4 byte) like this: (hi lo). Then I did this test on host code:

long test;

int4 x = make_int4(1, 0, 0, 0);

  test = *((long*)&x);

  cout << test << endl;

int4 y = make_int4(0, 1, 0, 0);

  test = *((long*)&y);

  cout << test << endl;

int4 z = make_int4(0, 0, 1, 0);

  test = *((long*)&z+1);

  cout << test << endl;

int4 w = make_int4(0, 0, 0, 1);

  test = *((long*)&w+1);

  cout << test << endl;

Result is

x: 1

y: 4294967296

z: 1

w: 4294967296

What is wrong with my test?

Texture can not be declared as double only as int and floats. The only way to get double is via 64 bit int.

The following kind of construct invokes undefined behavior in both C and C++:

test = *((long*)&x);

Because re-interpretation casts were needed in CUDA-based libraries when CUDA was still a C-based language, we defined well-defined transfer functions such as __int_as_float(), __float_as_int(), __hiloint2double(), etc from the start. Now that CUDA is based on C++, one can also use reinterpret_cast for this purpose.

Having said that, it seems that the type punning through pointer conversion happens to work as intended in the case shown, and shows the results one would expect. In the first case and second case the “long” result is constructed from the int4 argument as follows (long apparently being a 64-bit type on your platform):

res<63:32>=arg.y, res<31:0>=arg.x

In the first case, arg.y = 0 and arg.x = 1, so res = 1. In the second case, arg.y = 1 and arg.x = 0, so res = 2**32. Analogous for the third and fourth case, where

res<63:32>=arg.w, res<31:0>=arg.z