Edit: Sorry, immediately after posting this i had the idea. Working code below. Please give comments anyway, if you have some suggestions.
My programs reads many double complex values from global memory. One of my approaches (AoS) is to
[list=1]
[*]store them continuosly as cuDoubleComplex values in pitch linear memory
[*]bind them to an int4 texture
[*]read them as int4 values in kernel
[*]convert from int4 to cuDoubleComplex bit-by-bit in kernel
[*](perform some computations)
[*]write results as cuDoubleComplex to (if needed different location in) global memory
A similar test with int2/double worked well, but when switching to int4/cuDoubleComplex the program does not do what I intended to do. Instead, it takes the first two elements and converts them the usual way.
Here is my try. Using toolkit 4.0.
#include <iostream>
#include <iomanip>
#include <stdio.h>
using namespace std;
#include "/usr/local/cuda/include/cuComplex.h"
void checkCUDAError( const char *msg ) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err ) {
cerr << "Cuda error: " << msg << ": " << cudaGetErrorString( err ) << endl;
exit(EXIT_FAILURE);
}
}
texture<int4, cudaTextureType2D, cudaReadModeElementType> tex;
__global__ void texTest( cuDoubleComplex* d_out ) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
int4 temp = tex2D( tex, ( float(i)+0.5f )/32.0f, 0.5f ); // adjust N here!
// if d_out is int4*, the following sometimes works
/*cuDoubleComplex moep = *((cuDoubleComplex*)&temp);
moep.x *= 100.0;
d_out[i] = *((int4*)&moep);*/
// if d_out is cuDoubleComplex*, the following does not convert bit-by-bit but in the usual way
int4* temp1 = &temp;
void** temp2 = (void**)(&temp1);
cuDoubleComplex* moep1 = (cuDoubleComplex*)(*temp2);
cuDoubleComplex moep = *moep1;
moep.x *= 100.0;
d_out[i] = moep;
}
int main() {
const int N = 32;
const unsigned threadsPerBlock = 16;
const unsigned blockCount = N/threadsPerBlock;
// allocate memory, put some data in
int4* h_in = (int4*)malloc( N*sizeof(int4) );
cuDoubleComplex* h_out = (cuDoubleComplex*)malloc( N*sizeof(cuDoubleComplex) );
cuDoubleComplex temp;
for(int i=0; i<N; ++i) {
temp = make_cuDoubleComplex(5.0+double(i)+0.5, 4.0+double(i));
//h_in[i] = *( (int4*)(&temp) ); // later this is the purpose
h_in[i] = make_int4(3+2*i, 100*i, 1000*i, 3564+3*i); // this is only for testing
}
cuDoubleComplex* d_out;
cudaMalloc( &d_out, N*sizeof(cuDoubleComplex) );
int4* d_in;
size_t pitchBytes = 0;
cudaMallocPitch( &d_in, &pitchBytes, N*sizeof(int4), 1 );
cudaMemcpy( d_in, h_in, N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice ); // later copy 2D
// 2. bind to int4 texture
size_t offset = 0;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<int4>();
cudaBindTexture2D(&offset, &tex, (int4*)d_in, &channelDesc, N, 1, pitchBytes);
tex.normalized = true;
tex.filterMode = cudaFilterModePoint;
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
// run kernel
texTest<<< blockCount, threadsPerBlock >>>( d_out );
// look at result
cudaMemcpy( h_out, d_out, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
double buffer = 0;
for(int i=0; i<N; ++i) {
buffer = *((double*)(&h_out[i]));
cout << i << "\t" << buffer << endl;
}
// free
free(h_in);
free(h_out);
cudaUnbindTexture(tex);
cudaFree(d_in);
cudaFree(d_out);
checkCUDAError("cuda free operations");
}
If I change the device output to int4, it (sometimes) converts correctly. Here is the (cleaned up) ptx code for the original cuComplexDouble version:
tex.2d.v4.s32.f32 {%r6,%r7,%r8,%r9},[tex,{%f6,%f8,%f10,%f12}];
.loc 17 41 0
// 37 cuDoubleComplex moep = *moep1;
// 38
// 39 moep.x *= 100.0;
// 40
// 41 d_out[i] = moep;
ld.param.u64 %rd1, [__cudaparm__Z7texTestP7double2_d_out];
cvt.s64.s32 %rd2, %r5;
mul.wide.s32 %rd3, %r5, 16;
add.u64 %rd4, %rd1, %rd3;
cvt.rn.f64.s32 %fd1, %r6;
mov.f64 %fd2, 0d4059000000000000; // 100
mul.f64 %fd3, %fd1, %fd2;
cvt.rn.f64.s32 %fd4, %r8;
st.global.v2.f64 [%rd4+0], {%fd3,%fd4};
Here for the int4 version:
tex.2d.v4.s32.f32 {%r6,%r7,%r8,%r9},[tex,{%f6,%f8,%f10,%f12}];
.loc 17 25 0
// 23
// 24 // if d_out is int4*, the following sometimes works
// 25 cuDoubleComplex moep = *((cuDoubleComplex*)&temp);
cvt.rn.f64.s32 %fd1, %r8;
mov.f64 %fd2, %fd1;
.loc 17 27 0
// 26
// 27 moep.x *= 100.0;
cvt.rn.f64.s32 %fd3, %r6;
mov.f64 %fd4, 0d4059000000000000; // 100
mul.f64 %fd5, %fd3, %fd4;
mov.f64 %fd6, %fd5;
.loc 17 29 0
// 28
// 29 d_out[i] = *((int4*)&moep);
ld.param.u64 %rd1, [__cudaparm__Z7texTestP4int4_d_out];
cvt.s64.s32 %rd2, %r5;
mul.wide.s32 %rd3, %r5, 16;
add.u64 %rd4, %rd1, %rd3;
cvt.rzi.s32.f64 %r18, %fd6;
mov.b64 %rd5, %fd6;
mov.u64 %rd6, %rd5;
shr.u64 %rd7, %rd6, 32;
mov.b64 %fd7, %rd7;
cvt.rzi.s32.f64 %r19, %fd7;
cvt.rzi.s32.f64 %r20, %fd2;
mov.b64 %rd8, %fd2;
mov.u64 %rd9, %rd8;
shr.u64 %rd10, %rd9, 32;
mov.b64 %fd8, %rd10;
cvt.rzi.s32.f64 %r21, %fd8;
st.global.v4.s32 [%rd4+0], {%r18,%r19,%r20,%r21};
So how to correctly do this conversion?
Edit: Here is the code that works for me. Using toolkit 4.0, CC 2.1.
#include <iostream>
#include <iomanip>
#include <stdio.h>
using namespace std;
#include "/usr/local/cuda/include/cuComplex.h"
void checkCUDAError( const char *msg ) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err ) {
cerr << "Cuda error: " << msg << ": " << cudaGetErrorString( err ) << endl;
exit(EXIT_FAILURE);
}
}
texture<int4, cudaTextureType2D, cudaReadModeElementType> tex;
__global__ void texTest( cuDoubleComplex* d_out ) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
d_out[i] = make_cuDoubleComplex( 0.0, 0.0 );
int4 temp = tex2D( tex, ( float(i)+0.5f )/32.0f, 0.5f ); // adjust N here!
long re = temp.x;
re <<= 32;
re += temp.y;
long im = temp.z;
im <<= 32;
im += temp.w;
cuDoubleComplex moep = make_cuDoubleComplex( *((double*)&re), *((double*)&im) );
moep.x *= 0.25;
d_out[i] = moep;
}
int main() {
const int N = 32;
const unsigned threadsPerBlock = 16;
const unsigned blockCount = N/threadsPerBlock;
// allocate memory, put some data in
int4* h_in = (int4*)malloc( N*sizeof(int4) );
cuDoubleComplex* h_out = (cuDoubleComplex*)malloc( N*sizeof(cuDoubleComplex) );
cuDoubleComplex temp;
long re = 0;
long im = 0;
for(int i=0; i<N; ++i) {
temp = make_cuDoubleComplex(8.0+double(i)+0.5, 4.0+double(i));
re = *((long*)&(temp.x));
h_in[i].x = (int)(re>>32);
h_in[i].y = (int)((re<<32)>>32);
im = *((long*)&(temp.y));
h_in[i].w = (int)(im>>32);
h_in[i].z = (int)((im<<32)>>32);
}
cuDoubleComplex* d_out;
cudaMalloc( &d_out, N*sizeof(cuDoubleComplex) );
int4* d_in;
size_t pitchBytes = 0;
cudaMallocPitch( &d_in, &pitchBytes, N*sizeof(int4), 1 );
cudaMemcpy( d_in, h_in, N*sizeof(int4), cudaMemcpyHostToDevice ); // later copy 2D
// 2. bind to int4 texture
size_t offset = 0;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<int4>();
cudaBindTexture2D(&offset, &tex, (int4*)d_in, &channelDesc, N, 1, pitchBytes);
tex.normalized = true;
tex.filterMode = cudaFilterModePoint;
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
// run kernel
texTest<<< blockCount, threadsPerBlock >>>( d_out );
// look at result
cudaMemcpy( h_out, d_out, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
double buffer = 0;
for(int i=0; i<N; ++i) {
buffer = *((double*)(&h_out[i]));
cout << i << "\t" << buffer << endl;
}
// free
free(h_in);
free(h_out);
cudaUnbindTexture(tex);
cudaFree(d_in);
cudaFree(d_out);
checkCUDAError("cuda free operations");
}