finally, i have found the cause of this error. It was operator. But it is very strange, because it’s a first time i have problems with operator. And it is interesting that i can not reproduce this error with simple structure. It appears just with my MGML_MATH::VECTOR<4,float> :blink:
so this tricky function give bugs
inline __device__ vec4f make_vec4f(float x, float y, float z, float w, vec4f& v)
{
v[0] = x;
v[1] = y;
v[2] = z;
v[3] = w;
return v;
}
but this function works
inline __device__ vec4f make_vec4f(float x, float y, float z, float w, vec4f& v)
{
v.M[0] = x;
v.M[1] = y;
v.M[2] = z;
v.M[3] = w;
return v;
}
in the following code i have use them both ang get different ptx assembler
__global__ void test_kernel2(vec4f* in_a, vec4f* in_b, vec4f* out_result)
{
uint x = blockDim.x * blockIdx.x + threadIdx.x;
uint y = blockDim.y * blockIdx.y + threadIdx.y;
uint w = 8*4;
uint tid = x + w*y;
vec4f a = in_a[tid];
vec4f b = in_b[tid];
vec4f mask = make_vec4f(__int_as_float(0xffffffff),
__int_as_float(0x0),
__int_as_float(0xffffffff),
__int_as_float(0x0),mask);
vec4f result = _mm_and_ps(a,mask);
out_result[tid] = result;
}
correct ptx (seems to be optimezed, but it works)
d.param.u32 $r10, [__cudaparm__Z12test_kernel2PN9MGML_MATH6VECTORILi4EfEES2_S2__in_a];
add.u32 $r11, $r10, $r9; //
ld.global.f32 $f1, [$r11+8]; // id:93
.loc 24 435 0
ld.global.s32 $r12, [$r11+0]; // id:94
mov.b32 $f2, $r12; //
mov.f32 $f3, $f2; //
mov.f32 $f4, 0f00000000; // 0
mov.f32 $f5, $f4; //
mov.b32 $r13, $f1; //
mov.b32 $f6, $r13; //
mov.f32 $f7, $f6; //
.loc 2 234 0
ld.param.u32 $r14, [__cudaparm__Z12test_kernel2PN9MGML_MATH6VECTORILi4EfEES2_S2__out_result];
incorrect ptx
ld.param.u32 $r10, [__cudaparm__Z12test_kernel2PN9MGML_MATH6VECTORILi4EfEES2_S2__in_a];
add.u32 $r11, $r10, $r9; //
ld.global.f32 $f1, [$r11+0]; // id:109
ld.global.f32 $f2, [$r11+4]; // id:110
ld.global.f32 $f3, [$r11+8]; // id:111
.loc 24 423 0
mov.f32 $f4, 0fffffffff; // -1.#QNAN
mov.f32 $f5, $f4; //
mov.f32 $f6, 0f00000000; // 0
mov.f32 $f7, $f6; //
mov.f32 $f8, 0fffffffff; // -1.#QNAN
mov.f32 $f9, $f8; //
mov.f32 $f10, 0f00000000; // 0
mov.f32 $f11, $f10; //
.loc 24 435 0
mov.b32 $r12, $f1; //
cvt.rzi.s32.f32 $r13, $f5; //
and.b32 $r14, $r12, $r13; //
mov.b32 $f12, $r14; //
mov.f32 $f13, $f12; //
mov.b32 $r15, $f2; //
cvt.rzi.s32.f32 $r16, $f7; //
and.b32 $r17, $r15, $r16; //
mov.b32 $f14, $r17; //
mov.f32 $f15, $f14; //
mov.b32 $r18, $f3; //
cvt.rzi.s32.f32 $r19, $f9; //
and.b32 $r20, $r18, $r19; //
mov.b32 $f16, $r20; //
mov.f32 $f17, $f16; //
.loc 2 234 0
ld.param.u32 $r21, [__cudaparm__Z12test_kernel2PN9MGML_MATH6VECTORILi4EfEES2_S2__out_result];
but i can not reproduse that error with the simple structure with operator, so one can get it only with MGML_MATH.
anyway thanks for help. I think it is better not to use C++ features as frequently as i do that.