Same .cu file works in NVRTC 10.1, but not in NVRTC 10.2 and 11.0

My cuda c file (.cu) used to get compiled to PTX file by NVRTC 10.1, and it works very well at run time. However, after I upgrade NVRTC to 10.2 and 11.0, although the same cuda file can still be compiled to PTX file, it got “ErrorLaunchFailed” error at runtime.

I also noticed that there is a huge difference between PTX files compiled by NVRTC 10.1 and PTX files compiled by NVRTC 11.0 (and 10.2 also). And the newer version (11.0 and 10.2) is much much short than the older version (10.1)

Did anyone have idea about it ? Attached cuda cpp file, PTX file compiled by NVRTC 10.1 and PTX file compiled by NVRTC 11.0 as follows:

Cuda cpp:

#define __int64 long long
#define __int32 int

#define MAX_CUTORCH_DIMS 25

template
struct TensorInfo {
float* data;
IndexType sizes[MAX_CUTORCH_DIMS];
IndexType strides[MAX_CUTORCH_DIMS];
int dims;
};

// Translate a linear index for the apply to a float* offset;
// specialized on Dims to reduce nvcc compilation time
template <typename IndexType, int Dims>
struct IndexToOffset {
static host device IndexType get(
IndexType linearId,
const TensorInfo& info) {
IndexType offset = 0;

// Use static dims
for (int i = Dims - 1; i >= 0; --i) {
  IndexType curDimIndex = linearId % info.sizes[i];
  IndexType curDimOffset = curDimIndex * info.strides[i];
  offset += curDimOffset;

  if (i > 0) {
    linearId /= info.sizes[i];
  }
}

return offset;

}
};

template
struct IndexToOffset<IndexType, -2> {
static forceinline host device IndexType
get(IndexType linearId, const TensorInfo& info) {
return linearId;
}
};

template
struct IndexToOffset<IndexType, -1> {
static forceinline host device IndexType
get(IndexType linearId, const TensorInfo& info) {
IndexType offset = 0;

// Use dynamic dims
for (int i = info.dims - 1; i >= 0; --i) {
  IndexType curDimIndex = linearId % info.sizes[i];
  IndexType curDimOffset = curDimIndex * info.strides[i];
  offset += curDimOffset;

  linearId /= info.sizes[i];
}

return offset;

}
};

template
device forceinline IndexType getLinearBlockId() {
return blockIdx.z * gridDim.y * gridDim.x +
blockIdx.y * gridDim.x +
blockIdx.x;
}

template <typename Op, typename IndexType, int ADims>
#if CUDA_ARCH >= 350
launch_bounds(32 * 16, 4)
#endif
global void
pointwiseApply1(TensorInfo a,
IndexType totalElements,
Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert linearIndex into an offset of a
const IndexType aOffset =
IndexToOffset<IndexType, ADims>::get(linearIndex, a);

op(&a.data[aOffset]);

}
}

template <typename Op, typename IndexType, int ADims, int BDims>
#if CUDA_ARCH >= 350
launch_bounds(32 * 16, 4)
#endif
global void
pointwiseApply2(TensorInfo a,
TensorInfo b,
IndexType totalElements,
Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert linearIndex into an offset of a
const IndexType aOffset =
IndexToOffset<IndexType, ADims>::get(linearIndex, a);

// Convert `linearIndex` into an offset of `b`
const IndexType bOffset =
  IndexToOffset<IndexType, BDims>::get(linearIndex, b);

op(&a.data[aOffset], &b.data[bOffset]);

}
}

template <typename Op, typename IndexType, int ADims, int BDims, int CDims>
#if CUDA_ARCH >= 350
launch_bounds(32 * 16, 4)
#endif
global void
pointwiseApply3(TensorInfo a,
TensorInfo b,
TensorInfo c,
IndexType totalElements,
Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert linearIndex into an offset of a
const IndexType aOffset =
IndexToOffset<IndexType, ADims>::get(linearIndex, a);

// Convert `linearIndex` into an offset of `b`
const IndexType bOffset =
  IndexToOffset<IndexType, BDims>::get(linearIndex, b);

// Convert `linearIndex` into an offset of `c`
const IndexType cOffset =
  IndexToOffset<IndexType, CDims>::get(linearIndex, c);

op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset]);

}
}

template <typename Op, typename IndexType, int ADims, int BDims, int CDims, int DDims>
#if CUDA_ARCH >= 350
launch_bounds(32 * 16, 4)
#endif
global void
pointwiseApply4(TensorInfo a,
TensorInfo b,
TensorInfo c,
TensorInfo d,
IndexType totalElements,
Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert linearIndex into an offset of a
const IndexType aOffset =
IndexToOffset<IndexType, ADims>::get(linearIndex, a);

// Convert `linearIndex` into an offset of `b`
const IndexType bOffset =
  IndexToOffset<IndexType, BDims>::get(linearIndex, b);

// Convert `linearIndex` into an offset of `c`
const IndexType cOffset =
  IndexToOffset<IndexType, CDims>::get(linearIndex, c);

// Convert `linearIndex` into an offset of `d`
const IndexType dOffset =
  IndexToOffset<IndexType, DDims>::get(linearIndex, d);

op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset], &d.data[dOffset]);

}
}

template <typename Op, typename IndexType, int ADims, int BDims, int CDims, int DDims, int EDims>
#if CUDA_ARCH >= 350
launch_bounds(32 * 16, 4)
#endif
global void
pointwiseApply5(TensorInfo a,
TensorInfo b,
TensorInfo c,
TensorInfo d,
TensorInfo e,
IndexType totalElements,
Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert linearIndex into an offset of a
const IndexType aOffset =
IndexToOffset<IndexType, ADims>::get(linearIndex, a);

// Convert `linearIndex` into an offset of `b`
const IndexType bOffset =
  IndexToOffset<IndexType, BDims>::get(linearIndex, b);

// Convert `linearIndex` into an offset of `c`
const IndexType cOffset =
  IndexToOffset<IndexType, CDims>::get(linearIndex, c);

// Convert `linearIndex` into an offset of `d`
const IndexType dOffset =
  IndexToOffset<IndexType, DDims>::get(linearIndex, d);

// Convert `linearIndex` into an offset of `e`
const IndexType eOffset =
  IndexToOffset<IndexType, EDims>::get(linearIndex, e);

op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset], &d.data[dOffset], &e.data[eOffset]);

}
}

#define APPLY_T(INDEX_TYPE, DIMSA, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME { device forceinline void operator()(float* v) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> src, __int64 totalElements)
{
pointwiseApply1<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA>(src, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME());
}
}

#define APPLY_TT(INDEX_TYPE, DIMSA, DIMSB, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME { device forceinline void operator()(float* a, float* b) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> tensorA, TensorInfo<INDEX_TYPE> tensorB, __int64 totalElements)
{
pointwiseApply2<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB>(tensorA, tensorB, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME());
}
}

#define APPLY_TTT(INDEX_TYPE, DIMSA, DIMSB, DIMSC, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME { device forceinline void operator()(float* a, float* b, float *c) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> tensorA, TensorInfo<INDEX_TYPE> tensorB, TensorInfo<INDEX_TYPE> tensorC, __int64 totalElements)
{
pointwiseApply3<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB, DIMSC>(tensorA, tensorB, tensorC, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME());
}
}

#define APPLY_TTTT(INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME { device forceinline void operator()(float* a, float* b, float *c, float *d) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> tensorA, TensorInfo<INDEX_TYPE> tensorB, TensorInfo<INDEX_TYPE> tensorC, TensorInfo<INDEX_TYPE> tensorD, __int64 totalElements)
{
pointwiseApply4<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD>(tensorA, tensorB, tensorC, tensorD, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME());
}
}

#define APPLY_TTTTT(INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD, DIMSE, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME { device forceinline void operator()(float* a, float* b, float *c, float *d, float *e) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> tensorA, TensorInfo<INDEX_TYPE> tensorB, TensorInfo<INDEX_TYPE> tensorC, TensorInfo<INDEX_TYPE> tensorD, TensorInfo<INDEX_TYPE> tensorE, __int64 totalElements)
{
pointwiseApply5<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD, DIMSE>(tensorA, tensorB, tensorC, tensorD, tensorE, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME());
}
}

#define APPLY_TS(INDEX_TYPE, DIMSA, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float b;
device ConcreteOp##KERNEL_NAME(float bVal) { this->b = bVal; }
device forceinline void operator()(float* a) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, float b, __int64 totalElements)
{
pointwiseApply1<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA>(a, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME(b));
}
}

#define APPLY_TSS(INDEX_TYPE, DIMSA, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float b;
float c;
device ConcreteOp##KERNEL_NAME(float bVal, float cVal) { this->b = bVal; this->c = cVal; }
device forceinline void operator()(float* a) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, float b, float c, __int64 totalElements)
{
pointwiseApply1<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA>(a, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME(b, c));
}
}

#define APPLY_TTS(INDEX_TYPE, DIMSA, DIMSB, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float c;
device ConcreteOp##KERNEL_NAME(float cVal) { this->c = cVal; }
device forceinline void operator()(float* a, float* b) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, TensorInfo<INDEX_TYPE> b, float c, __int64 totalElements)
{
pointwiseApply2<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB>(a, b, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME©);
}
}

#define APPLY_TTSS(INDEX_TYPE, DIMSA, DIMSB, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float c;
float d;
device ConcreteOp##KERNEL_NAME(float cVal, float dVal) { this->c = cVal; this->d = dVal; }
device forceinline void operator()(float* a, float* b) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, TensorInfo<INDEX_TYPE> b, float c, float d, __int64 totalElements)
{
pointwiseApply2<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB>(a, b, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME(c, d));
}
}

#define APPLY_TTTS(INDEX_TYPE, DIMSA, DIMSB, DIMSC, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float d;
device ConcreteOp##KERNEL_NAME(float dVal) { this->d = dVal; }
device forceinline void operator()(float* a, float* b, float* c) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, TensorInfo<INDEX_TYPE> b, TensorInfo<INDEX_TYPE> c, float d, __int64 totalElements)
{
pointwiseApply3<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB, DIMSC>(a, b, c, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME(d));
}
}

/*
#define APPLY_TTTSS(INDEX_TYPE, DIMSA, DIMSB, DIMSC, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float d;
float e;
device ConcreteOp##KERNEL_NAME(float dVal, float eVal) { this->d = dVal; this->e = eVal; }
device forceinline void operator()(float* a, float* b, float* c) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, TensorInfo<INDEX_TYPE> b, TensorInfo<INDEX_TYPE> c, float d, float e, __int64 totalElements)
{
pointwiseApply3<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB, DIMSC>(a, b, c, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME(d, e));
}
}

#define APPLY_TTTTS(INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float e;
device ConcreteOp##KERNEL_NAME(float eVal) { this->e = eVal; }
device forceinline void operator()(float* a, float* b, float* c, float* d) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, TensorInfo<INDEX_TYPE> b, TensorInfo<INDEX_TYPE> c, TensorInfo<INDEX_TYPE> d, float e, __int64 totalElements)
{
pointwiseApply4<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD>(a, b, c, d, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME(e));
}
}

#define APPLY_TTTTS(INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD, KERNEL_NAME, OP_CODE)
struct ConcreteOp##KERNEL_NAME {
float e;
float f;
device ConcreteOp##KERNEL_NAME(float eVal, float fVal) { this->e = eVal; this->f = fVal; }
device forceinline void operator()(float* a, float* b, float* c, float* d) const { OP_CODE } };
extern “C” {
global void KERNEL_NAME(TensorInfo<INDEX_TYPE> a, TensorInfo<INDEX_TYPE> b, TensorInfo<INDEX_TYPE> c, TensorInfo<INDEX_TYPE> d, float e, float f, __int64 totalElements)
{
pointwiseApply4<ConcreteOp##KERNEL_NAME, INDEX_TYPE, DIMSA, DIMSB, DIMSC, DIMSD>(a, b, c, d, (INDEX_TYPE)totalElements, ConcreteOp##KERNEL_NAME(e, f));
}
}
*/

APPLY_TS(unsigned __int32, -2, fill__int32_M2, *a = b;)
APPLY_TS(unsigned __int32, -1, fill__int32_M1, *a = b;)
APPLY_TT(unsigned __int32, -2, -2, copy__int32_M2_M2, *a = *b;)
APPLY_TT(unsigned __int32, -1, -2, copy__int32_M1_M2, *a = *b;)
APPLY_TT(unsigned __int32, -2, -1, copy__int32_M2_M1, *a = *b;)
APPLY_TT(unsigned __int32, -1, -1, copy__int32_M1_M1, *a = *b;)

PTX file compiled by NVRTC 10.1, which works at run time.

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-26907403
// Cuda compilation tools, release 10.1, V10.1.243
// Based on LLVM 3.4svn
//

.version 6.4
.target sm_60
.address_size 64

// .globl	fill__int32_M2

.visible .entry fill__int32_M2(
.param .align 8 .b8 fill__int32_M2_param_0[216],
.param .f32 fill__int32_M2_param_1,
.param .u64 fill__int32_M2_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .b32 %r<11>;
.reg .b64 %rd<7>;

mov.b64	%rd2, fill__int32_M2_param_0;
ld.param.f32 	%f1, [fill__int32_M2_param_1];
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
ld.param.u32 	%r3, [fill__int32_M2_param_2];
setp.ge.u32	%p1, %r10, %r3;
@%p1 bra 	BB0_3;

mov.u64 	%rd3, %rd2;
ld.param.u64 	%rd4, [%rd3];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r4, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd4;

BB0_2:
mul.wide.u32 %rd5, %r10, 4;
add.s64 %rd6, %rd1, %rd5;
st.global.f32 [%rd6], %f1;
add.s32 %r10, %r4, %r10;
setp.lt.u32 %p2, %r10, %r3;
@%p2 bra BB0_2;

BB0_3:
ret;
}

// .weak	_Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T_

.weak .entry Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T(
.param .align 8 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_0[216],
.param .u32 _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_1,
.param .align 4 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_2[4]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<3>;
.reg .f32 %f<3>;
.reg .b32 %r<11>;
.reg .b64 %rd<7>;

mov.b64	%rd2, _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_0;
ld.param.u32 	%r6, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_1];
ld.param.f32 	%f2, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_2];
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
setp.ge.u32	%p1, %r10, %r6;
@%p1 bra 	BB1_3;

mov.u64 	%rd3, %rd2;
ld.param.u64 	%rd4, [%rd3];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r3, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd4;

BB1_2:
mul.wide.u32 %rd5, %r10, 4;
add.s64 %rd6, %rd1, %rd5;
st.global.f32 [%rd6], %f2;
add.s32 %r10, %r3, %r10;
setp.lt.u32 %p2, %r10, %r6;
@%p2 bra BB1_2;

BB1_3:
ret;
}

// .globl	fill__int32_M1

.visible .entry fill__int32_M1(
.param .align 8 .b8 fill__int32_M1_param_0[216],
.param .f32 fill__int32_M1_param_1,
.param .u64 fill__int32_M1_param_2
)
{
.local .align 8 .b8 __local_depot2[216];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<10>;
.reg .f32 %f<2>;
.reg .b32 %r<91>;
.reg .b64 %rd<24>;

mov.u64 	%SPL, __local_depot2;
mov.b64	%rd10, fill__int32_M1_param_0;
ld.param.f32 	%f1, [fill__int32_M1_param_1];
mov.u64 	%rd22, %rd10;
add.u64 	%rd3, %SPL, 0;
ld.param.u32 	%r1, [fill__int32_M1_param_2];
mov.u32 	%r76, 0;
mov.u64 	%rd23, %rd3;

BB2_1:
ld.param.v2.u32 {%r38, %r39}, [%rd22];
st.local.v2.u32 [%rd23], {%r38, %r39};
add.s64 %rd23, %rd23, 8;
add.s64 %rd22, %rd22, 8;
add.s32 %r76, %r76, 1;
setp.lt.u32 %p1, %r76, 27;
@%p1 bra BB2_1;

mov.u32 	%r42, %ctaid.x;
mov.u32 	%r4, %ntid.x;
mov.u32 	%r43, %tid.x;
mad.lo.s32 	%r77, %r4, %r42, %r43;
setp.ge.u32	%p2, %r77, %r1;
@%p2 bra 	BB2_14;

ld.local.u32 	%r6, [%rd3+208];
add.s32 	%r7, %r6, -1;
ld.local.u64 	%rd8, [%rd3];
mov.u32 	%r44, %nctaid.x;
mul.lo.s32 	%r8, %r44, %r4;
and.b32  	%r9, %r6, 3;
mul.wide.s32 	%rd12, %r7, 4;
add.s64 	%rd13, %rd3, %rd12;
add.s64 	%rd9, %rd13, 8;
add.s32 	%r10, %r6, -2;

BB2_4:
mov.u32 %r90, 0;
setp.lt.s32 %p3, %r7, 0;
@%p3 bra BB2_13;

setp.eq.s32	%p4, %r9, 0;
mov.u32 	%r90, 0;
mov.u32 	%r87, %r7;
mov.u32 	%r88, %r77;
@%p4 bra 	BB2_11;

setp.eq.s32	%p5, %r9, 1;
mov.u32 	%r83, 0;
mov.u32 	%r81, %r7;
mov.u32 	%r82, %r77;
@%p5 bra 	BB2_10;

setp.eq.s32	%p6, %r9, 2;
mov.u32 	%r80, 0;
mov.u32 	%r78, %r7;
mov.u32 	%r79, %r77;
@%p6 bra 	BB2_9;

ld.local.u32 	%r49, [%rd9];
rem.u32 	%r50, %r77, %r49;
ld.local.u32 	%r51, [%rd9+100];
mul.lo.s32 	%r80, %r51, %r50;
div.u32 	%r79, %r77, %r49;
mov.u32 	%r78, %r10;

BB2_9:
mul.wide.s32 %rd14, %r78, 4;
add.s64 %rd15, %rd3, %rd14;
ld.local.u32 %r52, [%rd15+8];
rem.u32 %r53, %r79, %r52;
ld.local.u32 %r54, [%rd15+108];
mad.lo.s32 %r83, %r54, %r53, %r80;
div.u32 %r82, %r79, %r52;
add.s32 %r81, %r78, -1;

BB2_10:
mul.wide.s32 %rd16, %r81, 4;
add.s64 %rd17, %rd3, %rd16;
ld.local.u32 %r55, [%rd17+8];
rem.u32 %r56, %r82, %r55;
ld.local.u32 %r57, [%rd17+108];
mad.lo.s32 %r90, %r57, %r56, %r83;
div.u32 %r88, %r82, %r55;
add.s32 %r87, %r81, -1;

BB2_11:
setp.lt.u32 %p7, %r6, 4;
@%p7 bra BB2_13;

BB2_12:
mul.wide.s32 %rd18, %r87, 4;
add.s64 %rd19, %rd3, %rd18;
ld.local.u32 %r58, [%rd19+8];
rem.u32 %r59, %r88, %r58;
ld.local.u32 %r60, [%rd19+108];
mad.lo.s32 %r61, %r60, %r59, %r90;
div.u32 %r62, %r88, %r58;
ld.local.u32 %r63, [%rd19+4];
rem.u32 %r64, %r62, %r63;
ld.local.u32 %r65, [%rd19+104];
mad.lo.s32 %r66, %r65, %r64, %r61;
div.u32 %r67, %r62, %r63;
ld.local.u32 %r68, [%rd19];
rem.u32 %r69, %r67, %r68;
ld.local.u32 %r70, [%rd19+100];
mad.lo.s32 %r71, %r70, %r69, %r66;
div.u32 %r72, %r67, %r68;
ld.local.u32 %r73, [%rd19±4];
rem.u32 %r74, %r72, %r73;
ld.local.u32 %r75, [%rd19+96];
mad.lo.s32 %r90, %r75, %r74, %r71;
div.u32 %r88, %r72, %r73;
add.s32 %r87, %r87, -4;
setp.gt.s32 %p8, %r87, -1;
@%p8 bra BB2_12;

BB2_13:
mul.wide.u32 %rd20, %r90, 4;
add.s64 %rd21, %rd8, %rd20;
st.f32 [%rd21], %f1;
add.s32 %r77, %r8, %r77;
setp.lt.u32 %p9, %r77, %r1;
@%p9 bra BB2_4;

BB2_14:
ret;
}

// .weak	_Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T_

.weak .entry Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T(
.param .align 8 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_0[216],
.param .u32 _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_1,
.param .align 4 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_2[4]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<9>;
.reg .f32 %f<3>;
.reg .b32 %r<83>;
.reg .b64 %rd<16>;

mov.b64	%rd4, _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_0;
ld.param.u32 	%r33, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_1];
ld.param.f32 	%f2, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_2];
mov.u64 	%rd1, %rd4;
mov.u32 	%r1, %ntid.x;
mov.u32 	%r34, %ctaid.x;
mov.u32 	%r35, %tid.x;
mad.lo.s32 	%r69, %r1, %r34, %r35;
setp.ge.u32	%p1, %r69, %r33;
@%p1 bra 	BB3_14;

ld.param.u32 	%r3, [%rd1+208];
add.s32 	%r4, %r3, -1;
ld.param.u64 	%rd5, [%rd1];
cvta.to.global.u64 	%rd2, %rd5;
mov.u32 	%r36, %nctaid.x;
mul.lo.s32 	%r5, %r36, %r1;
mul.wide.s32 	%rd6, %r4, 4;
add.s64 	%rd7, %rd1, %rd6;
add.s64 	%rd3, %rd7, 8;

BB3_2:
mov.u32 %r82, 0;
setp.lt.s32 %p2, %r4, 0;
@%p2 bra BB3_13;

and.b32  	%r41, %r3, 3;
mov.u32 	%r82, 0;
setp.eq.s32	%p3, %r41, 0;
@%p3 bra 	BB3_4;

setp.eq.s32	%p4, %r41, 1;
@%p4 bra 	BB3_6;
bra.uni 	BB3_7;

BB3_6:
mov.u32 %r73, %r4;
mov.u32 %r74, %r69;
bra.uni BB3_10;

BB3_4:
mov.u32 %r79, %r4;
mov.u32 %r80, %r69;
bra.uni BB3_11;

BB3_7:
setp.eq.s32 %p5, %r41, 2;
mov.u32 %r70, %r4;
mov.u32 %r71, %r69;
@%p5 bra BB3_9;

ld.param.u32 	%r42, [%rd3];
rem.u32 	%r43, %r69, %r42;
ld.param.u32 	%r44, [%rd3+100];
mul.lo.s32 	%r82, %r44, %r43;
div.u32 	%r71, %r69, %r42;
add.s32 	%r70, %r3, -2;

BB3_9:
mul.wide.s32 %rd8, %r70, 4;
add.s64 %rd9, %rd1, %rd8;
ld.param.u32 %r45, [%rd9+8];
rem.u32 %r46, %r71, %r45;
ld.param.u32 %r47, [%rd9+108];
mad.lo.s32 %r82, %r47, %r46, %r82;
div.u32 %r74, %r71, %r45;
add.s32 %r73, %r70, -1;

BB3_10:
mul.wide.s32 %rd10, %r73, 4;
add.s64 %rd11, %rd1, %rd10;
ld.param.u32 %r48, [%rd11+8];
rem.u32 %r49, %r74, %r48;
ld.param.u32 %r50, [%rd11+108];
mad.lo.s32 %r82, %r50, %r49, %r82;
div.u32 %r80, %r74, %r48;
add.s32 %r79, %r73, -1;

BB3_11:
setp.lt.u32 %p6, %r3, 4;
@%p6 bra BB3_13;

BB3_12:
mul.wide.s32 %rd12, %r79, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r51, [%rd13+8];
rem.u32 %r52, %r80, %r51;
ld.param.u32 %r53, [%rd13+108];
mad.lo.s32 %r54, %r53, %r52, %r82;
div.u32 %r55, %r80, %r51;
ld.param.u32 %r56, [%rd13+4];
rem.u32 %r57, %r55, %r56;
ld.param.u32 %r58, [%rd13+104];
mad.lo.s32 %r59, %r58, %r57, %r54;
div.u32 %r60, %r55, %r56;
ld.param.u32 %r61, [%rd13];
rem.u32 %r62, %r60, %r61;
ld.param.u32 %r63, [%rd13+100];
mad.lo.s32 %r64, %r63, %r62, %r59;
div.u32 %r65, %r60, %r61;
ld.param.u32 %r66, [%rd13±4];
rem.u32 %r67, %r65, %r66;
ld.param.u32 %r68, [%rd13+96];
mad.lo.s32 %r82, %r68, %r67, %r64;
div.u32 %r80, %r65, %r66;
add.s32 %r79, %r79, -4;
setp.gt.s32 %p7, %r79, -1;
@%p7 bra BB3_12;

BB3_13:
mul.wide.u32 %rd14, %r82, 4;
add.s64 %rd15, %rd2, %rd14;
st.global.f32 [%rd15], %f2;
add.s32 %r69, %r5, %r69;
setp.lt.u32 %p8, %r69, %r33;
@%p8 bra BB3_2;

BB3_14:
ret;
}

// .globl	copy__int32_M2_M2

.visible .entry copy__int32_M2_M2(
.param .align 8 .b8 copy__int32_M2_M2_param_0[216],
.param .align 8 .b8 copy__int32_M2_M2_param_1[216],
.param .u64 copy__int32_M2_M2_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;

mov.b64	%rd3, copy__int32_M2_M2_param_0;
mov.b64	%rd4, copy__int32_M2_M2_param_1;
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
ld.param.u32 	%r3, [copy__int32_M2_M2_param_2];
setp.ge.u32	%p1, %r10, %r3;
@%p1 bra 	BB4_3;

mov.u64 	%rd5, %rd3;
ld.param.u64 	%rd6, [%rd5];
mov.u64 	%rd7, %rd4;
ld.param.u64 	%rd8, [%rd7];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r4, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd8;
cvta.to.global.u64 	%rd2, %rd6;

BB4_2:
mul.wide.u32 %rd9, %r10, 4;
add.s64 %rd10, %rd2, %rd9;
add.s64 %rd11, %rd1, %rd9;
ld.global.f32 %f1, [%rd11];
st.global.f32 [%rd10], %f1;
add.s32 %r10, %r4, %r10;
setp.lt.u32 %p2, %r10, %r3;
@%p2 bra BB4_2;

BB4_3:
ret;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;

mov.b64	%rd3, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd4, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r6, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
setp.ge.u32	%p1, %r10, %r6;
@%p1 bra 	BB5_3;

mov.u64 	%rd5, %rd3;
ld.param.u64 	%rd6, [%rd5];
mov.u64 	%rd7, %rd4;
ld.param.u64 	%rd8, [%rd7];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r3, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd8;
cvta.to.global.u64 	%rd2, %rd6;

BB5_2:
mul.wide.u32 %rd9, %r10, 4;
add.s64 %rd10, %rd2, %rd9;
add.s64 %rd11, %rd1, %rd9;
ld.global.f32 %f1, [%rd11];
st.global.f32 [%rd10], %f1;
add.s32 %r10, %r3, %r10;
setp.lt.u32 %p2, %r10, %r6;
@%p2 bra BB5_2;

BB5_3:
ret;
}

// .globl	copy__int32_M1_M2

.visible .entry copy__int32_M1_M2(
.param .align 8 .b8 copy__int32_M1_M2_param_0[216],
.param .align 8 .b8 copy__int32_M1_M2_param_1[216],
.param .u64 copy__int32_M1_M2_param_2
)
{
.local .align 8 .b8 __local_depot6[216];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<10>;
.reg .f32 %f<2>;
.reg .b32 %r<91>;
.reg .b64 %rd<30>;

mov.u64 	%SPL, __local_depot6;
mov.b64	%rd12, copy__int32_M1_M2_param_0;
mov.b64	%rd13, copy__int32_M1_M2_param_1;
mov.u64 	%rd28, %rd12;
mov.u64 	%rd2, %rd13;
add.u64 	%rd4, %SPL, 0;
ld.param.u32 	%r1, [copy__int32_M1_M2_param_2];
mov.u32 	%r76, 0;
mov.u64 	%rd29, %rd4;

BB6_1:
ld.param.v2.u32 {%r38, %r39}, [%rd28];
st.local.v2.u32 [%rd29], {%r38, %r39};
add.s64 %rd29, %rd29, 8;
add.s64 %rd28, %rd28, 8;
add.s32 %r76, %r76, 1;
setp.lt.u32 %p1, %r76, 27;
@%p1 bra BB6_1;

mov.u32 	%r4, %ntid.x;
mov.u32 	%r42, %ctaid.x;
mov.u32 	%r43, %tid.x;
mad.lo.s32 	%r77, %r4, %r42, %r43;
setp.ge.u32	%p2, %r77, %r1;
@%p2 bra 	BB6_14;

ld.param.u64 	%rd15, [%rd2];
ld.local.u32 	%r6, [%rd4+208];
add.s32 	%r7, %r6, -1;
ld.local.u64 	%rd9, [%rd4];
mov.u32 	%r44, %nctaid.x;
mul.lo.s32 	%r8, %r44, %r4;
and.b32  	%r9, %r6, 3;
mul.wide.s32 	%rd16, %r7, 4;
add.s64 	%rd17, %rd4, %rd16;
add.s64 	%rd10, %rd17, 8;
add.s32 	%r10, %r6, -2;
cvta.to.global.u64 	%rd11, %rd15;

BB6_4:
mov.u32 %r90, 0;
setp.lt.s32 %p3, %r7, 0;
@%p3 bra BB6_13;

setp.eq.s32	%p4, %r9, 0;
mov.u32 	%r90, 0;
mov.u32 	%r87, %r7;
mov.u32 	%r88, %r77;
@%p4 bra 	BB6_11;

setp.eq.s32	%p5, %r9, 1;
mov.u32 	%r83, 0;
mov.u32 	%r81, %r7;
mov.u32 	%r82, %r77;
@%p5 bra 	BB6_10;

setp.eq.s32	%p6, %r9, 2;
mov.u32 	%r80, 0;
mov.u32 	%r78, %r7;
mov.u32 	%r79, %r77;
@%p6 bra 	BB6_9;

ld.local.u32 	%r49, [%rd10];
rem.u32 	%r50, %r77, %r49;
ld.local.u32 	%r51, [%rd10+100];
mul.lo.s32 	%r80, %r51, %r50;
div.u32 	%r79, %r77, %r49;
mov.u32 	%r78, %r10;

BB6_9:
mul.wide.s32 %rd18, %r78, 4;
add.s64 %rd19, %rd4, %rd18;
ld.local.u32 %r52, [%rd19+8];
rem.u32 %r53, %r79, %r52;
ld.local.u32 %r54, [%rd19+108];
mad.lo.s32 %r83, %r54, %r53, %r80;
div.u32 %r82, %r79, %r52;
add.s32 %r81, %r78, -1;

BB6_10:
mul.wide.s32 %rd20, %r81, 4;
add.s64 %rd21, %rd4, %rd20;
ld.local.u32 %r55, [%rd21+8];
rem.u32 %r56, %r82, %r55;
ld.local.u32 %r57, [%rd21+108];
mad.lo.s32 %r90, %r57, %r56, %r83;
div.u32 %r88, %r82, %r55;
add.s32 %r87, %r81, -1;

BB6_11:
setp.lt.u32 %p7, %r6, 4;
@%p7 bra BB6_13;

BB6_12:
mul.wide.s32 %rd22, %r87, 4;
add.s64 %rd23, %rd4, %rd22;
ld.local.u32 %r58, [%rd23+8];
rem.u32 %r59, %r88, %r58;
ld.local.u32 %r60, [%rd23+108];
mad.lo.s32 %r61, %r60, %r59, %r90;
div.u32 %r62, %r88, %r58;
ld.local.u32 %r63, [%rd23+4];
rem.u32 %r64, %r62, %r63;
ld.local.u32 %r65, [%rd23+104];
mad.lo.s32 %r66, %r65, %r64, %r61;
div.u32 %r67, %r62, %r63;
ld.local.u32 %r68, [%rd23];
rem.u32 %r69, %r67, %r68;
ld.local.u32 %r70, [%rd23+100];
mad.lo.s32 %r71, %r70, %r69, %r66;
div.u32 %r72, %r67, %r68;
ld.local.u32 %r73, [%rd23±4];
rem.u32 %r74, %r72, %r73;
ld.local.u32 %r75, [%rd23+96];
mad.lo.s32 %r90, %r75, %r74, %r71;
div.u32 %r88, %r72, %r73;
add.s32 %r87, %r87, -4;
setp.gt.s32 %p8, %r87, -1;
@%p8 bra BB6_12;

BB6_13:
mul.wide.u32 %rd24, %r90, 4;
add.s64 %rd25, %rd9, %rd24;
mul.wide.u32 %rd26, %r77, 4;
add.s64 %rd27, %rd11, %rd26;
ld.global.f32 %f1, [%rd27];
st.f32 [%rd25], %f1;
add.s32 %r77, %r8, %r77;
setp.lt.u32 %p9, %r77, %r1;
@%p9 bra BB6_4;

BB6_14:
ret;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<9>;
.reg .f32 %f<2>;
.reg .b32 %r<83>;
.reg .b64 %rd<22>;

mov.b64	%rd6, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd5, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r32, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u64 	%rd1, %rd6;
mov.u32 	%r33, %ntid.x;
mov.u32 	%r34, %ctaid.x;
mov.u32 	%r35, %tid.x;
mad.lo.s32 	%r69, %r33, %r34, %r35;
setp.ge.u32	%p1, %r69, %r32;
@%p1 bra 	BB7_14;

mov.u64 	%rd7, %rd5;
ld.param.u64 	%rd8, [%rd7];
ld.param.u32 	%r2, [%rd1+208];
add.s32 	%r3, %r2, -1;
ld.param.u64 	%rd9, [%rd1];
cvta.to.global.u64 	%rd2, %rd9;
and.b32  	%r4, %r2, 3;
mul.wide.s32 	%rd10, %r3, 4;
add.s64 	%rd11, %rd1, %rd10;
add.s64 	%rd3, %rd11, 8;
cvta.to.global.u64 	%rd4, %rd8;

BB7_2:
mov.u32 %r82, 0;
setp.lt.s32 %p2, %r3, 0;
@%p2 bra BB7_13;

mov.u32 	%r82, 0;
setp.eq.s32	%p3, %r4, 0;
@%p3 bra 	BB7_4;

setp.eq.s32	%p4, %r4, 1;
@%p4 bra 	BB7_6;
bra.uni 	BB7_7;

BB7_6:
mov.u32 %r73, %r3;
mov.u32 %r74, %r69;
bra.uni BB7_10;

BB7_4:
mov.u32 %r79, %r3;
mov.u32 %r80, %r69;
bra.uni BB7_11;

BB7_7:
setp.eq.s32 %p5, %r4, 2;
mov.u32 %r70, %r3;
mov.u32 %r71, %r69;
@%p5 bra BB7_9;

ld.param.u32 	%r40, [%rd3];
rem.u32 	%r41, %r69, %r40;
ld.param.u32 	%r42, [%rd3+100];
mul.lo.s32 	%r82, %r42, %r41;
div.u32 	%r71, %r69, %r40;
add.s32 	%r70, %r2, -2;

BB7_9:
mul.wide.s32 %rd12, %r70, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r43, [%rd13+8];
rem.u32 %r44, %r71, %r43;
ld.param.u32 %r45, [%rd13+108];
mad.lo.s32 %r82, %r45, %r44, %r82;
div.u32 %r74, %r71, %r43;
add.s32 %r73, %r70, -1;

BB7_10:
mul.wide.s32 %rd14, %r73, 4;
add.s64 %rd15, %rd1, %rd14;
ld.param.u32 %r46, [%rd15+8];
rem.u32 %r47, %r74, %r46;
ld.param.u32 %r48, [%rd15+108];
mad.lo.s32 %r82, %r48, %r47, %r82;
div.u32 %r80, %r74, %r46;
add.s32 %r79, %r73, -1;

BB7_11:
setp.lt.u32 %p6, %r2, 4;
@%p6 bra BB7_13;

BB7_12:
mul.wide.s32 %rd16, %r79, 4;
add.s64 %rd17, %rd1, %rd16;
ld.param.u32 %r49, [%rd17+8];
rem.u32 %r50, %r80, %r49;
ld.param.u32 %r51, [%rd17+108];
mad.lo.s32 %r52, %r51, %r50, %r82;
div.u32 %r53, %r80, %r49;
ld.param.u32 %r54, [%rd17+4];
rem.u32 %r55, %r53, %r54;
ld.param.u32 %r56, [%rd17+104];
mad.lo.s32 %r57, %r56, %r55, %r52;
div.u32 %r58, %r53, %r54;
ld.param.u32 %r59, [%rd17];
rem.u32 %r60, %r58, %r59;
ld.param.u32 %r61, [%rd17+100];
mad.lo.s32 %r62, %r61, %r60, %r57;
div.u32 %r63, %r58, %r59;
ld.param.u32 %r64, [%rd17±4];
rem.u32 %r65, %r63, %r64;
ld.param.u32 %r66, [%rd17+96];
mad.lo.s32 %r82, %r66, %r65, %r62;
div.u32 %r80, %r63, %r64;
add.s32 %r79, %r79, -4;
setp.gt.s32 %p7, %r79, -1;
@%p7 bra BB7_12;

BB7_13:
mul.wide.u32 %rd18, %r82, 4;
add.s64 %rd19, %rd2, %rd18;
mul.wide.u32 %rd20, %r69, 4;
add.s64 %rd21, %rd4, %rd20;
ld.global.f32 %f1, [%rd21];
st.global.f32 [%rd19], %f1;
mov.u32 %r68, %nctaid.x;
mad.lo.s32 %r69, %r68, %r33, %r69;
setp.lt.u32 %p8, %r69, %r32;
@%p8 bra BB7_2;

BB7_14:
ret;
}

// .globl	copy__int32_M2_M1

.visible .entry copy__int32_M2_M1(
.param .align 8 .b8 copy__int32_M2_M1_param_0[216],
.param .align 8 .b8 copy__int32_M2_M1_param_1[216],
.param .u64 copy__int32_M2_M1_param_2
)
{
.local .align 8 .b8 __local_depot8[216];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<10>;
.reg .f32 %f<2>;
.reg .b32 %r<91>;
.reg .b64 %rd<28>;

mov.u64 	%SPL, __local_depot8;
mov.b64	%rd11, copy__int32_M2_M1_param_1;
mov.u64 	%rd26, %rd11;
add.u64 	%rd4, %SPL, 0;
ld.param.u32 	%r1, [copy__int32_M2_M1_param_2];
ld.param.u64 	%rd13, [copy__int32_M2_M1_param_0];
cvta.to.global.u64 	%rd3, %rd13;
mov.u32 	%r76, 0;
mov.u64 	%rd27, %rd4;

BB8_1:
ld.param.v2.u32 {%r38, %r39}, [%rd26];
st.local.v2.u32 [%rd27], {%r38, %r39};
add.s64 %rd27, %rd27, 8;
add.s64 %rd26, %rd26, 8;
add.s32 %r76, %r76, 1;
setp.lt.u32 %p1, %r76, 27;
@%p1 bra BB8_1;

mov.u32 	%r42, %ctaid.x;
mov.u32 	%r4, %ntid.x;
mov.u32 	%r43, %tid.x;
mad.lo.s32 	%r77, %r4, %r42, %r43;
setp.ge.u32	%p2, %r77, %r1;
@%p2 bra 	BB8_16;

ld.local.u32 	%r6, [%rd4+208];
add.s32 	%r7, %r6, -1;
ld.local.u64 	%rd9, [%rd4];
mov.u32 	%r44, %nctaid.x;
mul.lo.s32 	%r8, %r44, %r4;
and.b32  	%r9, %r6, 3;
mul.wide.s32 	%rd14, %r7, 4;
add.s64 	%rd15, %rd4, %rd14;
add.s64 	%rd10, %rd15, 8;
add.s32 	%r10, %r6, -2;

BB8_4:
mov.u32 %r90, 0;
setp.lt.s32 %p3, %r7, 0;
@%p3 bra BB8_15;

mov.u32 	%r90, 0;
setp.eq.s32	%p4, %r9, 0;
@%p4 bra 	BB8_6;

setp.eq.s32	%p5, %r9, 1;
@%p5 bra 	BB8_8;
bra.uni 	BB8_9;

BB8_8:
mov.u32 %r81, %r7;
mov.u32 %r82, %r77;
bra.uni BB8_12;

BB8_6:
mov.u32 %r87, %r7;
mov.u32 %r88, %r77;
bra.uni BB8_13;

BB8_9:
setp.eq.s32 %p6, %r9, 2;
mov.u32 %r78, %r7;
mov.u32 %r79, %r77;
@%p6 bra BB8_11;

ld.local.u32 	%r49, [%rd10];
rem.u32 	%r50, %r77, %r49;
ld.local.u32 	%r51, [%rd10+100];
mul.lo.s32 	%r90, %r51, %r50;
div.u32 	%r79, %r77, %r49;
mov.u32 	%r78, %r10;

BB8_11:
mul.wide.s32 %rd16, %r78, 4;
add.s64 %rd17, %rd4, %rd16;
ld.local.u32 %r52, [%rd17+8];
rem.u32 %r53, %r79, %r52;
ld.local.u32 %r54, [%rd17+108];
mad.lo.s32 %r90, %r54, %r53, %r90;
div.u32 %r82, %r79, %r52;
add.s32 %r81, %r78, -1;

BB8_12:
mul.wide.s32 %rd18, %r81, 4;
add.s64 %rd19, %rd4, %rd18;
ld.local.u32 %r55, [%rd19+8];
rem.u32 %r56, %r82, %r55;
ld.local.u32 %r57, [%rd19+108];
mad.lo.s32 %r90, %r57, %r56, %r90;
div.u32 %r88, %r82, %r55;
add.s32 %r87, %r81, -1;

BB8_13:
setp.lt.u32 %p7, %r6, 4;
@%p7 bra BB8_15;

BB8_14:
mul.wide.s32 %rd20, %r87, 4;
add.s64 %rd21, %rd4, %rd20;
ld.local.u32 %r58, [%rd21+8];
rem.u32 %r59, %r88, %r58;
ld.local.u32 %r60, [%rd21+108];
mad.lo.s32 %r61, %r60, %r59, %r90;
div.u32 %r62, %r88, %r58;
ld.local.u32 %r63, [%rd21+4];
rem.u32 %r64, %r62, %r63;
ld.local.u32 %r65, [%rd21+104];
mad.lo.s32 %r66, %r65, %r64, %r61;
div.u32 %r67, %r62, %r63;
ld.local.u32 %r68, [%rd21];
rem.u32 %r69, %r67, %r68;
ld.local.u32 %r70, [%rd21+100];
mad.lo.s32 %r71, %r70, %r69, %r66;
div.u32 %r72, %r67, %r68;
ld.local.u32 %r73, [%rd21±4];
rem.u32 %r74, %r72, %r73;
ld.local.u32 %r75, [%rd21+96];
mad.lo.s32 %r90, %r75, %r74, %r71;
div.u32 %r88, %r72, %r73;
add.s32 %r87, %r87, -4;
setp.gt.s32 %p8, %r87, -1;
@%p8 bra BB8_14;

BB8_15:
mul.wide.u32 %rd22, %r77, 4;
add.s64 %rd23, %rd3, %rd22;
mul.wide.u32 %rd24, %r90, 4;
add.s64 %rd25, %rd9, %rd24;
ld.f32 %f1, [%rd25];
st.global.f32 [%rd23], %f1;
add.s32 %r77, %r8, %r77;
setp.lt.u32 %p9, %r77, %r1;
@%p9 bra BB8_4;

BB8_16:
ret;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<9>;
.reg .f32 %f<2>;
.reg .b32 %r<83>;
.reg .b64 %rd<22>;

mov.b64	%rd5, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd6, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r32, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u64 	%rd1, %rd6;
mov.u32 	%r33, %ntid.x;
mov.u32 	%r34, %ctaid.x;
mov.u32 	%r35, %tid.x;
mad.lo.s32 	%r69, %r33, %r34, %r35;
setp.ge.u32	%p1, %r69, %r32;
@%p1 bra 	BB9_14;

mov.u64 	%rd7, %rd5;
ld.param.u64 	%rd8, [%rd7];
ld.param.u32 	%r2, [%rd1+208];
add.s32 	%r3, %r2, -1;
ld.param.u64 	%rd9, [%rd1];
cvta.to.global.u64 	%rd2, %rd9;
and.b32  	%r4, %r2, 3;
mul.wide.s32 	%rd10, %r3, 4;
add.s64 	%rd11, %rd1, %rd10;
add.s64 	%rd3, %rd11, 8;
cvta.to.global.u64 	%rd4, %rd8;

BB9_2:
mov.u32 %r82, 0;
setp.lt.s32 %p2, %r3, 0;
@%p2 bra BB9_13;

mov.u32 	%r82, 0;
setp.eq.s32	%p3, %r4, 0;
@%p3 bra 	BB9_4;

setp.eq.s32	%p4, %r4, 1;
@%p4 bra 	BB9_6;
bra.uni 	BB9_7;

BB9_6:
mov.u32 %r73, %r3;
mov.u32 %r74, %r69;
bra.uni BB9_10;

BB9_4:
mov.u32 %r79, %r3;
mov.u32 %r80, %r69;
bra.uni BB9_11;

BB9_7:
setp.eq.s32 %p5, %r4, 2;
mov.u32 %r70, %r3;
mov.u32 %r71, %r69;
@%p5 bra BB9_9;

ld.param.u32 	%r40, [%rd3];
rem.u32 	%r41, %r69, %r40;
ld.param.u32 	%r42, [%rd3+100];
mul.lo.s32 	%r82, %r42, %r41;
div.u32 	%r71, %r69, %r40;
add.s32 	%r70, %r2, -2;

BB9_9:
mul.wide.s32 %rd12, %r70, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r43, [%rd13+8];
rem.u32 %r44, %r71, %r43;
ld.param.u32 %r45, [%rd13+108];
mad.lo.s32 %r82, %r45, %r44, %r82;
div.u32 %r74, %r71, %r43;
add.s32 %r73, %r70, -1;

BB9_10:
mul.wide.s32 %rd14, %r73, 4;
add.s64 %rd15, %rd1, %rd14;
ld.param.u32 %r46, [%rd15+8];
rem.u32 %r47, %r74, %r46;
ld.param.u32 %r48, [%rd15+108];
mad.lo.s32 %r82, %r48, %r47, %r82;
div.u32 %r80, %r74, %r46;
add.s32 %r79, %r73, -1;

BB9_11:
setp.lt.u32 %p6, %r2, 4;
@%p6 bra BB9_13;

BB9_12:
mul.wide.s32 %rd16, %r79, 4;
add.s64 %rd17, %rd1, %rd16;
ld.param.u32 %r49, [%rd17+8];
rem.u32 %r50, %r80, %r49;
ld.param.u32 %r51, [%rd17+108];
mad.lo.s32 %r52, %r51, %r50, %r82;
div.u32 %r53, %r80, %r49;
ld.param.u32 %r54, [%rd17+4];
rem.u32 %r55, %r53, %r54;
ld.param.u32 %r56, [%rd17+104];
mad.lo.s32 %r57, %r56, %r55, %r52;
div.u32 %r58, %r53, %r54;
ld.param.u32 %r59, [%rd17];
rem.u32 %r60, %r58, %r59;
ld.param.u32 %r61, [%rd17+100];
mad.lo.s32 %r62, %r61, %r60, %r57;
div.u32 %r63, %r58, %r59;
ld.param.u32 %r64, [%rd17±4];
rem.u32 %r65, %r63, %r64;
ld.param.u32 %r66, [%rd17+96];
mad.lo.s32 %r82, %r66, %r65, %r62;
div.u32 %r80, %r63, %r64;
add.s32 %r79, %r79, -4;
setp.gt.s32 %p7, %r79, -1;
@%p7 bra BB9_12;

BB9_13:
mul.wide.u32 %rd18, %r69, 4;
add.s64 %rd19, %rd4, %rd18;
mul.wide.u32 %rd20, %r82, 4;
add.s64 %rd21, %rd2, %rd20;
ld.global.f32 %f1, [%rd21];
st.global.f32 [%rd19], %f1;
mov.u32 %r68, %nctaid.x;
mad.lo.s32 %r69, %r68, %r33, %r69;
setp.lt.u32 %p8, %r69, %r32;
@%p8 bra BB9_2;

BB9_14:
ret;
}

// .globl	copy__int32_M1_M1

.visible .entry copy__int32_M1_M1(
.param .align 8 .b8 copy__int32_M1_M1_param_0[216],
.param .align 8 .b8 copy__int32_M1_M1_param_1[216],
.param .u64 copy__int32_M1_M1_param_2
)
{
.local .align 8 .b8 __local_depot10[432];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<17>;
.reg .f32 %f<2>;
.reg .b32 %r<179>;
.reg .b64 %rd<48>;

mov.u64 	%SPL, __local_depot10;
mov.b64	%rd19, copy__int32_M1_M1_param_0;
mov.b64	%rd20, copy__int32_M1_M1_param_1;
mov.u64 	%rd44, %rd19;
mov.u64 	%rd46, %rd20;
add.u64 	%rd5, %SPL, 0;
add.u64 	%rd4, %SPL, 216;
ld.param.u32 	%r1, [copy__int32_M1_M1_param_2];
mov.u32 	%r150, 0;
mov.u64 	%rd45, %rd5;

BB10_1:
ld.param.v2.u32 {%r72, %r73}, [%rd44];
st.local.v2.u32 [%rd45], {%r72, %r73};
add.s64 %rd45, %rd45, 8;
add.s64 %rd44, %rd44, 8;
add.s32 %r150, %r150, 1;
setp.lt.u32 %p1, %r150, 27;
@%p1 bra BB10_1;

mov.u32 	%r151, 0;
mov.u64 	%rd47, %rd4;

BB10_3:
ld.param.v2.u32 {%r77, %r78}, [%rd46];
st.local.v2.u32 [%rd47], {%r77, %r78};
add.s64 %rd47, %rd47, 8;
add.s64 %rd46, %rd46, 8;
add.s32 %r151, %r151, 1;
setp.lt.u32 %p2, %r151, 27;
@%p2 bra BB10_3;

mov.u32 	%r81, %ctaid.x;
mov.u32 	%r6, %ntid.x;
mov.u32 	%r82, %tid.x;
mad.lo.s32 	%r152, %r6, %r81, %r82;
setp.ge.u32	%p3, %r152, %r1;
@%p3 bra 	BB10_25;

ld.local.u64 	%rd15, [%rd5];
ld.local.u64 	%rd16, [%rd4];
mov.u32 	%r83, %nctaid.x;
mul.lo.s32 	%r8, %r83, %r6;
ld.local.u32 	%r9, [%rd5+208];
and.b32  	%r10, %r9, 3;
add.s32 	%r84, %r9, -1;
mul.wide.s32 	%rd24, %r84, 4;
add.s64 	%rd25, %rd5, %rd24;
add.s64 	%rd17, %rd25, 8;
ld.local.u32 	%r11, [%rd4+208];
and.b32  	%r12, %r11, 3;
add.s32 	%r85, %r11, -1;
mul.wide.s32 	%rd26, %r85, 4;
add.s64 	%rd27, %rd4, %rd26;
add.s64 	%rd18, %rd27, 8;

BB10_6:
mov.u32 %r178, 0;
setp.lt.s32 %p4, %r84, 0;
mov.u32 %r165, %r178;
@%p4 bra BB10_15;

setp.eq.s32	%p5, %r10, 0;
mov.u32 	%r165, 0;
mov.u32 	%r162, %r84;
mov.u32 	%r163, %r152;
@%p5 bra 	BB10_13;

setp.eq.s32	%p6, %r10, 1;
add.s32 	%r156, %r9, -1;
mov.u32 	%r158, 0;
mov.u32 	%r157, %r152;
@%p6 bra 	BB10_12;

setp.eq.s32	%p7, %r10, 2;
add.s32 	%r153, %r9, -1;
mov.u32 	%r155, 0;
mov.u32 	%r154, %r152;
@%p7 bra 	BB10_11;

ld.local.u32 	%r91, [%rd17];
rem.u32 	%r92, %r152, %r91;
ld.local.u32 	%r93, [%rd17+100];
mul.lo.s32 	%r155, %r93, %r92;
div.u32 	%r154, %r152, %r91;
add.s32 	%r153, %r9, -2;

BB10_11:
mul.wide.s32 %rd28, %r153, 4;
add.s64 %rd29, %rd5, %rd28;
ld.local.u32 %r94, [%rd29+8];
rem.u32 %r95, %r154, %r94;
ld.local.u32 %r96, [%rd29+108];
mad.lo.s32 %r158, %r96, %r95, %r155;
div.u32 %r157, %r154, %r94;
add.s32 %r156, %r153, -1;

BB10_12:
mul.wide.s32 %rd30, %r156, 4;
add.s64 %rd31, %rd5, %rd30;
ld.local.u32 %r97, [%rd31+8];
rem.u32 %r98, %r157, %r97;
ld.local.u32 %r99, [%rd31+108];
mad.lo.s32 %r165, %r99, %r98, %r158;
div.u32 %r163, %r157, %r97;
add.s32 %r162, %r156, -1;

BB10_13:
setp.lt.u32 %p8, %r9, 4;
@%p8 bra BB10_15;

BB10_14:
mul.wide.s32 %rd32, %r162, 4;
add.s64 %rd33, %rd5, %rd32;
ld.local.u32 %r100, [%rd33+8];
rem.u32 %r101, %r163, %r100;
ld.local.u32 %r102, [%rd33+108];
mad.lo.s32 %r103, %r102, %r101, %r165;
div.u32 %r104, %r163, %r100;
ld.local.u32 %r105, [%rd33+4];
rem.u32 %r106, %r104, %r105;
ld.local.u32 %r107, [%rd33+104];
mad.lo.s32 %r108, %r107, %r106, %r103;
div.u32 %r109, %r104, %r105;
ld.local.u32 %r110, [%rd33];
rem.u32 %r111, %r109, %r110;
ld.local.u32 %r112, [%rd33+100];
mad.lo.s32 %r113, %r112, %r111, %r108;
div.u32 %r114, %r109, %r110;
ld.local.u32 %r115, [%rd33±4];
rem.u32 %r116, %r114, %r115;
ld.local.u32 %r117, [%rd33+96];
mad.lo.s32 %r165, %r117, %r116, %r113;
div.u32 %r163, %r114, %r115;
add.s32 %r162, %r162, -4;
setp.gt.s32 %p9, %r162, -1;
@%p9 bra BB10_14;

BB10_15:
setp.lt.s32 %p10, %r85, 0;
@%p10 bra BB10_24;

setp.eq.s32	%p11, %r12, 0;
mov.u32 	%r178, 0;
mov.u32 	%r175, %r85;
mov.u32 	%r176, %r152;
@%p11 bra 	BB10_22;

setp.eq.s32	%p12, %r12, 1;
add.s32 	%r169, %r11, -1;
mov.u32 	%r171, 0;
mov.u32 	%r170, %r152;
@%p12 bra 	BB10_21;

setp.eq.s32	%p13, %r12, 2;
add.s32 	%r166, %r11, -1;
mov.u32 	%r168, 0;
mov.u32 	%r167, %r152;
@%p13 bra 	BB10_20;

ld.local.u32 	%r123, [%rd18];
rem.u32 	%r124, %r152, %r123;
ld.local.u32 	%r125, [%rd18+100];
mul.lo.s32 	%r168, %r125, %r124;
div.u32 	%r167, %r152, %r123;
add.s32 	%r166, %r11, -2;

BB10_20:
mul.wide.s32 %rd34, %r166, 4;
add.s64 %rd35, %rd4, %rd34;
ld.local.u32 %r126, [%rd35+8];
rem.u32 %r127, %r167, %r126;
ld.local.u32 %r128, [%rd35+108];
mad.lo.s32 %r171, %r128, %r127, %r168;
div.u32 %r170, %r167, %r126;
add.s32 %r169, %r166, -1;

BB10_21:
mul.wide.s32 %rd36, %r169, 4;
add.s64 %rd37, %rd4, %rd36;
ld.local.u32 %r129, [%rd37+8];
rem.u32 %r130, %r170, %r129;
ld.local.u32 %r131, [%rd37+108];
mad.lo.s32 %r178, %r131, %r130, %r171;
div.u32 %r176, %r170, %r129;
add.s32 %r175, %r169, -1;

BB10_22:
setp.lt.u32 %p14, %r11, 4;
@%p14 bra BB10_24;

BB10_23:
mul.wide.s32 %rd38, %r175, 4;
add.s64 %rd39, %rd4, %rd38;
ld.local.u32 %r132, [%rd39+8];
rem.u32 %r133, %r176, %r132;
ld.local.u32 %r134, [%rd39+108];
mad.lo.s32 %r135, %r134, %r133, %r178;
div.u32 %r136, %r176, %r132;
ld.local.u32 %r137, [%rd39+4];
rem.u32 %r138, %r136, %r137;
ld.local.u32 %r139, [%rd39+104];
mad.lo.s32 %r140, %r139, %r138, %r135;
div.u32 %r141, %r136, %r137;
ld.local.u32 %r142, [%rd39];
rem.u32 %r143, %r141, %r142;
ld.local.u32 %r144, [%rd39+100];
mad.lo.s32 %r145, %r144, %r143, %r140;
div.u32 %r146, %r141, %r142;
ld.local.u32 %r147, [%rd39±4];
rem.u32 %r148, %r146, %r147;
ld.local.u32 %r149, [%rd39+96];
mad.lo.s32 %r178, %r149, %r148, %r145;
div.u32 %r176, %r146, %r147;
add.s32 %r175, %r175, -4;
setp.gt.s32 %p15, %r175, -1;
@%p15 bra BB10_23;

BB10_24:
mul.wide.u32 %rd40, %r165, 4;
add.s64 %rd41, %rd15, %rd40;
mul.wide.u32 %rd42, %r178, 4;
add.s64 %rd43, %rd16, %rd42;
ld.f32 %f1, [%rd43];
st.f32 [%rd41], %f1;
add.s32 %r152, %r8, %r152;
setp.lt.u32 %p16, %r152, %r1;
@%p16 bra BB10_6;

BB10_25:
ret;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<15>;
.reg .f32 %f<2>;
.reg .b32 %r<159>;
.reg .b64 %rd<31>;

mov.b64	%rd5, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd6, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r58, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u64 	%rd1, %rd5;
mov.u64 	%rd2, %rd6;
mov.u32 	%r59, %ntid.x;
mov.u32 	%r60, %ctaid.x;
mov.u32 	%r61, %tid.x;
mad.lo.s32 	%r132, %r59, %r60, %r61;
setp.ge.u32	%p1, %r132, %r58;
@%p1 bra 	BB11_25;

ld.param.u32 	%r2, [%rd1+208];
ld.param.u32 	%r3, [%rd2+208];
ld.param.u64 	%rd3, [%rd1];
ld.param.u64 	%rd4, [%rd2];
cvta.to.global.u64 	%rd25, %rd3;
cvta.to.global.u64 	%rd28, %rd4;

BB11_2:
add.s32 %r133, %r2, -1;
mov.u32 %r158, 0;
setp.lt.s32 %p2, %r133, 0;
mov.u32 %r145, %r158;
@%p2 bra BB11_13;

and.b32  	%r67, %r2, 3;
mov.u32 	%r145, 0;
setp.eq.s32	%p3, %r67, 0;
@%p3 bra 	BB11_4;

setp.eq.s32	%p4, %r67, 1;
@%p4 bra 	BB11_6;
bra.uni 	BB11_7;

BB11_6:
mov.u32 %r137, %r132;
bra.uni BB11_10;

BB11_4:
mov.u32 %r143, %r132;
bra.uni BB11_11;

BB11_7:
setp.eq.s32 %p5, %r67, 2;
mov.u32 %r134, %r132;
@%p5 bra BB11_9;

add.s32 	%r68, %r2, -1;
mul.wide.s32 	%rd8, %r68, 4;
add.s64 	%rd9, %rd1, %rd8;
ld.param.u32 	%r69, [%rd9+8];
rem.u32 	%r70, %r132, %r69;
ld.param.u32 	%r71, [%rd9+108];
mul.lo.s32 	%r145, %r71, %r70;
div.u32 	%r134, %r132, %r69;
add.s32 	%r133, %r2, -2;

BB11_9:
mul.wide.s32 %rd10, %r133, 4;
add.s64 %rd11, %rd1, %rd10;
ld.param.u32 %r72, [%rd11+8];
rem.u32 %r73, %r134, %r72;
ld.param.u32 %r74, [%rd11+108];
mad.lo.s32 %r145, %r74, %r73, %r145;
div.u32 %r137, %r134, %r72;
add.s32 %r133, %r133, -1;

BB11_10:
mul.wide.s32 %rd12, %r133, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r75, [%rd13+8];
rem.u32 %r76, %r137, %r75;
ld.param.u32 %r77, [%rd13+108];
mad.lo.s32 %r145, %r77, %r76, %r145;
div.u32 %r143, %r137, %r75;
add.s32 %r133, %r133, -1;

BB11_11:
setp.lt.u32 %p6, %r2, 4;
@%p6 bra BB11_13;

BB11_12:
mul.wide.s32 %rd14, %r133, 4;
add.s64 %rd15, %rd1, %rd14;
ld.param.u32 %r78, [%rd15+8];
rem.u32 %r79, %r143, %r78;
ld.param.u32 %r80, [%rd15+108];
mad.lo.s32 %r81, %r80, %r79, %r145;
div.u32 %r82, %r143, %r78;
ld.param.u32 %r83, [%rd15+4];
rem.u32 %r84, %r82, %r83;
ld.param.u32 %r85, [%rd15+104];
mad.lo.s32 %r86, %r85, %r84, %r81;
div.u32 %r87, %r82, %r83;
ld.param.u32 %r88, [%rd15];
rem.u32 %r89, %r87, %r88;
ld.param.u32 %r90, [%rd15+100];
mad.lo.s32 %r91, %r90, %r89, %r86;
div.u32 %r92, %r87, %r88;
ld.param.u32 %r93, [%rd15±4];
rem.u32 %r94, %r92, %r93;
ld.param.u32 %r95, [%rd15+96];
mad.lo.s32 %r145, %r95, %r94, %r91;
div.u32 %r143, %r92, %r93;
add.s32 %r133, %r133, -4;
setp.gt.s32 %p7, %r133, -1;
@%p7 bra BB11_12;

BB11_13:
add.s32 %r146, %r3, -1;
setp.lt.s32 %p8, %r146, 0;
@%p8 bra BB11_24;

and.b32  	%r101, %r3, 3;
mov.u32 	%r158, 0;
setp.eq.s32	%p9, %r101, 0;
@%p9 bra 	BB11_15;

setp.eq.s32	%p10, %r101, 1;
@%p10 bra 	BB11_17;
bra.uni 	BB11_18;

BB11_17:
mov.u32 %r150, %r132;
bra.uni BB11_21;

BB11_15:
mov.u32 %r156, %r132;
bra.uni BB11_22;

BB11_18:
setp.eq.s32 %p11, %r101, 2;
mov.u32 %r147, %r132;
@%p11 bra BB11_20;

add.s32 	%r102, %r3, -1;
mul.wide.s32 	%rd17, %r102, 4;
add.s64 	%rd18, %rd2, %rd17;
ld.param.u32 	%r103, [%rd18+8];
rem.u32 	%r104, %r132, %r103;
ld.param.u32 	%r105, [%rd18+108];
mul.lo.s32 	%r158, %r105, %r104;
div.u32 	%r147, %r132, %r103;
add.s32 	%r146, %r3, -2;

BB11_20:
mul.wide.s32 %rd19, %r146, 4;
add.s64 %rd20, %rd2, %rd19;
ld.param.u32 %r106, [%rd20+8];
rem.u32 %r107, %r147, %r106;
ld.param.u32 %r108, [%rd20+108];
mad.lo.s32 %r158, %r108, %r107, %r158;
div.u32 %r150, %r147, %r106;
add.s32 %r146, %r146, -1;

BB11_21:
mul.wide.s32 %rd21, %r146, 4;
add.s64 %rd22, %rd2, %rd21;
ld.param.u32 %r109, [%rd22+8];
rem.u32 %r110, %r150, %r109;
ld.param.u32 %r111, [%rd22+108];
mad.lo.s32 %r158, %r111, %r110, %r158;
div.u32 %r156, %r150, %r109;
add.s32 %r146, %r146, -1;

BB11_22:
setp.lt.u32 %p12, %r3, 4;
@%p12 bra BB11_24;

BB11_23:
mul.wide.s32 %rd23, %r146, 4;
add.s64 %rd24, %rd2, %rd23;
ld.param.u32 %r112, [%rd24+8];
rem.u32 %r113, %r156, %r112;
ld.param.u32 %r114, [%rd24+108];
mad.lo.s32 %r115, %r114, %r113, %r158;
div.u32 %r116, %r156, %r112;
ld.param.u32 %r117, [%rd24+4];
rem.u32 %r118, %r116, %r117;
ld.param.u32 %r119, [%rd24+104];
mad.lo.s32 %r120, %r119, %r118, %r115;
div.u32 %r121, %r116, %r117;
ld.param.u32 %r122, [%rd24];
rem.u32 %r123, %r121, %r122;
ld.param.u32 %r124, [%rd24+100];
mad.lo.s32 %r125, %r124, %r123, %r120;
div.u32 %r126, %r121, %r122;
ld.param.u32 %r127, [%rd24±4];
rem.u32 %r128, %r126, %r127;
ld.param.u32 %r129, [%rd24+96];
mad.lo.s32 %r158, %r129, %r128, %r125;
div.u32 %r156, %r126, %r127;
add.s32 %r146, %r146, -4;
setp.gt.s32 %p13, %r146, -1;
@%p13 bra BB11_23;

BB11_24:
mul.wide.u32 %rd26, %r145, 4;
add.s64 %rd27, %rd25, %rd26;
mul.wide.u32 %rd29, %r158, 4;
add.s64 %rd30, %rd28, %rd29;
ld.global.f32 %f1, [%rd30];
st.global.f32 [%rd27], %f1;
mov.u32 %r131, %nctaid.x;
mad.lo.s32 %r132, %r131, %r59, %r132;
setp.lt.u32 %p14, %r132, %r58;
@%p14 bra BB11_2;

BB11_25:
ret;
}

PTX file compiled by NVRTC 11.0, which got “ErrorLaunchFailed” error at run time

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-28845127
// Cuda compilation tools, release 11.0, V11.0.221
// Based on LLVM 3.4svn
//

.version 7.0
.target sm_60
.address_size 64

// .globl	fill__int32_M2

.visible .entry fill__int32_M2(
.param .align 8 .b8 fill__int32_M2_param_0[216],
.param .f32 fill__int32_M2_param_1,
.param .u64 fill__int32_M2_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .b32 %r<11>;
.reg .b64 %rd<7>;

mov.b64	%rd2, fill__int32_M2_param_0;
ld.param.f32 	%f1, [fill__int32_M2_param_1];
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
ld.param.u32 	%r3, [fill__int32_M2_param_2];
setp.ge.u32	%p1, %r10, %r3;
@%p1 bra 	BB0_3;

mov.u64 	%rd3, %rd2;
ld.param.u64 	%rd4, [%rd3];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r4, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd4;

BB0_2:
mul.wide.u32 %rd5, %r10, 4;
add.s64 %rd6, %rd1, %rd5;
st.global.f32 [%rd6], %f1;
add.s32 %r10, %r4, %r10;
setp.lt.u32 %p2, %r10, %r3;
@%p2 bra BB0_2;

BB0_3:
ret;
}

// .weak	_Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T_

.weak .entry Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T(
.param .align 8 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_0[216],
.param .u32 _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_1,
.param .align 4 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_2[4]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<3>;
.reg .f32 %f<3>;
.reg .b32 %r<11>;
.reg .b64 %rd<7>;

mov.b64	%rd2, _Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_0;
ld.param.u32 	%r6, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_1];
ld.param.f32 	%f2, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M2jLin2EEv10TensorInfoIT0_ES2_T__param_2];
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
setp.ge.u32	%p1, %r10, %r6;
@%p1 bra 	BB1_3;

mov.u64 	%rd3, %rd2;
ld.param.u64 	%rd4, [%rd3];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r3, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd4;

BB1_2:
mul.wide.u32 %rd5, %r10, 4;
add.s64 %rd6, %rd1, %rd5;
st.global.f32 [%rd6], %f2;
add.s32 %r10, %r3, %r10;
setp.lt.u32 %p2, %r10, %r6;
@%p2 bra BB1_2;

BB1_3:
ret;
}

// .globl	fill__int32_M1

.visible .entry fill__int32_M1(
.param .align 8 .b8 fill__int32_M1_param_0[216],
.param .f32 fill__int32_M1_param_1,
.param .u64 fill__int32_M1_param_2
)
{
.reg .pred %p<3>;
.reg .b32 %r<11>;

ld.param.u32 	%r1, [fill__int32_M1_param_2];
mov.u32 	%r2, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r2, %r7, %r8;
setp.lt.u32	%p1, %r10, %r1;
@%p1 bra 	BB2_2;
bra.uni 	BB2_1;

BB2_2:
mov.u32 %r9, %nctaid.x;
mul.lo.s32 %r4, %r9, %r2;

BB2_3:
add.s32 %r10, %r4, %r10;
setp.lt.u32 %p2, %r10, %r1;
@%p2 bra BB2_3;

trap;

BB2_1:
ret;
}

// .weak	_Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T_

.weak .entry Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T(
.param .align 8 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_0[216],
.param .u32 _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_1,
.param .align 4 .b8 _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_2[4]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<9>;
.reg .f32 %f<3>;
.reg .b32 %r<83>;
.reg .b64 %rd<16>;

mov.b64	%rd4, _Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_0;
ld.param.u32 	%r33, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_1];
ld.param.f32 	%f2, [_Z15pointwiseApply1I24ConcreteOpfill__int32_M1jLin1EEv10TensorInfoIT0_ES2_T__param_2];
mov.u64 	%rd1, %rd4;
mov.u32 	%r1, %ntid.x;
mov.u32 	%r34, %ctaid.x;
mov.u32 	%r35, %tid.x;
mad.lo.s32 	%r69, %r1, %r34, %r35;
setp.ge.u32	%p1, %r69, %r33;
@%p1 bra 	BB3_14;

ld.param.u32 	%r3, [%rd1+208];
add.s32 	%r4, %r3, -1;
ld.param.u64 	%rd5, [%rd1];
cvta.to.global.u64 	%rd2, %rd5;
mov.u32 	%r36, %nctaid.x;
mul.lo.s32 	%r5, %r36, %r1;
mul.wide.s32 	%rd6, %r4, 4;
add.s64 	%rd7, %rd1, %rd6;
add.s64 	%rd3, %rd7, 8;

BB3_2:
mov.u32 %r82, 0;
setp.lt.s32 %p2, %r4, 0;
@%p2 bra BB3_13;

and.b32  	%r41, %r3, 3;
mov.u32 	%r82, 0;
setp.eq.s32	%p3, %r41, 0;
@%p3 bra 	BB3_4;

setp.eq.s32	%p4, %r41, 1;
@%p4 bra 	BB3_6;
bra.uni 	BB3_7;

BB3_6:
mov.u32 %r73, %r4;
mov.u32 %r74, %r69;
bra.uni BB3_10;

BB3_4:
mov.u32 %r79, %r4;
mov.u32 %r80, %r69;
bra.uni BB3_11;

BB3_7:
setp.eq.s32 %p5, %r41, 2;
mov.u32 %r70, %r4;
mov.u32 %r71, %r69;
@%p5 bra BB3_9;

ld.param.u32 	%r42, [%rd3];
rem.u32 	%r43, %r69, %r42;
ld.param.u32 	%r44, [%rd3+100];
mul.lo.s32 	%r82, %r44, %r43;
div.u32 	%r71, %r69, %r42;
add.s32 	%r70, %r3, -2;

BB3_9:
mul.wide.s32 %rd8, %r70, 4;
add.s64 %rd9, %rd1, %rd8;
ld.param.u32 %r45, [%rd9+8];
rem.u32 %r46, %r71, %r45;
ld.param.u32 %r47, [%rd9+108];
mad.lo.s32 %r82, %r47, %r46, %r82;
div.u32 %r74, %r71, %r45;
add.s32 %r73, %r70, -1;

BB3_10:
mul.wide.s32 %rd10, %r73, 4;
add.s64 %rd11, %rd1, %rd10;
ld.param.u32 %r48, [%rd11+8];
rem.u32 %r49, %r74, %r48;
ld.param.u32 %r50, [%rd11+108];
mad.lo.s32 %r82, %r50, %r49, %r82;
div.u32 %r80, %r74, %r48;
add.s32 %r79, %r73, -1;

BB3_11:
setp.lt.u32 %p6, %r3, 4;
@%p6 bra BB3_13;

BB3_12:
mul.wide.s32 %rd12, %r79, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r51, [%rd13+8];
rem.u32 %r52, %r80, %r51;
ld.param.u32 %r53, [%rd13+108];
mad.lo.s32 %r54, %r53, %r52, %r82;
div.u32 %r55, %r80, %r51;
ld.param.u32 %r56, [%rd13+4];
rem.u32 %r57, %r55, %r56;
ld.param.u32 %r58, [%rd13+104];
mad.lo.s32 %r59, %r58, %r57, %r54;
div.u32 %r60, %r55, %r56;
ld.param.u32 %r61, [%rd13];
rem.u32 %r62, %r60, %r61;
ld.param.u32 %r63, [%rd13+100];
mad.lo.s32 %r64, %r63, %r62, %r59;
div.u32 %r65, %r60, %r61;
ld.param.u32 %r66, [%rd13±4];
rem.u32 %r67, %r65, %r66;
ld.param.u32 %r68, [%rd13+96];
mad.lo.s32 %r82, %r68, %r67, %r64;
div.u32 %r80, %r65, %r66;
add.s32 %r79, %r79, -4;
setp.gt.s32 %p7, %r79, -1;
@%p7 bra BB3_12;

BB3_13:
mul.wide.u32 %rd14, %r82, 4;
add.s64 %rd15, %rd2, %rd14;
st.global.f32 [%rd15], %f2;
add.s32 %r69, %r5, %r69;
setp.lt.u32 %p8, %r69, %r33;
@%p8 bra BB3_2;

BB3_14:
ret;
}

// .globl	copy__int32_M2_M2

.visible .entry copy__int32_M2_M2(
.param .align 8 .b8 copy__int32_M2_M2_param_0[216],
.param .align 8 .b8 copy__int32_M2_M2_param_1[216],
.param .u64 copy__int32_M2_M2_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;

mov.b64	%rd3, copy__int32_M2_M2_param_0;
mov.b64	%rd4, copy__int32_M2_M2_param_1;
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
ld.param.u32 	%r3, [copy__int32_M2_M2_param_2];
setp.ge.u32	%p1, %r10, %r3;
@%p1 bra 	BB4_3;

mov.u64 	%rd5, %rd3;
ld.param.u64 	%rd6, [%rd5];
mov.u64 	%rd7, %rd4;
ld.param.u64 	%rd8, [%rd7];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r4, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd8;
cvta.to.global.u64 	%rd2, %rd6;

BB4_2:
mul.wide.u32 %rd9, %r10, 4;
add.s64 %rd10, %rd2, %rd9;
add.s64 %rd11, %rd1, %rd9;
ld.global.f32 %f1, [%rd11];
st.global.f32 [%rd10], %f1;
add.s32 %r10, %r4, %r10;
setp.lt.u32 %p2, %r10, %r3;
@%p2 bra BB4_2;

BB4_3:
ret;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;

mov.b64	%rd3, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd4, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r6, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M2jLin2ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u32 	%r1, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r1, %r7, %r8;
setp.ge.u32	%p1, %r10, %r6;
@%p1 bra 	BB5_3;

mov.u64 	%rd5, %rd3;
ld.param.u64 	%rd6, [%rd5];
mov.u64 	%rd7, %rd4;
ld.param.u64 	%rd8, [%rd7];
mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r3, %r9, %r1;
cvta.to.global.u64 	%rd1, %rd8;
cvta.to.global.u64 	%rd2, %rd6;

BB5_2:
mul.wide.u32 %rd9, %r10, 4;
add.s64 %rd10, %rd2, %rd9;
add.s64 %rd11, %rd1, %rd9;
ld.global.f32 %f1, [%rd11];
st.global.f32 [%rd10], %f1;
add.s32 %r10, %r3, %r10;
setp.lt.u32 %p2, %r10, %r6;
@%p2 bra BB5_2;

BB5_3:
ret;
}

// .globl	copy__int32_M1_M2

.visible .entry copy__int32_M1_M2(
.param .align 8 .b8 copy__int32_M1_M2_param_0[216],
.param .align 8 .b8 copy__int32_M1_M2_param_1[216],
.param .u64 copy__int32_M1_M2_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;

ld.param.u32 	%r1, [copy__int32_M1_M2_param_2];
mov.u32 	%r2, %ntid.x;
mov.u32 	%r3, %ctaid.x;
mov.u32 	%r4, %tid.x;
mad.lo.s32 	%r5, %r2, %r3, %r4;
setp.lt.u32	%p1, %r5, %r1;
@%p1 bra 	BB6_2;

ret;

BB6_2:
trap;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<9>;
.reg .f32 %f<2>;
.reg .b32 %r<83>;
.reg .b64 %rd<22>;

mov.b64	%rd6, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd5, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r32, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M2jLin1ELin2EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u64 	%rd1, %rd6;
mov.u32 	%r33, %ntid.x;
mov.u32 	%r34, %ctaid.x;
mov.u32 	%r35, %tid.x;
mad.lo.s32 	%r69, %r33, %r34, %r35;
setp.ge.u32	%p1, %r69, %r32;
@%p1 bra 	BB7_14;

mov.u64 	%rd7, %rd5;
ld.param.u64 	%rd8, [%rd7];
ld.param.u32 	%r2, [%rd1+208];
add.s32 	%r3, %r2, -1;
ld.param.u64 	%rd9, [%rd1];
cvta.to.global.u64 	%rd2, %rd9;
and.b32  	%r4, %r2, 3;
mul.wide.s32 	%rd10, %r3, 4;
add.s64 	%rd11, %rd1, %rd10;
add.s64 	%rd3, %rd11, 8;
cvta.to.global.u64 	%rd4, %rd8;

BB7_2:
mov.u32 %r82, 0;
setp.lt.s32 %p2, %r3, 0;
@%p2 bra BB7_13;

mov.u32 	%r82, 0;
setp.eq.s32	%p3, %r4, 0;
@%p3 bra 	BB7_4;

setp.eq.s32	%p4, %r4, 1;
@%p4 bra 	BB7_6;
bra.uni 	BB7_7;

BB7_6:
mov.u32 %r73, %r3;
mov.u32 %r74, %r69;
bra.uni BB7_10;

BB7_4:
mov.u32 %r79, %r3;
mov.u32 %r80, %r69;
bra.uni BB7_11;

BB7_7:
setp.eq.s32 %p5, %r4, 2;
mov.u32 %r70, %r3;
mov.u32 %r71, %r69;
@%p5 bra BB7_9;

ld.param.u32 	%r40, [%rd3];
rem.u32 	%r41, %r69, %r40;
ld.param.u32 	%r42, [%rd3+100];
mul.lo.s32 	%r82, %r42, %r41;
div.u32 	%r71, %r69, %r40;
add.s32 	%r70, %r2, -2;

BB7_9:
mul.wide.s32 %rd12, %r70, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r43, [%rd13+8];
rem.u32 %r44, %r71, %r43;
ld.param.u32 %r45, [%rd13+108];
mad.lo.s32 %r82, %r45, %r44, %r82;
div.u32 %r74, %r71, %r43;
add.s32 %r73, %r70, -1;

BB7_10:
mul.wide.s32 %rd14, %r73, 4;
add.s64 %rd15, %rd1, %rd14;
ld.param.u32 %r46, [%rd15+8];
rem.u32 %r47, %r74, %r46;
ld.param.u32 %r48, [%rd15+108];
mad.lo.s32 %r82, %r48, %r47, %r82;
div.u32 %r80, %r74, %r46;
add.s32 %r79, %r73, -1;

BB7_11:
setp.lt.u32 %p6, %r2, 4;
@%p6 bra BB7_13;

BB7_12:
mul.wide.s32 %rd16, %r79, 4;
add.s64 %rd17, %rd1, %rd16;
ld.param.u32 %r49, [%rd17+8];
rem.u32 %r50, %r80, %r49;
ld.param.u32 %r51, [%rd17+108];
mad.lo.s32 %r52, %r51, %r50, %r82;
div.u32 %r53, %r80, %r49;
ld.param.u32 %r54, [%rd17+4];
rem.u32 %r55, %r53, %r54;
ld.param.u32 %r56, [%rd17+104];
mad.lo.s32 %r57, %r56, %r55, %r52;
div.u32 %r58, %r53, %r54;
ld.param.u32 %r59, [%rd17];
rem.u32 %r60, %r58, %r59;
ld.param.u32 %r61, [%rd17+100];
mad.lo.s32 %r62, %r61, %r60, %r57;
div.u32 %r63, %r58, %r59;
ld.param.u32 %r64, [%rd17±4];
rem.u32 %r65, %r63, %r64;
ld.param.u32 %r66, [%rd17+96];
mad.lo.s32 %r82, %r66, %r65, %r62;
div.u32 %r80, %r63, %r64;
add.s32 %r79, %r79, -4;
setp.gt.s32 %p7, %r79, -1;
@%p7 bra BB7_12;

BB7_13:
mul.wide.u32 %rd18, %r82, 4;
add.s64 %rd19, %rd2, %rd18;
mul.wide.u32 %rd20, %r69, 4;
add.s64 %rd21, %rd4, %rd20;
ld.global.f32 %f1, [%rd21];
st.global.f32 [%rd19], %f1;
mov.u32 %r68, %nctaid.x;
mad.lo.s32 %r69, %r68, %r33, %r69;
setp.lt.u32 %p8, %r69, %r32;
@%p8 bra BB7_2;

BB7_14:
ret;
}

// .globl	copy__int32_M2_M1

.visible .entry copy__int32_M2_M1(
.param .align 8 .b8 copy__int32_M2_M1_param_0[216],
.param .align 8 .b8 copy__int32_M2_M1_param_1[216],
.param .u64 copy__int32_M2_M1_param_2
)
{
.reg .pred %p<3>;
.reg .b32 %r<11>;

ld.param.u32 	%r1, [copy__int32_M2_M1_param_2];
mov.u32 	%r2, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r2, %r7, %r8;
setp.ge.u32	%p1, %r10, %r1;
@%p1 bra 	BB8_3;

mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r4, %r9, %r2;

BB8_2:
add.s32 %r10, %r4, %r10;
setp.lt.u32 %p2, %r10, %r1;
@%p2 bra BB8_2;

BB8_3:
ret;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<9>;
.reg .f32 %f<2>;
.reg .b32 %r<83>;
.reg .b64 %rd<22>;

mov.b64	%rd5, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd6, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r32, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M2_M1jLin2ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u64 	%rd1, %rd6;
mov.u32 	%r33, %ntid.x;
mov.u32 	%r34, %ctaid.x;
mov.u32 	%r35, %tid.x;
mad.lo.s32 	%r69, %r33, %r34, %r35;
setp.ge.u32	%p1, %r69, %r32;
@%p1 bra 	BB9_14;

mov.u64 	%rd7, %rd5;
ld.param.u64 	%rd8, [%rd7];
ld.param.u32 	%r2, [%rd1+208];
add.s32 	%r3, %r2, -1;
ld.param.u64 	%rd9, [%rd1];
cvta.to.global.u64 	%rd2, %rd9;
and.b32  	%r4, %r2, 3;
mul.wide.s32 	%rd10, %r3, 4;
add.s64 	%rd11, %rd1, %rd10;
add.s64 	%rd3, %rd11, 8;
cvta.to.global.u64 	%rd4, %rd8;

BB9_2:
mov.u32 %r82, 0;
setp.lt.s32 %p2, %r3, 0;
@%p2 bra BB9_13;

mov.u32 	%r82, 0;
setp.eq.s32	%p3, %r4, 0;
@%p3 bra 	BB9_4;

setp.eq.s32	%p4, %r4, 1;
@%p4 bra 	BB9_6;
bra.uni 	BB9_7;

BB9_6:
mov.u32 %r73, %r3;
mov.u32 %r74, %r69;
bra.uni BB9_10;

BB9_4:
mov.u32 %r79, %r3;
mov.u32 %r80, %r69;
bra.uni BB9_11;

BB9_7:
setp.eq.s32 %p5, %r4, 2;
mov.u32 %r70, %r3;
mov.u32 %r71, %r69;
@%p5 bra BB9_9;

ld.param.u32 	%r40, [%rd3];
rem.u32 	%r41, %r69, %r40;
ld.param.u32 	%r42, [%rd3+100];
mul.lo.s32 	%r82, %r42, %r41;
div.u32 	%r71, %r69, %r40;
add.s32 	%r70, %r2, -2;

BB9_9:
mul.wide.s32 %rd12, %r70, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r43, [%rd13+8];
rem.u32 %r44, %r71, %r43;
ld.param.u32 %r45, [%rd13+108];
mad.lo.s32 %r82, %r45, %r44, %r82;
div.u32 %r74, %r71, %r43;
add.s32 %r73, %r70, -1;

BB9_10:
mul.wide.s32 %rd14, %r73, 4;
add.s64 %rd15, %rd1, %rd14;
ld.param.u32 %r46, [%rd15+8];
rem.u32 %r47, %r74, %r46;
ld.param.u32 %r48, [%rd15+108];
mad.lo.s32 %r82, %r48, %r47, %r82;
div.u32 %r80, %r74, %r46;
add.s32 %r79, %r73, -1;

BB9_11:
setp.lt.u32 %p6, %r2, 4;
@%p6 bra BB9_13;

BB9_12:
mul.wide.s32 %rd16, %r79, 4;
add.s64 %rd17, %rd1, %rd16;
ld.param.u32 %r49, [%rd17+8];
rem.u32 %r50, %r80, %r49;
ld.param.u32 %r51, [%rd17+108];
mad.lo.s32 %r52, %r51, %r50, %r82;
div.u32 %r53, %r80, %r49;
ld.param.u32 %r54, [%rd17+4];
rem.u32 %r55, %r53, %r54;
ld.param.u32 %r56, [%rd17+104];
mad.lo.s32 %r57, %r56, %r55, %r52;
div.u32 %r58, %r53, %r54;
ld.param.u32 %r59, [%rd17];
rem.u32 %r60, %r58, %r59;
ld.param.u32 %r61, [%rd17+100];
mad.lo.s32 %r62, %r61, %r60, %r57;
div.u32 %r63, %r58, %r59;
ld.param.u32 %r64, [%rd17±4];
rem.u32 %r65, %r63, %r64;
ld.param.u32 %r66, [%rd17+96];
mad.lo.s32 %r82, %r66, %r65, %r62;
div.u32 %r80, %r63, %r64;
add.s32 %r79, %r79, -4;
setp.gt.s32 %p7, %r79, -1;
@%p7 bra BB9_12;

BB9_13:
mul.wide.u32 %rd18, %r69, 4;
add.s64 %rd19, %rd4, %rd18;
mul.wide.u32 %rd20, %r82, 4;
add.s64 %rd21, %rd2, %rd20;
ld.global.f32 %f1, [%rd21];
st.global.f32 [%rd19], %f1;
mov.u32 %r68, %nctaid.x;
mad.lo.s32 %r69, %r68, %r33, %r69;
setp.lt.u32 %p8, %r69, %r32;
@%p8 bra BB9_2;

BB9_14:
ret;
}

// .globl	copy__int32_M1_M1

.visible .entry copy__int32_M1_M1(
.param .align 8 .b8 copy__int32_M1_M1_param_0[216],
.param .align 8 .b8 copy__int32_M1_M1_param_1[216],
.param .u64 copy__int32_M1_M1_param_2
)
{
.reg .pred %p<3>;
.reg .b32 %r<11>;

ld.param.u32 	%r1, [copy__int32_M1_M1_param_2];
mov.u32 	%r2, %ntid.x;
mov.u32 	%r7, %ctaid.x;
mov.u32 	%r8, %tid.x;
mad.lo.s32 	%r10, %r2, %r7, %r8;
setp.ge.u32	%p1, %r10, %r1;
@%p1 bra 	BB10_3;

mov.u32 	%r9, %nctaid.x;
mul.lo.s32 	%r4, %r9, %r2;

BB10_2:
add.s32 %r10, %r4, %r10;
setp.lt.u32 %p2, %r10, %r1;
@%p2 bra BB10_2;

BB10_3:
ret;
}

// .weak	_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T_

.weak .entry Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T(
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0[216],
.param .align 8 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1[216],
.param .u32 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2,
.param .align 1 .b8 _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_3[1]
)
.maxntid 512, 1, 1
.minnctapersm 4
{
.reg .pred %p<15>;
.reg .f32 %f<2>;
.reg .b32 %r<159>;
.reg .b64 %rd<31>;

mov.b64	%rd5, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_0;
mov.b64	%rd6, _Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_1;
ld.param.u32 	%r58, [_Z15pointwiseApply2I27ConcreteOpcopy__int32_M1_M1jLin1ELin1EEv10TensorInfoIT0_ES3_S2_T__param_2];
mov.u64 	%rd1, %rd5;
mov.u64 	%rd2, %rd6;
mov.u32 	%r59, %ntid.x;
mov.u32 	%r60, %ctaid.x;
mov.u32 	%r61, %tid.x;
mad.lo.s32 	%r132, %r59, %r60, %r61;
setp.ge.u32	%p1, %r132, %r58;
@%p1 bra 	BB11_25;

ld.param.u32 	%r2, [%rd1+208];
ld.param.u32 	%r3, [%rd2+208];
ld.param.u64 	%rd3, [%rd1];
ld.param.u64 	%rd4, [%rd2];
cvta.to.global.u64 	%rd25, %rd3;
cvta.to.global.u64 	%rd28, %rd4;

BB11_2:
add.s32 %r133, %r2, -1;
mov.u32 %r158, 0;
setp.lt.s32 %p2, %r133, 0;
mov.u32 %r145, %r158;
@%p2 bra BB11_13;

and.b32  	%r67, %r2, 3;
mov.u32 	%r145, 0;
setp.eq.s32	%p3, %r67, 0;
@%p3 bra 	BB11_4;

setp.eq.s32	%p4, %r67, 1;
@%p4 bra 	BB11_6;
bra.uni 	BB11_7;

BB11_6:
mov.u32 %r137, %r132;
bra.uni BB11_10;

BB11_4:
mov.u32 %r143, %r132;
bra.uni BB11_11;

BB11_7:
setp.eq.s32 %p5, %r67, 2;
mov.u32 %r134, %r132;
@%p5 bra BB11_9;

add.s32 	%r68, %r2, -1;
mul.wide.s32 	%rd8, %r68, 4;
add.s64 	%rd9, %rd1, %rd8;
ld.param.u32 	%r69, [%rd9+8];
rem.u32 	%r70, %r132, %r69;
ld.param.u32 	%r71, [%rd9+108];
mul.lo.s32 	%r145, %r71, %r70;
div.u32 	%r134, %r132, %r69;
add.s32 	%r133, %r2, -2;

BB11_9:
mul.wide.s32 %rd10, %r133, 4;
add.s64 %rd11, %rd1, %rd10;
ld.param.u32 %r72, [%rd11+8];
rem.u32 %r73, %r134, %r72;
ld.param.u32 %r74, [%rd11+108];
mad.lo.s32 %r145, %r74, %r73, %r145;
div.u32 %r137, %r134, %r72;
add.s32 %r133, %r133, -1;

BB11_10:
mul.wide.s32 %rd12, %r133, 4;
add.s64 %rd13, %rd1, %rd12;
ld.param.u32 %r75, [%rd13+8];
rem.u32 %r76, %r137, %r75;
ld.param.u32 %r77, [%rd13+108];
mad.lo.s32 %r145, %r77, %r76, %r145;
div.u32 %r143, %r137, %r75;
add.s32 %r133, %r133, -1;

BB11_11:
setp.lt.u32 %p6, %r2, 4;
@%p6 bra BB11_13;

BB11_12:
mul.wide.s32 %rd14, %r133, 4;
add.s64 %rd15, %rd1, %rd14;
ld.param.u32 %r78, [%rd15+8];
rem.u32 %r79, %r143, %r78;
ld.param.u32 %r80, [%rd15+108];
mad.lo.s32 %r81, %r80, %r79, %r145;
div.u32 %r82, %r143, %r78;
ld.param.u32 %r83, [%rd15+4];
rem.u32 %r84, %r82, %r83;
ld.param.u32 %r85, [%rd15+104];
mad.lo.s32 %r86, %r85, %r84, %r81;
div.u32 %r87, %r82, %r83;
ld.param.u32 %r88, [%rd15];
rem.u32 %r89, %r87, %r88;
ld.param.u32 %r90, [%rd15+100];
mad.lo.s32 %r91, %r90, %r89, %r86;
div.u32 %r92, %r87, %r88;
ld.param.u32 %r93, [%rd15±4];
rem.u32 %r94, %r92, %r93;
ld.param.u32 %r95, [%rd15+96];
mad.lo.s32 %r145, %r95, %r94, %r91;
div.u32 %r143, %r92, %r93;
add.s32 %r133, %r133, -4;
setp.gt.s32 %p7, %r133, -1;
@%p7 bra BB11_12;

BB11_13:
add.s32 %r146, %r3, -1;
setp.lt.s32 %p8, %r146, 0;
@%p8 bra BB11_24;

and.b32  	%r101, %r3, 3;
mov.u32 	%r158, 0;
setp.eq.s32	%p9, %r101, 0;
@%p9 bra 	BB11_15;

setp.eq.s32	%p10, %r101, 1;
@%p10 bra 	BB11_17;
bra.uni 	BB11_18;

BB11_17:
mov.u32 %r150, %r132;
bra.uni BB11_21;

BB11_15:
mov.u32 %r156, %r132;
bra.uni BB11_22;

BB11_18:
setp.eq.s32 %p11, %r101, 2;
mov.u32 %r147, %r132;
@%p11 bra BB11_20;

add.s32 	%r102, %r3, -1;
mul.wide.s32 	%rd17, %r102, 4;
add.s64 	%rd18, %rd2, %rd17;
ld.param.u32 	%r103, [%rd18+8];
rem.u32 	%r104, %r132, %r103;
ld.param.u32 	%r105, [%rd18+108];
mul.lo.s32 	%r158, %r105, %r104;
div.u32 	%r147, %r132, %r103;
add.s32 	%r146, %r3, -2;

BB11_20:
mul.wide.s32 %rd19, %r146, 4;
add.s64 %rd20, %rd2, %rd19;
ld.param.u32 %r106, [%rd20+8];
rem.u32 %r107, %r147, %r106;
ld.param.u32 %r108, [%rd20+108];
mad.lo.s32 %r158, %r108, %r107, %r158;
div.u32 %r150, %r147, %r106;
add.s32 %r146, %r146, -1;

BB11_21:
mul.wide.s32 %rd21, %r146, 4;
add.s64 %rd22, %rd2, %rd21;
ld.param.u32 %r109, [%rd22+8];
rem.u32 %r110, %r150, %r109;
ld.param.u32 %r111, [%rd22+108];
mad.lo.s32 %r158, %r111, %r110, %r158;
div.u32 %r156, %r150, %r109;
add.s32 %r146, %r146, -1;

BB11_22:
setp.lt.u32 %p12, %r3, 4;
@%p12 bra BB11_24;

BB11_23:
mul.wide.s32 %rd23, %r146, 4;
add.s64 %rd24, %rd2, %rd23;
ld.param.u32 %r112, [%rd24+8];
rem.u32 %r113, %r156, %r112;
ld.param.u32 %r114, [%rd24+108];
mad.lo.s32 %r115, %r114, %r113, %r158;
div.u32 %r116, %r156, %r112;
ld.param.u32 %r117, [%rd24+4];
rem.u32 %r118, %r116, %r117;
ld.param.u32 %r119, [%rd24+104];
mad.lo.s32 %r120, %r119, %r118, %r115;
div.u32 %r121, %r116, %r117;
ld.param.u32 %r122, [%rd24];
rem.u32 %r123, %r121, %r122;
ld.param.u32 %r124, [%rd24+100];
mad.lo.s32 %r125, %r124, %r123, %r120;
div.u32 %r126, %r121, %r122;
ld.param.u32 %r127, [%rd24±4];
rem.u32 %r128, %r126, %r127;
ld.param.u32 %r129, [%rd24+96];
mad.lo.s32 %r158, %r129, %r128, %r125;
div.u32 %r156, %r126, %r127;
add.s32 %r146, %r146, -4;
setp.gt.s32 %p13, %r146, -1;
@%p13 bra BB11_23;

BB11_24:
mul.wide.u32 %rd26, %r145, 4;
add.s64 %rd27, %rd25, %rd26;
mul.wide.u32 %rd29, %r158, 4;
add.s64 %rd30, %rd28, %rd29;
ld.global.f32 %f1, [%rd30];
st.global.f32 [%rd27], %f1;
mov.u32 %r131, %nctaid.x;
mad.lo.s32 %r132, %r131, %r59, %r132;
setp.lt.u32 %p14, %r132, %r58;
@%p14 bra BB11_2;

BB11_25:
ret;
}