Hello, I’m using cutensor library to perform some real-complex mixed contraction. Both my input and output are strided, so I use the stride
parameter in tensor descriptor to do this. However, for C=A*B+C
, when I set A
to CUDA_C_64F
, B
to CUDA_R_64F
and C
to CUDA_C_64F
, which is said to be supported in documentation, the stride
parameter for C
does not make any effect. The calculation result (though is correct) is as if the stride
for C
is one, no matter what value I set. On the other hand, if all the input and output are consistently CUDA_C_64F
or CUDA_R_64F
, the output is strided as expected.
Below is a short test code, which is also attached. Any feedback is appreciated.
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include “cuComplex.h”
#include “cutensor.h”
int main()
{
cutensorHandle_t handle;
cutensorInit(&handle);
int mode_A[2] = {'i','j'};
int mode_B[1] = {'j'};
int mode_C[1] = {'i'};
int64_t extent_A[2] = {2, 2};
int64_t extent_B[1] = {2};
int64_t extent_C[1] = {2};
int64_t stride_C[1] = {2};
///////////////////////////
// real-real contraction //
///////////////////////////
double Ar[4] = {1,2,3,4};
double Br[2] = {1,2};
double Cr[4];
double* Ar_dev;
cudaMalloc(&Ar_dev, 4*sizeof(double));
cudaMemcpy(Ar_dev, Ar, 4*sizeof(double), cudaMemcpyHostToDevice);
double* Br_dev;
cudaMalloc(&Br_dev, 2*sizeof(double));
cudaMemcpy(Br_dev, Br, 2*sizeof(double), cudaMemcpyHostToDevice);
double* Cr_dev;
cudaMalloc(&Cr_dev, 4*sizeof(double));
cudaMemset(Cr_dev, 0, 4*sizeof(double));
cutensorTensorDescriptor_t desc_Ar;
cutensorTensorDescriptor_t desc_Br;
cutensorTensorDescriptor_t desc_Cr;
cutensorInitTensorDescriptor(&handle, &desc_Ar, 2, extent_A, NULL, CUDA_R_64F, CUTENSOR_OP_IDENTITY);
cutensorInitTensorDescriptor(&handle, &desc_Br, 1, extent_B, NULL, CUDA_R_64F, CUTENSOR_OP_IDENTITY);
cutensorInitTensorDescriptor(&handle, &desc_Cr, 1, extent_C, stride_C, CUDA_R_64F, CUTENSOR_OP_IDENTITY);
uint32_t alignmentRequirement_Ar;
uint32_t alignmentRequirement_Br;
uint32_t alignmentRequirement_Cr;
cutensorGetAlignmentRequirement(&handle, Ar_dev, &desc_Ar, &alignmentRequirement_Ar);
cutensorGetAlignmentRequirement(&handle, Br_dev, &desc_Br, &alignmentRequirement_Br);
cutensorGetAlignmentRequirement(&handle, Cr_dev, &desc_Cr, &alignmentRequirement_Cr);
cutensorContractionDescriptor_t desc_rr;
cutensorInitContractionDescriptor(&handle, &desc_rr,
&desc_Ar, mode_A, alignmentRequirement_Ar,
&desc_Br, mode_B, alignmentRequirement_Br,
&desc_Cr, mode_C, alignmentRequirement_Cr,
&desc_Cr, mode_C, alignmentRequirement_Cr,
CUTENSOR_COMPUTE_64F);
cutensorContractionFind_t find_rr;
cutensorInitContractionFind(&handle, &find_rr, CUTENSOR_ALGO_DEFAULT);
size_t worksize_rr;
cutensorContractionGetWorkspace(&handle, &desc_rr, &find_rr, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize_rr);
void* work_rr = nullptr;
if (worksize_rr > 0) {
cudaMalloc(&work_rr, worksize_rr);
}
cutensorContractionPlan_t plan_rr;
cutensorInitContractionPlan(&handle, &plan_rr, &desc_rr, &find_rr, worksize_rr);
double alpha_r = 1.0;
double beta_r = 0.0;
cutensorContraction(&handle, &plan_rr, &alpha_r, Ar_dev, Br_dev, &beta_r, Cr_dev, Cr_dev, work_rr, worksize_rr, 0);
cudaMemcpy(Cr, Cr_dev, 4*sizeof(double), cudaMemcpyDeviceToHost);
printf("C=%f, %f, %f, %f\n\n", Cr[0], Cr[1], Cr[2], Cr[3]);
/////////////////////////////////
// complex-complex contraction //
/////////////////////////////////
cuDoubleComplex Ac[4];
for (int i = 0; i < 4; i++) {
Ac[i].x = i+1.0; Ac[i].y = 0;
}
cuDoubleComplex Bc[2];
for (int i = 0; i < 2; i++) {
Bc[i].x = i+1.0; Bc[i].y = 0;
}
cuDoubleComplex Cc[4];
cuDoubleComplex* Ac_dev;
cudaMalloc(&Ac_dev, 4*sizeof(cuDoubleComplex));
cudaMemcpy(Ac_dev, Ac, 4*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
cuDoubleComplex* Bc_dev;
cudaMalloc(&Bc_dev, 2*sizeof(cuDoubleComplex));
cudaMemcpy(Bc_dev, Bc, 2*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
cuDoubleComplex* Cc_dev;
cudaMalloc(&Cc_dev, 4*sizeof(cuDoubleComplex));
cudaMemset(Cc_dev, 0, 4*sizeof(cuDoubleComplex));
cutensorTensorDescriptor_t desc_Ac;
cutensorTensorDescriptor_t desc_Bc;
cutensorTensorDescriptor_t desc_Cc;
cutensorInitTensorDescriptor(&handle, &desc_Ac, 2, extent_A, NULL, CUDA_C_64F, CUTENSOR_OP_IDENTITY);
cutensorInitTensorDescriptor(&handle, &desc_Bc, 1, extent_B, NULL, CUDA_C_64F, CUTENSOR_OP_IDENTITY);
cutensorInitTensorDescriptor(&handle, &desc_Cc, 1, extent_C, stride_C, CUDA_C_64F, CUTENSOR_OP_IDENTITY);
uint32_t alignmentRequirement_Ac;
uint32_t alignmentRequirement_Bc;
uint32_t alignmentRequirement_Cc;
cutensorGetAlignmentRequirement(&handle, Ac_dev, &desc_Ac, &alignmentRequirement_Ac);
cutensorGetAlignmentRequirement(&handle, Bc_dev, &desc_Bc, &alignmentRequirement_Bc);
cutensorGetAlignmentRequirement(&handle, Cc_dev, &desc_Cc, &alignmentRequirement_Cc);
cutensorContractionDescriptor_t desc_cc;
cutensorInitContractionDescriptor(&handle, &desc_cc,
&desc_Ac, mode_A, alignmentRequirement_Ac,
&desc_Bc, mode_B, alignmentRequirement_Bc,
&desc_Cc, mode_C, alignmentRequirement_Cc,
&desc_Cc, mode_C, alignmentRequirement_Cc,
CUTENSOR_COMPUTE_64F);
cutensorContractionFind_t find_cc;
cutensorInitContractionFind(&handle, &find_cc, CUTENSOR_ALGO_DEFAULT);
size_t worksize_cc;
cutensorContractionGetWorkspace(&handle, &desc_cc, &find_cc, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize_cc);
void* work_cc = nullptr;
if (worksize_cc > 0) {
cudaMalloc(&work_cc, worksize_cc);
}
cutensorContractionPlan_t plan_cc;
cutensorInitContractionPlan(&handle, &plan_cc, &desc_cc, &find_cc, worksize_cc);
cuDoubleComplex alpha_c = make_cuDoubleComplex(1.0,0.0);
cuDoubleComplex beta_c = make_cuDoubleComplex(0.0,0.0);
cutensorContraction(&handle, &plan_cc, &alpha_c, Ac_dev, Bc_dev, &beta_c, Cc_dev, Cc_dev, work_cc, worksize_cc, 0);
cudaMemcpy(Cc, Cc_dev, 4*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
printf("C=(%f,%f), (%f,%f), (%f,%f), (%f,%f)\n\n", Cc[0].x, Cc[0].y, Cc[1].x, Cc[1].y, Cc[2].x, Cc[2].y, Cc[3].x, Cc[3].y);
//////////////////////////////
// complex-real contraction //
//////////////////////////////
cudaMemset(Cc_dev, 0, 4*sizeof(cuDoubleComplex));
cutensorContractionDescriptor_t desc_cr;
cutensorInitContractionDescriptor(&handle, &desc_cr,
&desc_Ac, mode_A, alignmentRequirement_Ac,
&desc_Br, mode_B, alignmentRequirement_Br,
&desc_Cc, mode_C, alignmentRequirement_Cc,
&desc_Cc, mode_C, alignmentRequirement_Cc,
CUTENSOR_COMPUTE_64F);
cutensorContractionFind_t find_cr;
cutensorInitContractionFind(&handle, &find_cr, CUTENSOR_ALGO_DEFAULT);
size_t worksize_cr;
cutensorContractionGetWorkspace(&handle, &desc_cr, &find_cr, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize_cr);
void* work_cr = nullptr;
if (worksize_cr > 0) {
cudaMalloc(&work_cr, worksize_cr);
}
cutensorContractionPlan_t plan_cr;
cutensorInitContractionPlan(&handle, &plan_cr, &desc_cr, &find_cr, worksize_cr);
cutensorContraction(&handle, &plan_cr, &alpha_c, Ac_dev, Br_dev, &beta_c, Cc_dev, Cc_dev, work_cr, worksize_cr, 0);
cudaMemcpy(Cc, Cc_dev, 4*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
printf("C=(%f,%f), (%f,%f), (%f,%f), (%f,%f)\n\n", Cc[0].x, Cc[0].y, Cc[1].x, Cc[1].y, Cc[2].x, Cc[2].y, Cc[3].x, Cc[3].y);
}
test result:
source code:
kernel.cu (7.2 KB)