Working with really large arrays in CUDA (how to prevent negative indexes?)

I have a “GeForce RTX 2060” with compute capability 7.5 and the CUDA SDK 10.1 Installed. I have a matrix that is about 6000x6000x3 or 108,000,000 elements. I’ve proved out my algorithm on a smaller matrix and there are no issues. I’ve based my work off the simple VectorAdd program in the SDK. I flatten my matrix so it is a single array. I’ve read about threads and blocks and grids and I feel like I have an understanding. I get systemic indexing errors when I try to work with this larger array. I get weird things like negative numbered arrays.

This makes sense since i’m assigning the index to an ‘int’ but what datatype should I use so it doesn’t overflow?

I couldn’t find where this question was answered but if there’s some other links or conversations I would welcome them.

Here is how I setup my kernel blocks and threads:

int threadsInX = 16;	int threadsInY = 4;	int threadsInZ = 16; //threadsInX*threadsInY*threadsInZ can not be > 1024
	   
	int blocksInX =  ((Xrows + threadsInX - 1) / threadsInX) > threadsInX ? ((Xrows + threadsInX - 1) / threadsInX) : threadsInX;
	int blocksInY = ((Xcols + threadsInY - 1) / threadsInY); // > threadsInY ? ((Xcols + threadsInY - 1) / threadsInY) : threadsInY;
	int blocksInZ =  ((Xrows + threadsInZ - 1) / threadsInZ) > threadsInZ ? ((Xrows + threadsInZ - 1) / threadsInZ) : threadsInZ;

	dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
	dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);
		
	printf("launch reshapeAndPermute CUDA kernel with %d %d %d blocks of %i, %i, %i threads\n", blocksInX, blocksInY, blocksInZ, threadsInX, threadsInY, threadsInZ);
	reshapeAndPermute << <dimGrid, dimBlock >> > (d_A, d_B, d_X, Xrows, Xcols, Xrows);

Here is the kernel itself (Still being debugged)

/**
 * CUDA Kernel Device code
 *
 * Flattens two 2D vector X into A and B. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void reshapeAndPermute(double* A, double* B, double* X, int numElementsRows, int numElementsCols, int numElementsDepth)
{
	unsigned int ii = (blockIdx.x * blockDim.x) + threadIdx.x;
	unsigned int jj = (blockIdx.y * blockDim.y) + threadIdx.y;
	unsigned int kk = (blockIdx.z * blockDim.z) + threadIdx.z;	

	__syncthreads();
	//so many checks...
	if((ii < numElementsRows && kk < numElementsDepth) && jj < numElementsCols){
		int idxA = ii + numElementsRows * (jj + kk * numElementsDepth);
		int idxX = ii + numElementsRows * jj;
		if ((abs(idxX) < (numElementsRows * numElementsCols) && abs(idxA) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))){
			A[idxA] = X[idxX];
			printf(" %i %i %i %d %d %d %d \r\n", ii, jj, kk, ii + numElementsRows * jj, ii + numElementsRows * (jj + kk * numElementsDepth),idxX, idxA);
			//printf("X[%i %i] = %f = A[%i %i %i] = %f  \r\n", ii, jj , X[idxX], ii, jj, kk, A[idxA]);
		}
	}

	if (ii < numElementsRows) {
		if (jj < numElementsCols) {
			if (kk < numElementsDepth) {

				int idxB = kk + numElementsRows * (jj + ii * numElementsDepth);
				int idxXB = kk + numElementsDepth * jj;
				if ((abs(idxXB) < (numElementsRows * numElementsCols) && abs(idxXB) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
					//B[idxB] = X[idxXB]; //do permute
					printf(" %i %i %i %d %d \r\n", ii, jj, kk, idxB, idxXB);
					//printf("X[%i %i] = %f = B[%i %i %i] = %f  \r\n", jj, kk, X[idxXB], ii, jj, kk, B[idxB]); 
				}
			}
		}
	}

}

Any ideas of what’s going wrong here?

If you want help with debugging code, its usually my suggestion to provide a complete code that someone else can work on directly, without having to add anything or change anything.

Since you appear to be on windows, a common problem that people run into is the WDDM TDR function. You might google that and make sure that you are not running into a kernel duration timeout.

Other general recommendations, any time you are having trouble with a CUDA code, are to use proper CUDA error checking, and run your code with cuda-memcheck (which can be done in windows also, from a command prompt. If you prefer, a similar error checking facility is available in the CUDA plugin for Visual Studio.)

Hey thanks for getting back to me! I finally untangled my code for a few full examples I can post.
Thanks for your help looking at this. I’ll also research the issues you posted I haven’t done this yet.

as a side note if we can get this it will help a lot of matlab to C++ coders :-)

In this example you can see there are no issues:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <assert.h>
#include <ctype.h>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <filesystem>
#include <iostream>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
using namespace std;

cudaError_t debugWrapper(int* c, int* a, int* b, unsigned int Xrows, unsigned int Xcols);

__global__ void reshapeAndPermute(int* A, int* B, int* X, int numElementsRows, int numElementsCols, int numElementsDepth)
{
	int ii = (blockIdx.x * blockDim.x) + threadIdx.x;
	int jj = (blockIdx.y * blockDim.y) + threadIdx.y;
	int kk = (blockIdx.z * blockDim.z) + threadIdx.z;

	__syncthreads();
	//so many checks...
	if ((ii < numElementsRows && kk < numElementsDepth) && jj < numElementsCols) {
		int idxA = ii + numElementsRows * (jj + kk * numElementsDepth);
		int idxX = ii + numElementsRows * jj;
		if ((abs(idxX) < (numElementsRows * numElementsCols) && abs(idxA) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
			A[idxA] = X[idxX];
			//printf(" %i %i %i %d %d %d %d \r\n", ii, jj, kk, ii + numElementsRows * jj, ii + numElementsRows * (jj + kk * numElementsDepth), idxX, idxA);
			//printf("X[%i %i] = %i = A[%i %i %i] = %i  \r\n", ii, jj , X[idxX], ii, jj, kk, A[idxA]);
		}
	}

	if (ii < numElementsRows) {
		if (jj < numElementsCols) {
			if (kk < numElementsDepth) {

				int idxB = kk + numElementsRows * (jj + ii * numElementsDepth);
				int idxXB = kk + numElementsDepth * jj;
				if ((abs(idxXB) < (numElementsRows * numElementsCols) && abs(idxXB) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
					B[idxB] = X[idxXB]; //do permute
					//printf(" %i %i %i %d %d \r\n", ii, jj, kk, idxB, idxXB);
					//printf("X[%i %i] = %i = B[%i %i %i] = %i  \r\n", jj, kk, X[idxXB], ii, jj, kk, B[idxB]); 
				}
			}
		}
	}

}

int main()
{

	/***Calculate Affinity Matrix***/
	//int h_Debug[12] = { 1,4,7,12,2,5,8,14,3,6,10,16 };
	/*
	A = 1  2  3
		4  5  6
		7  8  10
		12 14 16
	*/
	//double h_Debug[12] = { 1,2,3,4,5,6,7,8,10,12,14,16 };
	/* Note the transpose
	A = 1 5 10
		2 6 12
		3 7 14
		4 9 16
	*/
	const int Xrows = 4000; const int Xcols = 3;

    const int arraySize = Xrows*Xcols;
	int numElementsSquare = Xrows * Xcols; //size for the 2D array
	int numElements2D = Xrows * Xcols; //size for the 2D array
	int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array
	size_t size = numElements * sizeof(double);
	size_t size2D = numElements2D * sizeof(double);
	size_t sizeSquare = numElementsSquare * sizeof(double);

	// Allocate the host input vector A 
	int* h_A = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D
	// Allocate the host input vector B
	int* h_B = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D

	int* h_Debug = (int*)calloc(size2D, sizeof(int));
	// Initialize the host input vectors (for large arrays)
	for (int ii = 0; ii < Xrows; ++ii)
	{
		for (int jj = 0; jj < Xcols; jj++) {
			h_Debug[ii + Xrows*jj] = rand() % 100;         // v1 in the range 0 to 99
			if (ii + Xrows * jj < 10) {
				printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);
			}
		}
	}

    // Add vectors in parallel.
    cudaError_t cudaStatus = debugWrapper(h_Debug, h_A, h_B, Xrows, Xcols);

	for (int ii = 0; ii < 4; ii++) {
		for (int jj = 0; jj < 3; jj++) {
			for (int kk = 0; kk < 4; kk++) {
				printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);

				printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);

				printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
			}
		}
	}

    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }
	
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t debugWrapper(int *h_X, int *h_A, int *h_B, unsigned int Xrows, unsigned int Xcols)
{
	// Print the vector length to be used, and compute its size
	int numElementsSquare = Xrows * Xrows; //size for the 2D array
	int numElements2D = Xrows * Xcols; //size for the 2D array
	int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array

	size_t size = numElements * sizeof(double);
	size_t size2D = numElements2D * sizeof(double);
	size_t sizeSquare = numElementsSquare * sizeof(double);

	cudaError_t err = cudaSuccess;
	// Verify that allocations succeeded
	if (h_A == NULL || h_B == NULL || h_X == NULL )
	{
		fprintf(stderr, "Failed to allocate host vectors!\n");
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector A
	int* d_A = NULL;
	err = cudaMalloc((void**)&d_A, size);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector B
	int* d_B = NULL;
	err = cudaMalloc((void**)&d_B, size);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector X
	int* d_X = NULL;
	err = cudaMalloc((void**)&d_X, size2D);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector X (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Copy the host input vectors A and B in host memory to the device input vectors in
	// device memory
	printf("Copy input data from the host memory to the CUDA device\n");
	err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaMemcpy(d_X, h_X, size2D, cudaMemcpyHostToDevice);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector X from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	//Launch the reshapeAndPermute and flatten CUDA Kernel
	int threadsInX = 16;	int threadsInY = 4;	int threadsInZ = 16; //threadsInX*threadsInY*threadsInZ can not be > 1024

	int blocksInX = ((Xrows + threadsInX - 1) / threadsInX) > threadsInX ? ((Xrows + threadsInX - 1) / threadsInX) : threadsInX;
	int blocksInY = ((Xcols + threadsInY - 1) / threadsInY); // > threadsInY ? ((Xcols + threadsInY - 1) / threadsInY) : threadsInY;
	int blocksInZ = ((Xrows + threadsInZ - 1) / threadsInZ) > threadsInZ ? ((Xrows + threadsInZ - 1) / threadsInZ) : threadsInZ;

	dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
	dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);

	printf("launch reshapeAndPermute CUDA kernel with %d %d %d blocks of %i, %i, %i threads\n", blocksInX, blocksInY, blocksInZ, threadsInX, threadsInY, threadsInZ);
	reshapeAndPermute << <dimGrid, dimBlock >> > (d_A, d_B, d_X, Xrows, Xcols, Xrows);
	err = cudaGetLastError();
	err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
	err = cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
	for (int ii = 0; ii < 4; ii++) {
		for (int jj = 0; jj < 3; jj++) {
			for (int kk = 0; kk < 4; kk++) {
				printf("h_X[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_X[ii + Xrows * jj]);

				printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);

				printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
			}
		}
	}

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to launch reshapeAndPermute kernel (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	
	// Free device global memory
	err = cudaFree(d_A);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaFree(d_B);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaFree(d_X);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector X (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Free host memory
	//free(h_A);
	//free(h_B);
	//free(h_X);

	printf("Done\n");

	return err;
}

Here’s the output

h_Debug[0] = h_X[0 0] = 41
h_Debug[4] = h_X[0 1] = 67
h_Debug[8] = h_X[0 2] = 34
h_Debug[1] = h_X[1 0] = 0
h_Debug[5] = h_X[1 1] = 69
h_Debug[9] = h_X[1 2] = 24
h_Debug[2] = h_X[2 0] = 78
h_Debug[6] = h_X[2 1] = 58
h_Debug[3] = h_X[3 0] = 64
h_Debug[7] = h_X[3 1] = 5
Copy input data from the host memory to the CUDA device
launch reshapeAndPermute CUDA kernel with 16 1 16 blocks of 16, 4, 16 threads
h_X[0] = h_X[0 0] = 41
h_A[0] = h_A[0 0 0] = 41
h_B[0] = h_B[0 0 0] = 41
h_X[0] = h_X[0 0] = 41
h_A[16] = h_A[0 0 1] = 41
h_B[16] = h_B[0 0 1] = 41
h_X[0] = h_X[0 0] = 41
h_A[32] = h_A[0 0 2] = 41
h_B[32] = h_B[0 0 2] = 41
h_X[0] = h_X[0 0] = 41
h_A[48] = h_A[0 0 3] = 41
h_B[48] = h_B[0 0 3] = 41
h_X[4] = h_X[0 1] = 67
h_A[4] = h_A[0 1 0] = 67
h_B[4] = h_B[0 1 0] = 67
h_X[4] = h_X[0 1] = 67
h_A[20] = h_A[0 1 1] = 67
h_B[20] = h_B[0 1 1] = 67
h_X[4] = h_X[0 1] = 67
h_A[36] = h_A[0 1 2] = 67
h_B[36] = h_B[0 1 2] = 67
h_X[4] = h_X[0 1] = 67
h_A[52] = h_A[0 1 3] = 67
h_B[52] = h_B[0 1 3] = 67
h_X[8] = h_X[0 2] = 34
h_A[8] = h_A[0 2 0] = 34
h_B[8] = h_B[0 2 0] = 34
h_X[8] = h_X[0 2] = 34
h_A[24] = h_A[0 2 1] = 34
h_B[24] = h_B[0 2 1] = 34
h_X[8] = h_X[0 2] = 34
h_A[40] = h_A[0 2 2] = 34
h_B[40] = h_B[0 2 2] = 34
h_X[8] = h_X[0 2] = 34
h_A[56] = h_A[0 2 3] = 34
h_B[56] = h_B[0 2 3] = 34
h_X[1] = h_X[1 0] = 0
h_A[1] = h_A[1 0 0] = 0
h_B[1] = h_B[1 0 0] = 0
h_X[1] = h_X[1 0] = 0
h_A[17] = h_A[1 0 1] = 0
h_B[17] = h_B[1 0 1] = 0
h_X[1] = h_X[1 0] = 0
h_A[33] = h_A[1 0 2] = 0
h_B[33] = h_B[1 0 2] = 0
h_X[1] = h_X[1 0] = 0
h_A[49] = h_A[1 0 3] = 0
h_B[49] = h_B[1 0 3] = 0
h_X[5] = h_X[1 1] = 69
h_A[5] = h_A[1 1 0] = 69
h_B[5] = h_B[1 1 0] = 69
h_X[5] = h_X[1 1] = 69
h_A[21] = h_A[1 1 1] = 69
h_B[21] = h_B[1 1 1] = 69
h_X[5] = h_X[1 1] = 69
h_A[37] = h_A[1 1 2] = 69
h_B[37] = h_B[1 1 2] = 69
h_X[5] = h_X[1 1] = 69
h_A[53] = h_A[1 1 3] = 69
h_B[53] = h_B[1 1 3] = 69
h_X[9] = h_X[1 2] = 24
h_A[9] = h_A[1 2 0] = 24
h_B[9] = h_B[1 2 0] = 24
h_X[9] = h_X[1 2] = 24
h_A[25] = h_A[1 2 1] = 24
h_B[25] = h_B[1 2 1] = 24
h_X[9] = h_X[1 2] = 24
h_A[41] = h_A[1 2 2] = 24
h_B[41] = h_B[1 2 2] = 24
h_X[9] = h_X[1 2] = 24
h_A[57] = h_A[1 2 3] = 24
h_B[57] = h_B[1 2 3] = 24
h_X[2] = h_X[2 0] = 78
h_A[2] = h_A[2 0 0] = 78
h_B[2] = h_B[2 0 0] = 78
h_X[2] = h_X[2 0] = 78
h_A[18] = h_A[2 0 1] = 78
h_B[18] = h_B[2 0 1] = 78
h_X[2] = h_X[2 0] = 78
h_A[34] = h_A[2 0 2] = 78
h_B[34] = h_B[2 0 2] = 78
h_X[2] = h_X[2 0] = 78
h_A[50] = h_A[2 0 3] = 78
h_B[50] = h_B[2 0 3] = 78
h_X[6] = h_X[2 1] = 58
h_A[6] = h_A[2 1 0] = 58
h_B[6] = h_B[2 1 0] = 58
h_X[6] = h_X[2 1] = 58
h_A[22] = h_A[2 1 1] = 58
h_B[22] = h_B[2 1 1] = 58
h_X[6] = h_X[2 1] = 58
h_A[38] = h_A[2 1 2] = 58
h_B[38] = h_B[2 1 2] = 58
h_X[6] = h_X[2 1] = 58
h_A[54] = h_A[2 1 3] = 58
h_B[54] = h_B[2 1 3] = 58
h_X[10] = h_X[2 2] = 62
h_A[10] = h_A[2 2 0] = 62
h_B[10] = h_B[2 2 0] = 62
h_X[10] = h_X[2 2] = 62
h_A[26] = h_A[2 2 1] = 62
h_B[26] = h_B[2 2 1] = 62
h_X[10] = h_X[2 2] = 62
h_A[42] = h_A[2 2 2] = 62
h_B[42] = h_B[2 2 2] = 62
h_X[10] = h_X[2 2] = 62
h_A[58] = h_A[2 2 3] = 62
h_B[58] = h_B[2 2 3] = 62
h_X[3] = h_X[3 0] = 64
h_A[3] = h_A[3 0 0] = 64
h_B[3] = h_B[3 0 0] = 64
h_X[3] = h_X[3 0] = 64
h_A[19] = h_A[3 0 1] = 64
h_B[19] = h_B[3 0 1] = 64
h_X[3] = h_X[3 0] = 64
h_A[35] = h_A[3 0 2] = 64
h_B[35] = h_B[3 0 2] = 64
h_X[3] = h_X[3 0] = 64
h_A[51] = h_A[3 0 3] = 64
h_B[51] = h_B[3 0 3] = 64
h_X[7] = h_X[3 1] = 5
h_A[7] = h_A[3 1 0] = 5
h_B[7] = h_B[3 1 0] = 5
h_X[7] = h_X[3 1] = 5
h_A[23] = h_A[3 1 1] = 5
h_B[23] = h_B[3 1 1] = 5
h_X[7] = h_X[3 1] = 5
h_A[39] = h_A[3 1 2] = 5
h_B[39] = h_B[3 1 2] = 5
h_X[7] = h_X[3 1] = 5
h_A[55] = h_A[3 1 3] = 5
h_B[55] = h_B[3 1 3] = 5
h_X[11] = h_X[3 2] = 45
h_A[11] = h_A[3 2 0] = 45
h_B[11] = h_B[3 2 0] = 45
h_X[11] = h_X[3 2] = 45
h_A[27] = h_A[3 2 1] = 45
h_B[27] = h_B[3 2 1] = 45
h_X[11] = h_X[3 2] = 45
h_A[43] = h_A[3 2 2] = 45
h_B[43] = h_B[3 2 2] = 45
h_X[11] = h_X[3 2] = 45
h_A[59] = h_A[3 2 3] = 45
h_B[59] = h_B[3 2 3] = 45
Done
h_Debug[0] = h_X[0 0] = 41
h_A[0] = h_A[0 0 0] = 41
h_B[0] = h_B[0 0 0] = 41
h_Debug[0] = h_X[0 0] = 41
h_A[16] = h_A[0 0 1] = 41
h_B[16] = h_B[0 0 1] = 41
h_Debug[0] = h_X[0 0] = 41
h_A[32] = h_A[0 0 2] = 41
h_B[32] = h_B[0 0 2] = 41
h_Debug[0] = h_X[0 0] = 41
h_A[48] = h_A[0 0 3] = 41
h_B[48] = h_B[0 0 3] = 41
h_Debug[4] = h_X[0 1] = 67
h_A[4] = h_A[0 1 0] = 67
h_B[4] = h_B[0 1 0] = 67
h_Debug[4] = h_X[0 1] = 67
h_A[20] = h_A[0 1 1] = 67
h_B[20] = h_B[0 1 1] = 67
h_Debug[4] = h_X[0 1] = 67
h_A[36] = h_A[0 1 2] = 67
h_B[36] = h_B[0 1 2] = 67
h_Debug[4] = h_X[0 1] = 67
h_A[52] = h_A[0 1 3] = 67
h_B[52] = h_B[0 1 3] = 67
h_Debug[8] = h_X[0 2] = 34
h_A[8] = h_A[0 2 0] = 34
h_B[8] = h_B[0 2 0] = 34
h_Debug[8] = h_X[0 2] = 34
h_A[24] = h_A[0 2 1] = 34
h_B[24] = h_B[0 2 1] = 34
h_Debug[8] = h_X[0 2] = 34
h_A[40] = h_A[0 2 2] = 34
h_B[40] = h_B[0 2 2] = 34
h_Debug[8] = h_X[0 2] = 34
h_A[56] = h_A[0 2 3] = 34
h_B[56] = h_B[0 2 3] = 34
h_Debug[1] = h_X[1 0] = 0
h_A[1] = h_A[1 0 0] = 0
h_B[1] = h_B[1 0 0] = 0
h_Debug[1] = h_X[1 0] = 0
h_A[17] = h_A[1 0 1] = 0
h_B[17] = h_B[1 0 1] = 0
h_Debug[1] = h_X[1 0] = 0
h_A[33] = h_A[1 0 2] = 0
h_B[33] = h_B[1 0 2] = 0
h_Debug[1] = h_X[1 0] = 0
h_A[49] = h_A[1 0 3] = 0
h_B[49] = h_B[1 0 3] = 0
h_Debug[5] = h_X[1 1] = 69
h_A[5] = h_A[1 1 0] = 69
h_B[5] = h_B[1 1 0] = 69
h_Debug[5] = h_X[1 1] = 69
h_A[21] = h_A[1 1 1] = 69
h_B[21] = h_B[1 1 1] = 69
h_Debug[5] = h_X[1 1] = 69
h_A[37] = h_A[1 1 2] = 69
h_B[37] = h_B[1 1 2] = 69
h_Debug[5] = h_X[1 1] = 69
h_A[53] = h_A[1 1 3] = 69
h_B[53] = h_B[1 1 3] = 69
h_Debug[9] = h_X[1 2] = 24
h_A[9] = h_A[1 2 0] = 24
h_B[9] = h_B[1 2 0] = 24
h_Debug[9] = h_X[1 2] = 24
h_A[25] = h_A[1 2 1] = 24
h_B[25] = h_B[1 2 1] = 24
h_Debug[9] = h_X[1 2] = 24
h_A[41] = h_A[1 2 2] = 24
h_B[41] = h_B[1 2 2] = 24
h_Debug[9] = h_X[1 2] = 24
h_A[57] = h_A[1 2 3] = 24
h_B[57] = h_B[1 2 3] = 24
h_Debug[2] = h_X[2 0] = 78
h_A[2] = h_A[2 0 0] = 78
h_B[2] = h_B[2 0 0] = 78
h_Debug[2] = h_X[2 0] = 78
h_A[18] = h_A[2 0 1] = 78
h_B[18] = h_B[2 0 1] = 78
h_Debug[2] = h_X[2 0] = 78
h_A[34] = h_A[2 0 2] = 78
h_B[34] = h_B[2 0 2] = 78
h_Debug[2] = h_X[2 0] = 78
h_A[50] = h_A[2 0 3] = 78
h_B[50] = h_B[2 0 3] = 78
h_Debug[6] = h_X[2 1] = 58
h_A[6] = h_A[2 1 0] = 58
h_B[6] = h_B[2 1 0] = 58
h_Debug[6] = h_X[2 1] = 58
h_A[22] = h_A[2 1 1] = 58
h_B[22] = h_B[2 1 1] = 58
h_Debug[6] = h_X[2 1] = 58
h_A[38] = h_A[2 1 2] = 58
h_B[38] = h_B[2 1 2] = 58
h_Debug[6] = h_X[2 1] = 58
h_A[54] = h_A[2 1 3] = 58
h_B[54] = h_B[2 1 3] = 58
h_Debug[10] = h_X[2 2] = 62
h_A[10] = h_A[2 2 0] = 62
h_B[10] = h_B[2 2 0] = 62
h_Debug[10] = h_X[2 2] = 62
h_A[26] = h_A[2 2 1] = 62
h_B[26] = h_B[2 2 1] = 62
h_Debug[10] = h_X[2 2] = 62
h_A[42] = h_A[2 2 2] = 62
h_B[42] = h_B[2 2 2] = 62
h_Debug[10] = h_X[2 2] = 62
h_A[58] = h_A[2 2 3] = 62
h_B[58] = h_B[2 2 3] = 62
h_Debug[3] = h_X[3 0] = 64
h_A[3] = h_A[3 0 0] = 64
h_B[3] = h_B[3 0 0] = 64
h_Debug[3] = h_X[3 0] = 64
h_A[19] = h_A[3 0 1] = 64
h_B[19] = h_B[3 0 1] = 64
h_Debug[3] = h_X[3 0] = 64
h_A[35] = h_A[3 0 2] = 64
h_B[35] = h_B[3 0 2] = 64
h_Debug[3] = h_X[3 0] = 64
h_A[51] = h_A[3 0 3] = 64
h_B[51] = h_B[3 0 3] = 64
h_Debug[7] = h_X[3 1] = 5
h_A[7] = h_A[3 1 0] = 5
h_B[7] = h_B[3 1 0] = 5
h_Debug[7] = h_X[3 1] = 5
h_A[23] = h_A[3 1 1] = 5
h_B[23] = h_B[3 1 1] = 5
h_Debug[7] = h_X[3 1] = 5
h_A[39] = h_A[3 1 2] = 5
h_B[39] = h_B[3 1 2] = 5
h_Debug[7] = h_X[3 1] = 5
h_A[55] = h_A[3 1 3] = 5
h_B[55] = h_B[3 1 3] = 5
h_Debug[11] = h_X[3 2] = 45
h_A[11] = h_A[3 2 0] = 45
h_B[11] = h_B[3 2 0] = 45
h_Debug[11] = h_X[3 2] = 45
h_A[27] = h_A[3 2 1] = 45
h_B[27] = h_B[3 2 1] = 45
h_Debug[11] = h_X[3 2] = 45
h_A[43] = h_A[3 2 2] = 45
h_B[43] = h_B[3 2 2] = 45
h_Debug[11] = h_X[3 2] = 45
h_A[59] = h_A[3 2 3] = 45
h_B[59] = h_B[3 2 3] = 45

C:\Users\shane\source\repos\BugWithReshape\x64\Debug\BugWithReshape.exe (process 12424) exited with code 0.
To automatically close the console when debugging stops, enable Tools->Options->Debugging->Automatically close the console when debugging stops.
Press any key to close this window . . .

But with this example I get an illegal memory access error

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <assert.h>
#include <ctype.h>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <filesystem>
#include <iostream>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
using namespace std;

cudaError_t debugWrapper(int* c, int* a, int* b, unsigned int Xrows, unsigned int Xcols);

__global__ void reshapeAndPermute(int* A, int* B, int* X, int numElementsRows, int numElementsCols, int numElementsDepth)
{
	int ii = (blockIdx.x * blockDim.x) + threadIdx.x;
	int jj = (blockIdx.y * blockDim.y) + threadIdx.y;
	int kk = (blockIdx.z * blockDim.z) + threadIdx.z;

	__syncthreads();
	//so many checks...
	if ((ii < numElementsRows && kk < numElementsDepth) && jj < numElementsCols) {
		int idxA = ii + numElementsRows * (jj + kk * numElementsDepth);
		int idxX = ii + numElementsRows * jj;
		if ((abs(idxX) < (numElementsRows * numElementsCols) && abs(idxA) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
			A[idxA] = X[idxX];
			//printf(" %i %i %i %d %d %d %d \r\n", ii, jj, kk, ii + numElementsRows * jj, ii + numElementsRows * (jj + kk * numElementsDepth), idxX, idxA);
			//printf("X[%i %i] = %i = A[%i %i %i] = %i  \r\n", ii, jj , X[idxX], ii, jj, kk, A[idxA]);
		}
	}

	if (ii < numElementsRows) {
		if (jj < numElementsCols) {
			if (kk < numElementsDepth) {

				int idxB = kk + numElementsRows * (jj + ii * numElementsDepth);
				int idxXB = kk + numElementsDepth * jj;
				if ((abs(idxXB) < (numElementsRows * numElementsCols) && abs(idxXB) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
					B[idxB] = X[idxXB]; //do permute
					//printf(" %i %i %i %d %d \r\n", ii, jj, kk, idxB, idxXB);
					//printf("X[%i %i] = %i = B[%i %i %i] = %i  \r\n", jj, kk, X[idxXB], ii, jj, kk, B[idxB]); 
				}
			}
		}
	}

}

int main()
{

	/***Calculate Affinity Matrix***/
	//int h_Debug[12] = { 1,4,7,12,2,5,8,14,3,6,10,16 };
	/*
	A = 1  2  3
		4  5  6
		7  8  10
		12 14 16
	*/
	//double h_Debug[12] = { 1,2,3,4,5,6,7,8,10,12,14,16 };
	/* Note the transpose
	A = 1 5 10
		2 6 12
		3 7 14
		4 9 16
	*/
	const int Xrows = 6000; const int Xcols = 3;

    const int arraySize = Xrows*Xcols;
	int numElementsSquare = Xrows * Xcols; //size for the 2D array
	int numElements2D = Xrows * Xcols; //size for the 2D array
	int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array
	size_t size = numElements * sizeof(double);
	size_t size2D = numElements2D * sizeof(double);
	size_t sizeSquare = numElementsSquare * sizeof(double);

	// Allocate the host input vector A 
	int* h_A = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D
	// Allocate the host input vector B
	int* h_B = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D

	int* h_Debug = (int*)calloc(size2D, sizeof(int));
	// Initialize the host input vectors (for large arrays)
	for (int ii = 0; ii < Xrows; ++ii)
	{
		for (int jj = 0; jj < Xcols; jj++) {
			h_Debug[ii + Xrows*jj] = rand() % 100;         // v1 in the range 0 to 99
			if (ii + Xrows * jj < 10) {
				printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);
			}
		}
	}

    // Add vectors in parallel.
    cudaError_t cudaStatus = debugWrapper(h_Debug, h_A, h_B, Xrows, Xcols);

	for (int ii = 0; ii < 4; ii++) {
		for (int jj = 0; jj < 3; jj++) {
			for (int kk = 0; kk < 4; kk++) {
				printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);

				printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);

				printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
			}
		}
	}

    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }
	
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t debugWrapper(int *h_X, int *h_A, int *h_B, unsigned int Xrows, unsigned int Xcols)
{
	// Print the vector length to be used, and compute its size
	int numElementsSquare = Xrows * Xrows; //size for the 2D array
	int numElements2D = Xrows * Xcols; //size for the 2D array
	int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array

	size_t size = numElements * sizeof(double);
	size_t size2D = numElements2D * sizeof(double);
	size_t sizeSquare = numElementsSquare * sizeof(double);

	cudaError_t err = cudaSuccess;
	// Verify that allocations succeeded
	if (h_A == NULL || h_B == NULL || h_X == NULL )
	{
		fprintf(stderr, "Failed to allocate host vectors!\n");
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector A
	int* d_A = NULL;
	err = cudaMalloc((void**)&d_A, size);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector B
	int* d_B = NULL;
	err = cudaMalloc((void**)&d_B, size);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector X
	int* d_X = NULL;
	err = cudaMalloc((void**)&d_X, size2D);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector X (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Copy the host input vectors A and B in host memory to the device input vectors in
	// device memory
	printf("Copy input data from the host memory to the CUDA device\n");
	err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaMemcpy(d_X, h_X, size2D, cudaMemcpyHostToDevice);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector X from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	//Launch the reshapeAndPermute and flatten CUDA Kernel
	int threadsInX = 16;	int threadsInY = 4;	int threadsInZ = 16; //threadsInX*threadsInY*threadsInZ can not be > 1024

	int blocksInX = ((Xrows + threadsInX - 1) / threadsInX) > threadsInX ? ((Xrows + threadsInX - 1) / threadsInX) : threadsInX;
	int blocksInY = ((Xcols + threadsInY - 1) / threadsInY); // > threadsInY ? ((Xcols + threadsInY - 1) / threadsInY) : threadsInY;
	int blocksInZ = ((Xrows + threadsInZ - 1) / threadsInZ) > threadsInZ ? ((Xrows + threadsInZ - 1) / threadsInZ) : threadsInZ;

	dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
	dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);

	printf("launch reshapeAndPermute CUDA kernel with %d %d %d blocks of %i, %i, %i threads\n", blocksInX, blocksInY, blocksInZ, threadsInX, threadsInY, threadsInZ);
	reshapeAndPermute << <dimGrid, dimBlock >> > (d_A, d_B, d_X, Xrows, Xcols, Xrows);
	err = cudaGetLastError();
	err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
	err = cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
	for (int ii = 0; ii < 4; ii++) {
		for (int jj = 0; jj < 3; jj++) {
			for (int kk = 0; kk < 4; kk++) {
				printf("h_X[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_X[ii + Xrows * jj]);

				printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);

				printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
			}
		}
	}

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to launch reshapeAndPermute kernel (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	
	// Free device global memory
	err = cudaFree(d_A);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaFree(d_B);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaFree(d_X);
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector X (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Free host memory
	//free(h_A);
	//free(h_B);
	//free(h_X);

	printf("Done\n");

	return err;
}

Here’s the output

h_Debug[0] = h_X[0 0] = 41
h_Debug[1] = h_X[1 0] = 0
h_Debug[2] = h_X[2 0] = 78
h_Debug[3] = h_X[3 0] = 64
h_Debug[4] = h_X[4 0] = 81
h_Debug[5] = h_X[5 0] = 91
h_Debug[6] = h_X[6 0] = 27
h_Debug[7] = h_X[7 0] = 4
h_Debug[8] = h_X[8 0] = 92
h_Debug[9] = h_X[9 0] = 16
Copy input data from the host memory to the CUDA device
launch reshapeAndPermute CUDA kernel with 375 1 375 blocks of 16, 4, 16 threads
h_X[0] = h_X[0 0] = 41
h_A[0] = h_A[0 0 0] = 0
h_B[0] = h_B[0 0 0] = 0
h_X[0] = h_X[0 0] = 41
h_A[36000000] = h_A[0 0 1] = 0
h_B[36000000] = h_B[0 0 1] = 0
h_X[0] = h_X[0 0] = 41
h_A[72000000] = h_A[0 0 2] = 0
h_B[72000000] = h_B[0 0 2] = 0
h_X[0] = h_X[0 0] = 41
h_A[108000000] = h_A[0 0 3] = 0
h_B[108000000] = h_B[0 0 3] = 0
h_X[6000] = h_X[0 1] = 67
h_A[6000] = h_A[0 1 0] = 0
h_B[6000] = h_B[0 1 0] = 0
h_X[6000] = h_X[0 1] = 67
h_A[36006000] = h_A[0 1 1] = 0
h_B[36006000] = h_B[0 1 1] = 0
h_X[6000] = h_X[0 1] = 67
h_A[72006000] = h_A[0 1 2] = 0
h_B[72006000] = h_B[0 1 2] = 0
h_X[6000] = h_X[0 1] = 67
h_A[108006000] = h_A[0 1 3] = 0
h_B[108006000] = h_B[0 1 3] = 0
h_X[12000] = h_X[0 2] = 34
h_A[12000] = h_A[0 2 0] = 0
h_B[12000] = h_B[0 2 0] = 0
h_X[12000] = h_X[0 2] = 34
h_A[36012000] = h_A[0 2 1] = 0
h_B[36012000] = h_B[0 2 1] = 0
h_X[12000] = h_X[0 2] = 34
h_A[72012000] = h_A[0 2 2] = 0
h_B[72012000] = h_B[0 2 2] = 0
h_X[12000] = h_X[0 2] = 34
h_A[108012000] = h_A[0 2 3] = 0
h_B[108012000] = h_B[0 2 3] = 0
h_X[1] = h_X[1 0] = 0
h_A[1] = h_A[1 0 0] = 0
h_B[1] = h_B[1 0 0] = 0
h_X[1] = h_X[1 0] = 0
h_A[36000001] = h_A[1 0 1] = 0
h_B[36000001] = h_B[1 0 1] = 0
h_X[1] = h_X[1 0] = 0
h_A[72000001] = h_A[1 0 2] = 0
h_B[72000001] = h_B[1 0 2] = 0
h_X[1] = h_X[1 0] = 0
h_A[108000001] = h_A[1 0 3] = 0
h_B[108000001] = h_B[1 0 3] = 0
h_X[6001] = h_X[1 1] = 69
h_A[6001] = h_A[1 1 0] = 0
h_B[6001] = h_B[1 1 0] = 0
h_X[6001] = h_X[1 1] = 69
h_A[36006001] = h_A[1 1 1] = 0
h_B[36006001] = h_B[1 1 1] = 0
h_X[6001] = h_X[1 1] = 69
h_A[72006001] = h_A[1 1 2] = 0
h_B[72006001] = h_B[1 1 2] = 0
h_X[6001] = h_X[1 1] = 69
h_A[108006001] = h_A[1 1 3] = 0
h_B[108006001] = h_B[1 1 3] = 0
h_X[12001] = h_X[1 2] = 24
h_A[12001] = h_A[1 2 0] = 0
h_B[12001] = h_B[1 2 0] = 0
h_X[12001] = h_X[1 2] = 24
h_A[36012001] = h_A[1 2 1] = 0
h_B[36012001] = h_B[1 2 1] = 0
h_X[12001] = h_X[1 2] = 24
h_A[72012001] = h_A[1 2 2] = 0
h_B[72012001] = h_B[1 2 2] = 0
h_X[12001] = h_X[1 2] = 24
h_A[108012001] = h_A[1 2 3] = 0
h_B[108012001] = h_B[1 2 3] = 0
h_X[2] = h_X[2 0] = 78
h_A[2] = h_A[2 0 0] = 0
h_B[2] = h_B[2 0 0] = 0
h_X[2] = h_X[2 0] = 78
h_A[36000002] = h_A[2 0 1] = 0
h_B[36000002] = h_B[2 0 1] = 0
h_X[2] = h_X[2 0] = 78
h_A[72000002] = h_A[2 0 2] = 0
h_B[72000002] = h_B[2 0 2] = 0
h_X[2] = h_X[2 0] = 78
h_A[108000002] = h_A[2 0 3] = 0
h_B[108000002] = h_B[2 0 3] = 0
h_X[6002] = h_X[2 1] = 58
h_A[6002] = h_A[2 1 0] = 0
h_B[6002] = h_B[2 1 0] = 0
h_X[6002] = h_X[2 1] = 58
h_A[36006002] = h_A[2 1 1] = 0
h_B[36006002] = h_B[2 1 1] = 0
h_X[6002] = h_X[2 1] = 58
h_A[72006002] = h_A[2 1 2] = 0
h_B[72006002] = h_B[2 1 2] = 0
h_X[6002] = h_X[2 1] = 58
h_A[108006002] = h_A[2 1 3] = 0
h_B[108006002] = h_B[2 1 3] = 0
h_X[12002] = h_X[2 2] = 62
h_A[12002] = h_A[2 2 0] = 0
h_B[12002] = h_B[2 2 0] = 0
h_X[12002] = h_X[2 2] = 62
h_A[36012002] = h_A[2 2 1] = 0
h_B[36012002] = h_B[2 2 1] = 0
h_X[12002] = h_X[2 2] = 62
h_A[72012002] = h_A[2 2 2] = 0
h_B[72012002] = h_B[2 2 2] = 0
h_X[12002] = h_X[2 2] = 62
h_A[108012002] = h_A[2 2 3] = 0
h_B[108012002] = h_B[2 2 3] = 0
h_X[3] = h_X[3 0] = 64
h_A[3] = h_A[3 0 0] = 0
h_B[3] = h_B[3 0 0] = 0
h_X[3] = h_X[3 0] = 64
h_A[36000003] = h_A[3 0 1] = 0
h_B[36000003] = h_B[3 0 1] = 0
h_X[3] = h_X[3 0] = 64
h_A[72000003] = h_A[3 0 2] = 0
h_B[72000003] = h_B[3 0 2] = 0
h_X[3] = h_X[3 0] = 64
h_A[108000003] = h_A[3 0 3] = 0
h_B[108000003] = h_B[3 0 3] = 0
h_X[6003] = h_X[3 1] = 5
h_A[6003] = h_A[3 1 0] = 0
h_B[6003] = h_B[3 1 0] = 0
h_X[6003] = h_X[3 1] = 5
h_A[36006003] = h_A[3 1 1] = 0
h_B[36006003] = h_B[3 1 1] = 0
h_X[6003] = h_X[3 1] = 5
h_A[72006003] = h_A[3 1 2] = 0
h_B[72006003] = h_B[3 1 2] = 0
h_X[6003] = h_X[3 1] = 5
h_A[108006003] = h_A[3 1 3] = 0
h_B[108006003] = h_B[3 1 3] = 0
h_X[12003] = h_X[3 2] = 45
h_A[12003] = h_A[3 2 0] = 0
h_B[12003] = h_B[3 2 0] = 0
h_X[12003] = h_X[3 2] = 45
h_A[36012003] = h_A[3 2 1] = 0
h_B[36012003] = h_B[3 2 1] = 0
h_X[12003] = h_X[3 2] = 45
h_A[72012003] = h_A[3 2 2] = 0
h_B[72012003] = h_B[3 2 2] = 0
h_X[12003] = h_X[3 2] = 45
h_A[108012003] = h_A[3 2 3] = 0
h_B[108012003] = h_B[3 2 3] = 0
Failed to launch reshapeAndPermute kernel (error code an illegal memory access was encountered)!

C:\Users\shane\source\repos\BugWithReshape\x64\Debug\BugWithReshape.exe (process 62620) exited with code 1.
To automatically close the console when debugging stops, enable Tools->Options->Debugging->Automatically close the console when debugging stops.
Press any key to close this window . . .

I do too. Invalid global writes at line 40 of your code, here:

B[idxB] = X[idxXB]; //do permute

It seems evident that the calculation of idxB is resulting in values that are out of range for the B array.

I used this method to localize the problem:

https://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218

One possible way to continue the debug process if you wish is by putting an if statement in that kernel, in place of that line of code, something like this:

if (idxB is less than zero or out of range for the B array) printf("idxB is out of range: %d\n", idxB);
else B[idxB] = X[idxXB]; //do permute

You should then be able to work backward from there, by adding more data to the printf statement (such as the elements and indices used to calculate idxB) if needed.

Hi Robert,
This solution seemed to work.
Thank you!