Hey thanks for getting back to me! I finally untangled my code for a few full examples I can post.
Thanks for your help looking at this. I’ll also research the issues you posted I haven’t done this yet.
as a side note if we can get this it will help a lot of matlab to C++ coders :-)
In this example you can see there are no issues:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <ctype.h>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <filesystem>
#include <iostream>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
using namespace std;
cudaError_t debugWrapper(int* c, int* a, int* b, unsigned int Xrows, unsigned int Xcols);
__global__ void reshapeAndPermute(int* A, int* B, int* X, int numElementsRows, int numElementsCols, int numElementsDepth)
{
int ii = (blockIdx.x * blockDim.x) + threadIdx.x;
int jj = (blockIdx.y * blockDim.y) + threadIdx.y;
int kk = (blockIdx.z * blockDim.z) + threadIdx.z;
__syncthreads();
//so many checks...
if ((ii < numElementsRows && kk < numElementsDepth) && jj < numElementsCols) {
int idxA = ii + numElementsRows * (jj + kk * numElementsDepth);
int idxX = ii + numElementsRows * jj;
if ((abs(idxX) < (numElementsRows * numElementsCols) && abs(idxA) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
A[idxA] = X[idxX];
//printf(" %i %i %i %d %d %d %d \r\n", ii, jj, kk, ii + numElementsRows * jj, ii + numElementsRows * (jj + kk * numElementsDepth), idxX, idxA);
//printf("X[%i %i] = %i = A[%i %i %i] = %i \r\n", ii, jj , X[idxX], ii, jj, kk, A[idxA]);
}
}
if (ii < numElementsRows) {
if (jj < numElementsCols) {
if (kk < numElementsDepth) {
int idxB = kk + numElementsRows * (jj + ii * numElementsDepth);
int idxXB = kk + numElementsDepth * jj;
if ((abs(idxXB) < (numElementsRows * numElementsCols) && abs(idxXB) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
B[idxB] = X[idxXB]; //do permute
//printf(" %i %i %i %d %d \r\n", ii, jj, kk, idxB, idxXB);
//printf("X[%i %i] = %i = B[%i %i %i] = %i \r\n", jj, kk, X[idxXB], ii, jj, kk, B[idxB]);
}
}
}
}
}
int main()
{
/***Calculate Affinity Matrix***/
//int h_Debug[12] = { 1,4,7,12,2,5,8,14,3,6,10,16 };
/*
A = 1 2 3
4 5 6
7 8 10
12 14 16
*/
//double h_Debug[12] = { 1,2,3,4,5,6,7,8,10,12,14,16 };
/* Note the transpose
A = 1 5 10
2 6 12
3 7 14
4 9 16
*/
const int Xrows = 4000; const int Xcols = 3;
const int arraySize = Xrows*Xcols;
int numElementsSquare = Xrows * Xcols; //size for the 2D array
int numElements2D = Xrows * Xcols; //size for the 2D array
int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array
size_t size = numElements * sizeof(double);
size_t size2D = numElements2D * sizeof(double);
size_t sizeSquare = numElementsSquare * sizeof(double);
// Allocate the host input vector A
int* h_A = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D
// Allocate the host input vector B
int* h_B = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D
int* h_Debug = (int*)calloc(size2D, sizeof(int));
// Initialize the host input vectors (for large arrays)
for (int ii = 0; ii < Xrows; ++ii)
{
for (int jj = 0; jj < Xcols; jj++) {
h_Debug[ii + Xrows*jj] = rand() % 100; // v1 in the range 0 to 99
if (ii + Xrows * jj < 10) {
printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);
}
}
}
// Add vectors in parallel.
cudaError_t cudaStatus = debugWrapper(h_Debug, h_A, h_B, Xrows, Xcols);
for (int ii = 0; ii < 4; ii++) {
for (int jj = 0; jj < 3; jj++) {
for (int kk = 0; kk < 4; kk++) {
printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);
printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);
printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
}
}
}
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t debugWrapper(int *h_X, int *h_A, int *h_B, unsigned int Xrows, unsigned int Xcols)
{
// Print the vector length to be used, and compute its size
int numElementsSquare = Xrows * Xrows; //size for the 2D array
int numElements2D = Xrows * Xcols; //size for the 2D array
int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array
size_t size = numElements * sizeof(double);
size_t size2D = numElements2D * sizeof(double);
size_t sizeSquare = numElementsSquare * sizeof(double);
cudaError_t err = cudaSuccess;
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_X == NULL )
{
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Allocate the device input vector A
int* d_A = NULL;
err = cudaMalloc((void**)&d_A, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input vector B
int* d_B = NULL;
err = cudaMalloc((void**)&d_B, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input vector X
int* d_X = NULL;
err = cudaMalloc((void**)&d_X, size2D);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector X (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the host input vectors A and B in host memory to the device input vectors in
// device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_X, h_X, size2D, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector X from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//Launch the reshapeAndPermute and flatten CUDA Kernel
int threadsInX = 16; int threadsInY = 4; int threadsInZ = 16; //threadsInX*threadsInY*threadsInZ can not be > 1024
int blocksInX = ((Xrows + threadsInX - 1) / threadsInX) > threadsInX ? ((Xrows + threadsInX - 1) / threadsInX) : threadsInX;
int blocksInY = ((Xcols + threadsInY - 1) / threadsInY); // > threadsInY ? ((Xcols + threadsInY - 1) / threadsInY) : threadsInY;
int blocksInZ = ((Xrows + threadsInZ - 1) / threadsInZ) > threadsInZ ? ((Xrows + threadsInZ - 1) / threadsInZ) : threadsInZ;
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);
printf("launch reshapeAndPermute CUDA kernel with %d %d %d blocks of %i, %i, %i threads\n", blocksInX, blocksInY, blocksInZ, threadsInX, threadsInY, threadsInZ);
reshapeAndPermute << <dimGrid, dimBlock >> > (d_A, d_B, d_X, Xrows, Xcols, Xrows);
err = cudaGetLastError();
err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
err = cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
for (int ii = 0; ii < 4; ii++) {
for (int jj = 0; jj < 3; jj++) {
for (int kk = 0; kk < 4; kk++) {
printf("h_X[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_X[ii + Xrows * jj]);
printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);
printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
}
}
}
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch reshapeAndPermute kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_B);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_X);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector X (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
//free(h_A);
//free(h_B);
//free(h_X);
printf("Done\n");
return err;
}
Here’s the output
h_Debug[0] = h_X[0 0] = 41
h_Debug[4] = h_X[0 1] = 67
h_Debug[8] = h_X[0 2] = 34
h_Debug[1] = h_X[1 0] = 0
h_Debug[5] = h_X[1 1] = 69
h_Debug[9] = h_X[1 2] = 24
h_Debug[2] = h_X[2 0] = 78
h_Debug[6] = h_X[2 1] = 58
h_Debug[3] = h_X[3 0] = 64
h_Debug[7] = h_X[3 1] = 5
Copy input data from the host memory to the CUDA device
launch reshapeAndPermute CUDA kernel with 16 1 16 blocks of 16, 4, 16 threads
h_X[0] = h_X[0 0] = 41
h_A[0] = h_A[0 0 0] = 41
h_B[0] = h_B[0 0 0] = 41
h_X[0] = h_X[0 0] = 41
h_A[16] = h_A[0 0 1] = 41
h_B[16] = h_B[0 0 1] = 41
h_X[0] = h_X[0 0] = 41
h_A[32] = h_A[0 0 2] = 41
h_B[32] = h_B[0 0 2] = 41
h_X[0] = h_X[0 0] = 41
h_A[48] = h_A[0 0 3] = 41
h_B[48] = h_B[0 0 3] = 41
h_X[4] = h_X[0 1] = 67
h_A[4] = h_A[0 1 0] = 67
h_B[4] = h_B[0 1 0] = 67
h_X[4] = h_X[0 1] = 67
h_A[20] = h_A[0 1 1] = 67
h_B[20] = h_B[0 1 1] = 67
h_X[4] = h_X[0 1] = 67
h_A[36] = h_A[0 1 2] = 67
h_B[36] = h_B[0 1 2] = 67
h_X[4] = h_X[0 1] = 67
h_A[52] = h_A[0 1 3] = 67
h_B[52] = h_B[0 1 3] = 67
h_X[8] = h_X[0 2] = 34
h_A[8] = h_A[0 2 0] = 34
h_B[8] = h_B[0 2 0] = 34
h_X[8] = h_X[0 2] = 34
h_A[24] = h_A[0 2 1] = 34
h_B[24] = h_B[0 2 1] = 34
h_X[8] = h_X[0 2] = 34
h_A[40] = h_A[0 2 2] = 34
h_B[40] = h_B[0 2 2] = 34
h_X[8] = h_X[0 2] = 34
h_A[56] = h_A[0 2 3] = 34
h_B[56] = h_B[0 2 3] = 34
h_X[1] = h_X[1 0] = 0
h_A[1] = h_A[1 0 0] = 0
h_B[1] = h_B[1 0 0] = 0
h_X[1] = h_X[1 0] = 0
h_A[17] = h_A[1 0 1] = 0
h_B[17] = h_B[1 0 1] = 0
h_X[1] = h_X[1 0] = 0
h_A[33] = h_A[1 0 2] = 0
h_B[33] = h_B[1 0 2] = 0
h_X[1] = h_X[1 0] = 0
h_A[49] = h_A[1 0 3] = 0
h_B[49] = h_B[1 0 3] = 0
h_X[5] = h_X[1 1] = 69
h_A[5] = h_A[1 1 0] = 69
h_B[5] = h_B[1 1 0] = 69
h_X[5] = h_X[1 1] = 69
h_A[21] = h_A[1 1 1] = 69
h_B[21] = h_B[1 1 1] = 69
h_X[5] = h_X[1 1] = 69
h_A[37] = h_A[1 1 2] = 69
h_B[37] = h_B[1 1 2] = 69
h_X[5] = h_X[1 1] = 69
h_A[53] = h_A[1 1 3] = 69
h_B[53] = h_B[1 1 3] = 69
h_X[9] = h_X[1 2] = 24
h_A[9] = h_A[1 2 0] = 24
h_B[9] = h_B[1 2 0] = 24
h_X[9] = h_X[1 2] = 24
h_A[25] = h_A[1 2 1] = 24
h_B[25] = h_B[1 2 1] = 24
h_X[9] = h_X[1 2] = 24
h_A[41] = h_A[1 2 2] = 24
h_B[41] = h_B[1 2 2] = 24
h_X[9] = h_X[1 2] = 24
h_A[57] = h_A[1 2 3] = 24
h_B[57] = h_B[1 2 3] = 24
h_X[2] = h_X[2 0] = 78
h_A[2] = h_A[2 0 0] = 78
h_B[2] = h_B[2 0 0] = 78
h_X[2] = h_X[2 0] = 78
h_A[18] = h_A[2 0 1] = 78
h_B[18] = h_B[2 0 1] = 78
h_X[2] = h_X[2 0] = 78
h_A[34] = h_A[2 0 2] = 78
h_B[34] = h_B[2 0 2] = 78
h_X[2] = h_X[2 0] = 78
h_A[50] = h_A[2 0 3] = 78
h_B[50] = h_B[2 0 3] = 78
h_X[6] = h_X[2 1] = 58
h_A[6] = h_A[2 1 0] = 58
h_B[6] = h_B[2 1 0] = 58
h_X[6] = h_X[2 1] = 58
h_A[22] = h_A[2 1 1] = 58
h_B[22] = h_B[2 1 1] = 58
h_X[6] = h_X[2 1] = 58
h_A[38] = h_A[2 1 2] = 58
h_B[38] = h_B[2 1 2] = 58
h_X[6] = h_X[2 1] = 58
h_A[54] = h_A[2 1 3] = 58
h_B[54] = h_B[2 1 3] = 58
h_X[10] = h_X[2 2] = 62
h_A[10] = h_A[2 2 0] = 62
h_B[10] = h_B[2 2 0] = 62
h_X[10] = h_X[2 2] = 62
h_A[26] = h_A[2 2 1] = 62
h_B[26] = h_B[2 2 1] = 62
h_X[10] = h_X[2 2] = 62
h_A[42] = h_A[2 2 2] = 62
h_B[42] = h_B[2 2 2] = 62
h_X[10] = h_X[2 2] = 62
h_A[58] = h_A[2 2 3] = 62
h_B[58] = h_B[2 2 3] = 62
h_X[3] = h_X[3 0] = 64
h_A[3] = h_A[3 0 0] = 64
h_B[3] = h_B[3 0 0] = 64
h_X[3] = h_X[3 0] = 64
h_A[19] = h_A[3 0 1] = 64
h_B[19] = h_B[3 0 1] = 64
h_X[3] = h_X[3 0] = 64
h_A[35] = h_A[3 0 2] = 64
h_B[35] = h_B[3 0 2] = 64
h_X[3] = h_X[3 0] = 64
h_A[51] = h_A[3 0 3] = 64
h_B[51] = h_B[3 0 3] = 64
h_X[7] = h_X[3 1] = 5
h_A[7] = h_A[3 1 0] = 5
h_B[7] = h_B[3 1 0] = 5
h_X[7] = h_X[3 1] = 5
h_A[23] = h_A[3 1 1] = 5
h_B[23] = h_B[3 1 1] = 5
h_X[7] = h_X[3 1] = 5
h_A[39] = h_A[3 1 2] = 5
h_B[39] = h_B[3 1 2] = 5
h_X[7] = h_X[3 1] = 5
h_A[55] = h_A[3 1 3] = 5
h_B[55] = h_B[3 1 3] = 5
h_X[11] = h_X[3 2] = 45
h_A[11] = h_A[3 2 0] = 45
h_B[11] = h_B[3 2 0] = 45
h_X[11] = h_X[3 2] = 45
h_A[27] = h_A[3 2 1] = 45
h_B[27] = h_B[3 2 1] = 45
h_X[11] = h_X[3 2] = 45
h_A[43] = h_A[3 2 2] = 45
h_B[43] = h_B[3 2 2] = 45
h_X[11] = h_X[3 2] = 45
h_A[59] = h_A[3 2 3] = 45
h_B[59] = h_B[3 2 3] = 45
Done
h_Debug[0] = h_X[0 0] = 41
h_A[0] = h_A[0 0 0] = 41
h_B[0] = h_B[0 0 0] = 41
h_Debug[0] = h_X[0 0] = 41
h_A[16] = h_A[0 0 1] = 41
h_B[16] = h_B[0 0 1] = 41
h_Debug[0] = h_X[0 0] = 41
h_A[32] = h_A[0 0 2] = 41
h_B[32] = h_B[0 0 2] = 41
h_Debug[0] = h_X[0 0] = 41
h_A[48] = h_A[0 0 3] = 41
h_B[48] = h_B[0 0 3] = 41
h_Debug[4] = h_X[0 1] = 67
h_A[4] = h_A[0 1 0] = 67
h_B[4] = h_B[0 1 0] = 67
h_Debug[4] = h_X[0 1] = 67
h_A[20] = h_A[0 1 1] = 67
h_B[20] = h_B[0 1 1] = 67
h_Debug[4] = h_X[0 1] = 67
h_A[36] = h_A[0 1 2] = 67
h_B[36] = h_B[0 1 2] = 67
h_Debug[4] = h_X[0 1] = 67
h_A[52] = h_A[0 1 3] = 67
h_B[52] = h_B[0 1 3] = 67
h_Debug[8] = h_X[0 2] = 34
h_A[8] = h_A[0 2 0] = 34
h_B[8] = h_B[0 2 0] = 34
h_Debug[8] = h_X[0 2] = 34
h_A[24] = h_A[0 2 1] = 34
h_B[24] = h_B[0 2 1] = 34
h_Debug[8] = h_X[0 2] = 34
h_A[40] = h_A[0 2 2] = 34
h_B[40] = h_B[0 2 2] = 34
h_Debug[8] = h_X[0 2] = 34
h_A[56] = h_A[0 2 3] = 34
h_B[56] = h_B[0 2 3] = 34
h_Debug[1] = h_X[1 0] = 0
h_A[1] = h_A[1 0 0] = 0
h_B[1] = h_B[1 0 0] = 0
h_Debug[1] = h_X[1 0] = 0
h_A[17] = h_A[1 0 1] = 0
h_B[17] = h_B[1 0 1] = 0
h_Debug[1] = h_X[1 0] = 0
h_A[33] = h_A[1 0 2] = 0
h_B[33] = h_B[1 0 2] = 0
h_Debug[1] = h_X[1 0] = 0
h_A[49] = h_A[1 0 3] = 0
h_B[49] = h_B[1 0 3] = 0
h_Debug[5] = h_X[1 1] = 69
h_A[5] = h_A[1 1 0] = 69
h_B[5] = h_B[1 1 0] = 69
h_Debug[5] = h_X[1 1] = 69
h_A[21] = h_A[1 1 1] = 69
h_B[21] = h_B[1 1 1] = 69
h_Debug[5] = h_X[1 1] = 69
h_A[37] = h_A[1 1 2] = 69
h_B[37] = h_B[1 1 2] = 69
h_Debug[5] = h_X[1 1] = 69
h_A[53] = h_A[1 1 3] = 69
h_B[53] = h_B[1 1 3] = 69
h_Debug[9] = h_X[1 2] = 24
h_A[9] = h_A[1 2 0] = 24
h_B[9] = h_B[1 2 0] = 24
h_Debug[9] = h_X[1 2] = 24
h_A[25] = h_A[1 2 1] = 24
h_B[25] = h_B[1 2 1] = 24
h_Debug[9] = h_X[1 2] = 24
h_A[41] = h_A[1 2 2] = 24
h_B[41] = h_B[1 2 2] = 24
h_Debug[9] = h_X[1 2] = 24
h_A[57] = h_A[1 2 3] = 24
h_B[57] = h_B[1 2 3] = 24
h_Debug[2] = h_X[2 0] = 78
h_A[2] = h_A[2 0 0] = 78
h_B[2] = h_B[2 0 0] = 78
h_Debug[2] = h_X[2 0] = 78
h_A[18] = h_A[2 0 1] = 78
h_B[18] = h_B[2 0 1] = 78
h_Debug[2] = h_X[2 0] = 78
h_A[34] = h_A[2 0 2] = 78
h_B[34] = h_B[2 0 2] = 78
h_Debug[2] = h_X[2 0] = 78
h_A[50] = h_A[2 0 3] = 78
h_B[50] = h_B[2 0 3] = 78
h_Debug[6] = h_X[2 1] = 58
h_A[6] = h_A[2 1 0] = 58
h_B[6] = h_B[2 1 0] = 58
h_Debug[6] = h_X[2 1] = 58
h_A[22] = h_A[2 1 1] = 58
h_B[22] = h_B[2 1 1] = 58
h_Debug[6] = h_X[2 1] = 58
h_A[38] = h_A[2 1 2] = 58
h_B[38] = h_B[2 1 2] = 58
h_Debug[6] = h_X[2 1] = 58
h_A[54] = h_A[2 1 3] = 58
h_B[54] = h_B[2 1 3] = 58
h_Debug[10] = h_X[2 2] = 62
h_A[10] = h_A[2 2 0] = 62
h_B[10] = h_B[2 2 0] = 62
h_Debug[10] = h_X[2 2] = 62
h_A[26] = h_A[2 2 1] = 62
h_B[26] = h_B[2 2 1] = 62
h_Debug[10] = h_X[2 2] = 62
h_A[42] = h_A[2 2 2] = 62
h_B[42] = h_B[2 2 2] = 62
h_Debug[10] = h_X[2 2] = 62
h_A[58] = h_A[2 2 3] = 62
h_B[58] = h_B[2 2 3] = 62
h_Debug[3] = h_X[3 0] = 64
h_A[3] = h_A[3 0 0] = 64
h_B[3] = h_B[3 0 0] = 64
h_Debug[3] = h_X[3 0] = 64
h_A[19] = h_A[3 0 1] = 64
h_B[19] = h_B[3 0 1] = 64
h_Debug[3] = h_X[3 0] = 64
h_A[35] = h_A[3 0 2] = 64
h_B[35] = h_B[3 0 2] = 64
h_Debug[3] = h_X[3 0] = 64
h_A[51] = h_A[3 0 3] = 64
h_B[51] = h_B[3 0 3] = 64
h_Debug[7] = h_X[3 1] = 5
h_A[7] = h_A[3 1 0] = 5
h_B[7] = h_B[3 1 0] = 5
h_Debug[7] = h_X[3 1] = 5
h_A[23] = h_A[3 1 1] = 5
h_B[23] = h_B[3 1 1] = 5
h_Debug[7] = h_X[3 1] = 5
h_A[39] = h_A[3 1 2] = 5
h_B[39] = h_B[3 1 2] = 5
h_Debug[7] = h_X[3 1] = 5
h_A[55] = h_A[3 1 3] = 5
h_B[55] = h_B[3 1 3] = 5
h_Debug[11] = h_X[3 2] = 45
h_A[11] = h_A[3 2 0] = 45
h_B[11] = h_B[3 2 0] = 45
h_Debug[11] = h_X[3 2] = 45
h_A[27] = h_A[3 2 1] = 45
h_B[27] = h_B[3 2 1] = 45
h_Debug[11] = h_X[3 2] = 45
h_A[43] = h_A[3 2 2] = 45
h_B[43] = h_B[3 2 2] = 45
h_Debug[11] = h_X[3 2] = 45
h_A[59] = h_A[3 2 3] = 45
h_B[59] = h_B[3 2 3] = 45
C:\Users\shane\source\repos\BugWithReshape\x64\Debug\BugWithReshape.exe (process 12424) exited with code 0.
To automatically close the console when debugging stops, enable Tools->Options->Debugging->Automatically close the console when debugging stops.
Press any key to close this window . . .
But with this example I get an illegal memory access error
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <ctype.h>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <filesystem>
#include <iostream>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
using namespace std;
cudaError_t debugWrapper(int* c, int* a, int* b, unsigned int Xrows, unsigned int Xcols);
__global__ void reshapeAndPermute(int* A, int* B, int* X, int numElementsRows, int numElementsCols, int numElementsDepth)
{
int ii = (blockIdx.x * blockDim.x) + threadIdx.x;
int jj = (blockIdx.y * blockDim.y) + threadIdx.y;
int kk = (blockIdx.z * blockDim.z) + threadIdx.z;
__syncthreads();
//so many checks...
if ((ii < numElementsRows && kk < numElementsDepth) && jj < numElementsCols) {
int idxA = ii + numElementsRows * (jj + kk * numElementsDepth);
int idxX = ii + numElementsRows * jj;
if ((abs(idxX) < (numElementsRows * numElementsCols) && abs(idxA) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
A[idxA] = X[idxX];
//printf(" %i %i %i %d %d %d %d \r\n", ii, jj, kk, ii + numElementsRows * jj, ii + numElementsRows * (jj + kk * numElementsDepth), idxX, idxA);
//printf("X[%i %i] = %i = A[%i %i %i] = %i \r\n", ii, jj , X[idxX], ii, jj, kk, A[idxA]);
}
}
if (ii < numElementsRows) {
if (jj < numElementsCols) {
if (kk < numElementsDepth) {
int idxB = kk + numElementsRows * (jj + ii * numElementsDepth);
int idxXB = kk + numElementsDepth * jj;
if ((abs(idxXB) < (numElementsRows * numElementsCols) && abs(idxXB) < (numElementsRows * numElementsDepth * numElementsCols + numElementsCols * numElementsRows))) {
B[idxB] = X[idxXB]; //do permute
//printf(" %i %i %i %d %d \r\n", ii, jj, kk, idxB, idxXB);
//printf("X[%i %i] = %i = B[%i %i %i] = %i \r\n", jj, kk, X[idxXB], ii, jj, kk, B[idxB]);
}
}
}
}
}
int main()
{
/***Calculate Affinity Matrix***/
//int h_Debug[12] = { 1,4,7,12,2,5,8,14,3,6,10,16 };
/*
A = 1 2 3
4 5 6
7 8 10
12 14 16
*/
//double h_Debug[12] = { 1,2,3,4,5,6,7,8,10,12,14,16 };
/* Note the transpose
A = 1 5 10
2 6 12
3 7 14
4 9 16
*/
const int Xrows = 6000; const int Xcols = 3;
const int arraySize = Xrows*Xcols;
int numElementsSquare = Xrows * Xcols; //size for the 2D array
int numElements2D = Xrows * Xcols; //size for the 2D array
int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array
size_t size = numElements * sizeof(double);
size_t size2D = numElements2D * sizeof(double);
size_t sizeSquare = numElementsSquare * sizeof(double);
// Allocate the host input vector A
int* h_A = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D
// Allocate the host input vector B
int* h_B = (int*)calloc(size, sizeof(int)); // 3D flattened to 1D
int* h_Debug = (int*)calloc(size2D, sizeof(int));
// Initialize the host input vectors (for large arrays)
for (int ii = 0; ii < Xrows; ++ii)
{
for (int jj = 0; jj < Xcols; jj++) {
h_Debug[ii + Xrows*jj] = rand() % 100; // v1 in the range 0 to 99
if (ii + Xrows * jj < 10) {
printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);
}
}
}
// Add vectors in parallel.
cudaError_t cudaStatus = debugWrapper(h_Debug, h_A, h_B, Xrows, Xcols);
for (int ii = 0; ii < 4; ii++) {
for (int jj = 0; jj < 3; jj++) {
for (int kk = 0; kk < 4; kk++) {
printf("h_Debug[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_Debug[ii + Xrows * jj]);
printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);
printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
}
}
}
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t debugWrapper(int *h_X, int *h_A, int *h_B, unsigned int Xrows, unsigned int Xcols)
{
// Print the vector length to be used, and compute its size
int numElementsSquare = Xrows * Xrows; //size for the 2D array
int numElements2D = Xrows * Xcols; //size for the 2D array
int numElements = Xrows * Xrows * Xcols + Xrows * Xcols; //for the 3D array
size_t size = numElements * sizeof(double);
size_t size2D = numElements2D * sizeof(double);
size_t sizeSquare = numElementsSquare * sizeof(double);
cudaError_t err = cudaSuccess;
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_X == NULL )
{
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Allocate the device input vector A
int* d_A = NULL;
err = cudaMalloc((void**)&d_A, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input vector B
int* d_B = NULL;
err = cudaMalloc((void**)&d_B, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input vector X
int* d_X = NULL;
err = cudaMalloc((void**)&d_X, size2D);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector X (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the host input vectors A and B in host memory to the device input vectors in
// device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_X, h_X, size2D, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector X from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//Launch the reshapeAndPermute and flatten CUDA Kernel
int threadsInX = 16; int threadsInY = 4; int threadsInZ = 16; //threadsInX*threadsInY*threadsInZ can not be > 1024
int blocksInX = ((Xrows + threadsInX - 1) / threadsInX) > threadsInX ? ((Xrows + threadsInX - 1) / threadsInX) : threadsInX;
int blocksInY = ((Xcols + threadsInY - 1) / threadsInY); // > threadsInY ? ((Xcols + threadsInY - 1) / threadsInY) : threadsInY;
int blocksInZ = ((Xrows + threadsInZ - 1) / threadsInZ) > threadsInZ ? ((Xrows + threadsInZ - 1) / threadsInZ) : threadsInZ;
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);
printf("launch reshapeAndPermute CUDA kernel with %d %d %d blocks of %i, %i, %i threads\n", blocksInX, blocksInY, blocksInZ, threadsInX, threadsInY, threadsInZ);
reshapeAndPermute << <dimGrid, dimBlock >> > (d_A, d_B, d_X, Xrows, Xcols, Xrows);
err = cudaGetLastError();
err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
err = cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
for (int ii = 0; ii < 4; ii++) {
for (int jj = 0; jj < 3; jj++) {
for (int kk = 0; kk < 4; kk++) {
printf("h_X[%i] = h_X[%i %i] = %i \r\n", ii + Xrows * jj, ii, jj, h_X[ii + Xrows * jj]);
printf("h_A[%i] = h_A[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_A[ii + Xrows * (jj + kk * Xrows)]);
printf("h_B[%i] = h_B[%i %i %i] = %i \r\n", ii + Xrows * (jj + kk * Xrows), ii, jj, kk, h_B[ii + Xrows * (jj + kk * Xrows)]); //do the permute( [3 2 1]) here
}
}
}
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch reshapeAndPermute kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_B);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_X);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector X (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
//free(h_A);
//free(h_B);
//free(h_X);
printf("Done\n");
return err;
}
Here’s the output
h_Debug[0] = h_X[0 0] = 41
h_Debug[1] = h_X[1 0] = 0
h_Debug[2] = h_X[2 0] = 78
h_Debug[3] = h_X[3 0] = 64
h_Debug[4] = h_X[4 0] = 81
h_Debug[5] = h_X[5 0] = 91
h_Debug[6] = h_X[6 0] = 27
h_Debug[7] = h_X[7 0] = 4
h_Debug[8] = h_X[8 0] = 92
h_Debug[9] = h_X[9 0] = 16
Copy input data from the host memory to the CUDA device
launch reshapeAndPermute CUDA kernel with 375 1 375 blocks of 16, 4, 16 threads
h_X[0] = h_X[0 0] = 41
h_A[0] = h_A[0 0 0] = 0
h_B[0] = h_B[0 0 0] = 0
h_X[0] = h_X[0 0] = 41
h_A[36000000] = h_A[0 0 1] = 0
h_B[36000000] = h_B[0 0 1] = 0
h_X[0] = h_X[0 0] = 41
h_A[72000000] = h_A[0 0 2] = 0
h_B[72000000] = h_B[0 0 2] = 0
h_X[0] = h_X[0 0] = 41
h_A[108000000] = h_A[0 0 3] = 0
h_B[108000000] = h_B[0 0 3] = 0
h_X[6000] = h_X[0 1] = 67
h_A[6000] = h_A[0 1 0] = 0
h_B[6000] = h_B[0 1 0] = 0
h_X[6000] = h_X[0 1] = 67
h_A[36006000] = h_A[0 1 1] = 0
h_B[36006000] = h_B[0 1 1] = 0
h_X[6000] = h_X[0 1] = 67
h_A[72006000] = h_A[0 1 2] = 0
h_B[72006000] = h_B[0 1 2] = 0
h_X[6000] = h_X[0 1] = 67
h_A[108006000] = h_A[0 1 3] = 0
h_B[108006000] = h_B[0 1 3] = 0
h_X[12000] = h_X[0 2] = 34
h_A[12000] = h_A[0 2 0] = 0
h_B[12000] = h_B[0 2 0] = 0
h_X[12000] = h_X[0 2] = 34
h_A[36012000] = h_A[0 2 1] = 0
h_B[36012000] = h_B[0 2 1] = 0
h_X[12000] = h_X[0 2] = 34
h_A[72012000] = h_A[0 2 2] = 0
h_B[72012000] = h_B[0 2 2] = 0
h_X[12000] = h_X[0 2] = 34
h_A[108012000] = h_A[0 2 3] = 0
h_B[108012000] = h_B[0 2 3] = 0
h_X[1] = h_X[1 0] = 0
h_A[1] = h_A[1 0 0] = 0
h_B[1] = h_B[1 0 0] = 0
h_X[1] = h_X[1 0] = 0
h_A[36000001] = h_A[1 0 1] = 0
h_B[36000001] = h_B[1 0 1] = 0
h_X[1] = h_X[1 0] = 0
h_A[72000001] = h_A[1 0 2] = 0
h_B[72000001] = h_B[1 0 2] = 0
h_X[1] = h_X[1 0] = 0
h_A[108000001] = h_A[1 0 3] = 0
h_B[108000001] = h_B[1 0 3] = 0
h_X[6001] = h_X[1 1] = 69
h_A[6001] = h_A[1 1 0] = 0
h_B[6001] = h_B[1 1 0] = 0
h_X[6001] = h_X[1 1] = 69
h_A[36006001] = h_A[1 1 1] = 0
h_B[36006001] = h_B[1 1 1] = 0
h_X[6001] = h_X[1 1] = 69
h_A[72006001] = h_A[1 1 2] = 0
h_B[72006001] = h_B[1 1 2] = 0
h_X[6001] = h_X[1 1] = 69
h_A[108006001] = h_A[1 1 3] = 0
h_B[108006001] = h_B[1 1 3] = 0
h_X[12001] = h_X[1 2] = 24
h_A[12001] = h_A[1 2 0] = 0
h_B[12001] = h_B[1 2 0] = 0
h_X[12001] = h_X[1 2] = 24
h_A[36012001] = h_A[1 2 1] = 0
h_B[36012001] = h_B[1 2 1] = 0
h_X[12001] = h_X[1 2] = 24
h_A[72012001] = h_A[1 2 2] = 0
h_B[72012001] = h_B[1 2 2] = 0
h_X[12001] = h_X[1 2] = 24
h_A[108012001] = h_A[1 2 3] = 0
h_B[108012001] = h_B[1 2 3] = 0
h_X[2] = h_X[2 0] = 78
h_A[2] = h_A[2 0 0] = 0
h_B[2] = h_B[2 0 0] = 0
h_X[2] = h_X[2 0] = 78
h_A[36000002] = h_A[2 0 1] = 0
h_B[36000002] = h_B[2 0 1] = 0
h_X[2] = h_X[2 0] = 78
h_A[72000002] = h_A[2 0 2] = 0
h_B[72000002] = h_B[2 0 2] = 0
h_X[2] = h_X[2 0] = 78
h_A[108000002] = h_A[2 0 3] = 0
h_B[108000002] = h_B[2 0 3] = 0
h_X[6002] = h_X[2 1] = 58
h_A[6002] = h_A[2 1 0] = 0
h_B[6002] = h_B[2 1 0] = 0
h_X[6002] = h_X[2 1] = 58
h_A[36006002] = h_A[2 1 1] = 0
h_B[36006002] = h_B[2 1 1] = 0
h_X[6002] = h_X[2 1] = 58
h_A[72006002] = h_A[2 1 2] = 0
h_B[72006002] = h_B[2 1 2] = 0
h_X[6002] = h_X[2 1] = 58
h_A[108006002] = h_A[2 1 3] = 0
h_B[108006002] = h_B[2 1 3] = 0
h_X[12002] = h_X[2 2] = 62
h_A[12002] = h_A[2 2 0] = 0
h_B[12002] = h_B[2 2 0] = 0
h_X[12002] = h_X[2 2] = 62
h_A[36012002] = h_A[2 2 1] = 0
h_B[36012002] = h_B[2 2 1] = 0
h_X[12002] = h_X[2 2] = 62
h_A[72012002] = h_A[2 2 2] = 0
h_B[72012002] = h_B[2 2 2] = 0
h_X[12002] = h_X[2 2] = 62
h_A[108012002] = h_A[2 2 3] = 0
h_B[108012002] = h_B[2 2 3] = 0
h_X[3] = h_X[3 0] = 64
h_A[3] = h_A[3 0 0] = 0
h_B[3] = h_B[3 0 0] = 0
h_X[3] = h_X[3 0] = 64
h_A[36000003] = h_A[3 0 1] = 0
h_B[36000003] = h_B[3 0 1] = 0
h_X[3] = h_X[3 0] = 64
h_A[72000003] = h_A[3 0 2] = 0
h_B[72000003] = h_B[3 0 2] = 0
h_X[3] = h_X[3 0] = 64
h_A[108000003] = h_A[3 0 3] = 0
h_B[108000003] = h_B[3 0 3] = 0
h_X[6003] = h_X[3 1] = 5
h_A[6003] = h_A[3 1 0] = 0
h_B[6003] = h_B[3 1 0] = 0
h_X[6003] = h_X[3 1] = 5
h_A[36006003] = h_A[3 1 1] = 0
h_B[36006003] = h_B[3 1 1] = 0
h_X[6003] = h_X[3 1] = 5
h_A[72006003] = h_A[3 1 2] = 0
h_B[72006003] = h_B[3 1 2] = 0
h_X[6003] = h_X[3 1] = 5
h_A[108006003] = h_A[3 1 3] = 0
h_B[108006003] = h_B[3 1 3] = 0
h_X[12003] = h_X[3 2] = 45
h_A[12003] = h_A[3 2 0] = 0
h_B[12003] = h_B[3 2 0] = 0
h_X[12003] = h_X[3 2] = 45
h_A[36012003] = h_A[3 2 1] = 0
h_B[36012003] = h_B[3 2 1] = 0
h_X[12003] = h_X[3 2] = 45
h_A[72012003] = h_A[3 2 2] = 0
h_B[72012003] = h_B[3 2 2] = 0
h_X[12003] = h_X[3 2] = 45
h_A[108012003] = h_A[3 2 3] = 0
h_B[108012003] = h_B[3 2 3] = 0
Failed to launch reshapeAndPermute kernel (error code an illegal memory access was encountered)!
C:\Users\shane\source\repos\BugWithReshape\x64\Debug\BugWithReshape.exe (process 62620) exited with code 1.
To automatically close the console when debugging stops, enable Tools->Options->Debugging->Automatically close the console when debugging stops.
Press any key to close this window . . .