I always get “CUDA error: invalid argument”
Please don’t post code as an attachment on these forums. Post it inline, using the forum tools such as </>
button to format code.
#include <cuda_runtime.h>
#include <iostream>
#define WIDTH 8
#define HEIGHT 1024*1024
int main() {
int width = WIDTH;
int height = HEIGHT;
uint8_t *h_idata = new uint8_t[width * height];
uint8_t *h_odata = new uint8_t[width * height];
for (int i = 0; i < width * height; ++i) {
h_idata[i] = static_cast<uint8_t>(i % 256);
}
uint8_t *d_idata, *d_odata;
size_t pitch_src, pitch_dst;
cudaMallocPitch((void**)&d_idata, &pitch_src, width * sizeof(uint8_t), height);
printf("src pitch from %d to %d\n",width,pitch_src);
cudaMallocPitch((void**)&d_odata, &pitch_dst, height * sizeof(uint8_t), width);
printf("dst pitch from %d to %d\n",height,pitch_dst);
cudaMemcpy2D(d_idata, pitch_src, h_idata, width * sizeof(uint8_t), width * sizeof(uint8_t), height, cudaMemcpyHostToDevice);
cudaStream_t stream;
cudaStreamCreate(&stream);
size_t copy_width = width * sizeof(uint8_t);
cudaMemcpy2DAsync(d_odata, pitch_dst, d_idata, pitch_src, copy_width,height, cudaMemcpyDeviceToDevice, stream);
cudaStreamSynchronize(stream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
return -1;
}
cudaMemcpy2D(h_odata, width * sizeof(uint8_t), d_odata, pitch_dst, height * sizeof(uint8_t), width, cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; ++i) {
std::cout << static_cast<int>(h_odata[i]) << " ";
}
std::cout << std::endl;
delete[] h_idata;
delete[] h_odata;
cudaFree(d_idata);
cudaFree(d_odata);
cudaStreamDestroy(stream);
return 0;
}
//nvcc -g -o cpy2d memcp2d.cu
Please format the code properly. One possible approach:
edit your post using the pencil icon below the post. Select the code. Press the </>
button. Save your changes.
Please do that now. Thanks.
I guess it’s done now
I don’t think you’ll be able to transpose a matrix using a single cudaMemcpy2D
, or a small fixed number of them, independent of matrix dimensions. As njuffa points out below, it should be possible to use one cudaMemcpy2D
call per matrix row or column, to perform a transpose. Given that there is an interest in performance here, that is not likely to be an interesting realization.
The cudaMemcpy2D
function copies rows or portions of rows from one place to another. There is no way to get rows (more than 1) in one matrix to be copied to columns (more than 1) in another, using a single call.
Anyway there are several issues:
- This is incorrect:
specifically the use of height
as the number of rows to copy. While the d_idata
matrix/allocation has that many rows, the d_odata
does not. cudaMemcpy2D
won’t somehow fix that for you. You will get an invalid argument error. If you “fix” that e.g. by changing height
there to width
, then you will run into another issue.
a row-copy size of height*sizeof(uint8_t)
will not fit into a destination pitch of width*sizeof(uint8_t)
.
Additionally, the use of %d
as a printf
format specifier for 64-bit quantities is incorrect.
thanks Robert, can you give some hints for improving the performance of transposing matrix like 8x1M if I write my own kernel ?
now I use the NPP transpose but the transposing performance is poor comparing with matrix of 1Mx1M.
a typical piece of advice is to avoid transposing (or moving) data unnecessarily. Instead develop an indexing or adapter that allows you to retrieve the data from a transposed state, for future operations (for example, the trove library listed below).
non-square matrix transpose is (for me) something I would normally leave to a library to handle, like NPP, or cublas geam. (cublas geam can’t be readily/easily used with byte types)
If you wish to write your own kernel for it, the indexing gets hairy because an efficient kernel will probably attempt to use shared memory, which imposes a tile structure. This combined with the arbitrary matrix dimensions (presumably) and the non-square nature creates a raft of boundary checking that in my opinion becomes burdensome to get correct and defect-free.
If you wish to proceed, this library may be a good alternative to writing your own. trove may also be of interest, although both of those libraries are implemented only for specific divisible-by-4 byte types. You can find many forum threads that discuss matrix transpose. Here is one (“notSoNaivaTransKernel”). I haven’t guaranteed that any of this is defect-free. Use at your own risk.
Here’s an example showing there is not much difference between the NPP time and the “notSoNaivaTransKernel” kernel time, for a “very” skinny matrix:
# cat t307.cu
#include <iostream>
#include <cstdlib>
#include <npp.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;
template <typename T>
__global__ void notSoNaivaTransKernel(T * __restrict__ matrB, const T * __restrict__ matrA, const int width, const int height)
{
__shared__ T tile[TILE_DIM][TILE_DIM + 1];
int ciIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int riIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int coIndex = blockIdx.y * TILE_DIM + threadIdx.x;
int roIndex = blockIdx.x * TILE_DIM + threadIdx.y;
size_t index_in = (size_t)ciIndex + ((size_t)(riIndex))* width;
size_t index_out = (size_t)coIndex + ((size_t)(roIndex))* height;
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((ciIndex<width) && (riIndex+i < height))
tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
__syncthreads();
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((coIndex<height) && (roIndex+i < width))
matrB[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
}
using mt = Npp8u;
const int cydim=9;
const int cxdim=1000000;
int main(int argc, char *argv[]){
int xdim = cxdim;
int ydim = cydim;
if (argc > 2) {xdim = atoi(argv[1]); ydim = atoi(argv[2]);}
size_t s = ((size_t)xdim) * ydim;
std::cout << "xdim: " << xdim << " ydim: " << ydim << std::endl;
mt *h, *d, *o;
h = new mt[s];
cudaMalloc(&d, sizeof(d[0])*s);
cudaMalloc(&o, sizeof(o[0])*s);
for (size_t i = 0; i < s; i++) h[i] = i%5+1;
cudaMemcpy(d, h, s, cudaMemcpyHostToDevice);
dim3 dim_grid, dim_block;
dim_block.x = TILE_DIM;
dim_block.y = BLOCK_ROWS;
dim_block.z = 1;
dim_grid.x = (xdim + TILE_DIM - 1) / TILE_DIM;
dim_grid.y = (ydim + TILE_DIM - 1) / TILE_DIM;
dim_grid.z = 1;
notSoNaivaTransKernel<<<dim_grid, dim_block >>>(o, d, xdim, ydim); // warm-up
cudaDeviceSynchronize();
cudaMemset(o, 0, sizeof(o[0])*s);
cudaDeviceSynchronize();
unsigned long long dt = dtime_usec(0);
notSoNaivaTransKernel<<<dim_grid, dim_block >>>(o, d, xdim, ydim);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaMemcpy(h, o, s, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < xdim; i++)
for (size_t j = 0; j < ydim; j++)
{
size_t idx=i*ydim+j;
size_t idx2=idx/xdim+(idx%xdim)*ydim;
if (h[idx2] != (idx)%5+1)
{
std::cout << "Kernel Mismatch at: " << idx2 << " was: " << (int)h[idx2] << " should be: " << (idx)%5+1 << std::endl; return 0;
}
}
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess) std::cout << "kernel elapsed: " << dt << "us" << std::endl;
else {std::cout << "kernel error: " << cudaGetErrorString(err) << std::endl; return 0;}
NppiSize ns = {xdim, ydim};
int xstep=xdim;
int ystep=ydim;
cudaMemset(o, 0, sizeof(o[0])*s);
// npp transpose will not work above 2B elements total
// and will require a change for types with different byte sizes
// e.g. change _8u_ to _16u_ or _32u_
// also each dimension has size limits
NppStatus stat = nppiTranspose_8u_C1R(d, xstep, o, ystep, ns); // warm-up
cudaDeviceSynchronize();
cudaMemset(o, 0, sizeof(o[0])*s);
cudaDeviceSynchronize();
dt = dtime_usec(0);
stat = nppiTranspose_8u_C1R(d, xstep, o, ystep, ns);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
if (stat != NPP_SUCCESS) std::cout << "Npp error: " << (int)stat << std::endl;
memset(h, 0, sizeof(h[0])*s);
cudaMemcpy(h, o, s, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < xdim; i++)
for (size_t j = 0; j < ydim; j++)
{
size_t idx=i*ydim+j;
size_t idx2=idx/xdim+(idx%xdim)*ydim;
if (h[idx2] != (idx)%5+1)
{
std::cout << "NPP Mismatch at: " << idx2 << " was: " << (int)h[idx2] << " should be: " << (idx)%5+1 << std::endl; return 0;
}
}
err = cudaGetLastError();
if (err == cudaSuccess) std::cout << "NPP elapsed: " << dt << "us" << std::endl;
else {std::cout << "NPP error: " << cudaGetErrorString(err) << std::endl; return 0;}
}
# nvcc -o t307 t307.cu -lnppidei
# ./t307 9 1000000
xdim: 9 ydim: 1000000
kernel elapsed: 83us
NPP elapsed: 92us
# ./t307 1000000 9
xdim: 1000000 ydim: 9
kernel elapsed: 93us
NPP elapsed: 93us
# ./t307 1000000 1000
xdim: 1000000 ydim: 1000
kernel elapsed: 29363us
NPP elapsed: 51752us
# ./t307 1000 1000000
xdim: 1000 ydim: 1000000
kernel elapsed: 8366us
NPP elapsed: 11164us
#
A matrix of 1Mx1M is 1000 Billion elements. If those elements are each 1 byte, that is 1000 GB. That won’t fit in any GPU memory today, not sure what you are referring to or measuring there.
If you vary the dimensions in the code above (1000000 or less, for each dimension), I witness at most about a 2x difference between NPP and kernel time, so sometimes the kernel version is up to 2x faster than NPP. I haven’t done an exhaustive or rigorous test.
I will mention this for completeness: A cudaMemcpy2D()
call can provide a strided 1D copy, but the performance will likely be low due to the strided access pattern. In order to transpose a 2D matrix, one would need to use a loop issuing cudaMemcpy2D()
calls, one per row/column. This would be hugely inefficient.
I’ve modified my previous claim to make it consistent with that point.
Thank you for your good suggestions . I will accept NPP now. study stove and in-place code later.
1Mx1M is mis-spelled ,I mean a square matrix have similar member like 8x1M skinny matrix.