[RESOLVED]
Hi,
I wrote a test application that compares 1D arrays to 2D arrays.
There are 3 variants:
-
1D array.
-
2D array.
-
2D array using pitched memory.
Originally I tried to test the offsetCopy from the best_practices_guide, when I noticed this bizarre result.
The code copies a 1024x1024 matrix in the global memory. I execute the kernel 1000 times. It takes 1 sec for the 1D version, and 7 seconds for the 2D arrays (both cases). Okay, what went wrong??
Now I set int szY = 1025, and the pitch now has an effect and is set to 1088. 1D and 2D cases run the same as before, but the 2D pitched version now runs in 2.6 seconds. Can someone please explain this??
Here is the full test code:
#include <iostream>
using namespace std;
#include "../shared/stopwatch.h"
__global__ void offsetCopy1D(float *odata, float* idata, int offset)
{
int szX = blockDim.x * gridDim.x;
int i = blockIdx.x * blockDim.x + threadIdx.x + offset;
i = i % szX;
odata[i] = idata[i];
}
__global__ void offsetCopy2D(float *odata, float* idata, int offset)
{
int szX = blockDim.x * gridDim.x;
int szY = blockDim.y * gridDim.y;
int i = blockIdx.x * blockDim.x + threadIdx.x + offset; // RESOLVED: here is the problem, you should swap i with j.
i = i % szX;
int j = blockIdx.y * blockDim.y + threadIdx.y;
odata[i*szY + j] = idata[i*szY + j];
}
__global__ void offsetCopy2D_pitch(float *odata, float* idata, size_t pitch, int offset)
{
int szX = blockDim.x * gridDim.x;
// int szY = blockDim.y * gridDim.y;
int i = blockIdx.x * blockDim.x + threadIdx.x + offset;
i = i % szX;
int j = blockIdx.y * blockDim.y + threadIdx.y;
odata[i*pitch + j] = idata[i*pitch + j];
}
void test_res()
{
cudaError_t errorCode = cudaGetLastError();
if (errorCode != cudaSuccess) {
cout << "Cuda errorCode = " << cudaGetErrorString(errorCode) << endl;
throw std::exception("Cuda error!");
}
}
int main(int argc, char** argv)
{
Stopwatch sw;
sw.start_print();
float * din = NULL;
float * dout = NULL;
int szX = 1024;
int szY = 1024;
dim3 threads = dim3(16, 16);
dim3 blocks = dim3(szX / threads.x, szY / threads.y);
bool b2D = 1;
bool bPitch = 1;
size_t pitch;
if ( !bPitch ) {
cudaMalloc((void**)&din, sizeof(float) * szX * szY);
cudaMalloc((void**)&dout, sizeof(float) * szX * szY);
} else {
cudaMallocPitch( (void**) &din, &pitch , szY * sizeof(float) , szX);
cudaMallocPitch( (void**) &dout, &pitch , szY * sizeof(float) , szX);
pitch /= sizeof(float);
cout << "Pitch " << pitch << endl;
}
test_res();
int offset = 0;
for ( int it = 0 ; it < 1000 ; it++ ) {
if ( b2D ) {
if ( !bPitch )
offsetCopy2D<<<blocks, threads>>>(dout, din, offset);
else {
offsetCopy2D_pitch<<<blocks, threads>>>(dout, din, pitch, offset);
}
} else {
int nBlocks = blocks.x * blocks.y;
int nThreads = threads.x * threads.y;
offsetCopy1D<<<nBlocks, nThreads>>>(dout, din, offset);
}
test_res();
}
cudaFree(din);
cudaFree(dout);
cudaThreadExit();
test_res();
sw.print();
}