I’m trying to work through the transposeNew documentation to test out the behavior of bank conflicts and partition camping. I both implemented my own version and tried to run transposeNew (only on the gtx285 as it won’t run properly on the Quadro 140m) and I’m seeing some very strange behavior. I was hoping someone could tell me what I’m doing wrong.
From the timings, it seems like there are no bank conflicts on the gtx285 (16x16 floats shard memory size) and no partition camping on the quadro nvs 140m (matrix size, multiple of 256*6/4=384 floats, also tried 512)
Could the number of banks on the 285 be different than 16?
both are run on windows 7, gtx285 on a desktop with cuda 3.0, nvs 140m on a thinkpad t61 with cuda 3.0 beta (and linux with cuda 3.0)
and on that note, for some reason the cpu version seems to be running a lot slower on the e8400 cpu (desktop) than on the t9400 cpu (laptop) despite the e8400 running on ddr3 vs ddr2 and both having the same amount of cache. Any idea about that?
Actual kernels are at the end if it helps
Thanks
Note that matrix sizes are different to account for the different partition sizes and limited memory on the 140m so to compare the gtx285 to the 140m there are MB/s values.
Timings on the gtx 285:
CPU copy Time: 109 bandwidth: 1174.31 MB/s
CPU transpose Time: 671 bandwidth: 190.76 MB/s
copy Time: 1.02835 bandwidth: 124471 MB/s
copy shared Time: 1.37606 bandwidth: 93018.9 MB/s
naive Time: 155.646 bandwidth: 822.38 MB/s
shared Time: 10.1434 bandwidth: 12619.1 MB/s speedup: 15.3446
no bank Time: 10.141 bandwidth: 12622.1 MB/s speedup: 15.3482
partition Time: 3.02867 bandwidth: 42262.7 MB/s speedup: 51.3908
Timings on the nvs 140m:
CPU copy Time: 31 bandwidth: 1290.32 MB/s
CPU transpose Time: 47 bandwidth: 851.064 MB/s
copy Time: 10.0499 bandwidth: 3980.13 MB/s
copy shared Time: 12.081 bandwidth: 3310.99 MB/s
naive Time: 61.1975 bandwidth: 653.622 MB/s
shared Time: 19.9262 bandwidth: 2007.41 MB/s speedup: 3.07121
no bank Time: 14.1607 bandwidth: 2824.71 MB/s speedup: 4.32163
partition Time: 25.5887 bandwidth: 1563.19 MB/s speedup: 2.39158
As can be seem. on the gtx285 transpose using shared memory runs at 10.1434 ms and without bank conflicts (17x16 shared memory size) runs at 10.141. Under the NVS there is a factor of about a 1/3 so I would think that it’s not the code.
On the other hand on the nvs the version that is supposed to have no partition camping is running a lot slower than the version that does.
Running transposeNew on the gtx 285 is showing the same shared memory behavior
Device 0: “GeForce GTX 285”
SM Capability 1.3 detected:
CUDA device has 30 Multi-Processors
SM performance scaling factor = 4.2f
Matrix size: 1024x1024 (64x64 tiles), tile size: 16x16, block size: 16x16
transposeNew-Outer-simple copy , Throughput = 77.7585 GB/s, Time = 0.10047 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-simple copy , Throughput = 113.9901 GB/s, Time = 0.06854 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Outer-shared memory copy , Throughput = 51.1425 GB/s, Time = 0.15276 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-shared memory copy , Throughput = 96.5273 GB/s, Time = 0.08094 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Outer-naive transpose , Throughput = 2.5903 GB/s, Time = 3.01609 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-naive transpose , Throughput = 2.6728 GB/s, Time = 2.92296 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Outer-coalesced transpose , Throughput = 18.5979 GB/s, Time = 0.42008 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-coalesced transpose , Throughput = 20.8583 GB/s, Time = 0.37455 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Outer-no bank conflict trans, Throughput = 18.9127 GB/s, Time = 0.41308 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-no bank conflict trans, Throughput = 20.8786 GB/s, Time = 0.37419 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Outer-coarse-grained , Throughput = 18.9515 GB/s, Time = 0.41224 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-coarse-grained , Throughput = 20.8824 GB/s, Time = 0.37412 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Outer-fine-grained , Throughput = 81.3469 GB/s, Time = 0.09604 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-fine-grained , Throughput = 98.6893 GB/s, Time = 0.07916 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Outer-diagonal transpose , Throughput = 29.7126 GB/s, Time = 0.26294 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transposeNew-Inner-diagonal transpose , Throughput = 109.0551 GB/s, Time = 0.07164 s, Size = 1048576 fp32 elements, NumDevsUsed = 1, Workgroup = 256
and the code just in case
[codebox]
include <stdio.h>
include <windows.h>
define width 3846 // 5124 //
define height width/2 //2048
define blockX 16
define blockY 16
define CUDA_CHK_ERR(errExpr) do { \
cudaError_t err = (errExpr); \
if (err != cudaSuccess) { \
printf("%s (%d): Got error (%d) %s\n", __FILE__, __LINE__, err, cudaGetErrorString(err)); \
return err; \
} \
} while (0)
global void copy(float *in, float *out)
{
int x = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
int y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y;
if (x > width || y > height)
return;
out[__mul24(y, width) + x] = in[__mul24(y, width) + x];
}
global void copyshared(float *in, float *out)
{
__shared__ float shared[blockY][blockX];
int x = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
int y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y;
if (x > width || y > height)
return;
shared[threadIdx.y][threadIdx.x] = in[__mul24(y, width) + x];
__syncthreads();
out[__mul24(y, width) + x] = shared[threadIdx.y][threadIdx.x];
}
global void naive(float *in, float *out)
{
int x = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
int y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y;
if (x > width || y > height)
return;
out[__mul24(x, height) + y] = in[__mul24(y, width) + x];
}
global void shared(float *in, float *out)
{
__shared__ float shared[blockY][blockX];
int bX = __mul24(blockIdx.x, blockDim.x);
int bY = __mul24(blockIdx.y, blockDim.y);
int x = bX + threadIdx.x;
int y = bY + threadIdx.y;
if (x > width || y > height)
return;
shared[threadIdx.y][threadIdx.x] = in[__mul24(y, width) + x];
x = bY + threadIdx.x;
y = bX + threadIdx.y;
out[__mul24(y, height) + x] = shared[threadIdx.x][threadIdx.y];
}
global void sharedNoBank(float *in, float *out)
{
__shared__ float shared[blockY][blockX + 1];
int bX = __mul24(blockIdx.x, blockDim.x);
int bY = __mul24(blockIdx.y, blockDim.y);
int x = bX + threadIdx.x;
int y = bY + threadIdx.y;
if (x > width || y > height)
return;
shared[threadIdx.y][threadIdx.x] = in[__mul24(y, width) + x];
__syncthreads();
x = bY + threadIdx.x;
y = bX + threadIdx.y;
out[__mul24(y, height) + x] = shared[threadIdx.x][threadIdx.y];
}
global void partition(float *in, float *out)
{
__shared__ float shared[blockY][blockX + 1];
int blockIdx_x, blockIdx_y;
// diagonal reordering
// if (width == height) {
// blockIdx_y = blockIdx.y;
// blockIdx_x = blockIdx.x + blockIdx.y;
// if (blockIdx_x >= gridDim.x)
// blockIdx_x -= gridDim.x;
blockIdx_y = blockIdx.x;
blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
// } else {
// int bid = blockIdx.x + gridDim.x*blockIdx.y;
// blockIdx_y = bid%gridDim.y;
// blockIdx_x = ((bid/gridDim.y)+blockIdx_y)%gridDim.x;
// }
int bX = __mul24(blockIdx_x, blockDim.x);
int bY = __mul24(blockIdx_y, blockDim.y);
int x = bX + threadIdx.x;
int y = bY + threadIdx.y;
if (x > width || y > height)
return;
shared[threadIdx.y][threadIdx.x] = in[__mul24(y, width) + x];
__syncthreads();
x = bY + threadIdx.x;
y = bX + threadIdx.y;
out[__mul24(y, height) + x] = shared[threadIdx.x][threadIdx.y];
}
void cpuCopy(float *in, float *out)
{
for (int y = 0 ; y < height ; y++)
for (int x = 0 ; x < width ; x++)
out[y*width + x] = in[y*width + x];
}
void cpuTranspose(float *in, float *out)
{
for (int y = 0 ; y < height ; y++)
for (int x = 0 ; x < width ; x++)
out[x*height + y] = in[y*width + x];
}
int main()
{
float *in;
float *out;
in = (float )malloc(widthheight*sizeof(float));
out = (float *)malloc(width*height*sizeof(float));
DWORD time_msg_start;
DWORD time_msg_end;
time_msg_start = timeGetTime();
cpuCopy(in, out);
time_msg_end = timeGetTime();
printf(“CPU copy\t Time: %ld\t\t bandwidth: %g MB/s\n”, (unsigned long)(time_msg_end - time_msg_start),
width*height*2*sizeof(float)/(1024*1024)*(1000.0/float(time_msg_end - time_msg_start)));
//----------------------------------------------
time_msg_start = timeGetTime();
cpuTranspose(in, out);
time_msg_end = timeGetTime();
printf(“CPU transpose\t Time: %ld\t\t bandwidth: %g MB/s\n”, (unsigned long)(time_msg_end - time_msg_start),
width*height*2*sizeof(float)/(1024*1024)*(1000.0/float(time_msg_end - time_msg_start)));
free(in);
free(out);
CUDA_CHK_ERR(cudaMalloc(&in, widthheightsizeof(float)));
CUDA_CHK_ERR(cudaMalloc(&out, width*height*sizeof(float)));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 dimBlock(blockY, blockY);
dim3 dimGrid((width - 1) / dimBlock.x + 1, (height - 1) / dimBlock.y + 1);
// ---------------------------------------
copy <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(start, 0);
copy <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time, start, stop);
printf(“copy\t\t Time: %g\t\t bandwidth: %g MB/s\n”, time, widthheight2sizeof(float)/(10241024)*(1000.0/time));
// ---------------------------------------
copyshared <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(start, 0);
copyshared <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf(“copy shared\t Time: %g\t\t bandwidth: %g MB/s\n”, time, widthheight2sizeof(float)/(10241024)*(1000.0/time));
// ---------------------------------------
naive <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(start, 0);
naive <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float reftime;
cudaEventElapsedTime(&time, start, stop);
printf(“naive\t\t Time: %g\t\t bandwidth: %g MB/s\n”, time, widthheight2sizeof(float)/(10241024)*(1000.0/time));
reftime = time;
// ---------------------------------------
shared <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(start, 0);
shared <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf(“shared\t\t Time: %g\t\t bandwidth: %g MB/s\t\tspeedup: %g\n”, time, widthheight2sizeof(float)/(10241024)*(1000.0/time), reftime/time);
// ---------------------------------------
sharedNoBank <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(start, 0);
sharedNoBank <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf(“no bank\t\t Time: %g\t\t bandwidth: %g MB/s\t\tspeedup: %g\n”, time, widthheight2sizeof(float)/(10241024)*(1000.0/time), reftime/time);
// ---------------------------------------
partition <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(start, 0);
partition <<< dimGrid, dimBlock >>> (in, out);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf(“partition\t Time: %g\t\t bandwidth: %g MB/s\t\tspeedup: %g\n”, time, widthheight2sizeof(float)/(10241024)*(1000.0/time), reftime/time);
cudaFree(in);
cudaFree(out);
return 0;
}
[/codebox]