Hi,
Ok I think my initial guess about the textures was correct… would be glad for confirmation though.
I ran a code posted here in the past by MisterAnderson42 (posted below)
Take a look at the attached results of this test ran under VisualProfiler 2.2.
All the texture related kernels are showing 0 as all statistics and btw the values in the other columns are reasonable and probably not per MP.
What do you guys think?
thanks
eyal
#define BLOCK_SIZE 128
texture<float4, 1, cudaReadModeElementType> tex_float4;
texture<float2, 1, cudaReadModeElementType> tex_float2;
texture<float, 1, cudaReadModeElementType> tex_float;
template <class T> __global__ void copy_gmem(T* g_idata, T* g_odata, T c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
g_odata[idx] = g_idata[idx];
}
__global__ void copy_tex_float(float* g_idata, float* g_odata, float c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
g_odata[idx] = tex1Dfetch(tex_float, idx);
}
__global__ void copy_tex_float2(float2* g_idata, float2* g_odata, float2 c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
g_odata[idx] = tex1Dfetch(tex_float2, idx);
}
__global__ void copy_tex_float4(float4* g_idata, float4* g_odata, float4 c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
g_odata[idx] = tex1Dfetch(tex_float4, idx);
}
template <class T> __global__ void write_only(T* g_idata, T* g_odata, T c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
g_odata[idx] = c;
}
template <class T> __global__ void read_only_gmem(T* g_idata, T* g_odata, T c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ T shared[BLOCK_SIZE];
shared[threadIdx.x] = g_idata[idx];
*((float *)(&shared[(threadIdx.x + 1) & (BLOCK_SIZE-1)])) += 1.0;
}
__global__ void read_only_tex_float(float* g_idata, float* g_odata, float c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ float shared[BLOCK_SIZE];
shared[threadIdx.x] = tex1Dfetch(tex_float, idx);
*((float *)(&shared[(threadIdx.x + 1) & (BLOCK_SIZE-1)])) += 1.0;
}
__global__ void read_only_tex_float2(float2* g_idata, float2* g_odata, float2 c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ float2 shared[BLOCK_SIZE];
shared[threadIdx.x] = tex1Dfetch(tex_float2, idx);
*((float *)(&shared[(threadIdx.x + 1) & (BLOCK_SIZE-1)])) += 1.0;
}
__global__ void read_only_tex_float4(float4* g_idata, float4* g_odata, float4 c)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ float4 shared[BLOCK_SIZE];
shared[threadIdx.x] = tex1Dfetch(tex_float4, idx);
*((float *)(&shared[(threadIdx.x + 1) & (BLOCK_SIZE-1)])) += 1.0;
}
#define DO_BMARK(kernel, type, value, name, ntransfer) \
{ \
kernel<<< grid, threads >>>((type *)d_idata, (type *)d_odata, value); \
\
cudaEvent_t start, end; \
CUDA_SAFE_CALL( cudaEventCreate(&start) ); \
CUDA_SAFE_CALL( cudaEventCreate(&end) ); \
\
CUDA_SAFE_CALL( cudaEventRecord(start, 0) ); \
for (int i=0; i < nIters; ++i) \
{ \
kernel<<< grid, threads >>>((type *)d_idata, (type *)d_odata, value); \
} \
CUDA_SAFE_CALL( cudaEventRecord(end, 0) ); \
CUDA_SAFE_CALL( cudaEventSynchronize(end) ); \
\
float runTime; \
CUDA_SAFE_CALL( cudaEventElapsedTime(&runTime, start, end) ); \
runTime /= float(nIters); \
printf("%s - Bandwidth: %f GiB/s\n", name, (ntransfer * len * sizeof(type)) / (runTime * 1.0e-3 * 1024*1024*1024)); \
CUDA_SAFE_CALL( cudaEventDestroy(start) ); \
CUDA_SAFE_CALL( cudaEventDestroy(end) ); \
}
void BenchMarkTest()
{
cudaSetDevice( 1 );
int len = 1 << 22;
int num_threads = BLOCK_SIZE;
int nIters = 500;
int V = 104, A = 161, G = 1, C = 13, S = 750;
int iSize = V * A * G * C * S;
float4 *d_idata, *d_odata;
CUDA_SAFE_CALL( cudaMalloc((void**)&d_idata, sizeof(float4)*len) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_odata, sizeof(float4)*len) );
printf( "len[%d], sizeof(float4)*len[%d]\n", len, sizeof(float4)*len );
CUDA_SAFE_CALL( cudaBindTexture(0, tex_float, d_idata, sizeof(float) * len) );
CUDA_SAFE_CALL( cudaBindTexture(0, tex_float2, d_idata, sizeof(float2) * len) );
CUDA_SAFE_CALL( cudaBindTexture(0, tex_float4, d_idata, sizeof(float4) * len) );
dim3 threads(num_threads, 1, 1);
//dim3 grid(len/num_threads, 1, 1);
dim3 grid(len/num_threads, 10, 1);
printf( "threads[%d], grid[%d]\n", threads.x, grid.x );
//DO_BMARK(copy_gmem<float>, float, 0.0f, "copy_gmem<float>", 2);
DO_BMARK(copy_gmem<float>, float, 0.0f, "copy_gmem<float>", 2 );
DO_BMARK(copy_gmem<float2>, float2, make_float2(0.0f, 0.0f), "copy_gmem<float2>", 2);
DO_BMARK(copy_gmem<float4>, float4, make_float4(0.0f, 0.0f, 0.0f, 0.0f), "copy_gmem<float4>", 2);
printf("\n");
DO_BMARK(copy_tex_float, float, 0.0f, "copy_tex<float>", 2);
DO_BMARK(copy_tex_float2, float2, make_float2(0.0f, 0.0f), "copy_tex<float2>", 2);
DO_BMARK(copy_tex_float4, float4, make_float4(0.0f, 0.0f, 0.0f, 0.0f), "copy_tex<float4>", 2);
printf("\n");
DO_BMARK(write_only<float>, float, 0.0f, "write_only<float>", 1);
DO_BMARK(write_only<float2>, float2, make_float2(0.0f, 0.0f), "write_only<float2>", 1);
DO_BMARK(write_only<float4>, float4, make_float4(0.0f, 0.0f, 0.0f, 0.0f), "write_only<float4>", 1);
printf("\n");
DO_BMARK(read_only_gmem<float>, float, 0.0f, "read_only_gmem<float>", 1);
DO_BMARK(read_only_gmem<float2>, float2, make_float2(0.0f, 0.0f), "read_only_gmem<float2>", 1);
DO_BMARK(read_only_gmem<float4>, float4, make_float4(0.0f, 0.0f, 0.0f, 0.0f), "read_only_gmem<float4>", 1);
printf("\n");
DO_BMARK(read_only_tex_float, float, 0.0f, "read_only_tex<float>", 1);
DO_BMARK(read_only_tex_float2, float2, make_float2(0.0f, 0.0f), "read_only_tex<float2>", 1);
DO_BMARK(read_only_tex_float4, float4, make_float4(0.0f, 0.0f, 0.0f, 0.0f), "read_only_tex<float4>", 1);
}