thank you for your relply!
your measured result of share memory is 1.5 cycle?
oh, my god!
below is my testing code of global memory and share memory:
/************************************************************
- Micro-workload.cu
- This is developed for testing latency.
*********/
#include <stdlib.h>
#include <stdio.h>
#include <cutil.h>
#define ELENUM (1024)
#define DATA_SIZE (4*ELENUM)
#define ITER 40000L
//for global memory latency
global static void Global_L(unsigned long * Buffer, long * time, long * result);
//for share memory latency
global static void Share_L(unsigned long * Buffer, long * time, long * result);
///************************************************************
/// Program main
/// Test memory latency
///************************************************************
int main( int argc, char** argv)
{
unsigned long * Buf_Glo, * Buf_Cpu;
long * time;
long * result;
char in;
int i = 0;
CUT_DEVICE_INIT();
Buf_Cpu = (unsigned long *)malloc(DATA_SIZE);
for(i=0; i< DATA_SIZE/sizeof(unsigned long); i+=4)
Buf_Cpu[i] = i + 4;
Buf_Cpu[i-4] = 0;
cudaMalloc((void**) &Buf_Glo, DATA_SIZE);
cudaMalloc((void**) &time, sizeof(long));
cudaMalloc((void**) &result, sizeof(long));
long time_used = 0;
cudaMemcpy(Buf_Glo, Buf_Cpu, DATA_SIZE,cudaMemcpyHostToDevice);
//share memory latency test segment
printf("share memory latency test \n");
Share_L<<<1, 1, DATA_SIZE>>>(Buf_Glo,time,result);
cudaMemcpy(&time_used, time, sizeof(long), cudaMemcpyDeviceToHost);
printf("Iteration times: %ld, each latency: %f clock\n\n", ITER*16, (float)time_used/(ITER*16));
//global memory latency test segment
printf("global memory latency test \n");
Global_L<<<1, 1, 0>>>(Buf_Glo,time, result);
cudaMemcpy(&time_used, time, sizeof(long), cudaMemcpyDeviceToHost);
printf("Iteration times: %ld, each latency: %f clock\n\n", ITER*16, (float)time_used/(ITER*16));
cudaFree(time);
cudaFree(Buf_Glo);
cudaFree(result);
scanf("%c", &in);
return 0;
}
//for global memory latency
global static void Global_L(unsigned long * Buffer, long * time, long * result)
{
unsigned long i = 0, index = 0, iteration = ITER;
unsigned long mask = DATA_SIZE/sizeof(unsigned long)-1;
* time = 0;
clock_t start = clock();
for(i = 0; i < iteration; i++)
{
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
index = Buffer[index] & mask;
}
*time = (clock() - start);
*result = index;
}
//for share memory latency
global static void Share_L(unsigned long * Buffer, long * time, long * result)
{
unsigned long i = 0, index = 0, iteration = ITER;
extern shared unsigned long shared;
unsigned long mask = DATA_SIZE/sizeof(unsigned long)-1;
*time = 0;
//load the global memory into share memory
for(i=0; i< DATA_SIZE/sizeof(unsigned long); ++i)
{
shared[i] = Buffer[i];
}
clock_t start = clock();
for(i = 0; i < iteration; i++)
{
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
index = shared[index] & mask;
}
*time += (clock() - start);
*result = index;
}
is there any problems?
can you share me your testing code?