what conclusion can I get from this experinment?

I get the execution time of vector adder with different size of blocksize and I only use one block in this experinment
blocksize execution time
1 3.6
50 4.22
100 4.3
200 4.28
300 4.3
400 4.31
500 4.38
600 4.38
700 4.78
800 5.18
900 5.78
1000 6.4
can I get the conclusion one sm can work about 600 threads together?
and I have some questions , could anybody can help me?
1.why the execution time increase sharply when blocksize in crease from 1 to 50 and from 600 to 1000?
thank you very much

blocksize ----------------------------------- execution time
1---------------------------------------------3.6
50--------------------------------------------4.22
100-------------------------------------------4.3
200-------------------------------------------4.28
300-------------------------------------------4.3
400-------------------------------------------4.31
500-------------------------------------------4.38
600-------------------------------------------4.38
700-------------------------------------------4.78
800-------------------------------------------5.18
900-------------------------------------------5.78
1000------------------------------------------6.4

The reason for the sharp increase from 600 to 1000 is most likely occupancy of the multiprocessor core.

As you make your thread blocks bigger, I would assume that you cross a threshold where a SMX can only execute one block at a time, instead of 2 or 3 simultaneously. The most likely reason for being able to run just 1 block would be the register file which cannot hold enough registers to work on several blocks simultaneously.

The Cuda Visual Profiler or NSight Visual Studio or Eclipse Edition will show you the achieved occupancy and give some hints how to improve this.

Some of the observed numbers could also be caused by the applied method measure execution time. You could post code so we can see if execution time was measured correctly.

cbuchner1, he used a single block

chickennight, before asking a question, you are better to show your code AND describe what it does. it’s not even obvious how many work you do, how well arrays are cached and so on

Right, single block. So occupancy can’t be the reason.

I second that request to see actual code ;)

and instead of random poking around, it may be better to learn how GPU works. unfortunately, i doesn’t know any simple and concise source for such info, but you may go through

https://users.ices.utexas.edu/~sreepai/fermi-tbs/


thank you for your help

thank you for your help. But my tutor need us to use the opencl to test the NVIDIA CARD. so I can just use the oprncl, sorry for that. my opencl code is following:
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include
#include
#include <math.h>

// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource = “\n”
#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n”
“__kernel void vecAdd( __global double *a, \n”
" __global double *b, \n"
" __global double *c, \n"
" const unsigned int n) \n"
“{ \n”
" //Get our global thread ID \n"
" int id = get_group_id(0)*get_local_size(0) + get_local_id(0); \n"
" \n"
" int k; \n"
" while (id < n) { \n"
" for (k = 0; k < 180000; k++) \n"
" c[id] = a[id] + b[id]; \n"
" id += get_local_size(0) * get_num_groups(0); \n"
“} } \n”
“\n”;

int main(int argc, char* argv)
{
// Length of vectors
unsigned int n = 1200;

// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;

// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;

cl_platform_id cpPlatform;        // OpenCL platform
cl_device_id device_id;           // device ID
cl_context context;               // context
cl_command_queue queue;           // command queue
cl_program program;               // program
cl_kernel kernel;                 // kernel

// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);


// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);

// Initialize vectors on host
int i;
for (i = 0; i < n; i++)
{
	h_a[i] = sinf(i)*sinf(i);
	h_b[i] = cosf(i)*cosf(i);
}
size_t globalSize, localSize;
cl_int err;

// Number of work items in each local work group
localSize = 1;

// Number of total work items - localSize must be devisor
globalSize =1;

// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);

// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

// Create a context  
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

// Create a command queue 
queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);

// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1,
	(const char **)& kernelSource, NULL, &err);

// Build the program executable 
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

// Create the compute kernel in the program we wish to run

kernel = clCreateKernel(program, "vecAdd", &err);

// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);

// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
	bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
	bytes, h_b, 0, NULL, NULL);

// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);

// Execute the kernel over the entire range of the data set  
cl_event prof_event;
cl_ulong ev_start_time = (cl_ulong)0;
cl_ulong ev_end_time = (cl_ulong)0;
double rum_time;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
	0, NULL, &prof_event);

// Wait for the command queue to get serviced before reading back results
clFinish(queue);
//read time
err = clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
	sizeof(cl_ulong), &ev_start_time, NULL);
err = clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
	sizeof(cl_ulong), &ev_end_time, NULL);
if (err)
	perror("error\n");
rum_time = (double)(ev_end_time - ev_start_time);
std::cout << "execution time:" << rum_time <<std:: endl;

// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
	bytes, h_c, 0, NULL, NULL);

//Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (i = 0; i<n; i++)
	sum += h_c[i];
printf("final result: %f\n", sum);

//

// release OpenCL resources
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);

//release host memory
free(h_a);
free(h_b);
free(h_c);
system("pause");
return 0;

}