issue with cuda streams

Hi everyone!

I’m trying to use cuda streams in my matrix vector multiplication program, but I have some problems… Version without streams working very well . I suspect the problem is the memory allocation, because the vector of solutions is empty. Any ideas?

I’m working with mvs2010, cuda 3.2, geforce 560gtx

#include <iostream>

#include <stdio.h>

#include <assert.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <time.h>

#define BLOCK_SIZE 2

#define DATA_SIZE n * n

using namespace std;

__global__ void matixVectorMultiplicationINT(int n, int *A_G, int *x_G, int *y_G)

{

	int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;

	int  t = 0;

	for(int j = 0; j < n; j++)

	{

		t += A_G[i + j*n] * x_G[j];

	}

	y_G[i] = t;

}

void dataReset( int n, int *vec)

{

	for(int i = 0; i < n; i++)

	{

		vec[i] = 0;

	}

}

void showVector( int n, int *vec)

{

	for(int i = 0; i < n; i++)

	{

		cout << vec[i] << endl;

	}

}

float randomNumber()

{

	return rand();

}

void generateRandomVector( int n, int *vec)

{

	for(int i = 0; i < n; i++)

	{

		vec[i] = i+1;

	}

}

void generateRandomMatrix(int n, int** mat)

{

	int temp = 1;

	for (int i = 0; i < n; i++)

    {

        for (int j = 0; j < n; j++)    

		{

			mat[i][j] = temp;

			temp++;

		}        

    }

}

void showMatrix(int n, int** mat)

{

	for (int i = 0; i < n; i++)

    {

        for (int j=0; j<n; j++)    

		{

			cout << mat[i][j] << "\t";

		}

		cout<<endl;

    }

}

void linearizeMatrix(int n, int ** mat, int linMatrix[])

{

	int tempId = 0;

	for(int i = 0; i < n; i++)

	{

		for(int j = 0; j < n; j++)

		{

			linMatrix[tempId] = mat[j][i];

			tempId++;

		}

	}

}

int main()

{

	int n = 8; //dimension of matrix

	dim3 threads(BLOCK_SIZE);

	dim3 grid(n/BLOCK_SIZE);

	

	srand((unsigned)time(0));

	cudaDeviceProp prop;

	int whichDevice;

	cudaGetDevice(&whichDevice);

	cudaGetDeviceProperties(&prop, whichDevice);

	if(!prop.deviceOverlap)

	{

		cout << "Urzadzenie nie wspiera cudaStreams!";

		return 0;

	}

	cudaStream_t stream;

	cudaStreamCreate(&stream);

	cudaEvent_t start, stop;

	float elapsedTime;

	

	cudaEventCreate(&start);

	cudaEventCreate(&stop);

	int ** matrix;

	int *hostMatrix;

	int *hostVector;

	int *hostSolutionVector;

	int *deviceMatrix;

	int *deviceVector;

	int *deviceSolutionVector;

	//allcate memory for host data

	matrix = new int *[n];

	for(int i = 0; i < n; i++)

	{

		matrix[i] = new int[n];

	}

	hostVector = new int[n];

	hostSolutionVector = new int[n];

	generateRandomVector(n, hostVector);

	dataReset(n, hostSolutionVector);

	generateRandomMatrix(n, matrix);

	hostMatrix = new int[DATA_SIZE];

	linearizeMatrix(n, matrix, hostMatrix);

	cout << "Matrix:" << endl;

	showMatrix(n, matrix);

	cout << endl;

	cout<< "Vector:" << endl;

	showVector(n, hostVector);

	cout << endl;

	//alokacja pamieci

	cudaMalloc((void**)&deviceMatrix, sizeof(int) * DATA_SIZE);

	cudaMalloc((void**)&deviceVector, sizeof(int) * n);

	cudaMalloc((void**)&deviceSolutionVector, sizeof(int) * n);

	cudaHostAlloc((void**)&hostMatrix, sizeof(int) * DATA_SIZE, cudaHostAllocDefault);

	cudaHostAlloc((void**)&hostVector, sizeof(int) * n, cudaHostAllocDefault);

	cudaHostAlloc((void**)&hostVector, sizeof(int) * n, cudaHostAllocDefault);

	cudaMemcpyAsync(deviceMatrix, hostMatrix, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice, stream);

	cudaMemcpyAsync(deviceVector, hostVector, sizeof(int) * n, cudaMemcpyHostToDevice, stream);

	

	cudaEventRecord(start, 0);

	

	matixVectorMultiplicationINT<<<grid,threads,0, stream>>>(n, deviceMatrix, deviceVector, deviceSolutionVector);

	

	cudaEventRecord(stop, 0);

	

	cudaStreamSynchronize(stream);

	cudaMemcpyAsync(hostSolutionVector, deviceSolutionVector, sizeof(int) * n, cudaMemcpyDeviceToHost, stream);

	

	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapsedTime, start, stop);

	cout << "Computation time: " << elapsedTime << " ms" << endl;

	cout << "\nSolution vector:" <<endl;

	showVector(n, hostSolutionVector);

	

	//bye bye stream

	cudaStreamDestroy(stream);

	

	//free device memory

	cudaFree(deviceMatrix);

	cudaFree(deviceVector);

	cudaFree(deviceSolutionVector);

	//free Host memory

	cudaFreeHost(hostMatrix);

	cudaFreeHost(hostVector);

	cudaFreeHost(hostSolutionVector);

	delete [] matrix;

	cout << "\npress Enter..." << endl;

	getchar();

	return 0;

}

Regards

Hubert

you really need to check return values from cuda* calls

Thank you for fast reply to my post. How can I check return values? I need to use cuda_safe_call ?

you can use cuda_safe_call (if you don’t mind using cutil), or you can manually get and check the return values from the calls. They return a type cudaError_t, which you can then check to see if the function call succeeded or failed (and if it failed, what the error message was). A word of caution, beware of the asynchronous calls’ errors, as they can be misleading at times.

these are the possible values for cudaError_t (found in the driver_types.h file)

/**

 * CUDA error types

 */

/*DEVICE_BUILTIN*/

enum cudaError

{

  /**

   * The API call returned with no errors. In the case of query calls, this

   * can also mean that the operation being queried is complete (see

   * ::cudaEventQuery() and ::cudaStreamQuery()).

   */

  cudaSuccess                           =      0,

/**

   * The device function being invoked (usually via ::cudaLaunch()) was not

   * previously configured via the ::cudaConfigureCall() function.

   */

  cudaErrorMissingConfiguration         =      1,

/**

   * The API call failed because it was unable to allocate enough memory to

   * perform the requested operation.

   */

  cudaErrorMemoryAllocation             =      2,

/**

   * The API call failed because the CUDA driver and runtime could not be

   * initialized.

   */

  cudaErrorInitializationError          =      3,

/**

   * An exception occurred on the device while executing a kernel. Common

   * causes include dereferencing an invalid device pointer and accessing

   * out of bounds shared memory. The device cannot be used until

   * ::cudaThreadExit() is called. All existing device memory allocations

   * are invalid and must be reconstructed if the program is to continue

   * using CUDA.

   */

  cudaErrorLaunchFailure                =      4,

/**

   * This indicated that a previous kernel launch failed. This was previously

   * used for device emulation of kernel launches.

   * \deprecated

   * This error return is deprecated as of CUDA 3.1. Device emulation mode was

   * removed with the CUDA 3.1 release.

   */

  cudaErrorPriorLaunchFailure           =      5,

/**

   * This indicates that the device kernel took too long to execute. This can

   * only occur if timeouts are enabled - see the device property

   * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"

   * for more information. The device cannot be used until ::cudaThreadExit()

   * is called. All existing device memory allocations are invalid and must be

   * reconstructed if the program is to continue using CUDA.

   */

  cudaErrorLaunchTimeout                =      6,

/**

   * This indicates that a launch did not occur because it did not have

   * appropriate resources. Although this error is similar to

   * ::cudaErrorInvalidConfiguration, this error usually indicates that the

   * user has attempted to pass too many arguments to the device kernel, or the

   * kernel launch specifies too many threads for the kernel's register count.

   */

  cudaErrorLaunchOutOfResources         =      7,

/**

   * The requested device function does not exist or is not compiled for the

   * proper device architecture.

   */

  cudaErrorInvalidDeviceFunction        =      8,

/**

   * This indicates that a kernel launch is requesting resources that can

   * never be satisfied by the current device. Requesting more shared memory

   * per block than the device supports will trigger this error, as will

   * requesting too many threads or blocks. See ::cudaDeviceProp for more

   * device limitations.

   */

  cudaErrorInvalidConfiguration         =      9,

/**

   * This indicates that the device ordinal supplied by the user does not

   * correspond to a valid CUDA device.

   */

  cudaErrorInvalidDevice                =     10,

/**

   * This indicates that one or more of the parameters passed to the API call

   * is not within an acceptable range of values.

   */

  cudaErrorInvalidValue                 =     11,

/**

   * This indicates that one or more of the pitch-related parameters passed

   * to the API call is not within the acceptable range for pitch.

   */

  cudaErrorInvalidPitchValue            =     12,

/**

   * This indicates that the symbol name/identifier passed to the API call

   * is not a valid name or identifier.

   */

  cudaErrorInvalidSymbol                =     13,

/**

   * This indicates that the buffer object could not be mapped.

   */

  cudaErrorMapBufferObjectFailed        =     14,

/**

   * This indicates that the buffer object could not be unmapped.

   */

  cudaErrorUnmapBufferObjectFailed      =     15,

/**

   * This indicates that at least one host pointer passed to the API call is

   * not a valid host pointer.

   */

  cudaErrorInvalidHostPointer           =     16,

/**

   * This indicates that at least one device pointer passed to the API call is

   * not a valid device pointer.

   */

  cudaErrorInvalidDevicePointer         =     17,

/**

   * This indicates that the texture passed to the API call is not a valid

   * texture.

   */

  cudaErrorInvalidTexture               =     18,

/**

   * This indicates that the texture binding is not valid. This occurs if you

   * call ::cudaGetTextureAlignmentOffset() with an unbound texture.

   */

  cudaErrorInvalidTextureBinding        =     19,

/**

   * This indicates that the channel descriptor passed to the API call is not

   * valid. This occurs if the format is not one of the formats specified by

   * ::cudaChannelFormatKind, or if one of the dimensions is invalid.

   */

  cudaErrorInvalidChannelDescriptor     =     20,

/**

   * This indicates that the direction of the memcpy passed to the API call is

   * not one of the types specified by ::cudaMemcpyKind.

   */

  cudaErrorInvalidMemcpyDirection       =     21,

/**

   * This indicated that the user has taken the address of a constant variable,

   * which was forbidden up until the CUDA 3.1 release.

   * \deprecated

   * This error return is deprecated as of CUDA 3.1. Variables in constant

   * memory may now have their address taken by the runtime via

   * ::cudaGetSymbolAddress().

   */

  cudaErrorAddressOfConstant            =     22,

/**

   * This indicated that a texture fetch was not able to be performed.

   * This was previously used for device emulation of texture operations.

   * \deprecated

   * This error return is deprecated as of CUDA 3.1. Device emulation mode was

   * removed with the CUDA 3.1 release.

   */

  cudaErrorTextureFetchFailed           =     23,

/**

   * This indicated that a texture was not bound for access.

   * This was previously used for device emulation of texture operations.

   * \deprecated

   * This error return is deprecated as of CUDA 3.1. Device emulation mode was

   * removed with the CUDA 3.1 release.

   */

  cudaErrorTextureNotBound              =     24,

/**

   * This indicated that a synchronization operation had failed.

   * This was previously used for some device emulation functions.

   * \deprecated

   * This error return is deprecated as of CUDA 3.1. Device emulation mode was

   * removed with the CUDA 3.1 release.

   */

  cudaErrorSynchronizationError         =     25,

/**

   * This indicates that a non-float texture was being accessed with linear

   * filtering. This is not supported by CUDA.

   */

  cudaErrorInvalidFilterSetting         =     26,

/**

   * This indicates that an attempt was made to read a non-float texture as a

   * normalized float. This is not supported by CUDA.

   */

  cudaErrorInvalidNormSetting           =     27,

/**

   * Mixing of device and device emulation code was not allowed.

   * \deprecated

   * This error return is deprecated as of CUDA 3.1. Device emulation mode was

   * removed with the CUDA 3.1 release.

   */

  cudaErrorMixedDeviceExecution         =     28,

/**

   * This indicated an issue with calling API functions during the unload

   * process of the CUDA runtime in prior releases.

   * \deprecated

   * This error return is deprecated as of CUDA 3.2.

   */

  cudaErrorCudartUnloading              =     29,

/**

   * This indicates that an unknown internal error has occurred.

   */

  cudaErrorUnknown                      =     30,

/**

   * This indicates that the API call is not yet implemented. Production

   * releases of CUDA will never return this error.

   */

  cudaErrorNotYetImplemented            =     31,

/**

   * This indicated that an emulated device pointer exceeded the 32-bit address

   * range.

   * \deprecated

   * This error return is deprecated as of CUDA 3.1. Device emulation mode was

   * removed with the CUDA 3.1 release.

   */

  cudaErrorMemoryValueTooLarge          =     32,

/**

   * This indicates that a resource handle passed to the API call was not

   * valid. Resource handles are opaque types like ::cudaStream_t and

   * ::cudaEvent_t.

   */

  cudaErrorInvalidResourceHandle        =     33,

/**

   * This indicates that asynchronous operations issued previously have not

   * completed yet. This result is not actually an error, but must be indicated

   * differently than ::cudaSuccess (which indicates completion). Calls that

   * may return this value include ::cudaEventQuery() and ::cudaStreamQuery().

   */

  cudaErrorNotReady                     =     34,

/**

   * This indicates that the installed NVIDIA CUDA driver is older than the

   * CUDA runtime library. This is not a supported configuration. Users should

   * install an updated NVIDIA display driver to allow the application to run.

   */

  cudaErrorInsufficientDriver           =     35,

/**

   * This indicates that the user has called ::cudaSetDevice(),

   * ::cudaSetValidDevices(), ::cudaSetDeviceFlags(),

   * ::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice,

   * ::cudaD3D11SetDirect3DDevice(), * or ::cudaVDPAUSetVDPAUDevice() after

   * initializing the CUDA runtime by calling non-device management operations

   * (allocating memory and launching kernels are examples of non-device

   * management operations). This error can also be returned if using

   * runtime/driver interoperability and there is an existing ::CUcontext

   * active on the host thread.

   */

  cudaErrorSetOnActiveProcess           =     36,

/**

   * This indicates that the surface passed to the API call is not a valid

   * surface.

   */

  cudaErrorInvalidSurface               =     37,

/**

   * This indicates that no CUDA-capable devices were detected by the installed

   * CUDA driver.

   */

  cudaErrorNoDevice                     =     38,

/**

   * This indicates that an uncorrectable ECC error was detected during

   * execution.

   */

  cudaErrorECCUncorrectable             =     39,

/**

   * This indicates that a link to a shared object failed to resolve.

   */

  cudaErrorSharedObjectSymbolNotFound   =     40,

/**

   * This indicates that initialization of a shared object failed.

   */

  cudaErrorSharedObjectInitFailed       =     41,

/**

   * This indicates that the ::cudaLimit passed to the API call is not

   * supported by the active device.

   */

  cudaErrorUnsupportedLimit             =     42,

/**

   * This indicates that multiple global or constant variables (across separate

   * CUDA source files in the application) share the same string name.

   */

  cudaErrorDuplicateVariableName        =     43,

/**

   * This indicates that multiple textures (across separate CUDA source

   * files in the application) share the same string name.

   */

  cudaErrorDuplicateTextureName         =     44,

/**

   * This indicates that multiple surfaces (across separate CUDA source

   * files in the application) share the same string name.

   */

  cudaErrorDuplicateSurfaceName         =     45,

/**

   * This indicates that all CUDA devices are busy or unavailable at the current

   * time. Devices are often busy/unavailable due to use of

   * ::cudaComputeModeExclusive, ::cudaComputeModeProhibited or when long

   * running CUDA kernels have filled up the GPU and are blocking new work

   * from starting. They can also be unavailable due to memory constraints

   * on a device that already has active CUDA work being performed.

   */

  cudaErrorDevicesUnavailable           =     46,

/**

   * This indicates that the device kernel image is invalid.

   */

  cudaErrorInvalidKernelImage           =     47,

/**

   * This indicates that there is no kernel image available that is suitable

   * for the device. This can occur when a user specifies code generation

   * options for a particular CUDA source file that do not include the

   * corresponding device configuration.

   */

  cudaErrorNoKernelImageForDevice       =     48,

/**

   * This indicates that the current context is not compatible with this

   * the CUDA Runtime. This can only occur if you are using CUDA

   * Runtime/Driver interoperability and have created an existing Driver

   * context using the driver API. The Driver context may be incompatible

   * either because the Driver context was created using an older version 

   * of the API, because the Runtime API call expects a primary driver 

   * contextand the Driver context is not primary, or because the Driver 

   * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions 

   * with the CUDA Driver API" for more information.

   */

  cudaErrorIncompatibleDriverContext    =     49,

/**

   * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is

   * trying to re-enable peer addressing on from a context which has already

   * had peer addressing enabled.

   */

  cudaErrorPeerAccessAlreadyEnabled     =     50,

/**

   * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to 

   * disable peer addressing which has not been enabled yet via 

   * ::cudaDeviceEnablePeerAccess().

   */

  cudaErrorPeerAccessNotEnabled         =     51,

/**

   * This indicates that a call tried to access an exclusive-thread device that 

   * is already in use by a different thread.

   */

  cudaErrorDeviceAlreadyInUse           =     54,

/**

   * This indicates profiler has been disabled for this run and thus runtime 

   * APIs cannot be used to profile subsets of the program. This can 

   * happen when the application is running with external profiling tools

   * like visual profiler.

   */

  cudaErrorProfilerDisabled             =     55,

/**

   * This indicates profiler has not been initialized yet. cudaProfilerInitialize()

   * must be called before calling cudaProfilerStart and cudaProfilerStop to 

   * initialize profiler.

   */

  cudaErrorProfilerNotInitialized       =     56,

/**

   * This indicates profiler is already started. This error can be returned if 

   * cudaProfilerStart() is called multiple times without subsequent call

   * to cudaProfilerStop().

   */

  cudaErrorProfilerAlreadyStarted       =     57,

/**

   * This indicates profiler is already stopped. This error can be returned if 

   * cudaProfilerStop() is called without starting profiler using cudaProfilerStart().

   */

   cudaErrorProfilerAlreadyStopped       =    58,

/**

   * This indicates an internal startup failure in the CUDA runtime.

   */

  cudaErrorStartupFailure               =   0x7f,

/**

   * Any unhandled CUDA driver error is added to this value and returned via

   * the runtime. Production releases of CUDA should not return such errors.

   */

  cudaErrorApiFailureBase               =  10000

};

.....

/**

 * CUDA Error types

 */

/*DEVICE_BUILTIN*/

typedef enum cudaError cudaError_t;

don’t use cuda_safe_call or anything like that ever–cutil is not officially supported by anyone and is not guaranteed to do what you expect ever. check the return values manually.