OpenCL + CUDA 4.0 + Tesla T10: write on image object not working

hi,

I don’t know if it is a know issue but I’m experiencing problems when using CUDA 4.0 OpenCL on a Tesla T10 to write on 2d image object.

I tried to run the same code on a Radeon 6990 with AMD OpenCL, on two machines with Tesla M2050 with CUDA 4.0 OpenCL, on a GeForce 8600M GT with Apple OpenCL and it seems to run correctly.

To exclude a faulty T10 I also ran the code on two other machines with T10s and get the same problem.

for reference, here is the kernel that I ran:

kernel void write_img(write_only image2d_t output)

{

	if (get_global_id(0) < SXD4 && get_global_id(1) < SY)

	{

		const int2 idx = (int2)(get_global_id(0),get_global_id(1));

		write_imagef(output, idx, (float4)((float)((idx.x)%2)));

	}

}

once downloaded on an array with clEnqueueReadImage I would expect to get this pattern:

0000111100001111…

and instead, on a T10 I get:

1111111111111111… and after the 1st half of the image there are just random numbers

it seems that the odd indexed columns are written at idx.x/2 and the 2nd half of the image is not written

is there a solution for this issue?

thank you

I need help. I wrote a simple CUDA program for multiplying two vectors and I also wrote the same code in OpenCL but I got huge differences in measuring time of host memora allocation, device memory allocation and copying results from host to device and vice versa.

Can anyone help me? what could be the reason? I’m using Nvidia Geforce 9800 GT

CUDA

/*

  • Host code.

*/

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

// includes, kernels

#include <VecAdd_kernel.cu>

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main(void)

{

float *ulazd_A, *ulazd_B, *izlazd_C;

float *ulazh_A, *ulazh_B, *izlazh_C;

int velicinaBloka =512;

int N = 65536;

printf("Velicina vektora je: %d\n\n", N); 

unsigned int brojac = 0;

float H_ALL_vrijeme = 0.0f;

float D_ALL_vrijeme = 0.0f;

float H_to_D_vrijeme = 0.0f;

float D_to_H_vrijeme = 0.0f;

float D_vrijeme = 0.0f;

cudaEvent_t start1, start2, start3, start4, start5, start6;

cudaEvent_t stop1, stop2, stop3, stop4, stop5, stop6; 

cudaEventCreate( &start1 ) ;

 cudaEventCreate( &start2 ) ;

 cudaEventCreate( &start3 ) ;

 cudaEventCreate( &start4 ) ;

 cudaEventCreate( &start5 ) ;

 cudaEventCreate( &start6 ) ;

 cudaEventCreate( &stop1 ) ;

 cudaEventCreate( &stop2 ) ;

 cudaEventCreate( &stop3 ) ;

 cudaEventCreate( &stop4 ) ;

 cudaEventCreate( &stop5 ) ;

 cudaEventCreate( &stop6 ) ;

cutCreateTimer(&brojac);

cutStartTimer( brojac);

//dodijeli memoriju domacina

cudaEventRecord( start1, 0 );

ulazh_A = (float*)malloc(sizeof(float)*N);

ulazh_B = (float*)malloc(sizeof(float)*N);

izlazh_C = (float*)malloc(sizeof(float)*N);

cudaEventRecord( stop1, 0 );

cudaEventSynchronize(stop1);

cudaEventElapsedTime(&H_ALL_vrijeme,start1, stop1);

//ispuni vektore nasumičnim realnim brojevima

for (int i=0;i<N;i++)

{

ulazh_A[i] = (float)rand()/(float)RAND_MAX;

ulazh_B[i] = (float)rand()/(float)9;

}

//dodijeli memoriju uređaja

cudaEventRecord( start2, 0 );

cudaMalloc((void **)&ulazd_A, N*sizeof(float));

cudaMalloc((void **)&ulazd_B, N*sizeof(float));

cudaMalloc((void **)&izlazd_C, N*sizeof(float));

cudaEventRecord( stop2, 0 );

cudaEventSynchronize(stop2);

cudaEventElapsedTime(&D_ALL_vrijeme,start2, stop2);

//xxxx

dim3 dimBlock(velicinaBloka);

dim3 dimGrid (ceil(float(N)/float(dimBlock.x)));

//kopiraj memoriju s domacina na uredjaj

cudaEventRecord( start3, 0 );

cudaMemcpy(ulazd_A, ulazh_A, sizeof(float)*N, cudaMemcpyHostToDevice);

cudaMemcpy(ulazd_B, ulazh_B, sizeof(float)*N, cudaMemcpyHostToDevice);

cudaEventRecord( stop3, 0 );

cudaEventSynchronize(stop3);

cudaEventElapsedTime(&H_to_D_vrijeme,start3, stop3);

// izvrsi jezgru

cudaEventRecord( start4, 0 );	

VecAdd<<< dimGrid,dimBlock >>>(ulazd_A,ulazd_B,izlazd_C);

//cudaThreadSynchronize();

cudaEventRecord( stop4, 0 );

cudaEventSynchronize(stop4);

cudaEventElapsedTime(&D_vrijeme,start4, stop4);

//kopiraj rezultate s uređaja na domacina

cudaEventRecord( start6, 0 );

cudaMemcpy(izlazh_C, izlazd_C, N*sizeof(float), cudaMemcpyDeviceToHost);

cudaThreadSynchronize();

cudaEventRecord( stop6, 0 );

cudaEventSynchronize(stop6);

cudaEventElapsedTime(&D_to_H_vrijeme,start6, stop6);

cudaThreadSynchronize();

//zaustavi brojac

cutStopTimer( brojac);

printf("GPU Vrijeme procesiranja: %f ms\n", cutGetTimerValue(brojac));    

//ispisi rezultat

printf(“Vrijeme alociranja memorije domacina: %f ms\n”, H_ALL_vrijeme);

printf("Vrijeme alociranja memorije uredjaja: %f ms\n", D_ALL_vrijeme);

printf("Vrijeme kopiranja podataka s CPU na GPU: %f ms\n", H_to_D_vrijeme);

printf("Vrijeme izvodenja jezgre: %f ms\n", D_vrijeme);

printf("Vrijeme kopiranja podataka s GPU-a na RAM: %f ms\n", D_to_H_vrijeme); 

system (“PAUSE”);

// oslobodi memoriju

free( ulazh_A);

free( ulazh_B);

free( izlazh_C);

cudaFree(ulazd_A);

cudaFree(ulazd_B);

cudaFree(izlazd_C);

cudaEventDestroy(start1);

cudaEventDestroy(stop1);

cudaEventDestroy(start2);

cudaEventDestroy(stop2);

cudaEventDestroy(start3);

cudaEventDestroy(stop3);

cudaEventDestroy(start4);

cudaEventDestroy(stop4);

cudaEventDestroy(start5);

cudaEventDestroy(stop5);

cudaEventDestroy(start6);

cudaEventDestroy(stop6);

cutDeleteTimer(brojac);

cudaThreadExit();

cutDeleteTimer(brojac);

}

OPENCL

/*

*Code

*/

#include <stdio.h>

#include <shrQATest.h>

#include <oclUtils.h>

#include

// OpenCL application to compute a simple vector multiplication

// computation between 2 arrays on the GPU

// ******************************************************************

#include <stdio.h>

#include <stdlib.h>

#include

#include <string.h>

#include

#include <assert.h>

#include <time.h>

#include <Windows.h>

#include “Timer.h”

// OpenCL izvorni kod

const char* OpenCLSource = {

“__kernel void VectorAdd(__global float* GPUVector1, __global float* GPUVector2,__global float* GPUOutputVector)”,

“{”,

" // Indeks elemenata koji se trebaju pomnožiti \n",

" unsigned int n = get_global_id(0);",

" // pomnoži elemente vektora a i b i spremi u c \n",

" GPUVector1[n] = GPUVector2[n] + GPUOutputVector[n];",

“}”

};

// broj elemenata vektora

#define SIZE 10000

// Main function

// *********************************************************************

int main(int argc, char **argv)

{

 printf("Velicina vektora je: %d \n\n", SIZE); 

// Alociraj memoriju domaćina

CPerfCounter t;

t.Reset();

t.Start();

float* HostVector1 = (float *)malloc(sizeof(float) * SIZE);

float* HostVector2 = (float *)malloc(sizeof(float) * SIZE);

float* HostOutputVector = (float *)malloc(sizeof(float) * SIZE);

t.Stop();

printf(“Vrijeme potrebno za alokaciju memorije domacina: %2f ms\n\n”, t.GetElapsedTime());

t.Reset();

//kreiraj događaje za GPU

cl_event events [3];

//cl_event CPUtoGPU;

//cl_event CPUtoGPU2;

// Ispuni vektore domaćina s nasumičnim realnim brojevima

for(int c = 0; c < SIZE; c++)

{

HostVector1[c] = (float)rand()/(float)RAND_MAX;

HostVector2[c] = (float)rand()/(float)9;

}

//OpenCL varijable

cl_int status;

cl_platform_id platform;

cl_context GPUContext;

size_t ParmDataBytes;

cl_device_id* GPUDevices;

cl_command_queue GPUCommandQueue;

cl_mem GPUVector1;

cl_mem GPUVector2;

cl_mem GPUOutputVector;

cl_program OpenCLProgram;

cl_kernel OpenCLVectorAdd;

// platforma

status = clGetPlatformIDs (1, &platform, NULL);

if (status != CL_SUCCESS) {

	printf( "Greska: Nema dostupnih OpenCL platformi!");

	return 0;

	}

// postavi kontekst

cl_context_properties props[3];

props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;

props[1] = (cl_context_properties)platform;

props[2] = (cl_context_properties)0;

// Kreiraj OpenCL kontekst

GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, &status);

if (status != CL_SUCCESS) {

	printf( "Greska u kreiranju konteksta " );

	return 0;

}

// Dohvati listu GPU uređaja koji su u kontekstu

clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);

GPUDevices = (cl_device_id*)malloc(ParmDataBytes);

clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);

// Kreiraj commandqueuq za GPU uređaj

GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], CL_QUEUE_PROFILING_ENABLE, &status);

if (status != CL_SUCCESS) {

	printf( "Greska: Neuspjesno kreiranje memorije sa slijednim pristupom naredbama!" );

	return 0;

	}

// Alociraj memoriju uređaja

t.Reset();

t.Start();

GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, sizeof(float) * SIZE, NULL, &status);

if (status != CL_SUCCESS) {

	printf( "Greska u kreiranju memorijskog meduspremnika GPUVector1: " );

	return 0;

	}

GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, sizeof(float) * SIZE, NULL, &status);

if (status != CL_SUCCESS) {

	printf( "Greska u kreiranju memorijskog meduspremnika GPUVector2: " );

	return 0;

	}

// AlocirajGPU memoriju za rezultantni

GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,sizeof(float) * SIZE, NULL, NULL);

if (status != CL_SUCCESS) {

	printf( "Greska u kreiranju memorijskog meduspremnika GPUOutputVector: " );

	return 0;

	}

t.Stop();

printf( “Alociranje memorije ureÄ‘aja: %2f ms\n\n”, t.GetElapsedTime());

t.Reset();

// Kopiraj podatke s CPU na GPU

t.Reset();

t.Start();

clEnqueueWriteBuffer(GPUCommandQueue, GPUVector1, CL_FALSE, 0, SIZE * sizeof(float), HostVector1, 0, NULL, &events[3]);

status = clWaitForEvents(1, &events[3]);

if(status != CL_SUCCESS) {

  printf( "Greska u cekanju na kopiranje podataka s CPU na GPU!" );

	return 0;

}

/* Calculate performance */

cl_ulong startTime2;

cl_ulong endTime2;

/* Get kernel profiling info */

status = clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime2, 0);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena jedan! %d", status );

  return 0;

}

status = clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime2, 0);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena CPU na GPU!" );

return 0;

}

double vrijeme1 = (endTime2 - startTime2) * 1.0e-6f;

printf( “Vrijeme kopiranja podataka s CPU na GPU HostVector1: %lf ms\n”, vrijeme1);

clEnqueueWriteBuffer(GPUCommandQueue, GPUVector2, CL_TRUE, 0, SIZE * sizeof(float), HostVector2, 0, NULL, &events[2]);

t.Stop();

printf( “Vrijeme kopiranja podataka s CPU na CGU: %2f ms\n\n”, t.GetElapsedTime());

t.Reset();

status = clWaitForEvents(1, &events[2]);

if(status != CL_SUCCESS) {

  printf( "Greska u cekanju na kopiranje podataka s CPU na GPU!" );

	return 0;

}

/* Calculate performance */

cl_ulong startTime3;

cl_ulong endTime3;

/* Get kernel profiling info */

status = clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime3, 0);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena jedan! %d", status );

  return 0;

}

status = clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime3, 0);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena CPU na GPU!" );

return 0;

}

double vrijeme2 = (endTime3 - startTime3) * 1.0e-6f;

//printf( “Vrijeme kopiranja podataka s CPU na CGU: %2f ms\n”, vrijeme2);

printf( “Ukupno vrijeme kopiranja podataka s CPU na GPU event: %2f ms\n”, vrijeme2+vrijeme1);

clReleaseEvent(events[2]);

clReleaseEvent(events[3]);

// Create OpenCL program with source code

OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, &status);

if (status != CL_SUCCESS) {

	printf( "Greska u kreiranju programa " );

	return 0;

}

// Build the program (OpenCL JIT compilation)

clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);

if (status != CL_SUCCESS) {

	printf( "Greska u izgradnji programa" );

	return 0;

	}

// Create a handle to the compiled OpenCL function (Kernel)

OpenCLVectorAdd = clCreateKernel(OpenCLProgram, “VectorAdd”, &status);

if (status != CL_SUCCESS) {

	printf( "Greska u kreiranju jezgre " );

	return 0;

	}

// In the next step we associate the GPU memory with the Kernel arguments

clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);

clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);

clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);

// Launch the Kernel on the GPU

size_t WorkSize[1] = {SIZE}; // one dimensional Range

t.Reset();

t.Start();

status = clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, &events[0]);

if (status != CL_SUCCESS) {

	printf( "Greska u pokretanju jezgre!" );

	return 0;

}

t.Stop();

printf( “Vrijeme izvodjenja jezgree: %lf ms\n”, t.GetElapsedTime());

t.Reset();

status = clWaitForEvents(1, &events[0]);

if(status != CL_SUCCESS) {

  printf( "Greska u cekanju na dogadjaj!" );

	return 0;

}

/* Calculate performance */

cl_ulong startTime;

cl_ulong endTime;

/* Get kernel profiling info */

status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena jedan! %d", status );

  return 0;

}

status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena dva!" );

  return 0;

}

double KernelExecutionTime = (endTime - startTime) * 1.0e-6f;

printf( “Vrijeme izvodjenja jezgre event: %2f ms\n”, KernelExecutionTime);

clReleaseEvent(events[0]);

// Copy the output in GPU memory back to CPU memory

t.Reset();

t.Start();

clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_FALSE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, &events[1]);

t.Stop();

printf( “Vrijeme kopiranja podataka s GPU na CPU: %lf ms\n”, t.GetElapsedTime());

t.Reset();

status = clWaitForEvents(1, &events[1]);

if(status != CL_SUCCESS) {

  printf( "Greska u cekanju na kopiranje podataka s GPU na CPU!" );

	return 0;

}

/* Calculate performance */

cl_ulong startTime1;

cl_ulong endTime1;

/* Get kernel profiling info */

status = clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime1, NULL);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena jedan! %d", status );

  return 0;

}

status = clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime1, NULL);

if(status != CL_SUCCESS) {

  printf( "Greska u proracunu vremena dva!" );

return 0;

}

double vrijeme = (endTime1 - startTime1) * 1.0e-6f;

printf( “Vrijeme kopiranja podataka s GPU na CPU pomocu event: %lf ms\n”, vrijeme);

clReleaseEvent(events[1]);

// Cleanup

free(GPUDevices);

free (HostVector1);

free (HostVector2);

free (HostOutputVector);

clReleaseKernel(OpenCLVectorAdd);

clReleaseProgram(OpenCLProgram);

clReleaseCommandQueue(GPUCommandQueue);

clReleaseContext(GPUContext);

clReleaseMemObject(GPUVector1);

clReleaseMemObject(GPUVector2);

clReleaseMemObject(GPUOutputVector);

}