Please help code 6 error.

Cuda code to run in a few months, but the analysis does not solve the problem.

Currently using the laptop built-in graphics card is Geforce 610m.

INPUT Picture size is 100X100. And that the output picture, Fringe_Pattren size is 200X200. Making the operation a hologram and I want to use the GPU.

I have been trying to create CGH. Using the GPU I want to improve the speed.
However, an error message is generated. [“cudaDeviceSynchronize returned error code 6 after launching addKerel!” testCuda failed! … ~ ~ Press ~ ~ key ~ ~ ~ …]

Four loops in the first place was the C + + source. I studied were made of two loops. However, an error occurs.

How to use Thread and block? I’m asking for help …

//---------------------------------------------------------------------------------------
Origonal code is
//---------------------------------------------------------------------------------------
#include<stdio.h>
#include<math.h>
#include
#include<boost/thread.hpp>
using namespace std;
using namespace boost;

#include <time.h>

#define height 200
#define width 200

#define Fringe_height 400
#define Fringe_width 400

#define ImageMul 20
#define PI 3.14159265358979323846
#define CGHSCALE 1/255
#define DEFAULTDEPTH 0.5
#define Intensity 5//Light
#define RADIANS PI/180
#define THETA 0.544
#define WaveNum 9926043.4154818337
#define P 0.0000104
int x,y,z,k,l;
float R,DEPTH;
float **Fringe_Pattern, **Image;
FILE *read_file; // file read
FILE *write_file;

void part1()
{
for(k=0; k<Fringe_width; k++){
for(l=0; l<Fringe_height; l++){
for(int i=0; i<width; i++){
for(y=0; y<height; y++){
if(Image[i][y] > 0){
DEPTH = (255-Image[i][y]) * CGHSCALE + DEFAULTDEPTH;
R = sqrt(DEPTHDEPTH + (kP-iPImageMul)(kP-iPImageMul) + (lP-yPImageMul)(lP-yP*ImageMul));
Fringe_Pattern[k][l] += Image[i][y]cos(WaveNumR);
}
}
}
}
}

}

void main()
{
unsigned char tmp;

int temp;

float max=0, min=0;
float max_1=0, min_1=0;
float Resize_Temp = 0;




Image					= new float *[width];					// Input_Image
Fringe_Pattern			= new float *[Fringe_width];			

// Digital_Hologram

for(x=0; x<width; x++){
	Image[x]					= new float [height];
}

for(x=0; x<Fringe_width; x++){
	Fringe_Pattern[x]			= new float [Fringe_height];
}


read_file = fopen("image/002.raw", "rb");
for(x=0; x<width; x++){										
	for(y=0; y<height; y++){
		fread(&tmp, 1, 1, read_file);
		Image[x][y] = (int)tmp;
	}
}

for(k=0; k<Fringe_width; k++){
	for(l=0; l<Fringe_height; l++){
		Fringe_Pattern[k][l] = 0;
	}
}

clock_t start,end;

double duration;

start=clock();

////////////////////////////////////////////////
////////////////////////////////////////////////

thread t_part1(&part1);

t_part1.join();

////////////////////////////////////////////////
////////////////////////////////////////////////
end=clock();

duration = ((double)(end - start)/CLOCKS_PER_SEC );

cout << "Time "<<duration<<endl;

for(x=0; x<Fringe_width; x++){
	for(y=0; y<Fringe_height; y++){
		if (Fringe_Pattern[x][y] > max){
			max = Fringe_Pattern[x][y];
		}
		if (Fringe_Pattern[x][y] < min){
			min = Fringe_Pattern[x][y];
		}		
	}
}

for(x=0; x<Fringe_width; x++){
	for(y=0; y<Fringe_height; y++){
		Fringe_Pattern[x][y] = ((Fringe_Pattern[x][y]-min)/(max-min))*255;
	}
}

write_file = fopen("result/rabbit_imageMul20.raw", "wb");   //Write File

for(x=0; x<Fringe_width; x++){
	for(y=0; y<Fringe_height; y++){
		temp = Fringe_Pattern[x][y];
		fwrite(&temp, 1, 1, write_file);
	}		
}

}

//-----------------------------------------------------------------------------------------------
Changed cuda code is
//-----------------------------------------------------------------------------------------------
#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>

#define height 200
#define width 200
#define Fringe_height 100
#define Fringe_width 100

#define ImageMul 20
#define CGHSCALE 1/255
#define DEFAULTDEPTH 0.5
#define WaveNum 9926043.4154818337
#define P 0.0000104

cudaError_t testCuda(float *Image, float *Fringe_Pattern);

global void testKernel(float *dev_Image, float *dev_Fringe_Pattern)
{
int a, b, i, j, x, y;
float DEPTH, R;

for(a=0; a<Fringe_height*Fringe_width; a++)
{
	i=a/Fringe_width;
	j=a%Fringe_width;

	for(b=0; b<height*width; b++)
	{
		x=b/width;
		y=b%width;

		if(dev_Image[x*width+y] > 0)
		{
			DEPTH = (255-dev_Image[x][y]) * CGHSCALE + DEFAULTDEPTH;
			R = sqrt(DEPTH*DEPTH + (i*P-x*P*ImageMul)*(i*P-x*P*ImageMul) + (j*P-y*P*ImageMul)*(j*P-y*P*ImageMul));
			dev_Fringe_Pattern[i][j] += dev_Image[x][y]*cos(WaveNum*R);
		}
	}
}

}

int main()
{
unsigned char tmp;
int temp;
int i, j;
float max=0, min=0;
clock_t start, end;
double duration;
FILE *fin, *fout;
float *Fringe_Pattern, *Image;

start=clock();

Image = (float*)calloc(sizeof(float), height*width);
Fringe_Pattern = (float*)calloc(sizeof(float), Fringe_height*Fringe_width);

//fin = fopen("input.raw", "rb");

fin = fopen("resize_rabbit.raw", "rb");
for(i=0; i<height; i++)
{										
	for(j=0; j<width; j++)
	{
		fread(&tmp, 1, 1, fin);
		Image[i*width+j] = (int)tmp;
	}
}

//cudaError_t cudaStatus = testCuda(Image, Fringe_Pattern);
//if (cudaStatus != cudaSuccess) {
//	fprintf(stderr, "testCuda failed!");
//	return 1;
//}
for(i=0; i<Fringe_height; i++)
{
	for(j=0; j<Fringe_width; j++)
	{
		if(Fringe_Pattern[i*Fringe_width+j] > max)
			max = Fringe_Pattern[i*Fringe_width+j];
		
		if(Fringe_Pattern[i*Fringe_width+j] < min)
			min = Fringe_Pattern[i*Fringe_width+j];
	}
}

for(i=0; i<Fringe_height; i++)
{
	for(j=0; j<Fringe_width; j++)
		Fringe_Pattern[i*Fringe_width+j] = ((Fringe_Pattern[i*Fringe_width+j]-min)/(max-min))*255;
}

fout = fopen("result.raw", "wb");

for(i=0; i<Fringe_height; i++)
{
	for(j=0; j<Fringe_width; j++)
	{
		temp = Fringe_Pattern[i*Fringe_width+j];
		fwrite(&temp, 1, 1, fout);
	}		
}

cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaDeviceReset failed!");
	return 1;
}

free(Image);
free(Fringe_Pattern);
fclose(fin);
fclose(fout);

end=clock();
duration = ((double)(end - start)/CLOCKS_PER_SEC );
printf("Time : %f\n", duration);

return 0;

}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t testCuda(float *Image, float *Fringe_Pattern)
{
float *dev_Image, *dev_Fringe_Pattern;
cudaError_t cudaStatus;

cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_Image, height*width*sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_Fringe_Pattern, Fringe_height*Fringe_width*sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(dev_Image, Image, height*width*sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

testKernel<<<1, 1>>>(dev_Image, dev_Fringe_Pattern);

cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

cudaStatus = cudaMemcpy(Fringe_Pattern, dev_Fringe_Pattern, Fringe_height*Fringe_width*sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_Image);
cudaFree(dev_Fringe_Pattern);

return cudaStatus;

}

Please, note that your code contains no addKernel global function and that the error message you are setting in the code is misleading. Your code contains only a testKernel global function.

cudaThreadSynchronize() returns an error if one of the preceding tasks has failed.

Most probably your testKernel is taking too long to complete. Notice that you are launching testKernel with only 1 block and 1 thread. Also, notice that testKernel is actually a sequential C/C++ function to which you have just added the global decorator. This is not the way you should parallelize a sequential code.

I recomment either reading some textbook like “CUDA By Example” that will smoothly introduce you to the topic of parallel processing on GPU or some directive-based accelerator, like OpenACC, if you don’t want to directly use CUDA.