How to deal with "invalid argument" errors when launching kernels with arguments pointing to large data?

I have encountered an “invalid argument” error when attempting to launch my kernel. One of the arguments in my kernel is a pointer to a relatively large amount of data (more than 50M) residing in global memory. After shrinking the data size, the error was no longer reported. I wonder why this happens, and how to get rid of it without shrinking the data size? Thank you!

In before @tera shows up with his signature…

But in case he doesn’t, run your program with cuda-memcheck to see if there is invalid address/out-of-bounds errors. If there is any, the indices need to be fixed.

Thanks for the reply. I tried your solution out, but I don’t quite understand the output of cuda-memcheck, which is as follows (the name of my program is calcFStats_MITDB),

========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaLaunchKernel.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3572d3]
=========     Host Frame:./calcFStats_MITDB [0x575b5]
=========     Host Frame:./calcFStats_MITDB [0x70bc]
=========     Host Frame:./calcFStats_MITDB (_Z54__device_stub__Z17evalShCands_fStatPfPtS0_S0_PdS1_ttttPfPtS0_S0_PdS1_tttt + 0x274) [0x6f2b]
=========     Host Frame:./calcFStats_MITDB (_Z17evalShCands_fStatPfPtS0_S0_PdS1_tttt + 0x7d) [0x6fcc]
=========     Host Frame:./calcFStats_MITDB (main + 0x1513) [0x5e0f]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
=========     Host Frame:./calcFStats_MITDB (_start + 0x29) [0x47b9]
=========
invalid argument
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaGetLastError.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3572d3]
=========     Host Frame:./calcFStats_MITDB [0x54253]
=========     Host Frame:./calcFStats_MITDB (main + 0x154f) [0x5e4b]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
=========     Host Frame:./calcFStats_MITDB (_start + 0x29) [0x47b9]
=========

Any idea what this indicates? Thanks!

Also, I found that if I broke this argument down to three arguments, each pointing to a part of the data in the global memory, the error was gone even the actual amount of global memory usage is higher than the previous case. Why does this happen?

Please show the code line that launches your kernel.
It is also advisable to do error checking otherwise errors can go silent. Copy my signature to your program and do a google search on “proper CUDA error checking” to see how to use this macro.

My entire code is as follows. The kernel launch is in line 452.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <float.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <time.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <ios>
#include <string>
#include <algorithm>
#include <vector>

typedef unsigned short ushort;
typedef unsigned int uint;

#define MAX_SLEN_DIGITS 4
#define MAX_CHAR_PER_LINE 200000
#define MAX_THREADS_PER_BLOCK 1024
#define NUM_SUBCLASS 14
#define NUM_CLASS 5
#define NUM_TRAINREC 22

__device__ void decomposeDouble(double val, int *elms){
	elms[0] = (int)val;
	//double r = (val - elms[0]) * 1e8;
	elms[1] = (int)((val - elms[0]) * 1e8);
	elms[2] = ((val - elms[0]) * 1e8 - elms[1]) * 1e8;
}

//__device__ void quickSort(double *vec, double *sortedVec){
//	int p, r;
//	p = 0;
//	r = sizeof(vec) / sizeof(double) - 1;
//
//}

__device__ void calcFStat(double &f, double *distVec, int *labels, const int n){
	double s, s2, sp, sn, ss_t, ss_g, ss_w;
	int np, nn, i;
	s = s2 = sp = np = 0;
	for (i = 0; i < n; i++){
		s += distVec[i];
		s2 += distVec[i] * distVec[i];
		if (labels[i] == 1){
			sp += distVec[i];
			np++;
		}
	}
	nn = n - np;
	sn = s - sn;
	ss_t = s2 - s * s / n;
	ss_g = sp * sp / np + sn * sn / nn - s * s / n;
	ss_w = ss_t - ss_g;

	f = ss_g * (n - 2) / ss_w;
}

__global__ void evalShCands_fStat(float *tss_in, ushort *labels_in, ushort *numBySubClass_in, ushort *subStarts_in, 
	double *fStats_sub_out, double *fStats_out, const ushort numTrain, const ushort tsLen, const ushort sLen, const ushort nextTsIdx){

	ushort tid = threadIdx.x;
	ushort curIdx = blockIdx.x * tsLen + tid;

	if (curIdx == 0)
		printf("I am running!\n");

	//extern __shared__ float array[];

	//float *ts1 = (float*)array;
	//float *ts2 = (float*)&ts1[tsLen];
	//int *buf = (int*)&ts2[tsLen];
	//int *buf_s0 = (int*)&buf[tsLen * 3];
	//ushort *labels = (ushort*)&buf_s0[tsLen * 3];
	//ushort *numBySubClass = (ushort *)&labels[numTrain];
	//ushort *subStarts = (ushort *)&numBySubClass[NUM_SUBCLASS];
	//
	//ushort blockSize = blockDim.x;
	//ushort numIters = numTrain % blockSize ? numTrain / blockSize + 1 : numTrain / blockSize;
	//ushort i, numThis;
	//for (i = 0; i < numIters; i++){
	//	numThis = (i == numIters - 1) ? numTrain - (i * blockSize) : blockSize;
	//	if (tid < numThis)
	//		labels[i * blockSize + tid] = labels_in[i * blockSize + tid];
	//}
	//numIters = NUM_SUBCLASS % blockSize ? NUM_SUBCLASS / blockSize + 1 : NUM_SUBCLASS / blockSize;
	//for (i = 0; i < numIters; i++){
	//	numThis = (i == numIters - 1) ? NUM_SUBCLASS - (i * blockSize) : blockSize;
	//	if (tid < numThis)
	//		numBySubClass[i * blockSize + tid] = numBySubClass_in[i * blockSize + tid];
	//}
	//numIters = (NUM_CLASS + 1) % blockSize ? (NUM_CLASS + 1) / blockSize + 1 : (NUM_CLASS + 1) / blockSize;
	//for (i = 0; i < numIters; i++){
	//	numThis = (i == numIters - 1) ? (NUM_CLASS + 1) - (i * blockSize) : blockSize;
	//	if (tid < numThis)
	//		subStarts[i * blockSize + tid] = subStarts_in[i * blockSize + tid];
	//}

	//ts1[tid] = tss_in[nextTsIdx * tsLen + curIdx];
	//__syncthreads();

	////numIters = tsLen % blockSize ? tsLen / blockSize + 1 : tsLen / blockSize;
	////for (i = 0; i < numIters; i++){
	////	numThis = (i == numIters - 1) ? tsLen - (i * blockSize) : blockSize;
	////	if(tid < numThis)
	////		ts1[i * blockSize + tid] = tss_in[nextTsIdx * tsLen + curIdx]??? //start here, 后面还有别的变量!;
	////}
	//__syncthreads();

	//bool isValid = tid < tsLen - sLen + 1;

	//ushort idx, j;
	//int elms[3];
	//double term1, term2, s1, s1_2, s2, s2_2, mu1, sigma1, mu2, sigma2, dotPr, corr, nnCorr, nnDist_2,
	//	s, s_2, sc_sub[NUM_SUBCLASS];
	//s = s_2 = 0;
	//for (i = 0; i < NUM_SUBCLASS; i++)
	//	sc_sub[i] = 0;

	//if (isValid){	//当前没有解决不应该标准化的情况!!!
	//	s1 = s1_2 = 0;
	//	for (j = 0; j < sLen; j++){
	//		term1 = ts1[tid + j];
	//		s1 += term1;
	//		s1_2 += term1 * term1;
	//	}
	//	mu1 = s1 / sLen;
	//	sigma1 = s1_2 / sLen > mu1 * mu1 ? sqrt(s1_2 / sLen - mu1 * mu1) : 1;
	//}

	//for (i = 0; i < numTrain; i++){	//当前没有解决不应该标准化的情况!!!

	//	//load the time series into the shared memory
	//	ts2[tid] = tss_in[i * tsLen + tid];
	//	__syncthreads();

	//	if (isValid){
	//		//initiation
	//		s2 = s2_2 = 0;
	//		for (j = 0; j < sLen; j++){
	//			term2 = ts2[j];
	//			s2 += term2;
	//			s2_2 += term2 * term2;
	//		}
	//		mu2 = s2 / sLen;
	//		sigma2 = s2_2 / sLen > mu2 * mu2 ? sqrt(s2_2 / sLen - mu2 * mu2) : 1;

	//		dotPr = 0;
	//		for (j = 0; j < sLen; j++){
	//			term1 = ts1[j];
	//			term2 = ts2[tid + j];
	//			dotPr += term1 * term2;
	//		}
	//		decomposeDouble(dotPr, elms);
	//		buf_s0[3 * tid] = elms[0];
	//		buf_s0[3 * tid + 1] = elms[1];
	//		buf_s0[3 * tid + 2] = elms[2];

	//		dotPr = 0;
	//		for (j = 0; j < sLen; j++){
	//			term1 = ts1[tid + j];
	//			term2 = ts2[j];
	//			dotPr += term1 * term2;
	//		}
	//		decomposeDouble(dotPr, elms);
	//		buf[3 * tid] = elms[0];
	//		buf[3 * tid + 1] = elms[1];
	//		buf[3 * tid + 2] = elms[2];

	//		nnCorr = (dotPr - sLen * mu1 * mu2) / (sLen * sigma1 * sigma2);	//to maximize
	//	}
	//	__syncthreads();

	//	for (j = 1; j < tsLen - sLen + 1; j++){
	//		if (isValid){
	//			term1 = ts2[j + sLen - 1];
	//			term2 = ts2[j - 1];
	//			s2 = s2 + term1 - term2;
	//			s2_2 = s2_2 + term1 * term1 - term2 * term2;
	//			mu2 = s2 / sLen;
	//			sigma2 = s2_2 / sLen > mu2 * mu2 ? sqrt(s2_2 / sLen - mu2 * mu2) : 1;

	//			if (!tid){
	//				elms[0] = buf_s0[3 * j];
	//				elms[1] = buf_s0[3 * j + 1];
	//				elms[2] = buf_s0[3 * j + 2];
	//				dotPr = elms[0] + (double)elms[1] / 1e8 + (double)elms[2] / 1e16;
	//			}
	//			else{
	//				elms[0] = buf[3 * (tid - 1)];
	//				elms[1] = buf[3 * (tid - 1) + 1];
	//				elms[2] = buf[3 * (tid - 1) + 2];
	//				term1 = ts1[tid + sLen - 1];
	//				term2 = ts2[j + sLen - 1];
	//				dotPr = elms[0] + (double)elms[1] / 1e8 + (double)elms[2] / 1e16 + term1 * term2;
	//				term1 = ts1[tid - 1];
	//				term2 = ts2[j - 1];
	//				dotPr -= term1 * term2;
	//			}
	//		}
	//		__syncthreads();

	//		if (isValid){
	//			decomposeDouble(dotPr, elms);
	//			buf[3 * tid] = elms[0];
	//			buf[3 * tid + 1] = elms[1];
	//			buf[3 * tid + 2] = elms[2];
	//		}
	//		__syncthreads();

	//		if (isValid){
	//			corr = (dotPr - sLen * mu1 * mu2) / (sLen * sigma1 * sigma2);
	//			if (corr > nnCorr)
	//				nnCorr = corr;
	//		}
	//	}
	//	if (isValid){
	//		idx = curIdx * numTrain + i;
	//		if (nnCorr > 1 || nextTsIdx + blockIdx.x == i)
	//			nnDist_2 = 0;
	//		else
	//			nnDist_2 = 2 * (1 - nnCorr);

	//		s += sqrt(nnDist_2);
	//		s_2 += nnDist_2;
	//		sc_sub[labels[i]] += sqrt(nnDist_2);
	//	}
	//}

	//if (isValid){

	//	idx = 1;
	//	ushort numCurClass = 0;
	//	double sc = 0;
	//	double ss_t, ss_g, ss_g_sub;
	//	ss_g = ss_g_sub = -s * s / numTrain;
	//	ss_t = s_2 + ss_g;
	//	for (i = 0; i <= NUM_SUBCLASS; i++){
	//		if (i != NUM_SUBCLASS && numBySubClass[i])
	//			ss_g_sub += sc_sub[i] * sc_sub[i] / numBySubClass[i];
	//		if (subStarts[idx] == i){
	//			if(numCurClass)
	//				ss_g += sc * sc / numCurClass;
	//			idx++;
	//			sc = sc_sub[i];
	//			numCurClass = numBySubClass[i];
	//		}
	//		else{
	//			sc += sc_sub[i];
	//			numCurClass += numBySubClass[i];
	//		}
	//	}
	//	fStats_out[curIdx] = ss_g * (numTrain - NUM_CLASS) / ((NUM_CLASS - 1) * (ss_t - ss_g));
	//	fStats_sub_out[curIdx] = ss_g_sub * (numTrain - NUM_SUBCLASS) / ((NUM_SUBCLASS - 1) * (ss_t - ss_g_sub));
	//	
	//}
}

int main(int argc, char** argv){

	std::string path = "/home/user/MITDB";
	std::cout << path << std::endl;
	const std::string segPolicy = "100_100";	//100_100 / 100_200
	const std::string preprocessPolicy = "raw";		//raw / denoised
	const std::string leadId = "A";	//A / B
	const ushort tsLen = 200;
	const ushort deviceId = 0;
	const int maxThreadsPerBlock = 1024;	
	const int maxBlocksPerGrid = 256;
	const int minSLen = 10;
	const int maxSLen = tsLen;
	const int sLenStep = (maxSLen - minSLen) / 10 > 0 ? (maxSLen - minSLen) / 10 : 10;
	/*const std::string savePath = "E:/PhysioNet/MITDB/Models";*/

	const ushort numBySubClass[]{38102, 3949, 3783, 16, 16, 810, 100, 32, 2, 3683, 105, 415, 0, 8};
	const ushort subStarts[]{0, 5, 9, 11, 12, 14};
	const std::string trainIDs[] = {"101", "106", "108", "109", "112", "114", "115", "116", "118",
		"119", "122", "124", "201", "203", "205", "207", "208", "209", "215", "220", "223", "230"};
	ushort numTrain = 0;
	for (int i = 0; i < NUM_SUBCLASS; i++){
		numTrain += numBySubClass[i];
	}
	std::cout << numTrain << std::endl;

	//load to host
	size_t trainTssBytes = numTrain * tsLen * sizeof(float);
	float *trainTss = (float*)malloc(trainTssBytes);
	if (trainTss == NULL)
		std::cout << "Malloc error!" << std::endl;
	size_t trainLabelsBytes = numTrain * sizeof(ushort);
	ushort *trainLabels = (ushort*)malloc(trainLabelsBytes);

	char buf[MAX_CHAR_PER_LINE];
	char *tmp;
	const std::string dataPath = path + "/UCR_" + segPolicy + "/" + preprocessPolicy;
	std::ifstream fTrain_in;
	ushort nextTsId = 0;
	long nextId = 0;
	for (ushort ind = 0; ind < NUM_TRAINREC; ind++){
		std::string trainFName = dataPath + "/" + trainIDs[ind] + "_" + leadId + "_" + segPolicy + "_" + preprocessPolicy + "_ALL";

		fTrain_in.open(trainFName.c_str());
		if (!fTrain_in){
			exit(1);
		}
		while (fTrain_in.getline(buf, MAX_CHAR_PER_LINE, '\r\n')){	//注意要确认一下到底是要\n还是\r\n
			tmp = strtok(buf, " ,\t\r\n");
			if (tmp == NULL)
				break;

			trainLabels[nextTsId] = atoi(tmp);
			tmp = strtok(NULL, " ,\t\r\n");
			while (tmp != NULL){
				trainTss[nextId++] = atof(tmp);
				tmp = strtok(NULL, " ,\t\r\n");
			}
			nextTsId++;
		}
		fTrain_in.close();
	}

	//load to device
	cudaError_t cudaerr;
	cudaerr = cudaSetDevice(deviceId);
	if (cudaerr != cudaSuccess){
		printf("CudaSetDevice failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
		exit(1);
	}

	ushort *labels_in;
	cudaerr = cudaMalloc((void**)&labels_in, trainLabelsBytes);
	if (cudaerr != cudaSuccess){
		printf("CudaMalloc for labels_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}
	cudaerr = cudaMemcpy(labels_in, trainLabels, trainLabelsBytes, cudaMemcpyHostToDevice);
	if (cudaerr != cudaSuccess){
		printf("CudaMemcpy for labels_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}

	ushort *numBySubClass_in;
	cudaerr = cudaMalloc((void**)&numBySubClass_in, NUM_SUBCLASS * sizeof(ushort));
	if (cudaerr != cudaSuccess){
		printf("CudaMalloc for numBySubClass_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}
	cudaerr = cudaMemcpy(numBySubClass_in, numBySubClass, NUM_SUBCLASS * sizeof(ushort), cudaMemcpyHostToDevice);
	if (cudaerr != cudaSuccess){
		printf("CudaMemcpy for numBySubClass_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}

	ushort *subStarts_in;
	cudaerr = cudaMalloc((void**)&subStarts_in, (NUM_CLASS + 1) * sizeof(ushort));
	if (cudaerr != cudaSuccess){
		printf("CudaMalloc for subStarts_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}
	cudaerr = cudaMemcpy(subStarts_in, subStarts, (NUM_CLASS + 1) * sizeof(ushort), cudaMemcpyHostToDevice);
	if (cudaerr != cudaSuccess){
		printf("CudaMemcpy for subStarts_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}

	float *tss_in;
	printf("%d\n", trainTssBytes);
	cudaerr = cudaMalloc((void**)&tss_in, trainTssBytes);
	if (cudaerr != cudaSuccess){
		printf("CudaMalloc for tss_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}
	cudaerr = cudaMemcpy(tss_in, trainTss, trainTssBytes, cudaMemcpyHostToDevice);
	if (cudaerr != cudaSuccess){
		printf("CudaMemcpy for tss_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}

	/*float *tss_in_1;
	printf("%d\n", trainTssBytes);
	cudaerr = cudaMalloc((void**)&tss_in_1, trainTssBytes);
	if (cudaerr != cudaSuccess){
		printf("CudaMalloc for tss_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}
	cudaerr = cudaMemcpy(tss_in_1, trainTss, trainTssBytes, cudaMemcpyHostToDevice);
	if (cudaerr != cudaSuccess){
		printf("CudaMemcpy for tss_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}

	float *tss_in_2;
	printf("%d\n", trainTssBytes);
	cudaerr = cudaMalloc((void**)&tss_in_2, trainTssBytes);
	if (cudaerr != cudaSuccess){
		printf("CudaMalloc for tss_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}
	cudaerr = cudaMemcpy(tss_in_2, trainTss, trainTssBytes, cudaMemcpyHostToDevice);
	if (cudaerr != cudaSuccess){
		printf("CudaMemcpy for tss_in failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
	}*/

	int blockSize = tsLen < MAX_THREADS_PER_BLOCK ? tsLen : MAX_THREADS_PER_BLOCK;
	size_t sharedMemBytes = (2 * sizeof(double) + 6 * sizeof(int)) * tsLen + 
		(numTrain + NUM_SUBCLASS + NUM_CLASS + 1) * sizeof(ushort);

	std::string fStatPath = path + "/" + "F-statistics";
	if (access(fStatPath.c_str(), F_OK) == -1)
		//mkdir(fStatPath.c_str()); 
		mkdir(fStatPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
	std::string fStatFName;

	int gridSize, numRemaining;
	ushort sLen, nextTsIdx;
	size_t fStatBytes;
	double *fStats, *fStats_out, *fStats_sub, *fStats_sub_out;
	char s_sLen[MAX_SLEN_DIGITS];

	std::ofstream fFstat;
	fFstat.precision(15);

	clock_t tic, toc;
	tic = clock();
	for (sLen = minSLen; sLen <= maxSLen; sLen += sLenStep){
		/*itoa(sLen, s_sLen, 10);*/
		sprintf(s_sLen, "%d", sLen);
		fStatFName = fStatPath + "/" + "Fstats_" + s_sLen + ".txt";
		if (access(fStatFName.c_str(), F_OK) == 0)
			remove(fStatFName.c_str());
		fStatFName = fStatPath + "/" + "Fstats_sub_" + s_sLen + ".txt";
		if (access(fStatFName.c_str(), F_OK) == 0)
			remove(fStatFName.c_str());

		nextTsIdx = 0;
		numRemaining = numTrain;
		while (numRemaining > 0){
			std::cout << "Subsequence length is " << sLen << ". ";
			std::cout << "Number of remaining time series is " << numRemaining << ". ";
			gridSize = numRemaining < maxBlocksPerGrid ? numRemaining : maxBlocksPerGrid;
			std::cout << "Number of time series in this iteration is " << gridSize << "." << std::endl;
			fStatBytes = gridSize * tsLen * sizeof(double);

			cudaerr = cudaMalloc((void**)&fStats_out, fStatBytes);
			if (cudaerr != cudaSuccess){
				printf("CudaMalloc for fStats_out failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
			}

			cudaerr = cudaMalloc((void**)&fStats_sub_out, fStatBytes);
			if (cudaerr != cudaSuccess){
				printf("CudaMalloc for fStats_sub_out failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
			}
			cudaError_t error_check = cudaGetLastError();
			printf("%s\n", cudaGetErrorString(error_check));

			printf("%d %d %d\n", gridSize, blockSize, numTrain);
			evalShCands_fStat << <gridSize, blockSize, sharedMemBytes >> >(tss_in, labels_in, numBySubClass_in, subStarts_in, fStats_sub_out, fStats_out, numTrain, tsLen, sLen, nextTsIdx);
			cudaerr = cudaThreadSynchronize();
			if (cudaerr != cudaSuccess)
				printf("Kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
			error_check = cudaGetLastError();
			printf("%s\n", cudaGetErrorString(error_check));
			getchar();

			fStats = (double*)malloc(fStatBytes);
			cudaerr = cudaMemcpy(fStats, fStats_out, fStatBytes, cudaMemcpyDeviceToHost);
			if (cudaerr != cudaSuccess){
				printf("CudaMemcpy for fStats failed with error \"%s\". sLen = %d, numRemaining = %d\n", cudaGetErrorString(cudaerr), sLen, numRemaining);
			}
			cudaerr = cudaFree(fStats_out);
			if (cudaerr != cudaSuccess){
				printf("CudaFree for fStats_out failed with error \"%s\". sLen = %d, numRemaining = %d\n", cudaGetErrorString(cudaerr), sLen, numRemaining);
				exit(1);
			}

			fStats_sub = (double*)malloc(fStatBytes);
			cudaerr = cudaMemcpy(fStats_sub, fStats_sub_out, fStatBytes, cudaMemcpyDeviceToHost);
			if (cudaerr != cudaSuccess){
				printf("CudaMemcpy for fStats_sub failed with error \"%s\". sLen = %d, numRemaining = %d\n", cudaGetErrorString(cudaerr), sLen, numRemaining);
			}
			cudaerr = cudaFree(fStats_sub_out);
			if (cudaerr != cudaSuccess){
				printf("CudaFree for fStats_sub_out failed with error \"%s\". sLen = %d, numRemaining = %d\n", cudaGetErrorString(cudaerr), sLen, numRemaining);
				exit(1);
			}

			fStatFName = fStatPath + "/" + "Fstats_" + s_sLen + ".txt" ;
			fFstat.open(fStatFName.c_str(), std::ofstream::out | std::ofstream::app);
			for (int i = 0; i < gridSize; i++){
				for (int j = 0; j < tsLen - sLen + 1; j++){
					fFstat << fStats[i * tsLen + j] << " ";
				}
				fFstat << std::endl;
			}
			fFstat.close();
			free(fStats);

			fStatFName = fStatPath + "/" + "Fstats_sub_" + s_sLen + ".txt";
			fFstat.open(fStatFName.c_str(), std::ofstream::out | std::ofstream::app);
			for (int i = 0; i < gridSize; i++){
				for (int j = 0; j < tsLen - sLen + 1; j++){
					fFstat << fStats_sub[i * tsLen + j] << " ";
				}
				fFstat << std::endl;
			}
			fFstat.close();
			free(fStats_sub);

			numRemaining -= gridSize;
			nextTsIdx += gridSize;
		}
	}
	toc = clock();
	double time = (double)(toc - tic) / ((double)CLOCKS_PER_SEC);
	printf("Total running time is %f\n", time);

	free(trainTss);
	cudaFree(tss_in);
	free(trainLabels);
	cudaFree(labels_in);
	cudaFree(numBySubClass_in);
	cudaFree(subStarts_in);
	getchar();
	return 0;
}

I actually did error check in almost every step, and found that there were no errors before the kernel launch. The argument in question is tss_in, which points to about 50M data in the global memory.

change the statement in line 451 to be as follows:

printf("%d %d %d %d\n", gridSize, blockSize, numTrain, sharedMemBytes);

rerun the failing case

paste the printout from that line here

That solves the problem! I used up the shared memory. Never thought about that! Thanks a lot!

the kernel configuration are the parameters contained within the <<<…>>> syntax. If you get an invalid configuration argument error, then your focus should be there. Inspect all arguments that you are passing, and compare them to the various limits published in the programming guide (e.g. table 14).