Error while using Tensorflow GPU

While running the following code, I’m getting an error as above. Is it because GPU is running out of memory? That should not happen as GPU should still run on starvation using main memory

Note

Also, I’d like to add that this code ran just fine on Google Colab but did not work on Jupyter Notebook

Code


import tensorflow as tf

from tensorflow.keras.optimizers import Adam
epochs=50
model.compile(loss="binary_crossentropy",optimizer='adam',metrics=['accuracy'])
fitted_model=model.fit(X_train,y_train,epochs=epochs,validation_split=0.3, use_multiprocessing=True)

Error (Command Line)

2021-06-07 16:54:06.549654: W tensorflow/core/framework/op_kernel.cc:1755] Internal: 'cuModuleGetFunction(&function, module, kernel_name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
2021-06-07 16:54:06.549706: W tensorflow/core/framework/op_kernel.cc:1755] Internal: 'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, stream, params, nullptr)' failed with 'CUDA_ERROR_INVALID_HANDLE'
[I 16:55:19.628 NotebookApp] Saving file at /temp/Set2.ipynb

Traceback

---------------------------------------------------------------------------
InternalError                             Traceback (most recent call last)
<ipython-input-5-c072d84ff9ec> in <module>
      4 epochs=50
      5 model.compile(loss="binary_crossentropy",optimizer='adam',metrics=['accuracy'])
----> 6 fitted_model=model.fit(X_train,y_train,epochs=epochs,validation_split=0.3, use_multiprocessing=True)

c:\users\harvish's pc\desktop\logo detection\myvenv\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1181                 _r=1):
   1182               callbacks.on_train_batch_begin(step)
-> 1183               tmp_logs = self.train_function(iterator)
   1184               if data_handler.should_sync:
   1185                 context.async_wait()

c:\users\harvish's pc\desktop\logo detection\myvenv\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
    887 
    888       with OptionalXlaContext(self._jit_compile):
--> 889         result = self._call(*args, **kwds)
    890 
    891       new_tracing_count = self.experimental_get_tracing_count()

c:\users\harvish's pc\desktop\logo detection\myvenv\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
    948         # Lifting succeeded, so variables are initialized and we can run the
    949         # stateless function.
--> 950         return self._stateless_fn(*args, **kwds)
    951     else:
    952       _, _, _, filtered_flat_args = \

c:\users\harvish's pc\desktop\logo detection\myvenv\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
   3021       (graph_function,
   3022        filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 3023     return graph_function._call_flat(
   3024         filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access
   3025 

c:\users\harvish's pc\desktop\logo detection\myvenv\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1958         and executing_eagerly):
   1959       # No tape is watching; skip to running the function.
-> 1960       return self._build_call_outputs(self._inference_function.call(
   1961           ctx, args, cancellation_manager=cancellation_manager))
   1962     forward_backward = self._select_forward_and_backward_functions(

c:\users\harvish's pc\desktop\logo detection\myvenv\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
    589       with _InterpolateFunctionError(self):
    590         if cancellation_manager is None:
--> 591           outputs = execute.execute(
    592               str(self.signature.name),
    593               num_outputs=self._num_outputs,

c:\users\harvish's pc\desktop\logo detection\myvenv\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     57   try:
     58     ctx.ensure_initialized()
---> 59     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     60                                         inputs, attrs, num_outputs)
     61   except core._NotOkStatusException as e:

InternalError:  'cuModuleGetFunction(&function, module, kernel_name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
	 [[node sequential/dropout/dropout/Mul_1 (defined at <ipython-input-5-c072d84ff9ec>:6) ]] [Op:__inference_train_function_1066]

Function call stack:
train_function

Update

I was able to solve this issue by changing the batch_size from 8 or 4 to 2. But I still have no explanation for this.

1 Like

you made my day!!