We have been observing CUDA out of memory errors for some time in cryosparc live. The jobs start and run for several dozen micrographs then start producing out of memory errors, first intermittently, then reliably.
The same jobs run outside of the GUI produce no out of memory errors. The GPU and system RAM usage are less than half of the total available for the first few micrographs (as observed in nvidia-smi and htop, respectively). GPU RAM usage appears to slowly climb up over time until the error begins to happen.
This error happens even immediately after the system has been rebooted, and when no other users are using the server.
We are using cryosparc v 4.5.3.
Error message:
Traceback (most recent call last):
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 851, in _attempt_allocation
return allocator()
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 1054, in allocator
return driver.cuMemAlloc(size)
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 344, in verbose_cuda_api_call
return self._check_cuda_python_error(fname, libfn(*args))
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 408, in _check_cuda_python_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [CUresult.CUDA_ERROR_OUT_OF_MEMORY] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "cryosparc_master/cryosparc_compute/jobs/rtp_workers/run.py", line 381, in cryosparc_master.cryosparc_compute.jobs.rtp_workers.run.rtp_worker
File "cryosparc_master/cryosparc_compute/jobs/rtp_workers/run.py", line 450, in cryosparc_master.cryosparc_compute.jobs.rtp_workers.run.process_movie
File "cryosparc_master/cryosparc_compute/jobs/rtp_workers/run.py", line 596, in cryosparc_master.cryosparc_compute.jobs.rtp_workers.run.do_patch_motion
File "cryosparc_master/cryosparc_compute/jobs/rtp_workers/run.py", line 625, in cryosparc_master.cryosparc_compute.jobs.rtp_workers.run.do_patch_motion
File "cryosparc_master/cryosparc_compute/jobs/rtp_workers/run.py", line 602, in cryosparc_master.cryosparc_compute.jobs.rtp_workers.run.do_patch_motion
File "cryosparc_master/cryosparc_compute/jobs/motioncorrection/patchmotion.py", line 292, in cryosparc_master.cryosparc_compute.jobs.motioncorrection.patchmotion.unbend_motion_correction
File "cryosparc_master/cryosparc_compute/jobs/motioncorrection/patchmotion.py", line 394, in cryosparc_master.cryosparc_compute.jobs.motioncorrection.patchmotion.unbend_motion_correction
File "cryosparc_master/cryosparc_compute/gpu/gpucore.py", line 390, in cryosparc_master.cryosparc_compute.gpu.gpucore.EngineBaseThread.ensure_allocated
File "/var/home/cryosparc_user/cryosparc_worker/cryosparc_compute/gpu/gpuarray.py", line 270, in empty
return device_array(shape, dtype, stream=stream)
File "/var/home/cryosparc_user/cryosparc_worker/cryosparc_compute/gpu/gpuarray.py", line 226, in device_array
arr = GPUArray(shape=shape, strides=strides, dtype=dtype, stream=stream)
File "/var/home/cryosparc_user/cryosparc_worker/cryosparc_compute/gpu/gpuarray.py", line 21, in __init__
super().__init__(shape, strides, dtype, stream, gpu_data)
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/devicearray.py", line 103, in __init__
gpu_data = devices.get_context().memalloc(self.alloc_size)
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 1372, in memalloc
return self.memory_manager.memalloc(bytesize)
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 1056, in memalloc
ptr = self._attempt_allocation(allocator)
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 863, in _attempt_allocation
return allocator()
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 1054, in allocator
return driver.cuMemAlloc(size)
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 344, in verbose_cuda_api_call
return self._check_cuda_python_error(fname, libfn(*args))
File "/var/home/cryosparc_user/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/lib/python3.10/site-packages/numba/cuda/cudadrv/driver.py", line 408, in _check_cuda_python_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [CUresult.CUDA_ERROR_OUT_OF_MEMORY] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY