single workstation
CRYOSPARC_PATH=/home/sparc/cryosparc_worker/bin
PYTHONPATH=/home/sparc/cryosparc_worker
CRYOSPARC_CUDA_PATH=/usr/local/cuda
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/sparc/cryosparc_worker/deps/external/cudnn/lib
PATH=/usr/local/cuda/bin:/home/sparc/cryosparc_worker/bin:/home/sparc/cryosparc_worker/deps/anaconda/envs/cryosparc_worker_env/bin:/home/sparc/cryosparc_worker/deps/anaconda/condabin:/home/sparc/cryosparc_master/bin:/home/sparc/cryosparc_master/bin:/home/sparc/cryosparc_master/bin:/home/sparc/cryosparc_master/bin:/home/sparc/cryosparc_master/bin:/home/sparc/cryosparc_master/bin:/home/sparc/cryosparc_master/bin:/home/sparc/cryosparc_master/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
which nvcc
/usr/local/cuda/bin/nvcc
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0
python -c “import pycuda.driver; print(pycuda.driver.get_version())”
(11, 7, 0)
/sbin/ldconfig -p | grep -i cuda
libpcsamplingutil.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libpcsamplingutil.so
libnvrtc.so.11.2 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvrtc.so.11.2
libnvrtc.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvrtc.so
libnvrtc-builtins.so.11.7 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvrtc-builtins.so.11.7
libnvrtc-builtins.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvrtc-builtins.so
libnvperf_target.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvperf_target.so
libnvperf_host.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvperf_host.so
libnvjpeg.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg.so.11
libnvjpeg.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg.so
libnvblas.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvblas.so.11
libnvblas.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvblas.so
libnvToolsExt.so.1 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvToolsExt.so.1
libnvToolsExt.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnvToolsExt.so
libnpps.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnpps.so.11
libnpps.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnpps.so
libnppitc.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppitc.so.11
libnppitc.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppitc.so
libnppisu.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppisu.so.11
libnppisu.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppisu.so
libnppist.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppist.so.11
libnppist.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppist.so
libnppim.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppim.so.11
libnppim.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppim.so
libnppig.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppig.so.11
libnppig.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppig.so
libnppif.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.11
libnppif.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so
libnppidei.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppidei.so.11
libnppidei.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppidei.so
libnppicc.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppicc.so.11
libnppicc.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppicc.so
libnppial.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppial.so.11
libnppial.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppial.so
libnppc.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppc.so.11
libnppc.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libnppc.so
libicudata.so.70 (libc6,x86-64) => /lib/x86_64-linux-gnu/libicudata.so.70
libicudata.so.70 (ELF) => /lib/i386-linux-gnu/libicudata.so.70
libcusparse.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcusparse.so.11
libcusparse.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcusparse.so
libcusolverMg.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcusolverMg.so.11
libcusolverMg.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcusolverMg.so
libcusolver.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcusolver.so.11
libcusolver.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcusolver.so
libcurand.so.10 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcurand.so.10
libcurand.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcurand.so
libcupti.so.11.7 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcupti.so.11.7
libcupti.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcupti.so
libcuinj64.so.11.7 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcuinj64.so.11.7
libcuinj64.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcuinj64.so
libcufile_rdma.so.1 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufile_rdma.so.1
libcufile_rdma.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufile_rdma.so
libcufile.so.0 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufile.so.0
libcufile.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufile.so
libcufftw.so.10 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufftw.so.10
libcufftw.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufftw.so
libcufft.so.10 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufft.so.10
libcufft.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcufft.so
libcudart.so.11.0 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.11.0
libcudart.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so
libcudadebugger.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcudadebugger.so.1
libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
libcuda.so.1 (libc6) => /lib/i386-linux-gnu/libcuda.so.1
libcuda.so (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so
libcuda.so (libc6) => /lib/i386-linux-gnu/libcuda.so
libcublasLt.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so.11
libcublasLt.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so
libcublas.so.11 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.11
libcublas.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so
libcheckpoint.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libcheckpoint.so
libaccinj64.so.11.7 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libaccinj64.so.11.7
libaccinj64.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libaccinj64.so
libOpenCL.so.1 (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libOpenCL.so.1
libOpenCL.so (libc6,x86-64) => /usr/local/cuda/targets/x86_64-linux/lib/libOpenCL.so
uname -a
Linux grizzly 6.2.0-33-generic #33~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Sep 7 10:33:52 UTC 2 x86_64 x86_64 x86_64 GNU/Linux
free -g
total used free shared buff/cache available
Mem: 376 22 16 1 337 350
Swap: 7 1 6
nvidia-smi
Thu Oct 19 16:38:02 2023
±--------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12 Driver Version: 535.104.12 CUDA Version: 12.2 |
|-----------------------------------------±---------------------±---------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 2080 Ti On | 00000000:1B:00.0 Off | N/A |
| 27% 38C P8 20W / 250W | 828MiB / 11264MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 1 NVIDIA GeForce RTX 2080 Ti On | 00000000:1C:00.0 Off | N/A |
| 32% 39C P2 71W / 250W | 296MiB / 11264MiB | 7% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 2 NVIDIA GeForce RTX 2080 Ti On | 00000000:1D:00.0 Off | N/A |
| 30% 33C P8 2W / 250W | 8MiB / 11264MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 3 NVIDIA GeForce RTX 2080 Ti On | 00000000:1E:00.0 Off | N/A |
| 31% 34C P8 3W / 250W | 8MiB / 11264MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 4 NVIDIA GeForce RTX 2080 Ti On | 00000000:3D:00.0 Off | N/A |
| 30% 28C P8 14W / 250W | 8MiB / 11264MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 5 NVIDIA GeForce RTX 2080 Ti On | 00000000:3F:00.0 Off | N/A |
| 30% 29C P8 5W / 250W | 8MiB / 11264MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 6 NVIDIA GeForce RTX 2080 Ti On | 00000000:40:00.0 Off | N/A |
| 30% 29C P8 18W / 250W | 8MiB / 11264MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 7 NVIDIA GeForce RTX 2080 Ti On | 00000000:41:00.0 Off | N/A |
| 30% 30C P8 15W / 250W | 8MiB / 11264MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
±--------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 20896 C python 818MiB |
| 1 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
| 1 N/A N/A 21045 C python 284MiB |
| 2 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
| 3 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
| 4 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
| 5 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
| 6 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
| 7 N/A N/A 1951 G /usr/lib/xorg/Xorg 4MiB |
±--------------------------------------------------------------------------------------+
Two identical NU-refine jobs; one finished in Total time 3175.85s
The other was running for 2.5 hours then stopped producing output but is listed as running.
No error messages.
ps -ef | grep sparc | grep J94
sparc 20879 3546 0 09:36 ? 00:00:00 bash /home/sparc/cryosparc_worker/bin/cryosparcw run --project P3 --job J94 --master_hostname grizzly.mskcc.org --master_command_core_port 39002
sparc 20895 20879 0 09:36 ? 00:00:48 python -c import cryosparc_compute.run as run; run.run() --project P3 --job J94 --master_hostname grizzly.mskcc.org --master_command_core_port 39002
sparc 20896 20895 99 09:36 ? 09:04:01 python -c import cryosparc_compute.run as run; run.run() --project P3 --job J94 --master_hostname grizzly.mskcc.org --master_command_core_port 39002
top
20896 sparc 20 0 50.2g 13.2g 414776 R 100.3 3.5 484:02.84 python
nvidia-smi above.
The problem seems to occur to any type of jobs. After reboot, jobs runs normally, then slow down or get stalled with no error messages. For testing, I ran two identical jobs.
Please advise.
Thank you,
Yehuda