The computer ob which the CryoSPARC master processes crashes:
-
also acts as a CryoSPARC worker for this CryoSPARC master
-
also acts as a CryoSPARC master for other CryoSPARC workers.
cryosparcm status | grep -v LICENSE
----------------------------------------------------------------------------
CryoSPARC System master node installed at
/home/em7/Software/cryosparc/cryosparc_master
Current cryoSPARC version: v4.4.1+240110
----------------------------------------------------------------------------
CryoSPARC process status:
app RUNNING pid 2403428, uptime 7:09:28
app_api RUNNING pid 2403447, uptime 7:09:27
app_api_dev STOPPED Not started
command_core RUNNING pid 2403331, uptime 7:09:38
command_rtp RUNNING pid 2403389, uptime 7:09:30
command_vis RUNNING pid 2403363, uptime 7:09:31
database RUNNING pid 2403220, uptime 7:09:41
----------------------------------------------------------------------------
License is valid
----------------------------------------------------------------------------
global config variables:
export CRYOSPARC_MASTER_HOSTNAME="em7"
export CRYOSPARC_DB_PATH="/home/em7/Software/cryosparc/cryosparc_database"
export CRYOSPARC_BASE_PORT=39000
export CRYOSPARC_DB_CONNECTION_TIMEOUT_MS=20000
export CRYOSPARC_INSECURE=false
export CRYOSPARC_DB_ENABLE_AUTH=true
export CRYOSPARC_CLUSTER_JOB_MONITOR_INTERVAL=10
export CRYOSPARC_CLUSTER_JOB_MONITOR_MAX_RETRIES=1000000
export CRYOSPARC_PROJECT_DIR_PREFIX='CS-'
export CRYOSPARC_DEVELOP=false
export CRYOSPARC_CLICK_WRAP=true
cryosparcm cli "get_scheduler_targets()"
[{'cache_path': '/media/raid0/', 'cache_quota_mb': None, 'cache_reserve_mb': 10000, 'desc': None, 'gpus': [{'id': 0, 'mem': 25383469056, 'name': 'NVIDIA GeForce RTX 4090'}, {'id': 1, 'mem': 25386352640, 'name': 'NVIDIA GeForce RTX 4090'}], 'hostname': 'em7', 'lane': 'default', 'monitor_port': None, 'name': 'em7', 'resource_fixed': {'SSD': True}, 'resource_slots': {'CPU': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], 'GPU': [0, 1], 'RAM': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]}, 'ssh_str': 'em7@em7', 'title': 'Worker node em7', 'type': 'node', 'worker_bin_path': '/home/em7/Software/cryosparc/cryosparc_worker/bin/cryosparcw'}, {'cache_path': '/media/hrishi/scratch2/', 'cache_quota_mb': None, 'cache_reserve_mb': 10000, 'desc': None, 'gpus': [{'id': 0, 'mem': 11536039936, 'name': 'NVIDIA GeForce RTX 2080 Ti'}, {'id': 1, 'mem': 11539054592, 'name': 'NVIDIA GeForce RTX 2080 Ti'}], 'hostname': 'em1', 'lane': 'em1', 'monitor_port': None, 'name': 'em1', 'resource_fixed': {'SSD': True}, 'resource_slots': {'CPU': [0, 1, 2, 3, 4, 5, 6, 7], 'GPU': [0, 1], 'RAM': [0, 1, 2, 3, 4, 5, 6, 7]}, 'ssh_str': 'hrishi@em1', 'title': 'Worker node em1', 'type': 'node', 'worker_bin_path': '/home/hrishi/Software/cryosparc/cryosparc_worker/bin/cryosparcw'}, {'cache_path': '/media/scratch/', 'cache_quota_mb': None, 'cache_reserve_mb': 10000, 'desc': None, 'gpus': [{'id': 0, 'mem': 16900292608, 'name': 'Quadro RTX 5000'}, {'id': 1, 'mem': 16891707392, 'name': 'Quadro RTX 5000'}], 'hostname': 'em4', 'lane': 'em4', 'monitor_port': None, 'name': 'em4', 'resource_fixed': {'SSD': True}, 'resource_slots': {'CPU': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27], 'GPU': [0, 1], 'RAM': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]}, 'ssh_str': 'hrishi@em4', 'title': 'Worker node em4', 'type': 'node', 'worker_bin_path': '/home/hrishi/Software/cryosparc_worker/bin/cryosparcw'}, {'cache_path': '/media/scratch/', 'cache_quota_mb': None, 'cache_reserve_mb': 10000, 'desc': None, 'gpus': [{'id': 0, 'mem': 50962300928, 'name': 'Quadro RTX 8000'}, {'id': 1, 'mem': 50962300928, 'name': 'Quadro RTX 8000'}], 'hostname': 'preprocess1', 'lane': 'preprocess1', 'monitor_port': None, 'name': 'preprocess1', 'resource_fixed': {'SSD': True}, 'resource_slots': {'CPU': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71], 'GPU': [0, 1], 'RAM': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}, 'ssh_str': 'hallberg@preprocess1', 'title': 'Worker node preprocess1', 'type': 'node', 'worker_bin_path': '/home/hallberg/software/cryosparc/cryosparc_worker/bin/cryosparcw'}, {'cache_path': '/media/em6/scratch/', 'cache_quota_mb': None, 'cache_reserve_mb': 10000, 'desc': None, 'gpus': [{'id': 0, 'mem': 21116682240, 'name': 'NVIDIA RTX 4000 Ada Generation'}, {'id': 1, 'mem': 21125267456, 'name': 'NVIDIA RTX 4000 Ada Generation'}], 'hostname': 'em6-1', 'lane': 'em6-1', 'monitor_port': None, 'name': 'em6-1', 'resource_fixed': {'SSD': True}, 'resource_slots': {'CPU': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], 'GPU': [0, 1], 'RAM': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]}, 'ssh_str': 'em6@em6-1', 'title': 'Worker node em6-1', 'type': 'node', 'worker_bin_path': '/home/em6/cryosparc/cryosparc_worker/bin/cryosparcw'}, {'cache_path': '/media/em5/scratch/', 'cache_quota_mb': None, 'cache_reserve_mb': 10000, 'desc': None, 'gpus': [{'id': 0, 'mem': 21116682240, 'name': 'NVIDIA RTX 4000 Ada Generation'}, {'id': 1, 'mem': 21125267456, 'name': 'NVIDIA RTX 4000 Ada Generation'}], 'hostname': 'em5', 'lane': 'em5', 'monitor_port': None, 'name': 'em5', 'resource_fixed': {'SSD': True}, 'resource_slots': {'CPU': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], 'GPU': [0, 1], 'RAM': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]}, 'ssh_str': 'em5@em5', 'title': 'Worker node em5', 'type': 'node', 'worker_bin_path': '/home/em5/Software/cryosprac/cryosparc_worker/bin/cryosparcw'}]
Previously most workstations were configured to run master and worker at the same time. In this configuration they were not connected to another worker or master on a different workstation. This caused the same issues of unpredictable crashes on each of the workstations.
Another setup that crashed quite a while ago, in this case master and worker are on the same computer:
cryosparcm status | grep -v LICENSE
----------------------------------------------------------------------------
CryoSPARC System master node installed at
/home/supervisor/cryosparc/cryosparc_master
Current cryoSPARC version: v4.4.1
----------------------------------------------------------------------------
CryoSPARC process status:
app RUNNING pid 3909865, uptime 43 days, 5:56:05
app_api RUNNING pid 3909929, uptime 43 days, 5:56:03
app_api_dev STOPPED Not started
command_core RUNNING pid 3909514, uptime 43 days, 5:56:15
command_rtp RUNNING pid 3909721, uptime 43 days, 5:56:08
command_vis RUNNING pid 3909663, uptime 43 days, 5:56:09
database RUNNING pid 3909316, uptime 43 days, 5:56:18
----------------------------------------------------------------------------
License is valid
----------------------------------------------------------------------------
global config variables:
export CRYOSPARC_MASTER_HOSTNAME="3DEM-Workstation"
export CRYOSPARC_DB_PATH="/home/supervisor/cryosparc/cryosparc_database"
export CRYOSPARC_BASE_PORT=39000
export CRYOSPARC_DB_CONNECTION_TIMEOUT_MS=20000
export CRYOSPARC_INSECURE=false
export CRYOSPARC_DB_ENABLE_AUTH=true
export CRYOSPARC_CLUSTER_JOB_MONITOR_INTERVAL=10
export CRYOSPARC_CLUSTER_JOB_MONITOR_MAX_RETRIES=1000000
export CRYOSPARC_PROJECT_DIR_PREFIX='CS-'
export CRYOSPARC_DEVELOP=false
export CRYOSPARC_CLICK_WRAP=true
cryosparcm cli "get_scheduler_targets()"
[{'cache_path': '/media/supervisor/DATA/cryosparc_cache', 'cache_quota_mb': None, 'cache_reserve_mb': 10000, 'desc': None, 'gpus': [{'id': 0, 'mem': 12630294528, 'name': 'NVIDIA GeForce RTX 3080 Ti'}, {'id': 1, 'mem': 12630294528, 'name': 'NVIDIA GeForce RTX 3080 Ti'}, {'id': 2, 'mem': 12624723968, 'name': 'NVIDIA GeForce RTX 3080 Ti'}, {'id': 3, 'mem': 12630294528, 'name': 'NVIDIA GeForce RTX 3080 Ti'}], 'hostname': '3DEM-Workstation', 'lane': 'default', 'monitor_port': None, 'name': '3DEM-Workstation', 'resource_fixed': {'SSD': True}, 'resource_slots': {'CPU': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], 'GPU': [0, 1, 2, 3], 'RAM': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]}, 'ssh_str': 'supervisor@3DEM-Workstation', 'title': 'Worker node 3DEM-Workstation', 'type': 'node', 'worker_bin_path': '/home/supervisor/cryosparc/cryosparc_worker/bin/cryosparcw'}]