Our MongoDB instance had grown to over 1.4TB so I decided to take a closer look. Worked out what I think is going on… lots of images that seem destined for the UI go into the DB, but not necessarily all, some seemed to be actively created on-the-fly from .cs and .mrc files? In any case, I found a quite a few GridFS files that seem to get orphaned from live sessions as a result of getting updated during the course of processing. I wrote the following to go through and cleanup live sessions, was hoping someone from the dev team might comment on if I missed any active references… also hoping someone on the dev team might comment on actively caching all the generated images in the DB as png files rather than generating them on-the-fly… would make it easier to “freeze” a live session so that it remains actively browseable but all/most of the source data has been removed.
def get(obj, key, default):
if key in obj:
if obj[key] is not None:
return obj[key]
return default
def session_delete_unreferenced_gridfs_files(pid, sid, delete=False):
query = {'project_uid': pid, 'session_uid': sid}
session = db.workspaces.find_one(query)
gridfs_str_oids = []
gridfs_str_oids += [x['fileid'] for x in get(session, 'phase2_class2D_info', [])]
gridfs_str_oids += [x['fileid'] for x in get(session, 'phase2_abinit_info', [])]
gridfs_str_oids += [x['fileid'] for x in get(session, 'template_creation_info', [])]
for exposure in db.exposures.find(query):
gridfs_str_oids += get(exposure, 'preview_img_1x', [])
gridfs_str_oids += get(exposure, 'preview_img_2x', [])
referenced_gridfs_oids = set([ObjectId(x) for x in gridfs_str_oids])
dbfs = gridfs.GridFS(db)
to_delete = []
total_size = 0
unreferenced_size = 0
for dbfile in dbfs.find({'project_uid': pid, 'job_uid': sid}):
total_size += dbfile.length
if dbfile._id in referenced_gridfs_oids:
continue
unreferenced_size += dbfile.length
to_delete += [dbfile]
print(f'{query} is {hsize(total_size)} and {hsize(unreferenced_size)} is unreferenced')
if delete:
for dbfile in to_delete:
dbfs.delete(dbfile._id)
def hsize(bytes):
KB = 1024
MB = KB * 1024
GB = MB * 1024
TB = GB * 1024
if bytes > TB:
return f'{bytes/TB:.1f}TB'
elif bytes > GB:
return f'{bytes/GB:.1f}GB'
elif bytes > MB:
return f'{bytes/MB:.1f}MB'
elif bytes > KB:
return f'{bytes/KB:.1f}KB'
return f'{bytes}B'