Assign custom half-sets?

olibclarke · September 14, 2025, 2:35pm

Hi,

Is there any way (maybe CS tools?) to assign custom half-sets for particles?

I can assign custom half maps using the low level results interface, but would like (for a custom workflow) to assign custom half-sets (after treating two halves of the data independently in ab initio/local refinement, for example).

Using custom half maps works fine for just calculating a straight FSC, but does not support orientation diagnostics, local refinement, etc

Cheers
Oli

EDIT: Ignore these and use the one Rich has linked to below!

Hacked together chatGPT solution for this - have done some testing and seems to give sensible results (after quite a few iterations that did not), but I need to pull it apart and do more extensive testing, but it is a starting point:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combine two particle sets and enforce half-sets by overwriting an EXISTING
column CryoSPARC v4.x actually uses. Priority:

  1) alignments3D/split   (u4 on v4.x; used by Reconstruct Only)
  2) alignments3D/halfset
  3) split  (top-level)

The script:
  • loads two particle outputs
  • finds the first existing column above (with its original dtype)
  • concatenates A+B
  • overwrites that column to A->0 and B->1
  • if the other two names exist in either input, mirrors the same 0/1 values
    using their own original dtypes
  • declares only blob/ctf/alignments3D slots
  • reloads saved output and reports counts for alignments3D/split if present,
    else for the canonical column it used
"""

from pathlib import Path
import json
import argparse
import numpy as np

# Robust imports for v4/v5
try:
    from cryosparc.tools import CryoSPARC, Dataset, APIError
except Exception:
    from cryosparc.tools import CryoSPARC, Dataset
    try:
        from cryosparc.errors import APIError
    except Exception:
        class APIError(Exception): pass

# ---------------- CLI ----------------
p = argparse.ArgumentParser(
    description="Combine two CryoSPARC particle sets and set half-sets (A->0, B->1) by overwriting an existing column."
)
p.add_argument("--project","-p",required=True)
p.add_argument("--workspace","-w",required=True)
p.add_argument("--jobA","-a",required=True)
p.add_argument("--jobB","-b",required=True)
p.add_argument("--outputA","-oa",default=None)
p.add_argument("--outputB","-ob",default=None)
p.add_argument("--title","-t",default=None)
p.add_argument("--instance-info","-i",default=str(Path.home()/ "instance_info.json"))
p.add_argument("--allow-mixed-psize",action="store_true")
args = p.parse_args()

print(">> combine_halfsets_align3d_split_fixed starting...")

# ---------------- Connect ----------------
data = json.loads(Path(args.instance_info).read_text())
cs = CryoSPARC(**data)
if not cs.test_connection():
    raise RuntimeError("Could not connect to CryoSPARC")

project = cs.find_project(args.project)
if project is None:
    raise RuntimeError(f"Project not found: {args.project}")
jobA = project.find_job(args.jobA)
jobB = project.find_job(args.jobB)
if jobA is None or jobB is None:
    raise RuntimeError("Could not find one or both jobs in the project")

# ---------------- Load particle outputs ----------------
COMMON = ["particles","particles_selected","particles_keep","particles_out","particles_split"]

def try_load(j, nm):
    try:
        ds = j.load_output(nm)
        if ds is None or len(ds)==0: return None
        if "blob/path" not in ds.fields(): return None
        return ds
    except Exception:
        return None

def autodetect(j, forced=None):
    if forced:
        ds = try_load(j, forced)
        if ds is None: raise RuntimeError(f"{j.uid}: cannot load '{forced}'")
        return forced, ds
    for nm in COMMON:
        ds = try_load(j, nm)
        if ds is not None: return nm, ds
    raise RuntimeError(f"{j.uid}: no usable particle output among {COMMON}")

outA_name, A = autodetect(jobA, args.outputA)
outB_name, B = autodetect(jobB, args.outputB)
nA, nB = len(A), len(B)
print(f"Loaded: {jobA.uid}:{outA_name} n={nA:,} | {jobB.uid}:{outB_name} n={nB:,}")
if nA==0 or nB==0:
    raise RuntimeError("One of the inputs is empty.")

fA, fB = set(A.fields()), set(B.fields())

# Required fields
req = {"blob/path","blob/psize_A","alignments3D/pose","alignments3D/shift"}
missA = req - fA; missB = req - fB
if missA or missB:
    raise RuntimeError(f"Missing required fields. A:{sorted(missA)}  B:{sorted(missB)}")

# Pixel size consistency
if not args.allow_mixed_psize:
    uA = np.unique(np.asarray(A["blob/psize_A"]).ravel())
    uB = np.unique(np.asarray(B["blob/psize_A"]).ravel())
    if len(uA)!=1 or len(uB)!=1 or uA[0]!=uB[0]:
        raise RuntimeError(f"Pixel sizes differ (A:{uA}, B:{uB}). Re-extract/rebin or use --allow-mixed-psize.")

# ---------------- Choose canonical column ----------------
pref_order = ["alignments3D/split", "alignments3D/halfset", "split"]  # v4 uses alignments3D/split
canon_name = None
canon_dtype = None

for nm in pref_order:
    if nm in fA:
        canon_name = nm; canon_dtype = A[nm].dtype; break
    if nm in fB:
        canon_name = nm; canon_dtype = B[nm].dtype; break

if canon_name is None:
    raise RuntimeError(
        "No existing half-set column found ('alignments3D/split', 'alignments3D/halfset', or 'split'). "
        "On v4.x, Reconstruct expects 'alignments3D/split' in the alignments3D group."
    )

print(f"Canonical column: '{canon_name}'  dtype={canon_dtype}")

# ---------------- Build combined dataset ----------------
def tail(arr):
    try: return tuple(arr.shape[1:])
    except Exception: return ()

def empty_like(field, ref, n):
    tsh = tail(ref)
    try: dt = ref.dtype
    except Exception: dt = np.float32
    if np.issubdtype(getattr(dt,"type",dt), np.floating):
        return np.full((n,)+tsh, 0.0, dtype=dt)
    if np.issubdtype(getattr(dt,"type",dt), np.integer):
        return np.zeros((n,)+tsh, dtype=dt)
    arr = np.empty((n,)+tsh, dtype=dt)
    try: arr[...] = None
    except Exception:
        arr = np.empty((n,)+tsh, dtype=object); arr[...] = None
    return arr

def empty_for(field, refds, n):
    if field in refds.fields():
        return empty_like(field, refds[field], n)
    return np.array([None]*n, dtype=object)

# ensure blob/idx
idxA = A["blob/idx"] if "blob/idx" in fA else np.zeros(nA, dtype=np.int64)
idxB = B["blob/idx"] if "blob/idx" in fB else np.zeros(nB, dtype=np.int64)

# union of fields; we'll overwrite split fields after concat
union = sorted(fA.union(fB))
cols = {}
for f in union:
    colA = A[f] if f in fA else empty_for(f, B, nA)
    colB = B[f] if f in fB else empty_for(f, A, nB)
    if f=="blob/idx": colA,colB = idxA,idxB
    if tail(colA)!=tail(colB):
        continue
    cols[f] = np.concatenate([np.asarray(colA), np.asarray(colB)], axis=0)

N = nA + nB

# ---------------- Overwrite canonical column to A->0, B->1 ----------------
new_values = np.empty(N, dtype=canon_dtype)
new_values[:nA] = 0
new_values[nA:] = 1
cols[canon_name] = new_values

# If the other names exist in either input, mirror values there too (with THEIR dtypes)
other_names = [nm for nm in pref_order if nm != canon_name]
for nm in other_names:
    if nm in (fA | fB):
        dt = (A[nm].dtype if nm in fA else B[nm].dtype)
        tmp = np.empty(N, dtype=dt)
        tmp[:nA]=0; tmp[nA:]=1
        cols[nm] = tmp

combined = Dataset(sorted(cols.items(), key=lambda kv: kv[0]))

# ---------------- Sanity check ----------------
def counts(arr):
    u,c = np.unique(arr, return_counts=True)
    return {int(k): int(v) for k,v in zip(u,c)}

if "alignments3D/split" in combined.fields():
    c_pre = counts(combined["alignments3D/split"])
    print(f"alignments3D/split counts (pre-save): {c_pre} (expected 0:{nA}, 1:{nB})")
else:
    c_pre = counts(combined[canon_name])
    print(f"{canon_name} counts (pre-save): {c_pre} (expected 0:{nA}, 1:{nB})")

# ---------------- External job save ----------------
title = args.title or f"Combined half-sets from {jobA.uid} ({outA_name}) + {jobB.uid} ({outB_name})"
ej = project.create_external_job(workspace_uid=args.workspace, title=title)

# minimal slots
try:
    ver = cs.version(); major = int(str(ver).lstrip('v').split('.')[0])
except Exception:
    major = 4
IS_V5 = major >= 5
if IS_V5:
    slots_in  = [{"name":"blob","dtype":"particle_blob"},
                 {"name":"ctf","dtype":"ctf"},
                 {"name":"alignments3D","dtype":"alignments3D"}]
    slots_out = [{"name":"blob","dtype":"particle_blob"},
                 {"name":"ctf","dtype":"ctf"},
                 {"name":"alignments3D","dtype":"alignments3D"}]
else:
    slots_in  = ["blob","ctf","alignments3D"]
    slots_out = ["blob","ctf","alignments3D"]

ej.add_input(type="particle", name="input_particles_A", slots=slots_in)
ej.add_input(type="particle", name="input_particles_B", slots=slots_in)
ej.connect(target_input="input_particles_A", source_job_uid=jobA.uid, source_output=outA_name)
ej.connect(target_input="input_particles_B", source_job_uid=jobB.uid, source_output=outB_name)
ej.add_output(type="particle", name="particles", slots=slots_out, alloc=combined)

with ej.run():
    ej.log(f"Canonical column overwritten: '{canon_name}' -> A:0, B:1 (dtype {canon_dtype})")
    if "alignments3D/split" in combined.fields():
        ej.log(f"alignments3D/split counts (pre-save): {c_pre}")
    else:
        ej.log(f"{canon_name} counts (pre-save): {c_pre}")
    ej.save_output("particles", combined)

ej.stop()

# ---------------- Verify after reload ----------------
try:
    saved = project.find_job(ej.uid).load_output("particles")
    if "alignments3D/split" in saved.fields():
        c_post = counts(saved["alignments3D/split"])
        print(f"[VERIFY] alignments3D/split counts (post-save): {c_post}")
    else:
        c_post = counts(saved[canon_name])
        print(f"[VERIFY] {canon_name} counts (post-save): {c_post}")
except Exception as e:
    print(f"[VERIFY] Reload failed: {e}")

print(f"Created External Job: '{title}'")
print("Done.")

EDIT2:
Simpler version with uneccessary cruft removed:

#!/usr/bin/env python3
# Combine two particle sets and enforce half-sets via alignments3D/split (A->0, B->1).
# Works with CryoSPARC v4.7x and v5. Minimal, dtype-agnostic (reuses input dtype).

from pathlib import Path
import json
import argparse
import numpy as np

try:
    from cryosparc.tools import CryoSPARC, Dataset, APIError
except Exception:
    from cryosparc.tools import CryoSPARC, Dataset
    try:
        from cryosparc.errors import APIError
    except Exception:
        class APIError(Exception): pass

CANDIDATE_OUTPUTS = ("particles", "particles_selected", "particles_keep", "particles_out", "particles_split")
WANTED_PREFIXES   = ("blob/", "ctf/", "alignments3D/")

def autodetect_particles(job):
    for nm in CANDIDATE_OUTPUTS:
        try:
            ds = job.load_output(nm)
            if ds and len(ds) and "blob/path" in ds.fields():
                return nm, ds
        except Exception:
            pass
    raise RuntimeError(f"{job.uid}: no usable particle output found.")

def trailing_shape(arr):
    try: return arr.shape[1:]
    except Exception: return ()

def empty_like(ref, n):
    tsh = trailing_shape(ref)
    dt  = getattr(ref, "dtype", object)
    if hasattr(dt, "kind") and dt.kind in "iu":
        return np.zeros((n,)+tsh, dtype=dt)
    if hasattr(dt, "kind") and dt.kind == "f":
        return np.zeros((n,)+tsh, dtype=dt)
    out = np.empty((n,)+tsh, dtype=object); out[...] = None
    return out

def empty_for(field, refds, n):
    return empty_like(refds[field], n) if field in refds.fields() else np.array([None]*n, dtype=object)

def counts(arr):
    u, c = np.unique(arr, return_counts=True)
    return {int(k): int(v) for k, v in zip(u, c)}

# ---------------- CLI ----------------
p = argparse.ArgumentParser(description="Combine two CryoSPARC particle sets and enforce half-sets via alignments3D/split.")
p.add_argument("--project","-p", required=True)
p.add_argument("--workspace","-w", required=True)
p.add_argument("--jobA","-a", required=True, help="First job (becomes half-set 0)")
p.add_argument("--jobB","-b", required=True, help="Second job (becomes half-set 1)")
p.add_argument("--title","-t", default=None)
p.add_argument("--instance-info","-i", default=str(Path.home()/ "instance_info.json"))
args = p.parse_args()

print(">> combine_halfsets_min starting...")

# Connect
cfg = json.loads(Path(args.instance_info).read_text())
cs  = CryoSPARC(**cfg)
if not cs.test_connection(): raise RuntimeError("Could not connect to CryoSPARC")

project = cs.find_project(args.project) or (_ for _ in ()).throw(RuntimeError(f"Project not found: {args.project}"))
jobA    = project.find_job(args.jobA)    or (_ for _ in ()).throw(RuntimeError(f"Job not found: {args.jobA}"))
jobB    = project.find_job(args.jobB)    or (_ for _ in ()).throw(RuntimeError(f"Job not found: {args.jobB}"))

# Load inputs
outA_name, A = autodetect_particles(jobA)
outB_name, B = autodetect_particles(jobB)
nA, nB = len(A), len(B)
print(f"Loaded: {jobA.uid}:{outA_name} n={nA:,} | {jobB.uid}:{outB_name} n={nB:,}")
if nA == 0 or nB == 0: raise RuntimeError("One input is empty.")

fA, fB = set(A.fields()), set(B.fields())

# Sanity fields
req = {"blob/path", "blob/psize_A", "alignments3D/pose", "alignments3D/shift"}
missA, missB = req - fA, req - fB
if missA or missB:
    raise RuntimeError(f"Missing required fields. A:{sorted(missA)} B:{sorted(missB)}")

# Strict pixel-size equality
uA = np.unique(np.asarray(A["blob/psize_A"]).ravel())
uB = np.unique(np.asarray(B["blob/psize_A"]).ravel())
if len(uA) != 1: raise RuntimeError(f"Set A has mixed pixel sizes: {uA}")
if len(uB) != 1: raise RuntimeError(f"Set B has mixed pixel sizes: {uB}")
if uA[0] != uB[0]: raise RuntimeError(f"Pixel sizes differ (A:{uA[0]}, B:{uB[0]}). ")

# Require alignments3D/split in at least one input
if "alignments3D/split" not in (fA | fB):
    raise RuntimeError("Neither input has 'alignments3D/split'.")

split_dtype = (A["alignments3D/split"].dtype if "alignments3D/split" in fA else B["alignments3D/split"].dtype)

# Build combined table with only wanted groups and compatible trailing shapes
keepA = [f for f in fA if f.startswith(WANTED_PREFIXES)]
keepB = [f for f in fB if f.startswith(WANTED_PREFIXES)]
union = sorted(set(keepA).union(keepB))

# Ensure blob/idx exists on both sides
idxA = A["blob/idx"] if "blob/idx" in fA else np.zeros(nA, dtype=np.int64)
idxB = B["blob/idx"] if "blob/idx" in fB else np.zeros(nB, dtype=np.int64)

cols = {}
for f in union:
    colA = A[f] if f in fA else empty_for(f, B, nA)
    colB = B[f] if f in fB else empty_for(f, A, nB)
    if f == "blob/idx": colA, colB = idxA, idxB
    if trailing_shape(colA) != trailing_shape(colB):
        continue
    cols[f] = np.concatenate([np.asarray(colA), np.asarray(colB)], axis=0)

# Overwrite alignments3D/split to A->0, B->1 using the existing dtype
N = nA + nB
new_split = np.empty(N, dtype=split_dtype)
new_split[:nA] = 0
new_split[nA:] = 1
cols["alignments3D/split"] = new_split

combined = Dataset(sorted(cols.items(), key=lambda kv: kv[0]))

report = counts(combined["alignments3D/split"])
print(f"alignments3D/split (pre-save): {report}  (expected 0:{nA}, 1:{nB})")

# External job
title = args.title or f"Combined half-sets from {jobA.uid} ({outA_name}) + {jobB.uid} ({outB_name})"
ej = project.create_external_job(workspace_uid=args.workspace, title=title)

try:
    major = int(str(cs.version()).lstrip("v").split(".")[0])
except Exception:
    major = 4
if major >= 5:
    in_slots  = [{"name":"blob","dtype":"particle_blob"},
                 {"name":"ctf","dtype":"ctf"},
                 {"name":"alignments3D","dtype":"alignments3D"}]
    out_slots = [{"name":"blob","dtype":"particle_blob"},
                 {"name":"ctf","dtype":"ctf"},
                 {"name":"alignments3D","dtype":"alignments3D"}]
else:
    in_slots, out_slots = ["blob","ctf","alignments3D"], ["blob","ctf","alignments3D"]

ej.add_input(type="particle", name="input_particles_A", slots=in_slots)
ej.add_input(type="particle", name="input_particles_B", slots=in_slots)
ej.connect(target_input="input_particles_A", source_job_uid=jobA.uid, source_output=outA_name)
ej.connect(target_input="input_particles_B", source_job_uid=jobB.uid, source_output=outB_name)
ej.add_output(type="particle", name="particles", slots=out_slots, alloc=combined)

with ej.run():
    ej.log(f"Overwrote alignments3D/split → A:0 (n={nA:,}), B:1 (n={nB:,})")
    ej.log(f"alignments3D/split counts (pre-save): {report}")
    ej.save_output("particles", combined)
ej.stop()

# Verify
try:
    saved = project.find_job(ej.uid).load_output("particles")
    post = counts(saved["alignments3D/split"])
    print(f"[VERIFY] alignments3D/split (post-save): {post}")
except Exception as e:
    print(f"[VERIFY] reload failed: {e}")

print(f"Created External Job: '{title}'")
print("Done.")

rwaldo · September 16, 2025, 5:05pm

Hi @olibclarke! I think we’ve actually discussed something similar previously . There’s a sample script that’s a bit simpler than chatGPT’s solution in this post if you’d like to compare.

olibclarke · September 16, 2025, 5:08pm

Oh great that is much better, thanks Rich!