Source code for desispec.scripts.zcatalog

#!/usr/bin/env python

"""
Combine individual redrock files into a single zcatalog

Stephen Bailey
Lawrence Berkeley National Lab
Fall 2015

substantially updated Fall 2023
"""

from __future__ import absolute_import, division, print_function

import sys, os, glob
import argparse
import importlib.resources
import multiprocessing as mp

import numpy as np
from numpy.lib.recfunctions import append_fields

import fitsio
from astropy.table import Table, hstack, vstack

from desiutil.log import get_logger
from desispec import io
from desispec.zcatalog import find_primary_spectra
from desispec.io.util import get_tempfilename, checkgzip, replace_prefix, write_bintable
from desispec.io.table import read_table
from desispec.coaddition import coadd_fibermap
from desispec.util import parse_keyval
from desiutil.annotate import load_csv_units
import desiutil.depend

[docs]def load_sv1_ivar_w12(hpix, targetids):
    """
    Load FLUX_IVAR_W1/W2 from sv1 target files for requested targetids

    Args:
        hpix (int): nside=8 nested healpix
        targetids (array): TARGETIDs to include

    Returns table of TARGETID, FLUX_IVAR_W1, FLUX_IVAR_W2

    Note: this is only for the special case of sv1 dark/bright and the
    FLUX_IVAR_W1/W2 columns which were not included in fiberassign for
    tiles designed before 20201212.

    Note: nside=8 nested healpix is hardcodes for simplicity because that is
    what was used for sv1 target selection and this is not trying to be a
    more generic targetid lookup function.
    """
    log = get_logger()
    #- the targets could come from any version of desitarget, so search all,
    #- but once a TARGETID is found it will be the same answer (for FLUX_IVAR*)
    #- as any other version because it is propagated from the same dr9 input
    #- Tractor files.
    targetdir = os.path.join(os.environ['DESI_TARGET'], 'catalogs', 'dr9')
    fileglob = f'{targetdir}/*/targets/sv1/resolve/*/sv1targets-*-hp-{hpix}.fits'
    sv1targetfiles = sorted(glob.glob(fileglob))
    nfiles = len(sv1targetfiles)
    ntarg = len(np.unique(targetids))
    log.info(f'Searching {nfiles} sv1 target files for {ntarg} targets in nside=8 healpix={hpix}')
    columns = ['TARGETID', 'FLUX_IVAR_W1', 'FLUX_IVAR_W2']
    targets = list()
    found_targetids = list()
    for filename in sv1targetfiles:
        tx = fitsio.read(filename, 1, columns=columns)
        keep = np.isin(tx['TARGETID'], targetids)
        keep &= ~np.isin(tx['TARGETID'], found_targetids)
        targets.append(tx[keep])
        found_targetids.extend(tx['TARGETID'][keep])

        if np.all(np.isin(targetids, found_targetids)):
            break

    targets = np.hstack(targets)

    missing = np.isin(targetids, targets['TARGETID'], invert=True)
    if np.any(missing):
        nmissing = np.sum(missing)
        log.error(f'{nmissing} TARGETIDs not found in sv1 healpix={hpix}')

    return targets

[docs]def _wrap_read_redrock(optdict):
    """read_redrock wrapper to expand dictionary of named args for multiprocessing"""
    return read_redrock(**optdict)

[docs]def read_redrock(rrfile, group=None, recoadd_fibermap=False, minimal=False, pertile=False, counter=None):
    """
    Read a Redrock file, combining REDSHIFTS, FIBERMAP, and TSNR2 HDUs

    Args:
        rrfile (str): full path to redrock filename

    Options:
        group (str): add group-specific columns for cumulative, pernight, healpix
        readcoadd_fibermap (bool): recoadd fibermap from spectra file in same dir
        minimal (bool): only propagate minimal subet of columns
        pertile (bool): input Redrock file is single tile (not healpix)
        counter (tuple): (i,n) log loading ith file out of n

    Returns (zcat, expfibermap) where zcat is a join of the redrock REDSHIFTS
    catalog and the coadded FIBERMAP
    """
    log = get_logger()
    if counter is not None:
        i, n = counter
        log.info(f'Reading {i}/{n} {rrfile}')
    else:
        log.info(f'Reading {rrfile}')

    with fitsio.FITS(rrfile) as fx:
        hdr = fx[0].read_header()
        if group is not None and 'SPGRP' in hdr and \
                hdr['SPGRP'] != group:
            log.warning("Skipping {} with SPGRP {} != group {}".format(
                rrfile, hdr['SPGRP'], group))
            return None

        redshifts = fx['REDSHIFTS'].read()

        if recoadd_fibermap:
            spectra_filename = checkgzip(replace_prefix(rrfile, 'redrock', 'spectra'))
            log.info('Recoadding fibermap from %s', os.path.basename(spectra_filename))
            fibermap_orig = read_table(spectra_filename)
            fibermap, expfibermap = coadd_fibermap(fibermap_orig, onetile=pertile)
        else:
            fibermap = Table(fx['FIBERMAP'].read())
            expfibermap = fx['EXP_FIBERMAP'].read()

        tsnr2 = fx['TSNR2'].read()
        assert np.all(redshifts['TARGETID'] == fibermap['TARGETID'])
        assert np.all(redshifts['TARGETID'] == tsnr2['TARGETID'])

    if minimal:
        # basic set of target information
        fmcols = ['TARGET_RA', 'TARGET_DEC', 'FLUX_G', 'FLUX_R', 'FLUX_Z']

        # add targeting columns
        for colname in fibermap.dtype.names:
            if colname.endswith('_TARGET') and colname != 'FA_TARGET':
                fmcols.append(colname)

        # add columns needed for uniqueness that differ for healpix vs. tiles
        extracols = ['TILEID', 'LASTNIGHT', 'HEALPIX', 'SURVEY', 'PROGRAM']
        for colname in extracols:
            if colname in fibermap.dtype.names:
                fmcols.append(colname)

        # NIGHT header -> fibermap LASTNIGHT
        if ('LASTNIGHT' not in fmcols) and ('NIGHT' in hdr):
            fibermap['LASTNIGHT'] = np.int32(hdr['NIGHT'])
            fmcols.append('LASTNIGHT')

        data = hstack( [Table(redshifts), Table(fibermap[fmcols])] )

    else:
        fmcols = list(fibermap.dtype.names)
        fmcols.remove('TARGETID')
        if tsnr2 is not None:
            tsnr2cols = list(tsnr2.dtype.names)
            tsnr2cols.remove('TARGETID')
            data = hstack([
                Table(redshifts),
                Table(fibermap[fmcols]),
                Table(tsnr2[tsnr2cols]),
                ])
        else:
            data = hstack( [Table(redshifts), Table(fibermap[fmcols])] )

    #- Add group specific columns, recognizing some some of them may
    #- have already been inherited from the fibermap.
    #- Put these columns right after TARGETID
    nrows = len(data)
    icol = 1
    if group in ('perexp', 'pernight', 'cumulative'):
        if 'TILEID' not in data.colnames:
            data.add_column(np.full(nrows, hdr['TILEID'], dtype=np.int32),
                    index=icol, name='TILEID')
            icol += 1
        if 'PETAL_LOC' not in data.colnames:
            data.add_column(np.full(nrows, hdr['PETAL'], dtype=np.int16),
                    index=icol, name='PETAL_LOC')
            icol += 1

    if group == 'perexp':
        data.add_column(np.full(nrows, hdr['NIGHT'], dtype=np.int32),
                index=icol, name='NIGHT')
        icol += 1
        data.add_column(np.full(nrows, hdr['EXPID'], dtype=np.int32),
                index=icol, name='EXPID')
    elif group == 'pernight':
        data.add_column(np.full(nrows, hdr['NIGHT'], dtype=np.int32),
                index=icol, name='NIGHT')
    elif group == 'cumulative':
        if 'LASTNIGHT' not in data.colnames:
            data.add_column(np.full(nrows, hdr['NIGHT'], dtype=np.int32),
                    index=icol, name='LASTNIGHT')
    elif group == 'healpix':
        data.add_column(np.full(nrows, hdr['HPXPIXEL'], dtype=np.int32),
                index=icol, name='HEALPIX')

    icol += 1

    # SPGRPVAL = night for pernight, expid for perexp, subset for custom coadds
    if 'SPGRPVAL' in hdr.keys():
        val = hdr['SPGRPVAL']
        # if int, try to make int32, otherwise let numpy pick dtype
        if isinstance(val, int):
            if np.int32(val) == val:
                dtype = np.int32
            else:
                dtype = np.int64
        else:
            dtype = None

        data.add_column(np.full(nrows, hdr['SPGRPVAL'], dtype=dtype),
                index=icol, name='SPGRPVAL')
    else:
        log.warning(f'SPGRPVAL keyword missing from {rrfile}')

    return data, expfibermap


#--------------------------------------------------------------------------

def parse(options=None):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-i", "--indir",  type=str,
            help="input directory")
    parser.add_argument("-o", "--outfile",type=str,
            help="output file")
    parser.add_argument("--minimal", action='store_true',
            help="only include minimal output columns")
    parser.add_argument("-t", "--tiles", type=str,
            help="ascii file with tileids to include (one per line)")

    parser.add_argument("--survey", type=str,
            help="DESI survey, e.g. sv1, sv3, main")
    parser.add_argument("--program", type=str,
            help="DESI program, e.g bright, dark")

    parser.add_argument("-g", "--group", type=str,
            help="Add columns specific to this spectral grouping "
                 "e.g. pernight adds NIGHT column from input header keyword")
    parser.add_argument("--header", type=str, nargs="*",
            help="KEYWORD=VALUE entries to add to the output header")
    parser.add_argument('--patch-missing-ivar-w12', action='store_true',
            help="Use target files to patch missing FLUX_IVAR_W1/W2 values")
    parser.add_argument('--recoadd-fibermap', action='store_true',
            help="Re-coadd FIBERMAP from spectra files")
    parser.add_argument('--add-units', action='store_true',
            help="Add units to output catalog from desidatamodel "
                 "column descriptions")
    parser.add_argument('--nproc', type=int, default=1,
            help="Number of multiprocessing processes to use")

    args = parser.parse_args(options)

    return args


def main(args=None):

    if not isinstance(args, argparse.Namespace):
        args = parse(options=args)

    log=get_logger()

    if args.outfile is None:
        args.outfile = io.findfile('zcatalog')

    #- If adding units, check dependencies before doing a lot of work
    if args.add_units:
        try:
            import desidatamodel
        except ImportError:
            log.critical('Unable to import desidatamodel, required to add units (try "module load desidatamodel" first)')
            sys.exit(1)

    if args.indir:
        indir = args.indir
        redrockfiles = sorted(io.iterfiles(f'{indir}', prefix='redrock', suffix='.fits'))
        pertile = (args.group != 'healpix')  # assume tile-based input unless explicitely healpix
    elif args.group == 'healpix':
        pertile = False
        survey = args.survey if args.survey is not None else "*"
        program = args.program if args.program is not None else "*"
        indir = os.path.join(io.specprod_root(), 'healpix')

        #- specprod/healpix/SURVEY/PROGRAM/HPIXGROUP/HPIX/redrock*.fits
        globstr = os.path.join(indir, survey, program, '*', '*', 'redrock*.fits')
        log.info(f'Looking for healpix redrock files in {globstr}')
        redrockfiles = sorted(glob.glob(globstr))
    else:
        pertile = True
        tilefile = args.tiles if args.tiles is not None else io.findfile('tiles')
        indir = os.path.join(io.specprod_root(), 'tiles', args.group)

        log.info(f'Loading tiles from {tilefile}')
        tiles = Table.read(tilefile)
        if args.survey is not None:
            keep = tiles['SURVEY'] == args.survey
            tiles = tiles[keep]
            if len(tiles) == 0:
                log.critical(f'No tiles kept after filtering by SURVEY={args.survey}')
                sys.exit(1)

        if args.program is not None:
            keep = tiles['PROGRAM'] == args.program
            tiles = tiles[keep]
            if len(tiles) == 0:
                log.critical(f'No tiles kept after filtering by PROGRAM={args.program}')
                sys.exit(1)

        tileids = tiles['TILEID']

        redrockfiles = list()
        for tileid in tileids:
            tmp = sorted(io.iterfiles(f'{indir}/{tileid}', prefix='redrock', suffix='.fits'))
            if len(tmp) > 0:
                redrockfiles.extend(tmp)
            else:
                log.error(f'no redrock files found in {indir}/{tileid}')


    nfiles = len(redrockfiles)
    if nfiles == 0:
        msg = f'No redrock files found in {indir}'
        log.critical(msg)
        raise ValueError(msg)
    log.info(f'Reading {nfiles} redrock files')

    #- build list of args to support multiprocessing parallelism
    read_args = list()
    for ifile, rrfile in enumerate(redrockfiles):
        read_args.append(dict(rrfile=rrfile, group=args.group, pertile=pertile,
                              recoadd_fibermap=args.recoadd_fibermap, minimal=args.minimal,
                              counter=(ifile+1, nfiles)))

    #- Read individual Redrock files
    if args.nproc>1:
        from multiprocessing import Pool
        with Pool(args.nproc) as pool:
            results = pool.map(_wrap_read_redrock, read_args)
    else:
        results = [_wrap_read_redrock(a) for a in read_args]

    #- Stack catalogs
    zcatdata = list()
    exp_fibermaps = list()
    dependencies = dict()
    for data, expfibermap in results:
        if data is not None:
            desiutil.depend.mergedep(data.meta, dependencies)
            desiutil.depend.remove_dependencies(data.meta)
            zcatdata.append(data)

        if expfibermap is not None:
            exp_fibermaps.append(expfibermap)

    log.info('Stacking zcat')
    zcat = vstack(zcatdata)
    desiutil.depend.mergedep(dependencies, zcat.meta)
    if exp_fibermaps:
        log.info('Stacking exposure fibermaps')
        expfm = np.hstack(exp_fibermaps)
    else:
        expfm = None

    #- Add FIRSTNIGHT for tile-based cumulative catalogs
    #- (LASTNIGHT was added while reading from NIGHT header keyword)
    if args.group == 'cumulative' and expfm is not None and 'FIRSTNIGHT' not in zcat.colnames:
        log.info('Adding FIRSTNIGHT per tile')
        icol = zcat.colnames.index('LASTNIGHT')
        zcat.add_column(np.zeros(len(zcat), dtype=np.int32),
                    index=icol, name='FIRSTNIGHT')
        for tilefm in Table(expfm[['TILEID', 'NIGHT']]).group_by('TILEID').groups:
            tileid = tilefm['TILEID'][0]
            iitile = zcat['TILEID'] == tileid
            zcat['FIRSTNIGHT'][iitile] = np.min(tilefm['NIGHT'])

        #- all FIRSTNIGHT entries should be filled (no more zeros)
        bad = zcat['FIRSTNIGHT'] == 0
        if np.any(bad):
            badtiles = np.unique(zcat['TILEID'][bad])
            raise ValueError(f'FIRSTNIGHT not set for tiles {badtiles}')

    #- if TARGETIDs appear more than once, which one is best within this catalog?
    if 'TSNR2_LRG' in zcat.colnames and 'ZWARN' in zcat.colnames:
        log.info('Finding best spectrum for each target')
        nspec, primary = find_primary_spectra(zcat)
        zcat['ZCAT_NSPEC'] = nspec.astype(np.int16)
        zcat['ZCAT_PRIMARY'] = primary
    else:
        log.info('Missing TSNR2_LRG or ZWARN; not adding ZCAT_PRIMARY/_NSPEC')

    #- Used for fuji, should not be needed for later prods
    if args.patch_missing_ivar_w12:
        from desimodel.footprint import radec2pix
        missing = (zcat['FLUX_IVAR_W1'] < 0) | (zcat['FLUX_IVAR_W2'] < 0)
        missing &= zcat['OBJTYPE'] == 'TGT'
        missing &= zcat['TARGETID'] > 0

        if not np.any(missing):
            log.info('No targets missing FLUX_IVAR_W1/W2 to patch')
        else:
            #- Load targets from sv1 targeting files
            ra = zcat['TARGET_RA']
            dec = zcat['TARGET_DEC']
            nside = 8  #- use for sv1 targeting
            hpix8 = radec2pix(nside, ra, dec)
            for hpix in np.unique(hpix8[missing]):
                hpixmiss = (hpix == hpix8) & missing
                targets = load_sv1_ivar_w12(hpix, zcat['TARGETID'][hpixmiss])

                #- create dict[TARGETID] -> row number
                targetid2idx = dict(zip(targets['TARGETID'],
                                        np.arange(len(targets))))

                #- patch missing values, if they are in the targets file
                for i in np.where(hpixmiss)[0]:
                    tid = zcat['TARGETID'][i]
                    try:
                        j = targetid2idx[ tid ]
                        zcat['FLUX_IVAR_W1'][i] = targets['FLUX_IVAR_W1'][j]
                        zcat['FLUX_IVAR_W2'][i] = targets['FLUX_IVAR_W2'][j]
                    except KeyError:
                        log.warning(f'TARGETID {tid} (row {i}) not found in sv1 targets')

    #- we're done adding columns, convert to numpy array for fitsio
    zcat = np.array(zcat)

    #- Inherit header from first input, but remove keywords that don't apply
    #- across multiple files
    header = fitsio.read_header(redrockfiles[0], 0)
    for key in ['SPGRPVAL', 'TILEID', 'SPECTRO', 'PETAL', 'NIGHT', 'EXPID', 'HPXPIXEL',
                'NAXIS', 'BITPIX', 'SIMPLE', 'EXTEND']:
        if key in header:
            header.delete(key)

    #- Intercept previous incorrect boolean special cases
    if 'HPXNEST' in header:
        if header['HPXNEST'] == 'True':
            log.info("Correcting header HPXNEST='True' string to boolean True")
            header['HPXNEST'] = True
        elif header['HPXNEST'] == 'False':
            # False is not expected for DESI, but cover it for completeness
            log.info("Correcting header HPXNEST='False' string to boolean False")
            header['HPXNEST'] = False

    #- Add extra keywords if requested
    if args.header is not None:
        for keyval in args.header:
            key, value = parse_keyval(keyval)
            header[key] = value

    if args.survey is not None:
        header['SURVEY'] = args.survey

    if args.program is not None:
        header['PROGRAM'] = args.program

    #- Add units if requested
    if args.add_units:
        datamodeldir = str(importlib.resources.files('desidatamodel'))
        unitsfile = os.path.join(datamodeldir, 'data', 'column_descriptions.csv')
        log.info(f'Adding units from {unitsfile}')
        units, comments = load_csv_units(unitsfile)
    else:
        units = dict()
        comments = dict()

    log.info(f'Writing {args.outfile}')
    tmpfile = get_tempfilename(args.outfile)

    write_bintable(tmpfile, zcat, header=header, extname='ZCATALOG',
                   units=units, clobber=True)

    if not args.minimal and expfm is not None:
        write_bintable(tmpfile, expfm, extname='EXP_FIBERMAP', units=units)

    os.rename(tmpfile, args.outfile)

    log.info("Successfully wrote {}".format(args.outfile))