Source code for desispec.zcatalog

"""
desispec.zcatalog
=================

The zcatalog.py script contains the following utility functions for redshift catalogs:

(1) find_primary_spectra:

Given an input table with possibly multiple spectra per TARGETID, this function returns arrays for
a primary flag (whether a spectrum is the primary ["best"] spectrum based on the ZWARN value and
on a sort column [default='TSNR2_LRG']) and for the number of spectra per TARGETID.

Usage::

    nspec, spec_primary = find_primary_spectra(table, sort_column = 'TSNR2_LRG')

(2) create_summary_catalog:

This function combines individual redshift catalogs from a given release into a single compilation
catalog. The output catalog is saved in a FITS file in a user-specified location and filename.

This function can be used for 'fuji' (EDR) or 'guadalupe' (Main) by setting the keyword `specprod`.
By default, this function aggregates all the columns for the redshift catalogs, and adds columns to
quantify the number of coadded spectra listed in the catalogs for each TARGETID and to identify the
primary ("best") spectrum out of them using the `find_primary_spectra` function. Optionally, the
script can return a list of pre-selected "summary" column, or a list of user-specified
columns via the `columns_list` keyword.

Usage::

   create_summary_catalog(specprod, specgroup = 'zpix', all_columns = True, \
                          columns_list = None, output_filename = './zcat-all.fits')

Ragadeepika Pucha, Stephanie Juneau, and DESI data team
Version: 2022, March 31st
"""

####################################################################################################
####################################################################################################

import numpy as np
import os
from glob import glob
from astropy.io import fits
from astropy.table import Table, Column, vstack, join

## DESI related functions
from desispec.io import specprod_root
from desispec.io.util import get_tempfilename, write_bintable
from desiutil.log import get_logger
import desiutil.depend

####################################################################################################
####################################################################################################

log = get_logger()

[docs]def find_primary_spectra(table, sort_column = 'TSNR2_LRG'): """ Function to select the best "primary" spectrum for objects with multiple (coadded) spectra. The best spectrum is selected based on the following steps: 1. Spectra with ZWARN=0 are given top preference. 2. Among multiple entries with ZWARN=0, spectra with the highest value of 'sort_column' are given the preference. 3. If there are no spectra with ZWARN=0, spectra with the highest value of 'sort_column' are given the preference. Parameters ---------- table : Numpy array or Astropy Table The input table should be a redshift catalog, with at least the following columns: 'TARGETID', 'ZWARN' and the sort_column (Default:'TSNR2_LRG') Additional columns will be ignored. sort_column : str Column name that will be used to select the best spectrum. Default = 'TSNR2_LRG' The higher values of the column will be considered as better (descending order). Returns ------- nspec : Numpy int array Array of number of entries available per target spec_primary : Numpy bool array Array of spec_primary (= TRUE for the best spectrum) """ ## Convert into an astropy table table = Table(table) ## Main columns that are required: ## TARGETID, ZWARN, SORT_COLUMN that is given by the user (default=TSNR2_LRG) tsel = table['TARGETID', 'ZWARN', sort_column] ## Adding a row number column to sort the final table back to the same order as the input row = Column(np.arange(1, len(table)+1), name = 'ROW_NUMBER') tsel.add_column(row) ## Add the SPECPRIMARY and NSPEC columns - initialized to 0 nspec = Column(np.array([0]*len(table)), name = 'NSPEC', dtype = '>i4') spec_prim = Column(np.array([0]*len(table)), name = 'SPECPRIMARY', dtype = 'bool') tsel.add_column(nspec) tsel.add_column(spec_prim) ## Create a ZWARN_NOT_ZERO column ## This helps to sort in the order such that ZWARN=0 is on top. ## The rest are then arranged based on the 'SORT_COLUMN' tsel['ZWARN_NOT_ZERO'] = (tsel['ZWARN'] != 0).astype(int) ## Create an inverse sort_column -- this is for sorting in the decreasing order of sort_column ## Higher values of sort_column are considered as better tsel['INV_SORT_COLUMN'] = 1/(tsel[sort_column] + 1e-99*(tsel[sort_column] == 0.0)) ## The extra term in the denominator is added to avoid cases where the sort_column is 0, leading ## to unreal values when taking its inverse. ## Sort by TARGETID, ZWARN_NOT_ZERO, and inverse sort_column -- in this order tsel.sort(['TARGETID', 'ZWARN_NOT_ZERO', 'INV_SORT_COLUMN']) ## Selecting the unique targets, along with their indices and number of occurrences targets, indices, return_indices, num = np.unique(tsel['TARGETID'].data, return_index = True, \ return_inverse = True, return_counts = True) ## Since we sorted the table by TARGETID, ZWARN_NOT_ZERO and inverse sort_column ## The first occurence of each target is the PRIMARY (with ZWARN = 0 or with higher sort_column) ## This logic sets SPECPRIMARY = 1 for every first occurence of each target in this sorted table tsel['SPECPRIMARY'][indices] = 1 # Set the NSPEC for every target tsel['NSPEC'] = num[return_indices].astype('>i2') # Note: SPECPRIMARY for negative TARGETIDs (stuck positioners on sky locations) is a bit # meaningless, but tile-based perexp and pernight catalogs can have repeats of those # and they are treated like other targets so that there is strictly one SPECPRIMARY # entry per TARGETID ## Sort by ROW_NUMBER to get the original order - tsel.sort('ROW_NUMBER') ## Final nspec and specprimary arrays - nspec = tsel['NSPEC'].data spec_primary = tsel['SPECPRIMARY'].data return (nspec, spec_primary)
#################################################################################################### ####################################################################################################
[docs]def _get_survey_program_from_filename(filename): """ Return SURVEY,PROGRAM parsed from zpix/ztile filename; fragile! """ # zpix-SURVEY-PROGRAM.fits or ztile-SURVEY-PROGRAM-SPECGROUP.fits base = os.path.splitext(os.path.basename(filename))[0] arr = base.split('-') survey = arr[1] program = arr[2] return survey, program
[docs]def create_summary_catalog(specgroup, indir=None, specprod=None, all_columns = True, columns_list = None, output_filename=None): """ This function combines all the individual redshift catalogs for either 'zpix' or 'ztile' with the desired columns (all columns, or a pre-selected list, or a user-given list). It further adds 'NSPEC' and 'PRIMARY' columns, two for SV(or MAIN), and two for the entire combined redshift catalog. Parameters ---------- specgroup : str The option to run the code on ztile* files or zpix* files. It can either be 'zpix' or 'ztile' indir : str Input directory to look for zpix/ztile files. specprod : str Internal Release Name for the DESI Spectral Release. Used to derive input directory if indir is not provided. all_columns : bool Whether or not to include all the columns into the final table. Default is True. columns_list : list If all_columns = False, list of columns to include in the final table. If None, a list of pre-decided summary columns will be used. Default is None. The 'SV/MAIN' primary flag columns as well as the primary flag columns for the entire catalog witll be included. output_filename : str Path+Filename for the output summary redshift catalog. The output FITS file will be saved at this path. If not specified, the output filename will be derived from specgroup and $SPECPROD Returns ------- None The function saves a FITS file at the location and file name specified by `output_filename` with the summary redshift catalog in HDU1. """ ############################### Checking the inputs ################################## ## Initial check 1 ## If specgroup = something else by mistake valid_specgroups = ('zpix', 'ztile') if specgroup not in valid_specgroups: errmsg = f'{specgroup=} not recognized, should be one of {valid_specgroups}' log.error(errmsg) raise ValueError(errmsg) ## set indir if needed if indir is None: indir = specprod_root(specprod) + '/zcatalog' log.info(f'Using input directory {indir}') ## Initial check 2 ## Test whether the input directory exists if not os.path.isdir(indir): msg = f'"{indir}" directory does not exist' log.error(msg) raise ValueError(msg) ## Set output_filename if needed if output_filename is None: specprod = os.environ['SPECPROD'] if specgroup == 'zpix': output_filename = f'zall-pix-{specprod}.fits' elif specgroup == 'ztile': output_filename = f'zall-tilecumulative-{specprod}.fits' else: # not yet used; future-proofing output_filename = f'zall-{specgroup}-{specprod}.fits' log.info(f'Will write output to {output_filename}') ###################################################################################### ## Find all the filenames for a given specgroup if (specgroup == 'zpix'): ## List of all zpix* catalogs: zpix-survey-program.fits zcat = glob(f'{indir}/zpix-*.fits') elif (specgroup == 'ztile'): ## List of all ztile* catalogs, considering only cumulative catalogs zcat = glob(f'{indir}/ztile-*cumulative.fits') ## Sorting the list of zcatalogs by name ## This is to keep it neat, clean, and in order zcat.sort() ## Get all the zcatalogs for a given spectral release and specgroup ## Add the required columns or select a few of them ## Adding all the tables into a single list tables = [] ## Handle DEPNAMnn DEPVERnn keyword merging separately dependencies = dict() ## Looping through the different zcatalogs and adding the survey and program columns for filename in zcat: basefile = os.path.basename(filename) ## Load the ZCATALOG table, along with the meta data log.info(f'Reading {filename}') t = Table.read(filename, hdu = 'ZCATALOG') ## Merge DEPNAMnn and DEPVERnn, then remove from header desiutil.depend.mergedep(t.meta, dependencies) desiutil.depend.remove_dependencies(t.meta) ## Remove other keys that we don't want to propagate for key in ('CHECKSUM', 'DATASUM'): if key in t.meta: del t.meta[key] ## Get SURVEY and PROGRAM from header, then remove from header ## because we are stacking catalogs from multiple surveys and programs if 'SURVEY' in t.meta: survey = t.meta['SURVEY'] del t.meta['SURVEY'] else: # parse filename if needed, but complain about it survey = _get_survey_program_from_filename(filename)[0] log.warning(f'{filename} header missing SURVEY; guessing {survey} from filename') if 'PROGRAM' in t.meta: program = t.meta['PROGRAM'] del t.meta['PROGRAM'] else: program = _get_survey_program_from_filename(filename)[1] log.warning(f'{filename} header missing PROGRAM; guessing {program} from filename') log.debug(f'{basefile} SURVEY={survey} PROGRAM={program}') ## We keep the rest of the meta data ## Add the SURVEY and PROGRAM columns ## SURVEY is added as a str7 and PROGRAM is added as str6 (to match other catalogs) ## 'special' consists of seven characters and is the maximum character for a SURVEY string ## 'backup' and 'bright' consists of six characters and is the maximum for a PROGRAM string col1 = Column(np.array([survey]*len(t)), name = 'SURVEY', dtype = '<U7') col2 = Column(np.array([program]*len(t)), name = 'PROGRAM', dtype = '<U6') t.add_column(col1, 1) t.add_column(col2, 2) ## The SURVEY and PROGRAM columns are added as second and third columns, ## immediately after TARGETID ## Appending the tables to the list tables.append(t) ## Stacking all the tables into a final table tab = vstack(tables) ## The output of this will have Masked Columns ## We will fix this at the end ## Selecting primary spectra for the whole combined ZCATALOG ## For SV, it selects the best spectrum including cmx+special+sv1+sv2+sv3 ## For Main, it selects the best spectrum for main+special nspec, specprim = find_primary_spectra(tab) ## Replacing the existing 'ZCAT_NSPEC' and 'ZCAT_PRIMARY' ## If all_columns = False, and user-list does not contain this column - ## these columns will be added log.debug('Updating ZCAT_PRIMARY and ZCAT_NSPEC') tab['ZCAT_NSPEC'] = nspec tab['ZCAT_PRIMARY'] = specprim ############################### Adding SV/Main Primary Flags ################################## survey_col = tab['SURVEY'].astype(str) ## Check if SV1|SV2|SV3 targets are available and add SV Primary Flag columns if ('sv1' in survey_col)|('sv2' in survey_col)|('sv3' in survey_col): log.debug('Found SV inputs; adding SV_PRIMARY and SV_NSPEC columns') ## Add empty columns for SV NSPEC and PRIMARY col1 = Column(np.array([0]*len(tab)), name = 'SV_NSPEC', dtype = '>i2') col2 = Column(np.array([0]*len(tab)), name = 'SV_PRIMARY', dtype = 'bool') tab.add_columns([col1, col2]) ## Selecting primary spectra for targets within just SV ## For SV, it selects the primary spectra out of SV1+SV2+SV3 for every individual TARGETID ## Ignores cmx+special in SV is_sv = np.char.startswith(survey_col.data, 'sv') nspec, specprim = find_primary_spectra(tab[is_sv]) tab['SV_NSPEC'][is_sv] = nspec tab['SV_PRIMARY'][is_sv] = specprim ## Check if 'main' targets are available and add Main Primary Flag Columns if ('main' in survey_col): log.debug('Found main survey inputs; adding MAIN_PRIMARY and MAIN_NSPEC columns') ## Add empty columns for Main NSPEC and PRIMARY col1 = Column(np.array([0]*len(tab)), name = 'MAIN_NSPEC', dtype = '>i2') col2 = Column(np.array([0]*len(tab)), name = 'MAIN_PRIMARY', dtype = 'bool') tab.add_columns([col1, col2]) ## Selecting primary spectra for targets within just MAIN ## It selects the primary spectra just for 'main' and ignores 'special' is_main = (survey_col.data == 'main') nspec, specprim = find_primary_spectra(tab[is_main]) tab['MAIN_NSPEC'][is_main] = nspec tab['MAIN_PRIMARY'][is_main] = specprim ############################################################################################### ## For convenience, sort by SURVEY, PROGRAM, and (HEALPIX or TILEID) if (specgroup == 'zpix'): tab.sort(['SURVEY', 'PROGRAM', 'HEALPIX', 'TARGETID']) elif (specgroup == 'ztile'): tab.sort(['SURVEY', 'PROGRAM', 'TILEID', 'LASTNIGHT', 'FIBER']) ## Convert the masked column table to normal astropy table and select required columns final_table = update_table_columns(table = tab, specgroup = specgroup, \ all_columns = all_columns, columns_list = columns_list) ## Add merged DEPNAMnn / DEPVERnn dependencies back into final table desiutil.depend.mergedep(dependencies, final_table.meta) ## Write final output via a temporary filename tmpfile = get_tempfilename(output_filename) write_bintable(tmpfile, final_table, extname='ZCATALOG', clobber=True) os.rename(tmpfile, output_filename) log.info(f'Wrote {output_filename}')
#################################################################################################### ####################################################################################################
[docs]def update_table_columns(table, specgroup = 'zpix', all_columns = True, columns_list = None): """ This function fills the ``*TARGET`` masked columns and returns the final table with the required columns. Parameters ---------- table : Astropy Table A table. specgroup : str The option to run the code on ztile* files or zpix* files. It can either be 'zpix' or 'ztile'. Default is 'zpix' all_columns : bool Whether or not to include all the columns into the final table. Default is True. columns_list : list If all_columns = False, list of columns to include in the final table. If None, a list of pre-decided summary columns will be used. Default is None. The 'SV/MAIN' primary flag columns as well as the primary flag columns for the entire catalog witll be included. Returns ------- t_final : Astropy Table Final table with non-masked columns with required columns. """ ## Due to stacking tables with different columns, ## We have *TARGET columns that are Masked. We need to fill the empty columns with zero. ## This is important - otherwise Astropy fills the table with different values. ## Array of all columns: tab_cols = np.array(table.colnames) ## Pick out columns ending with '_TARGET' sel = np.char.endswith(tab_cols, '_TARGET') ## This include FA_TARGET -- which needs to be removed ## Make a list of all the '*_TARGET' columns, which contain DESI targetting information ## This is both for correcting the masked columns, as well as ## for rearranging the columns into a proper order target_cols = list(tab_cols[sel]) target_cols.remove('FA_TARGET') for col in target_cols: ## Fill the *TARGET columns that are masked with 0 table[col].fill_value = 0 ## Table with filled values tab = table.filled() ## Selecting the required columns for the final table ## If all_columns is True, then rearraning the columns into a proper order ## If all_columns is False, then only a subset of columns is selected ## If no input user-list of columns is given, a pre-selected list of columns is used ## to create a summary redshift catalog ## Find all the existing NSPEC and PRIMARY flag columns and order them nspec_cols = list(tab_cols[np.char.endswith(tab_cols, '_NSPEC')]) prim_cols = list(tab_cols[np.char.endswith(tab_cols, '_PRIMARY')]) nspec_cols.sort() prim_cols.sort() ## Ordering the primary columns ## If SV/MAIN flag columns also exist, the order is - ## MAIN_NSPEC, MAIN_PRIMARY, SV_NSPEC, SV_PRIMARY, ZCAT_NSPEC, ZCAT_PRIMARY ## This is to add these columns separately in the end primary_cols = [] for xx in range(len(nspec_cols)): primary_cols.append(nspec_cols[xx]) primary_cols.append(prim_cols[xx]) if all_columns: ## Rearranging the columns to order all the *TARGET columns together ## TARGET columns sit between NUMOBS_INIT and PLATE_RA columns ## Last column in TSNR2_LRG in all the redshift catalogs ## We will add the PRIMARY columns in the end ## The indices of NUMOBS_INIT, PLATE_RA, and ZCAT_PRIMARY columns nobs = np.where(np.array(tab.colnames) == 'NUMOBS_INIT')[0][0] pra = np.where(np.array(tab.colnames) == 'PLATE_RA')[0][0] tsnr = np.where(np.array(tab.colnames) == 'TSNR2_LRG')[0][0] ## List of all columns all_cols = tab.colnames ## Reorder the columns ## This reorder is important for stacking the different redshift catalogs ## Also to keep it neat and clean req_columns = all_cols[0:nobs+1] + target_cols + all_cols[pra:tsnr+1]+primary_cols else: if (columns_list == None): ## These are the pre-selected list of columns pre_selected_cols = ['TARGETID', 'SURVEY', 'PROGRAM', 'TARGET_RA', 'TARGET_DEC', 'Z', 'ZERR', 'ZWARN', 'COADD_FIBERSTATUS', 'CHI2', 'DELTACHI2', 'MASKBITS', 'SPECTYPE', 'FLUX_G', 'FLUX_R', 'FLUX_Z', 'FLUX_W1', 'FLUX_W2', 'FLUX_IVAR_G', 'FLUX_IVAR_R', 'FLUX_IVAR_Z','FLUX_IVAR_W1', 'FLUX_IVAR_W2', 'TSNR2_LRG', 'TSNR2_BGS', 'TSNR2_ELG', 'TSNR2_QSO', 'TSNR2_LYA'] + target_cols + primary_cols ## Add HEALPIX for zpix* files, and TILEID, LASTNIGHT for ztile* files if (specgroup == 'zpix'): req_columns = pre_selected_cols[0:3]+['HEALPIX']+pre_selected_cols[3:] else: req_columns = pre_selected_cols[0:3]+['TILEID', 'LASTNIGHT']+pre_selected_cols[3:] else: ## Adding the primary flag columns to the user-requested list req_columns = columns_list + primary_cols ## Final table with the required columns t_final = tab[req_columns] return (t_final)
#################################################################################################### ####################################################################################################