Source code for desispec.gpu

"""
desispec.gpu
============

Utility functions for working with GPUs
"""

import os
import socket
from desiutil.log import get_logger

#- Require both cupy and numba.cuda,
#- but track availability separately for debugging
try:
    import cupy
    import cupyx.scipy.ndimage
    _cupy_available = cupy.is_available()  #- True if cupy detects a GPU
except ImportError:
    _cupy_available = False

try:
    import numba.cuda
    _numba_cuda_available = numba.cuda.is_available()
except ImportError:
    _numba_cuda_available = False

#- context manager for temporarily turning off GPU usage,
#- e.g. within multiprocessing.map
_context_use_gpu = True
[docs]class NoGPU:
    """Context manager to temporarily disable GPU usage, e.g.

    with desispec.gpu.NoGPU()
        blat()
    """
    def __enter__(self):
        global _context_use_gpu
        _context_use_gpu = False

    def __exit__(self, exc_type, exc_value, exc_tb):
        global _context_use_gpu
        _context_use_gpu = True

[docs]def is_gpu_available():
    """Return whether cupy and numba.cuda are installed and a GPU
    is available to use, and $DESI_NO_GPU is *not* set"""
    return (_cupy_available and _numba_cuda_available and _context_use_gpu and
            ('DESI_NO_GPU' not in os.environ))

[docs]def free_gpu_memory():
    """Release all cupy GPU memory; ok to call even if no GPUs"""
    if is_gpu_available():
        mempool = cupy.get_default_memory_pool()
        mempool.free_all_blocks()

[docs]def redistribute_gpu_ranks(comm, method='round-robin'):
    """Redistribute which MPI ranks are assigned to which GPUs

    Args:
        comm: MPI communicator, or None

    Options:
        method: 'round-robin' (default) or 'contiguous'

    Returns:
        device_id assigned (-1 if no GPUs)

    'round-robin' assigns cyclically, e.g. 8 ranks on 4 GPUs would
    be assigned [0,1,2,3,0,1,2,3].

    'continuous' assigns contiguous ranks to the same GPU, e.g.
    [0,0,1,1,2,2,3,3,4,4].

    CAUTION: If the MPI communicator spans multiple
    nodes, this assumes that all nodes have the same number of
    GPUs, the same number of ranks per node, and that the MPI ranks
    are themselves contiguously assigned to nodes (which is often
    the case, but not required in general).

    If `comm` is None, assign the process to GPU 0 (if present).

    Note: this also calls free_gpu_memory to release memory on each
    GPU before assigning ranks to other GPUs.
    """
    device_id = -1  #- default if no GPUs
    if is_gpu_available():

        #- Free GPU memory pool before reallocating ranks so that the
        #- memory associated with the previous device isn't tied up
        #- by a rank that isn't using that device anymore.
        free_gpu_memory()

        log = get_logger()
        ngpu = cupy.cuda.runtime.getDeviceCount()
        if comm is None:
            device_id = 0
            cupy.cuda.Device(device_id).use()
            log.info(f'No MPI communicator; assigning process to GPU {device_id}/{ngpu}')
        else:
            if method == 'round-robin':
                device_id = comm.rank % ngpu
            elif method == 'contiguous':
                #- Handle case of MPI communicator spanning multiple hosts,
                #- but assume that all hosts have the same number of GPUs
                #- and that the MPI ranks are contiguously assigned across
                #- hosts
                hostnames = comm.gather(socket.gethostname(), root=0)
                nhosts = 0
                if comm.rank == 0:
                    nhosts = len(set(hostnames))
                    log.debug('nhosts=%d', nhosts)
                nhosts = comm.bcast(nhosts, root=0)

                device_id = int(comm.rank / (comm.size / (ngpu*nhosts))) % ngpu
            else:
                msg = f'method should be "round-robin" or "contiguous", not "{method}"'
                log.error(msg)
                raise ValueError(msg)

            cupy.cuda.Device(device_id).use()
            log.debug('Assigning rank=%d to GPU=%d/%d', comm.rank, device_id, ngpu)

            device_assignments = comm.gather(device_id, root=0)
            if comm.rank == 0:
                log.info(f'Assigned MPI ranks to GPUs {device_assignments}')

    return device_id