Source code for desispec.gpu

"""
desispec.gpu
============

Utility functions for working with GPUs
"""

import os
import socket
from desiutil.log import get_logger

#- Require both cupy and numba.cuda,
#- but track availability separately for debugging
try:
    import cupy
    import cupyx.scipy.ndimage
    _cupy_available = cupy.is_available()  #- True if cupy detects a GPU
except ImportError:
    _cupy_available = False

try:
    import numba.cuda
    _numba_cuda_available = numba.cuda.is_available()
except ImportError:
    _numba_cuda_available = False

#- context manager for temporarily turning off GPU usage,
#- e.g. within multiprocessing.map
_context_use_gpu = True
[docs]class NoGPU: """Context manager to temporarily disable GPU usage, e.g. with desispec.gpu.NoGPU() blat() """ def __enter__(self): global _context_use_gpu _context_use_gpu = False def __exit__(self, exc_type, exc_value, exc_tb): global _context_use_gpu _context_use_gpu = True
[docs]def is_gpu_available(): """Return whether cupy and numba.cuda are installed and a GPU is available to use, and $DESI_NO_GPU is *not* set""" return (_cupy_available and _numba_cuda_available and _context_use_gpu and ('DESI_NO_GPU' not in os.environ))
[docs]def free_gpu_memory(): """Release all cupy GPU memory; ok to call even if no GPUs""" if is_gpu_available(): mempool = cupy.get_default_memory_pool() mempool.free_all_blocks()
[docs]def redistribute_gpu_ranks(comm, method='round-robin'): """Redistribute which MPI ranks are assigned to which GPUs Args: comm: MPI communicator, or None Options: method: 'round-robin' (default) or 'contiguous' Returns: device_id assigned (-1 if no GPUs) 'round-robin' assigns cyclically, e.g. 8 ranks on 4 GPUs would be assigned [0,1,2,3,0,1,2,3]. 'continuous' assigns contiguous ranks to the same GPU, e.g. [0,0,1,1,2,2,3,3,4,4]. CAUTION: If the MPI communicator spans multiple nodes, this assumes that all nodes have the same number of GPUs, the same number of ranks per node, and that the MPI ranks are themselves contiguously assigned to nodes (which is often the case, but not required in general). If `comm` is None, assign the process to GPU 0 (if present). Note: this also calls free_gpu_memory to release memory on each GPU before assigning ranks to other GPUs. """ device_id = -1 #- default if no GPUs if is_gpu_available(): #- Free GPU memory pool before reallocating ranks so that the #- memory associated with the previous device isn't tied up #- by a rank that isn't using that device anymore. free_gpu_memory() log = get_logger() ngpu = cupy.cuda.runtime.getDeviceCount() if comm is None: device_id = 0 cupy.cuda.Device(device_id).use() log.info(f'No MPI communicator; assigning process to GPU {device_id}/{ngpu}') else: if method == 'round-robin': device_id = comm.rank % ngpu elif method == 'contiguous': #- Handle case of MPI communicator spanning multiple hosts, #- but assume that all hosts have the same number of GPUs #- and that the MPI ranks are contiguously assigned across #- hosts hostnames = comm.gather(socket.gethostname(), root=0) nhosts = 0 if comm.rank == 0: nhosts = len(set(hostnames)) log.debug('nhosts=%d', nhosts) nhosts = comm.bcast(nhosts, root=0) device_id = int(comm.rank / (comm.size / (ngpu*nhosts))) % ngpu else: msg = f'method should be "round-robin" or "contiguous", not "{method}"' log.error(msg) raise ValueError(msg) cupy.cuda.Device(device_id).use() log.debug('Assigning rank=%d to GPU=%d/%d', comm.rank, device_id, ngpu) device_assignments = comm.gather(device_id, root=0) if comm.rank == 0: log.info(f'Assigned MPI ranks to GPUs {device_assignments}') return device_id