Source code for neuromaps.datasets.annotations

# -*- coding: utf-8 -*-
"""
Functions for fetching annotations (from the internet, if necessary)
"""

from collections import defaultdict
from pathlib import Path
import re
import shutil

from nilearn.datasets.utils import _fetch_file

from neuromaps.datasets.utils import (get_data_dir, get_dataset_info,
                                      _get_token, _get_session)

MATCH = re.compile(
    r'source-(\S+)_desc-(\S+)_space-(\S+)_(?:den|res)-(\d+[k|m]{1,2})_'
)


def _groupby_match(fnames, return_single=False):
    """"
    Groups files in `fnames` by (source, desc, space, res/den)

    Parameters
    ----------
    fnames : list-of-str
        Filenames to be grouped
    return_single : bool, optional
        If there is only group of filenames return a list instead of a dict.
        Default: False

    Returns
    -------
    groups : dict-of-str
        Where keys are tuple (source, desc, space, res/den) and values are
        lists of filenames
    """

    out = defaultdict(list)
    for fn in fnames:
        out[MATCH.search(fn).groups()].append(fn)

    out = {k: v if len(v) > 1 else v[0] for k, v in out.items()}

    if return_single and len(out) == 1:
        out = list(out.values())[0]

    return out


def _match_annot(info, **kwargs):
    """
    Matches datasets in `info` to relevant keys

    Parameters
    ----------
    info : list-of-dict
        Information on annotations
    kwargs : key-value pairs
        Values of data in `info` on which to match

    Returns
    -------
    matched : list-of-dict
        Annotations with specified values for keys
    """

    # tags should always be a list
    tags = kwargs.get('tags')
    if tags is not None and isinstance(tags, str):
        kwargs['tags'] = [tags]

    # 'den' and 'res' are a special case because these are mutually exclusive
    # values (only one will ever be set for a given annotation) so we want to
    # match on _either_, not both, if and only if both are provided as keys.
    # if only one is specified as a key then we should exclude the other!
    denres = []
    for vals in (kwargs.get('den'), kwargs.get('res')):
        vals = [vals] if isinstance(vals, str) else vals
        if vals is not None:
            denres.extend(vals)

    out = []
    for dset in info:
        match = True
        for key in ('source', 'desc', 'space', 'hemi', 'tags', 'format'):
            comp, value = dset.get(key), kwargs.get(key)
            if value is None:
                continue
            elif value is not None and comp is None:
                match = False
            elif isinstance(value, str):
                if value != 'all':
                    match = match and comp == value
            else:
                func = all if key == 'tags' else any
                match = match and func(f in comp for f in value)
        if len(denres) > 0:
            match = match and (dset.get('den') or dset.get('res')) in denres
        if match:
            out.append(dset)

    return out


[docs]def available_annotations(source=None, desc=None, space=None, den=None,
                          res=None, hemi=None, tags=None, format=None,
                          return_restricted=False):
    """
    Lists datasets available via :func:`~.fetch_annotation`

    Parameters
    ----------
    source, desc, space, den, res, hemi, tags, format : str or list-of-str
        Values on which to match annotations. If not specified annotations with
        any value for the relevant key will be matched. Default: None
    return_restricted : bool, optional
        Whether to return restricted annotations. These will only be accesible
        with a valid OSF token. Default: True

    Returns
    -------
    datasets : list-of-str
        List of available annotations
    """

    info = _match_annot(get_dataset_info('annotations', return_restricted),
                        source=source, desc=desc, space=space, den=den,
                        res=res, hemi=hemi, tags=tags, format=format)
    fnames = [dset['fname'] for dset in info]

    return list(_groupby_match(fnames, return_single=False).keys())


[docs]def available_tags(return_restricted=False):
    """
    Returns available tags for querying annotations

    Parameters
    ----------
    return_restricted : bool, optional
        Whether to return restricted annotations. These will only be accesible
        with a valid OSF token. Default: True


    Returns
    -------
    tags : list-of-str
        Available tags
    """

    tags = set()
    for dset in get_dataset_info('annotations', return_restricted):
        if dset['tags'] is not None:
            tags.update(dset['tags'])
    return sorted(tags)


[docs]def fetch_annotation(*, source=None, desc=None, space=None, den=None, res=None,
                     hemi=None, tags=None, format=None, return_single=True,
                     token=None, data_dir=None, verbose=1):
    """
    Downloads files for brain annotations matching requested variables

    Parameters
    ----------
    source, desc, space, den, res, hemi, tags, format : str or list-of-str
        Values on which to match annotations. If not specified annotations with
        any value for the relevant key will be matched. Default: None
    return_single : bool, optional
        If only one annotation is found matching input parameters return the
        list of filepaths instead of the standard dictionary. Default: True
    token : str, optional
        OSF personal access token for accessing restricted annotations. Will
        also check the environmental variable 'NEUROMAPS_OSF_TOKEN' if not
        provided; if that is not set no token will be provided and restricted
        annotations will be inaccessible. Default: None
    data_dir : str, optional
        Path to use as data directory. If not specified, will check for
        environmental variable 'NEUROMAPS_DATA'; if that is not set, will
        use `~/neuromaps-data` instead. Default: None
    verbose : int, optional
        Modifies verbosity of download, where higher numbers mean more updates.
        Default: 1

    Returns
    -------
    data : dict
        Dictionary of downloaded annotations where dictionary keys are tuples
        (source, desc, space, den/res) and values are lists of corresponding
        filenames
    """

    # check input parameters to ensure we're fetching _something_
    supplied = False
    for val in (source, desc, space, den, res, hemi, tags, format):
        if val is not None:
            supplied = True
            break
    if not supplied:
        raise ValueError('Must provide at least one parameters on which to '
                         'match annotations. If you want to fetch all '
                         'annotations set any of the parameters to "all".')

    # get info on datasets we need to fetch
    token = _get_token(token=token)
    return_restricted = False if (token is None or not token) else True
    data_dir = get_data_dir(data_dir=data_dir)
    info = _match_annot(get_dataset_info('annotations', return_restricted),
                        source=source, desc=desc, space=space, den=den,
                        res=res, hemi=hemi, tags=tags, format=format)
    if verbose > 1:
        print(f'Identified {len(info)} datsets matching specified parameters')

    # get session for requests
    session = _get_session(token=token)

    # TODO: current work-around to handle that _fetch_files() does not support
    # session instances. hopefully a future version will and we can just use
    # that function to handle this instead of calling _fetch_file() directly
    data = []
    for dset in info:
        fn = Path(data_dir) / 'annotations' / dset['rel_path'] / dset['fname']
        if not fn.exists():
            dl_file = _fetch_file(dset['url'], str(fn.parent), verbose=verbose,
                                  md5sum=dset['checksum'], session=session)
            shutil.move(dl_file, fn)
        data.append(str(fn))

    return _groupby_match(data, return_single=return_single)