Source code for skysurvey.tools.speedutils

"""
This module provides utility functions for efficient DataFrame concatenation and array pair matching.
"""

import itertools
import pandas
import numpy as np


[docs] def isin_pair_elements(elements, test_elements): """ Test whether each pair of integers in elements is present in test_elements. Parameters ---------- elements: array_like Array of integer pairs to test. test_element: array_like Array of integer pairs defining the reference set. Returns ------- isin: ndarray, bool Boolean array. True if the corresponding pair in elements is present in `test_elements`, False otherwise. """ elements_combined = (elements[:, 0] << 16) | elements[:, 1] test_elements_combined = (test_elements[:, 0] << 16) | test_elements[:, 1] return np.isin(elements_combined, test_elements_combined)
# pandas concat tricks suggested by: AntoineGillesLordet (https://github.com/MickaelRigault/skysurvey/issues/35) # aranged by: Mickael Rigault
[docs] def chunk_dfs(dfs, chunk_size): """ Split an iterable of DataFrames into successive chunks. Parameters ---------- dfs: iterable of `pandas.DataFrame` Iterable yielding DataFrames to be grouped into chunks. chunk_size: int Number of DataFrames per chunk. Yields ------- chunk : list of `pandas.DataFrame` List of DataFrames in the current chunk. size : int Number of DataFrames in the chunk (may be smaller than `chunk_size` for the last chunk). """ dfs_out = [] for df in dfs: dfs_out.append(df) if len(dfs_out) == chunk_size: yield dfs_out, chunk_size dfs_out = [] if dfs_out: yield dfs_out, len(dfs_out)
[docs] def concat_chunk(dfs, **kwargs): """ Concatenate a chunk of DataFrames using `pandas.concat`. Parameters ---------- dfs: iterable of `pandas.DataFrame` DataFrames to concatenate. **kwargs Additional keyword arguments passed to `pandas.concat`. Returns ------- `pandas.DataFrame` Concatenated DataFrame. """ return pandas.concat((df for df in dfs), **kwargs)
[docs] def eff_concat(dfs, chunk_size, keys=None, **kwargs): """ Efficiently concatenate a large number of DataFrames by chunking. Parameters ---------- dfs: iterable of `pandas.DataFrame` DataFrames to concatenate. chunk_size: int Number of DataFrames per chunk. keys : sequence, optional Keys to use for indexing, passed to `pandas.concat`. When chunking, the corresponding slice of keys is passed to each chunk. Default is None. **kwargs Additional keyword arguments passed to `pandas.concat`. Returns ------- `pandas.DataFrame` Concatenated DataFrame. """ dfs, dfs_len = itertools.tee(dfs, 2) if len(list(dfs_len)) < chunk_size: return concat_chunk(dfs, keys=keys, **kwargs) if keys is None: return pandas.concat(concat_chunk(dfs_chunk, **kwargs) for dfs_chunk, _ in chunk_dfs(dfs, chunk_size) ) return pandas.concat( (concat_chunk(dfs, keys=keys[i*chunk_size:i*chunk_size+step_], **kwargs) for i, (dfs, step_) in enumerate( chunk_dfs(dfs, chunk_size)) ) )