Source code for mhkit.dolfyn.adv.clean

"""Module containing functions to clean data
"""

import warnings
import numpy as np
from ..velocity import VelBinner
from ..tools.misc import group, slice1d_along_axis


sin = np.sin
cos = np.cos


[docs] def clean_fill(u, mask, npt=12, method="cubic", maxgap=6): """ Interpolate over mask values in timeseries data using the specified method Parameters ---------- u : xarray.DataArray The dataArray to clean. mask : bool Logical tensor of elements to "nan" out (from `spikeThresh`, `rangeLimit`, or `GN2002`) and replace npt : int The number of points on either side of the bad values that interpolation occurs over method : string Interpolation method to use (linear, cubic, pchip, etc). Default is 'cubic' maxgap : numeric Maximum gap of missing data to interpolate across. Default is None Returns ------- da : xarray.DataArray The dataArray with nan's filled in See Also -------- xarray.DataArray.interpolate_na() """ # Apply mask u.values[..., mask] = np.nan # Remove bad data for 2D+ and 1D timeseries variables if "dir" in u.dims: for i in range(u.shape[0]): u[i] = _interp_nan(u[i], npt, method, maxgap) else: u = _interp_nan(u, npt, method, maxgap) return u
def _interp_nan(da, npt, method, maxgap): """ Interpolate over the points in `bad` that are True. Parameters ---------- da : xarray.DataArray The field to be cleaned npt : int The number of points on either side of the gap that the fit occurs over method : string Interpolation scheme to use (linear, cubic, pchip, etc) maxgap : int Max number of consective nan's to interpolate across Returns ------- da : xarray.DataArray The dataArray with nan's filled in """ searching = True bds = da.isnull().values ntail = 0 pos = 0 # The index array: i = np.arange(len(da), dtype=np.uint32) while pos < len(da): if searching: # Check the point if bds[pos]: # If it's bad, mark the start start = max(pos - npt, 0) # And stop searching. searching = False pos += 1 # Continue... else: # Another bad point? if bds[pos]: # Yes # Reset ntail ntail = 0 else: # No # Add to the tail of the block. ntail += 1 pos += 1 if ntail == npt or pos == len(da): # This is the block we are interpolating over i_int = i[start:pos] da[i_int] = da[i_int].interpolate_na( dim=da.dims[-1], method=method, use_coordinate=True, limit=maxgap ) # Reset searching = True ntail = 0 return da
[docs] def fill_nan_ensemble_mean(u, mask, fs, window): """ Fill missing values with the ensemble mean. Parameters ---------- u : xarray.DataArray (..., time) The dataArray to clean. Can be 1D or 2D. mask : bool Logical tensor of elements to "nan" out (from `spikeThresh`, `rangeLimit`, or `GN2002`) and replace fs : int Instrument sampling frequency window : int Size of window in seconds used to calculate ensemble means Returns ------- da : xarray.DataArray The dataArray with nan's filled in Notes ----- Gaps larger than the ensemble size will not get filled in. """ u = u.where(~mask) bnr = VelBinner(n_bin=window * fs, fs=fs) if len(u.shape) == 1: var = u.values[None, :] else: var = u.values vel = np.empty(var.shape) vel_reshaped = bnr.reshape(var) vel_mean = np.nanmean(vel_reshaped, axis=-1) # If there are extra datapoints trimmed off after the last ensemble, # take them into account by filling in another ensemble with means diff = vel.shape[-1] - vel_reshaped.size // vel.shape[0] # diff = number of extra points extra_nans = vel_reshaped.shape[-1] - diff if diff: vel = np.empty((var.shape[0], var.shape[-1] + extra_nans)) extra = var[:, -diff:] empty = np.empty((vel.shape[0], extra_nans)) * np.nan extra = np.concatenate((extra, empty), axis=-1) vel_reshaped = np.concatenate((vel_reshaped, extra[:, None, :]), axis=1) extra_mean = np.nanmean(extra, axis=-1) vel_mean = np.concatenate((vel_mean, extra_mean[:, None]), axis=-1) # Create a matrix the same size as the reshaped array, and mask out the # non-missing values. Then add the two matrices together. vel_mean_matrix = np.tile(vel_mean[..., None], (1, 1, bnr.n_bin)) vel_missing = np.isnan(vel_reshaped) vel_mask = np.ma.masked_array(vel_mean_matrix, ~vel_missing).filled(np.nan) vel_filled = np.where( np.isnan(vel_reshaped), vel_mask, vel_reshaped + np.nan_to_num(vel_mask) ) # "Unshape" the data for i in range(var.shape[0]): vel[i] = np.ravel(vel_filled[i], "C") if diff: # Trim off the extra means u.values = np.squeeze(vel[:, :-extra_nans]) else: u.values = np.squeeze(vel) return u
[docs] def spike_thresh(u, thresh=10): """ Returns a logical vector where a spike in `u` of magnitude greater than `thresh` occurs. Both 'Negative' and 'positive' spikes are found. Parameters ---------- u : xarray.DataArray The timeseries data to clean. thresh : int Magnitude of velocity spike, must be positive. Default = 10 Returns ------- mask : numpy.ndarray Logical vector with spikes labeled as 'True' """ du = np.diff(u.values, prepend=0) mask = (du > thresh) + (du < -thresh) return mask
[docs] def range_limit(u, range=[-5, 5]): """ Returns a logical vector that is True where the values of `u` are outside of `range`. Parameters ---------- u : xarray.DataArray The timeseries data to clean. range : list Min and max magnitudes beyond which are masked. Default is [-5, 5] Returns ------- mask : numpy.ndarray Logical vector with spikes labeled as 'True' """ return ~((range[0] < u.values) & (u.values < range[1]))
def _calcab(al, Lu_std_u, Lu_std_d2u): """Solve equations 10 and 11 of Goring+Nikora2002""" return tuple( np.linalg.solve( np.array([[cos(al) ** 2, sin(al) ** 2], [sin(al) ** 2, cos(al) ** 2]]), np.array([(Lu_std_u) ** 2, (Lu_std_d2u) ** 2]), ) ) def _phaseSpaceThresh(u): if u.ndim == 1: u = u[:, None] u = np.array(u) Lu = (2 * np.log(u.shape[0])) ** 0.5 u = u - u.mean(0) du = np.zeros_like(u) d2u = np.zeros_like(u) # Take the centered difference. du[1:-1] = (u[2:] - u[:-2]) / 2 # And again. d2u[2:-2] = (du[1:-1][2:] - du[1:-1][:-2]) / 2 p = u**2 + du**2 + d2u**2 std_u = np.std(u, axis=0) std_du = np.std(du, axis=0) std_d2u = np.std(d2u, axis=0) alpha = np.arctan2(np.sum(u * d2u, axis=0), np.sum(u**2, axis=0)) a = np.empty_like(alpha) b = np.empty_like(alpha) with warnings.catch_warnings() as w: warnings.filterwarnings( "ignore", category=RuntimeWarning, message="invalid value encountered in " ) for idx, al in enumerate(alpha): a[idx], b[idx] = _calcab(al, Lu * std_u[idx], Lu * std_d2u[idx]) theta = np.arctan2(du, u) phi = np.arctan2((du**2 + u**2) ** 0.5, d2u) pe = ( ((sin(phi) * cos(theta) * cos(alpha) + cos(phi) * sin(alpha)) ** 2) / a + ((sin(phi) * cos(theta) * sin(alpha) - cos(phi) * cos(alpha)) ** 2) / b + ((sin(phi) * sin(theta)) ** 2) / (Lu * std_du) ** 2 ) ** -1 pe[:, np.isnan(pe[0, :])] = 0 return (p > pe).flatten("F")
[docs] def GN2002(u, npt=5000): """ The Goring & Nikora 2002 'despiking' method, with Wahl2003 correction. Returns a logical vector that is true where spikes are identified. Parameters ---------- u : xarray.DataArray The velocity array (1D or 3D) to clean. npt : int The number of points over which to perform the method. Default = 5000 Returns ------- mask : numpy.ndarray Logical vector with spikes labeled as 'True' """ if not isinstance(u, np.ndarray): return GN2002(u.values, npt=npt) if u.ndim > 1: mask = np.zeros(u.shape, dtype="bool") for slc in slice1d_along_axis(u.shape, -1): mask[slc] = GN2002(u[slc], npt=npt) return mask mask = np.zeros(len(u), dtype="bool") # Find large bad segments (>npt/10): # group returns a vector of slice objects. bad_segs = group(np.isnan(u), min_length=int(npt // 10)) if bad_segs.size > 2: # Break them up into separate regions: sp = 0 ep = len(u) # Skip start and end bad_segs: if bad_segs[0].start == sp: sp = bad_segs[0].stop bad_segs = bad_segs[1:] if bad_segs[-1].stop == ep: ep = bad_segs[-1].start bad_segs = bad_segs[:-1] for ind in range(len(bad_segs)): bs = bad_segs[ind] # bs is a slice object. # Clean the good region: mask[sp : bs.start] = GN2002(u[sp : bs.start], npt=npt) sp = bs.stop # Clean the last good region. mask[sp:ep] = GN2002(u[sp:ep], npt=npt) return mask c = 0 ntot = len(u) nbins = int(ntot // npt) mask_last = np.zeros_like(mask) + np.inf mask[0] = True # make sure we start. while mask.any(): mask[: nbins * npt] = _phaseSpaceThresh( np.array(np.reshape(u[: (nbins * npt)], (npt, nbins), order="F")) ) mask[-npt:] = _phaseSpaceThresh(u[-npt:]) c += 1 if c >= 100: raise Exception("GN2002 loop-limit exceeded.") if mask.sum() >= mask_last.sum(): break mask_last = mask.copy() return mask