Source code for mhkit.wave.io.hindcast.wind_toolkit

"""
Wind Toolkit Data Utility Functions
===================================

This module contains a collection of utility functions designed to facilitate 
the extraction, caching, and visualization of wind data from the WIND Toolkit 
hindcast dataset hosted on AWS. This dataset includes offshore wind hindcast data 
with various parameters like wind speed, direction, temperature, and pressure.

Key Functions:
--------------
- `region_selection`: Determines which predefined wind region a given latitude 
  and longitude fall within.
  
- `get_region_data`: Retrieves latitude and longitude data points for a specified 
  wind region. Uses caching to speed up repeated requests.
  
- `plot_region`: Plots the geographical extent of a specified wind region and 
  can overlay a given latitude-longitude point.
  
- `elevation_to_string`: Converts a parameter (e.g., 'windspeed') and elevation 
  values (e.g., [20, 40, 120]) to the formatted strings used in the WIND Toolkit.
  
- `request_wtk_point_data`: Fetches specified wind data parameters for given 
  latitude-longitude points and years from the WIND Toolkit hindcast dataset. 
  Supports caching for faster repeated data retrieval.

Dependencies:
-------------
- rex: Library to handle renewable energy datasets.
- pandas: Data manipulation and analysis.
- os, hashlib, pickle: Used for caching functionality.
- matplotlib: Used for plotting.

Notes:
------
- To access the WIND Toolkit hindcast data, users need to configure `h5pyd` 
  for data access on HSDS (see the metocean_example or WPTO_hindcast_example
  notebook for more details).
  
- While some functions perform basic checks (e.g., verifying that latitude 
  and longitude are within a predefined region), it's essential to understand 
  the boundaries of each region and the available parameters and elevations in the dataset.

Author: 
-------
akeeste
ssolson

Date:
-----
2023-09-26

"""

import os
import hashlib
import pickle
import pandas as pd

from rex import MultiYearWindX
import matplotlib.pyplot as plt
from mhkit.utils.cache import handle_caching
from mhkit.utils.type_handling import convert_to_dataset



[docs]
def region_selection(lat_lon, preferred_region=""):
    """
    Returns the name of the predefined region in which the given coordinates reside.
    Can be used to check if the passed lat/lon pair is within the WIND Toolkit hindcast dataset.

    Parameters
    ----------
    lat_lon : tuple
        Latitude and longitude coordinates as floats or integers

    preferred_region : string (optional)
        Latitude and longitude coordinates as floats or integers

    Returns
    -------
    region : string
        Name of predefined region for given coordinates
    """
    if not isinstance(lat_lon, tuple):
        raise TypeError(f"lat_lon must be of type tuple, got {type(lat_lon).__name__}")

    if len(lat_lon) != 2:
        raise ValueError(f"lat_lon must be of length 2, got length {len(lat_lon)}")

    if not isinstance(lat_lon[0], (float, int)):
        raise TypeError(
            f"lat_lon values must be floats or ints, got {type(lat_lon[0]).__name__}"
        )

    if not isinstance(lat_lon[1], (float, int)):
        raise TypeError(
            f"lat_lon values must be floats or ints, got {type(lat_lon[1]).__name__}"
        )

    if not isinstance(preferred_region, str):
        raise TypeError(
            f"preferred_region must be a string, got {type(preferred_region).__name__}"
        )

    # Note that this check is fast, but not robust because region are not
    # rectangular on a lat-lon grid
    rDict = {
        "CA_NWP_overlap": {"lat": [41.213, 42.642], "lon": [-129.090, -121.672]},
        "Offshore_CA": {"lat": [31.932, 42.642], "lon": [-129.090, -115.806]},
        "Hawaii": {"lat": [15.565, 26.221], "lon": [-164.451, -151.278]},
        "NW_Pacific": {"lat": [41.213, 49.579], "lon": [-130.831, -121.672]},
        "Mid_Atlantic": {"lat": [37.273, 42.211], "lon": [-76.427, -64.800]},
    }

    def region_search(x):
        return all(
            (
                True if rDict[x][dk][0] <= d <= rDict[x][dk][1] else False
                for dk, d in {"lat": lat_lon[0], "lon": lat_lon[1]}.items()
            )
        )

    region = [key for key in rDict if region_search(key)]

    if region[0] == "CA_NWP_overlap":
        if preferred_region == "Offshore_CA":
            region[0] = "Offshore_CA"
        elif preferred_region == "NW_Pacific":
            region[0] = "NW_Pacific"
        else:
            raise TypeError(
                f"Preferred_region ({preferred_region}) must be 'Offshore_CA' or 'NW_Pacific' when lat_lon {lat_lon} falls in the overlap region"
            )

    if len(region) == 0:
        raise TypeError(f"Coordinates {lat_lon} out of bounds. Must be within {rDict}")
    else:
        return region[0]




[docs]
def get_region_data(region):
    """
    Retrieves the latitude and longitude data points for the specified region
    from the cache if available; otherwise, fetches the data and caches it for
    subsequent calls.

    The function forms a unique identifier from the `region` parameter and checks
    whether the corresponding data is available in the cache. If the data is found,
    it's loaded and returned. If not, the data is fetched, cached, and then returned.

    Parameters
    ----------
    region : str
        Name of the predefined region in the WIND Toolkit for which to
        retrieve latitude and longitude data points. It is case-sensitive.
        Examples: 'Offshore_CA','Hawaii','Mid_Atlantic','NW_Pacific'

    Returns
    -------
    lats : numpy.ndarray
        A 1D array containing the latitude coordinates of data points
        in the specified region.

    lons : numpy.ndarray
        A 1D array containing the longitude coordinates of data points
        in the specified region.

    Example
    -------
    >>> lats, lons = get_region_data('Offshore_CA')
    """
    if not isinstance(region, str):
        raise TypeError("region must be of type string")
    # Define the path to the cache directory
    cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "mhkit", "hindcast")

    # Create a unique identifier for this function call
    hash_id = hashlib.md5(region.encode()).hexdigest()

    # Create cache directory if it doesn't exist
    os.makedirs(cache_dir, exist_ok=True)

    # Create a path to the cache file for this function call
    cache_file = os.path.join(cache_dir, f"{hash_id}.pkl")

    if os.path.isfile(cache_file):
        # If the cache file exists, load the data from the cache
        with open(cache_file, "rb") as f:
            lats, lons = pickle.load(f)
        return lats, lons
    else:
        wind_path = "/nrel/wtk/" + region.lower() + "/" + region + "_*.h5"
        windKwargs = {
            "tree": None,
            "unscale": True,
            "str_decode": True,
            "hsds": True,
            "years": [2019],
        }

        # Get the latitude and longitude list from the region in rex
        rex_wind = MultiYearWindX(wind_path, **windKwargs)
        lats = rex_wind.lat_lon[:, 0]
        lons = rex_wind.lat_lon[:, 1]

        # Save data to cache
        with open(cache_file, "wb") as f:
            pickle.dump((lats, lons), f)

        return lats, lons




[docs]
def plot_region(region, lat_lon=None, ax=None):
    """
    Visualizes the area that a given region covers. Can help users understand
    the extent of a region since they are not all rectangular.

    Parameters
    ----------
    region : string
        Name of predefined region in the WIND Toolkit
        Options: 'Offshore_CA','Hawaii','Mid_Atlantic','NW_Pacific'
    lat_lon : couple (optional)
        Latitude and longitude pair to plot on top of the chosen region. Useful
        to inform accurate latitude-longitude selection for data analysis.
    ax : matplotlib axes object (optional)
        Axes for plotting.  If None, then a new figure is created.

    Returns
    ---------
    ax : matplotlib pyplot axes
    """
    if not isinstance(region, str):
        raise TypeError("region must be of type string")

    supported_regions = ["Offshore_CA", "Hawaii", "Mid_Atlantic", "NW_Pacific"]
    if region not in supported_regions:
        raise ValueError(
            f'{region} not in list of supported regions: {", ".join(supported_regions)}'
        )

    lats, lons = get_region_data(region)

    # Plot the latitude longitude pairs
    if ax is None:
        fig, ax = plt.subplots()
    ax.plot(lons, lats, "o", label=f"{region} region")
    if lat_lon is not None:
        ax.plot(lat_lon[1], lat_lon[0], "o", label="Specified lat-lon point")
    ax.set_xlabel("Longitude (deg)")
    ax.set_ylabel("Latitude (deg)")
    ax.grid()
    ax.set_title(f"Extent of the WIND Toolkit {region} region")
    ax.legend()

    return ax




[docs]
def elevation_to_string(parameter, elevations):
    """
    Takes in a parameter (e.g. 'windspeed') and elevations (e.g. [20, 40, 120])
    and returns the formatted strings that are input to WIND Toolkit (e.g. windspeed_10m).
    Does not check parameter against the elevation levels. This is done in request_wtk_point_data.

    Parameters
    ----------
    parameter: string
        Name of the WIND toolkit parameter.
        Options: 'windspeed', 'winddirection', 'temperature', 'pressure'
    elevations : list
        List of elevations (float).
        Values can range from approxiamtely 20 to 200 in increments of 20, depending
        on the parameter in question. See Documentation for request_wtk_point_data
        for the full list of available parameters.

    Returns
    ---------
    parameter_list: list
        Formatted List of WIND Toolkit parameter strings

    """

    if not isinstance(parameter, str):
        raise TypeError(f"parameter must be a string, got {type(parameter)}")

    if not isinstance(elevations, (float, list)):
        raise TypeError(f"elevations must be a float or list, got {type(elevations)}")

    if parameter not in ["windspeed", "winddirection", "temperature", "pressure"]:
        raise ValueError(f"Invalid parameter: {parameter}")

    parameter_list = []
    for e in elevations:
        parameter_list.append(parameter + "_" + str(e) + "m")

    return parameter_list




[docs]
def request_wtk_point_data(
    time_interval,
    parameter,
    lat_lon,
    years,
    preferred_region="",
    tree=None,
    unscale=True,
    str_decode=True,
    hsds=True,
    clear_cache=False,
    to_pandas=True,
):
    """
    Returns data from the WIND Toolkit offshore wind hindcast hosted on
    AWS at the specified latitude and longitude point(s), or the closest
    available point(s).Visit https://registry.opendata.aws/nrel-pds-wtk/
    for more information about the dataset and available locations and years.

    Calls with multiple parameters must have the same time interval. Calls
    with multiple locations must use the same region (use the plot_region function).

    Note: To access the WIND Toolkit hindcast data, you will need to
    configure h5pyd for data access on HSDS. Please see the
    metocean_example or WPTO_hindcast_example notebook for more information.

    Parameters
    ----------
    time_interval : string
        Data set type of interest
        Options: '1-hour' '5-minute'
    parameter : string or list of strings
        Dataset parameter to be downloaded. Other parameters may be available.
        This list is limited to those available at both 5-minute and 1-hour
        time intervals for all regions.
        Options:
            'precipitationrate_0m', 'inversemoninobukhovlength_2m',
            'relativehumidity_2m', 'surface_sea_temperature',
            'pressure_0m', 'pressure_100m', 'pressure_200m',
            'temperature_10m', 'temperature_20m', 'temperature_40m',
            'temperature_60m', 'temperature_80m', 'temperature_100m',
            'temperature_120m', 'temperature_140m', 'temperature_160m',
            'temperature_180m', 'temperature_200m',
            'winddirection_10m', 'winddirection_20m', 'winddirection_40m',
            'winddirection_60m', 'winddirection_80m', 'winddirection_100m',
            'winddirection_120m', 'winddirection_140m', 'winddirection_160m',
            'winddirection_180m', 'winddirection_200m',
            'windspeed_10m', 'windspeed_20m', 'windspeed_40m',
            'windspeed_60m', 'windspeed_80m', 'windspeed_100m',
            'windspeed_120m', 'windspeed_140m', 'windspeed_160m',
            'windspeed_180m', 'windspeed_200m'
    lat_lon : tuple or list of tuples
        Latitude longitude pairs at which to extract data. Use plot_region() or
        region_selection() to see the corresponding region for a given location.
    years : list
        Year(s) to be accessed. The years 2000-2019 available (up to 2020
        for Mid-Atlantic). Examples: [2015] or [2004,2006,2007]
    preferred_region : string (optional)
        Region that the lat_lon belongs to ('Offshore_CA' or 'NW_Pacific').
        Required when a lat_lon point falls in both the Offshore California
        and NW Pacific regions. Overlap region defined by
        latitude = (41.213, 42.642) and longitude = (-129.090, -121.672).
        Default = ''
    tree : str | cKDTree (optional)
        cKDTree or path to .pkl file containing pre-computed tree
        of lat, lon coordinates, default = None
    unscale : bool (optional)
        Boolean flag to automatically unscale variables on extraction
        Default = True
    str_decode : bool (optional)
        Boolean flag to decode the bytestring meta data into normal
        strings. Setting this to False will speed up the meta data read.
        Default = True
    hsds : bool (optional)
        Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
        behind HSDS. Setting to False will indicate to look for files on
        local machine, not AWS. Default = True
    clear_cache : bool (optional)
        Boolean flag to clear the cache related to this specific request.
        Default is False.
    to_pandas: bool (optional)
        Flag to output pandas instead of xarray. Default = True.

    Returns
    ---------
    data: DataFrame
        Data indexed by datetime with columns named for parameter and
        cooresponding metadata index
    meta: DataFrame
        Location metadata for the requested data location
    """

    if not isinstance(parameter, (str, list)):
        raise TypeError("parameter must be of type string or list")
    if not isinstance(lat_lon, (list, tuple)):
        raise TypeError("lat_lon must be of type list or tuple")
    if not isinstance(time_interval, str):
        raise TypeError("time_interval must be a string")
    if not isinstance(years, list):
        raise TypeError("years must be a list")
    if not isinstance(preferred_region, str):
        raise TypeError("preferred_region must be a string")
    if not isinstance(tree, (str, type(None))):
        raise TypeError("tree must be a string or None")
    if not isinstance(unscale, bool):
        raise TypeError("unscale must be bool type")
    if not isinstance(str_decode, bool):
        raise TypeError("str_decode must be bool type")
    if not isinstance(hsds, bool):
        raise TypeError("hsds must be bool type")
    if not isinstance(clear_cache, bool):
        raise TypeError("clear_cache must be of type bool")

    # Define the path to the cache directory
    cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "mhkit", "hindcast")

    # Construct a string representation of the function parameters
    hash_params = f"{time_interval}_{parameter}_{lat_lon}_{years}_{preferred_region}_{tree}_{unscale}_{str_decode}_{hsds}"

    # Use handle_caching to manage caching.
    data, meta, _ = handle_caching(
        hash_params,
        cache_dir,
        cache_content={"data": None, "metadata": None, "write_json": None},
        clear_cache_file=clear_cache,
    )

    if data is not None and meta is not None:
        if not to_pandas:
            data = convert_to_dataset(data)
            data.attrs = meta

        return data, meta  # Return cached data and meta if available
    else:
        # check for multiple region selection
        if isinstance(lat_lon[0], float):
            region = region_selection(lat_lon, preferred_region)
        else:
            reglist = []
            for loc in lat_lon:
                reglist.append(region_selection(loc, preferred_region))
            if reglist.count(reglist[0]) == len(lat_lon):
                region = reglist[0]
            else:
                raise TypeError("Coordinates must be within the same region!")

        if time_interval == "1-hour":
            wind_path = f"/nrel/wtk/{region.lower()}/{region}_*.h5"
        elif time_interval == "5-minute":
            wind_path = f"/nrel/wtk/{region.lower()}-5min/{region}_*.h5"
        else:
            raise TypeError(
                f"Invalid time_interval '{time_interval}', must be '1-hour' or '5-minute'"
            )
        windKwargs = {
            "tree": tree,
            "unscale": unscale,
            "str_decode": str_decode,
            "hsds": hsds,
            "years": years,
        }
        data_list = []
        with MultiYearWindX(wind_path, **windKwargs) as rex_wind:
            if isinstance(parameter, list):
                for p in parameter:
                    temp_data = rex_wind.get_lat_lon_df(p, lat_lon)
                    col = temp_data.columns[:]
                    for i, c in zip(range(len(col)), col):
                        temp = f"{p}_{i}"
                        temp_data = temp_data.rename(columns={c: temp})

                    data_list.append(temp_data)
                data = pd.concat(data_list, axis=1)

            else:
                data = rex_wind.get_lat_lon_df(parameter, lat_lon)
                col = data.columns[:]

                for i, c in zip(range(len(col)), col):
                    temp = f"{parameter}_{i}"
                    data = data.rename(columns={c: temp})

            meta = rex_wind.meta.loc[col, :]
            meta = meta.reset_index(drop=True)

        # Save the retrieved data and metadata to cache.
        handle_caching(
            hash_params,
            cache_dir,
            cache_content={"data": data, "metadata": meta, "write_json": None},
        )

        if not to_pandas:
            data = convert_to_dataset(data)
            data.attrs = meta

        return data, meta