Source code for mhkit.wave.io.hindcast.wind_toolkit

"""
Wind Toolkit Data Utility Functions
===================================

This module contains a collection of utility functions designed to facilitate 
the extraction, caching, and visualization of wind data from the WIND Toolkit 
hindcast dataset hosted on AWS. This dataset includes offshore wind hindcast data 
with various parameters like wind speed, direction, temperature, and pressure.

Key Functions:
--------------
- `region_selection`: Determines which predefined wind region a given latitude 
  and longitude fall within.
  
- `get_region_data`: Retrieves latitude and longitude data points for a specified 
  wind region. Uses caching to speed up repeated requests.
  
- `plot_region`: Plots the geographical extent of a specified wind region and 
  can overlay a given latitude-longitude point.
  
- `elevation_to_string`: Converts a parameter (e.g., 'windspeed') and elevation 
  values (e.g., [20, 40, 120]) to the formatted strings used in the WIND Toolkit.
  
- `request_wtk_point_data`: Fetches specified wind data parameters for given 
  latitude-longitude points and years from the WIND Toolkit hindcast dataset. 
  Supports caching for faster repeated data retrieval.

Dependencies:
-------------
- rex: Library to handle renewable energy datasets.
- pandas: Data manipulation and analysis.
- os, hashlib, pickle: Used for caching functionality.
- matplotlib: Used for plotting.

Notes:
------
- To access the WIND Toolkit hindcast data, users need to configure `h5pyd` 
  for data access on HSDS (see the metocean_example or WPTO_hindcast_example
  notebook for more details).
  
- While some functions perform basic checks (e.g., verifying that latitude 
  and longitude are within a predefined region), it's essential to understand 
  the boundaries of each region and the available parameters and elevations in the dataset.

Author: 
-------
akeeste
ssolson

Date:
-----
2023-09-26

"""

import os
import hashlib
import pickle
import pandas as pd

from rex import MultiYearWindX
import matplotlib.pyplot as plt
from mhkit.utils.cache import handle_caching
from mhkit.utils.type_handling import convert_to_dataset


[docs] def region_selection(lat_lon, preferred_region=""): """ Returns the name of the predefined region in which the given coordinates reside. Can be used to check if the passed lat/lon pair is within the WIND Toolkit hindcast dataset. Parameters ---------- lat_lon : tuple Latitude and longitude coordinates as floats or integers preferred_region : string (optional) Latitude and longitude coordinates as floats or integers Returns ------- region : string Name of predefined region for given coordinates """ if not isinstance(lat_lon, tuple): raise TypeError(f"lat_lon must be of type tuple, got {type(lat_lon).__name__}") if len(lat_lon) != 2: raise ValueError(f"lat_lon must be of length 2, got length {len(lat_lon)}") if not isinstance(lat_lon[0], (float, int)): raise TypeError( f"lat_lon values must be floats or ints, got {type(lat_lon[0]).__name__}" ) if not isinstance(lat_lon[1], (float, int)): raise TypeError( f"lat_lon values must be floats or ints, got {type(lat_lon[1]).__name__}" ) if not isinstance(preferred_region, str): raise TypeError( f"preferred_region must be a string, got {type(preferred_region).__name__}" ) # Note that this check is fast, but not robust because region are not # rectangular on a lat-lon grid rDict = { "CA_NWP_overlap": {"lat": [41.213, 42.642], "lon": [-129.090, -121.672]}, "Offshore_CA": {"lat": [31.932, 42.642], "lon": [-129.090, -115.806]}, "Hawaii": {"lat": [15.565, 26.221], "lon": [-164.451, -151.278]}, "NW_Pacific": {"lat": [41.213, 49.579], "lon": [-130.831, -121.672]}, "Mid_Atlantic": {"lat": [37.273, 42.211], "lon": [-76.427, -64.800]}, } def region_search(x): return all( ( True if rDict[x][dk][0] <= d <= rDict[x][dk][1] else False for dk, d in {"lat": lat_lon[0], "lon": lat_lon[1]}.items() ) ) region = [key for key in rDict if region_search(key)] if region[0] == "CA_NWP_overlap": if preferred_region == "Offshore_CA": region[0] = "Offshore_CA" elif preferred_region == "NW_Pacific": region[0] = "NW_Pacific" else: raise TypeError( f"Preferred_region ({preferred_region}) must be 'Offshore_CA' or 'NW_Pacific' when lat_lon {lat_lon} falls in the overlap region" ) if len(region) == 0: raise TypeError(f"Coordinates {lat_lon} out of bounds. Must be within {rDict}") else: return region[0]
[docs] def get_region_data(region): """ Retrieves the latitude and longitude data points for the specified region from the cache if available; otherwise, fetches the data and caches it for subsequent calls. The function forms a unique identifier from the `region` parameter and checks whether the corresponding data is available in the cache. If the data is found, it's loaded and returned. If not, the data is fetched, cached, and then returned. Parameters ---------- region : str Name of the predefined region in the WIND Toolkit for which to retrieve latitude and longitude data points. It is case-sensitive. Examples: 'Offshore_CA','Hawaii','Mid_Atlantic','NW_Pacific' Returns ------- lats : numpy.ndarray A 1D array containing the latitude coordinates of data points in the specified region. lons : numpy.ndarray A 1D array containing the longitude coordinates of data points in the specified region. Example ------- >>> lats, lons = get_region_data('Offshore_CA') """ if not isinstance(region, str): raise TypeError("region must be of type string") # Define the path to the cache directory cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "mhkit", "hindcast") # Create a unique identifier for this function call hash_id = hashlib.md5(region.encode()).hexdigest() # Create cache directory if it doesn't exist os.makedirs(cache_dir, exist_ok=True) # Create a path to the cache file for this function call cache_file = os.path.join(cache_dir, f"{hash_id}.pkl") if os.path.isfile(cache_file): # If the cache file exists, load the data from the cache with open(cache_file, "rb") as f: lats, lons = pickle.load(f) return lats, lons else: wind_path = "/nrel/wtk/" + region.lower() + "/" + region + "_*.h5" windKwargs = { "tree": None, "unscale": True, "str_decode": True, "hsds": True, "years": [2019], } # Get the latitude and longitude list from the region in rex rex_wind = MultiYearWindX(wind_path, **windKwargs) lats = rex_wind.lat_lon[:, 0] lons = rex_wind.lat_lon[:, 1] # Save data to cache with open(cache_file, "wb") as f: pickle.dump((lats, lons), f) return lats, lons
[docs] def plot_region(region, lat_lon=None, ax=None): """ Visualizes the area that a given region covers. Can help users understand the extent of a region since they are not all rectangular. Parameters ---------- region : string Name of predefined region in the WIND Toolkit Options: 'Offshore_CA','Hawaii','Mid_Atlantic','NW_Pacific' lat_lon : couple (optional) Latitude and longitude pair to plot on top of the chosen region. Useful to inform accurate latitude-longitude selection for data analysis. ax : matplotlib axes object (optional) Axes for plotting. If None, then a new figure is created. Returns --------- ax : matplotlib pyplot axes """ if not isinstance(region, str): raise TypeError("region must be of type string") supported_regions = ["Offshore_CA", "Hawaii", "Mid_Atlantic", "NW_Pacific"] if region not in supported_regions: raise ValueError( f'{region} not in list of supported regions: {", ".join(supported_regions)}' ) lats, lons = get_region_data(region) # Plot the latitude longitude pairs if ax is None: fig, ax = plt.subplots() ax.plot(lons, lats, "o", label=f"{region} region") if lat_lon is not None: ax.plot(lat_lon[1], lat_lon[0], "o", label="Specified lat-lon point") ax.set_xlabel("Longitude (deg)") ax.set_ylabel("Latitude (deg)") ax.grid() ax.set_title(f"Extent of the WIND Toolkit {region} region") ax.legend() return ax
[docs] def elevation_to_string(parameter, elevations): """ Takes in a parameter (e.g. 'windspeed') and elevations (e.g. [20, 40, 120]) and returns the formatted strings that are input to WIND Toolkit (e.g. windspeed_10m). Does not check parameter against the elevation levels. This is done in request_wtk_point_data. Parameters ---------- parameter: string Name of the WIND toolkit parameter. Options: 'windspeed', 'winddirection', 'temperature', 'pressure' elevations : list List of elevations (float). Values can range from approxiamtely 20 to 200 in increments of 20, depending on the parameter in question. See Documentation for request_wtk_point_data for the full list of available parameters. Returns --------- parameter_list: list Formatted List of WIND Toolkit parameter strings """ if not isinstance(parameter, str): raise TypeError(f"parameter must be a string, got {type(parameter)}") if not isinstance(elevations, (float, list)): raise TypeError(f"elevations must be a float or list, got {type(elevations)}") if parameter not in ["windspeed", "winddirection", "temperature", "pressure"]: raise ValueError(f"Invalid parameter: {parameter}") parameter_list = [] for e in elevations: parameter_list.append(parameter + "_" + str(e) + "m") return parameter_list
[docs] def request_wtk_point_data( time_interval, parameter, lat_lon, years, preferred_region="", tree=None, unscale=True, str_decode=True, hsds=True, clear_cache=False, to_pandas=True, ): """ Returns data from the WIND Toolkit offshore wind hindcast hosted on AWS at the specified latitude and longitude point(s), or the closest available point(s).Visit https://registry.opendata.aws/nrel-pds-wtk/ for more information about the dataset and available locations and years. Calls with multiple parameters must have the same time interval. Calls with multiple locations must use the same region (use the plot_region function). Note: To access the WIND Toolkit hindcast data, you will need to configure h5pyd for data access on HSDS. Please see the metocean_example or WPTO_hindcast_example notebook for more information. Parameters ---------- time_interval : string Data set type of interest Options: '1-hour' '5-minute' parameter : string or list of strings Dataset parameter to be downloaded. Other parameters may be available. This list is limited to those available at both 5-minute and 1-hour time intervals for all regions. Options: 'precipitationrate_0m', 'inversemoninobukhovlength_2m', 'relativehumidity_2m', 'surface_sea_temperature', 'pressure_0m', 'pressure_100m', 'pressure_200m', 'temperature_10m', 'temperature_20m', 'temperature_40m', 'temperature_60m', 'temperature_80m', 'temperature_100m', 'temperature_120m', 'temperature_140m', 'temperature_160m', 'temperature_180m', 'temperature_200m', 'winddirection_10m', 'winddirection_20m', 'winddirection_40m', 'winddirection_60m', 'winddirection_80m', 'winddirection_100m', 'winddirection_120m', 'winddirection_140m', 'winddirection_160m', 'winddirection_180m', 'winddirection_200m', 'windspeed_10m', 'windspeed_20m', 'windspeed_40m', 'windspeed_60m', 'windspeed_80m', 'windspeed_100m', 'windspeed_120m', 'windspeed_140m', 'windspeed_160m', 'windspeed_180m', 'windspeed_200m' lat_lon : tuple or list of tuples Latitude longitude pairs at which to extract data. Use plot_region() or region_selection() to see the corresponding region for a given location. years : list Year(s) to be accessed. The years 2000-2019 available (up to 2020 for Mid-Atlantic). Examples: [2015] or [2004,2006,2007] preferred_region : string (optional) Region that the lat_lon belongs to ('Offshore_CA' or 'NW_Pacific'). Required when a lat_lon point falls in both the Offshore California and NW Pacific regions. Overlap region defined by latitude = (41.213, 42.642) and longitude = (-129.090, -121.672). Default = '' tree : str | cKDTree (optional) cKDTree or path to .pkl file containing pre-computed tree of lat, lon coordinates, default = None unscale : bool (optional) Boolean flag to automatically unscale variables on extraction Default = True str_decode : bool (optional) Boolean flag to decode the bytestring meta data into normal strings. Setting this to False will speed up the meta data read. Default = True hsds : bool (optional) Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS behind HSDS. Setting to False will indicate to look for files on local machine, not AWS. Default = True clear_cache : bool (optional) Boolean flag to clear the cache related to this specific request. Default is False. to_pandas: bool (optional) Flag to output pandas instead of xarray. Default = True. Returns --------- data: DataFrame Data indexed by datetime with columns named for parameter and cooresponding metadata index meta: DataFrame Location metadata for the requested data location """ if not isinstance(parameter, (str, list)): raise TypeError("parameter must be of type string or list") if not isinstance(lat_lon, (list, tuple)): raise TypeError("lat_lon must be of type list or tuple") if not isinstance(time_interval, str): raise TypeError("time_interval must be a string") if not isinstance(years, list): raise TypeError("years must be a list") if not isinstance(preferred_region, str): raise TypeError("preferred_region must be a string") if not isinstance(tree, (str, type(None))): raise TypeError("tree must be a string or None") if not isinstance(unscale, bool): raise TypeError("unscale must be bool type") if not isinstance(str_decode, bool): raise TypeError("str_decode must be bool type") if not isinstance(hsds, bool): raise TypeError("hsds must be bool type") if not isinstance(clear_cache, bool): raise TypeError("clear_cache must be of type bool") # Define the path to the cache directory cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "mhkit", "hindcast") # Construct a string representation of the function parameters hash_params = f"{time_interval}_{parameter}_{lat_lon}_{years}_{preferred_region}_{tree}_{unscale}_{str_decode}_{hsds}" # Use handle_caching to manage caching. data, meta, _ = handle_caching(hash_params, cache_dir, clear_cache_file=clear_cache) if data is not None and meta is not None: if not to_pandas: data = convert_to_dataset(data) data.attrs = meta return data, meta # Return cached data and meta if available else: # check for multiple region selection if isinstance(lat_lon[0], float): region = region_selection(lat_lon, preferred_region) else: reglist = [] for loc in lat_lon: reglist.append(region_selection(loc)) if reglist.count(reglist[0]) == len(lat_lon): region = reglist[0] else: raise TypeError("Coordinates must be within the same region!") if time_interval == "1-hour": wind_path = f"/nrel/wtk/{region.lower()}/{region}_*.h5" elif time_interval == "5-minute": wind_path = f"/nrel/wtk/{region.lower()}-5min/{region}_*.h5" else: raise TypeError( f"Invalid time_interval '{time_interval}', must be '1-hour' or '5-minute'" ) windKwargs = { "tree": tree, "unscale": unscale, "str_decode": str_decode, "hsds": hsds, "years": years, } data_list = [] with MultiYearWindX(wind_path, **windKwargs) as rex_wind: if isinstance(parameter, list): for p in parameter: temp_data = rex_wind.get_lat_lon_df(p, lat_lon) col = temp_data.columns[:] for i, c in zip(range(len(col)), col): temp = f"{p}_{i}" temp_data = temp_data.rename(columns={c: temp}) data_list.append(temp_data) data = pd.concat(data_list, axis=1) else: data = rex_wind.get_lat_lon_df(parameter, lat_lon) col = data.columns[:] for i, c in zip(range(len(col)), col): temp = f"{parameter}_{i}" data = data.rename(columns={c: temp}) meta = rex_wind.meta.loc[col, :] meta = meta.reset_index(drop=True) # Save the retrieved data and metadata to cache. handle_caching(hash_params, cache_dir, data=data, metadata=meta) if not to_pandas: data = convert_to_dataset(data) data.attrs = meta return data, meta