Source code for mhkit.river.io.usgs

import os
import json
import requests
import shutil
import pandas as pd
from mhkit.utils.cache import handle_caching


def _read_usgs_json(text, to_pandas=True):
    data = pd.DataFrame()
    for i in range(len(text["value"]["timeSeries"])):
        try:
            site_name = text["value"]["timeSeries"][i]["variable"][
                "variableDescription"
            ]
            site_data = pd.DataFrame(
                text["value"]["timeSeries"][i]["values"][0]["value"]
            )
            site_data.set_index("dateTime", drop=True, inplace=True)
            site_data.index = pd.to_datetime(site_data.index, utc=True)
            site_data.rename(columns={"value": site_name}, inplace=True)
            site_data[site_name] = pd.to_numeric(site_data[site_name])
            site_data.index.name = None
            del site_data["qualifiers"]
            data = data.combine_first(site_data)
        except:
            pass

    if not to_pandas:
        data = data.to_dataset()

    return data


[docs] def read_usgs_file(file_name, to_pandas=True): """ Reads a USGS JSON data file (from https://waterdata.usgs.gov/nwis) Parameters ---------- file_name : str Name of USGS JSON data file to_pandas: bool (optional) Flag to output pandas instead of xarray. Default = True. Returns ------- data : pandas DataFrame or xarray Dataset Data indexed by datetime with columns named according to the parameter's variable description """ if not isinstance(to_pandas, bool): raise TypeError(f"to_pandas must be of type bool. Got: {type(to_pandas)}") with open(file_name) as json_file: text = json.load(json_file) data = _read_usgs_json(text, to_pandas) return data
[docs] def request_usgs_data( station, parameter, start_date, end_date, data_type="Daily", proxy=None, write_json=None, clear_cache=False, to_pandas=True, ): """ Loads USGS data directly from https://waterdata.usgs.gov/nwis using a GET request The request URL prints to the screen. Parameters ---------- station : str USGS station number (e.g. '08313000') parameter : str USGS parameter ID (e.g. '00060' for Discharge, cubic feet per second) start_date : str Start date in the format 'YYYY-MM-DD' (e.g. '2018-01-01') end_date : str End date in the format 'YYYY-MM-DD' (e.g. '2018-12-31') data_type : str Data type, options include 'Daily' (return the mean daily value) and 'Instantaneous'. proxy : dict or None To request data from behind a firewall, define a dictionary of proxy settings, for example {"http": 'localhost:8080'} write_json : str or None Name of json file to write data clear_cache : bool If True, the cache for this specific request will be cleared. to_pandas: bool (optional) Flag to output pandas instead of xarray. Default = True. Returns ------- data : pandas DataFrame or xarray Dataset Data indexed by datetime with columns named according to the parameter's variable description """ if data_type not in ["Daily", "Instantaneous"]: raise ValueError(f"data_type must be Daily or Instantaneous. Got: {data_type}") if not isinstance(to_pandas, bool): raise TypeError(f"to_pandas must be of type bool. Got: {type(to_pandas)}") # Define the path to the cache directory cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "mhkit", "usgs") # Create a unique filename based on the function parameters hash_params = f"{station}_{parameter}_{start_date}_{end_date}_{data_type}" # Use handle_caching to manage cache cached_data, metadata, cache_filepath = handle_caching( hash_params, cache_dir, cache_content={"data": None, "metadata": None, "write_json": write_json}, clear_cache_file=clear_cache, ) if cached_data is not None: return cached_data # If no cached data, proceed with the API request if data_type == "Daily": data_url = "https://waterservices.usgs.gov/nwis/dv" api_query = ( "/?format=json&sites=" + station + "&startDT=" + start_date + "&endDT=" + end_date + "&statCd=00003" + "&parameterCd=" + parameter + "&siteStatus=all" ) else: data_url = "https://waterservices.usgs.gov/nwis/iv" api_query = ( "/?format=json&sites=" + station + "&startDT=" + start_date + "&endDT=" + end_date + "&parameterCd=" + parameter + "&siteStatus=all" ) print("Data request URL: ", data_url + api_query) response = requests.get(url=data_url + api_query, proxies=proxy) text = json.loads(response.text) # handle_caching is only set-up for pandas, so force this data to output as pandas for now data = _read_usgs_json(text, True) # After making the API request and processing the response, write the # response to a cache file handle_caching( hash_params, cache_dir, cache_content={"data": data, "metadata": None, "write_json": None}, clear_cache_file=clear_cache, ) if write_json: shutil.copy(cache_filepath, write_json) if not to_pandas: data = data.to_dataset() return data