Source code for mhkit.utils.cache

"""
This module provides functionality for managing cache files to optimize
network requests and computations for handling data. The module focuses
on enabling users to read from and write to cache files, as well as 
perform cache clearing operations. Cache files are utilized to store data 
temporarily, mitigating the need to re-fetch or recompute the same data multiple 
times, which can be especially useful in network-dependent tasks.

The module consists of two main functions:

1. `handle_caching`:
   This function manages the caching of data. It provides options to read from 
   and write to cache files, depending on whether the data is already provided 
   or if it needs to be fetched from the cache. If a cache file corresponding 
   to the given parameters already exists, the function can either load data 
   from it or clear it based on the parameters passed. It also offers the ability 
   to store associated metadata along with the data and supports both JSON and 
   pickle file formats for caching. This function returns the loaded data and 
   metadata from the cache file, along with the cache file path.

2. `clear_cache`:
   This function enables the clearing of either specific sub-directories or the 
   entire cache directory, depending on the parameter passed. It removes the 
   specified directory and then recreates it to ensure future caching tasks can 
   be executed without any issues. If the specified directory does not exist, 
   the function prints an indicative message.

Module Dependencies:
--------------------
    - hashlib: For creating unique filenames based on hashed parameters.
    - json: For reading and writing JSON formatted cache files.
    - os: For performing operating system dependent tasks like directory creation.
    - re: For regular expression operations to match datetime formatted strings.
    - shutil: For performing high-level file operations like copying and removal.
    - pickle: For reading and writing pickle formatted cache files.
    - pandas: For handling data in DataFrame format.

Author: ssolson
Date: 2023-09-26
"""

from typing import Optional, Tuple, Dict, Any
import hashlib
import json
import os
import shutil
import pickle
import pandas as pd


[docs] def handle_caching( hash_params: str, cache_dir: str, cache_content: Optional[Dict[str, Any]] = None, clear_cache_file: bool = False, ) -> Tuple[Optional[pd.DataFrame], Optional[Dict[str, Any]], str]: """ Handles caching of data to avoid redundant network requests or computations. The function checks if a cache file exists for the given parameters. If it does, the function will load data from the cache file, unless the `clear_cache_file` parameter is set to `True`, in which case the cache file is cleared. If the cache file does not exist and the `data` parameter is not `None`, the function will store the provided data in a cache file. Parameters ---------- hash_params : str Parameters to generate the cache file hash. cache_dir : str Directory where cache files are stored. cache_content : Optional[Dict[str, Any]], optional Content to be cached. Should contain 'data', 'metadata', and 'write_json'. clear_cache_file : bool Whether to clear the existing cache. Returns ------- Tuple[Optional[pd.DataFrame], Optional[Dict[str, Any]], str] Cached data, metadata, and cache file path. """ data = None metadata = None def _generate_cache_filepath(): """Generates the cache file path based on the hashed parameters.""" file_extension = ( ".pkl" if "cdip" in cache_dir or "hindcast" in cache_dir or "ndbc" in cache_dir else ".json" ) cache_filename = ( hashlib.md5(hash_params.encode("utf-8")).hexdigest() + file_extension ) return os.path.join(cache_dir, cache_filename), file_extension def _clear_cache(cache_filepath): """Clear the cache file if requested.""" if clear_cache_file and os.path.isfile(cache_filepath): os.remove(cache_filepath) print(f"Cleared cache for {cache_filepath}") def _load_cache(file_extension, cache_filepath): """Load data from the cache file based on its extension.""" nonlocal data, metadata # Specify that these are outer variables if file_extension == ".json": with open(cache_filepath, encoding="utf-8") as f: json_data = json.load(f) metadata = json_data.pop("metadata", None) data = pd.DataFrame( json_data["data"], index=pd.to_datetime(json_data["index"]), columns=json_data["columns"], ) elif file_extension == ".pkl": with open(cache_filepath, "rb") as f: data, metadata = pickle.load(f) return data, metadata def _write_cache(data, metadata, file_extension, cache_filepath): """Store data in the cache file based on the extension.""" if file_extension == ".json": py_data = data.to_dict(orient="split") py_data["metadata"] = metadata if isinstance(data.index, pd.DatetimeIndex): py_data["index"] = [ dt.strftime("%Y-%m-%d %H:%M:%S") for dt in py_data["index"] ] else: py_data["index"] = list(data.index) with open(cache_filepath, "w", encoding="utf-8") as f: json.dump(py_data, f) elif file_extension == ".pkl": with open(cache_filepath, "wb") as f: pickle.dump((data, metadata), f) # Create the cache directory if it doesn't exist if not os.path.isdir(cache_dir): os.makedirs(cache_dir) # Generate cache filepath and extension cache_filepath, file_extension = _generate_cache_filepath() # Clear cache if requested _clear_cache(cache_filepath) # If cache file exists and cache_content["data"] is None, load from cache if os.path.isfile(cache_filepath) and ( cache_content is None or cache_content["data"] is None ): return _load_cache(file_extension, cache_filepath) + (cache_filepath,) # Store data in cache if provided if cache_content and cache_content["data"] is not None: _write_cache( cache_content["data"], cache_content["metadata"], file_extension, cache_filepath, ) if cache_content["write_json"]: shutil.copy(cache_filepath, cache_content["write_json"]) return cache_content["data"], cache_content["metadata"], cache_filepath return None, None, cache_filepath
[docs] def clear_cache(specific_dir: Optional[str] = None) -> None: """ Clears the cache. The function checks if a specific directory or the entire cache directory exists. If it does, the function will remove the directory and recreate it. If the directory does not exist, a message indicating is printed. Parameters ---------- specific_dir : str or None, optional Specific sub-directory to clear. If None, the entire cache is cleared. Default is None. Returns ------- None """ cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "mhkit") # Consider generating this from a system folder search folders = { "river": "river", "tidal": "tidal", "wave": "wave", "usgs": os.path.join("river", "usgs"), "noaa": os.path.join("tidal", "noaa"), "ndbc": os.path.join("wave", "ndbc"), "cdip": os.path.join("wave", "cdip"), "hindcast": os.path.join("wave", "hindcast"), } # If specific_dir is provided and matches a key in the folders dictionary, # use its corresponding value if specific_dir and specific_dir in folders: specific_dir = folders[specific_dir] # Construct the path to the directory to be cleared path_to_clear = os.path.join(cache_dir, specific_dir) if specific_dir else cache_dir # Check if the directory exists if os.path.exists(path_to_clear): # Clear the directory shutil.rmtree(path_to_clear) # Recreate the directory after deletion os.makedirs(path_to_clear) else: print(f"The directory {path_to_clear} does not exist.")