"""
Remote meteorological data sources for NOAA ARL archives.
Each source class encodes the filename convention, S3 path layout, and
approximate spatial extent for one ARL-formatted met product hosted on
the NOAA ARL public archives.
Storage backends
----------------
"s3" : AWS S3 (noaa-oar-arl-hysplit-pds, anonymous) — recommended
"ftp" : NOAA ARL FTP (ftp.arl.noaa.gov, anonymous, 2-connection limit)
"http" : NOAA READY web (www.ready.noaa.gov/data/archives)
Example
-------
>>> from arlmet.sources import HRRRSource
>>> source = HRRRSource()
>>> files = source.fetch("2024-07-18", "2024-07-19", local_dir="./met/")
>>> # Crop to domain on download (recommended due to large file sizes)
>>> files = source.fetch(
... "2024-07-18",
... "2024-07-19",
... local_dir="./met/",
... bbox=(-114.0, 39.0, -110.0, 42.0),
... )
Requires ``fsspec`` (and ``s3fs`` for the S3 backend).
Install with: ``pip install arlmet[sources]``
"""
from __future__ import annotations
import logging
import shutil
import tempfile
from abc import ABC, abstractmethod
from pathlib import Path
from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, cast
if TYPE_CHECKING:
from typing_extensions import override
else:
def override(f: object) -> object:
return f
import pandas as pd
from arlmet._time import ensure_timestamp
logger = logging.getLogger(__name__)
_MONTH_CODES: tuple[str, ...] = (
"jan",
"feb",
"mar",
"apr",
"may",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec",
)
[docs]
class MeteorologySource(ABC):
"""
Abstract base class for NOAA ARL meteorological archive sources.
Subclasses set class-level metadata and implement ``_s3_key()`` to
encode the filename convention for their product.
Attributes
----------
name : str
Short source identifier used by callers.
description : str
Human-readable product description.
start_date : pandas.Timestamp
Earliest archive date supported by the source.
Methods
-------
keys_for_range(start, end)
Return archive keys covering the requested inclusive time range.
fetch(start, end, ...)
Download or crop local ARL files for the requested time range.
"""
# Subclasses must define these
name: ClassVar[str]
description: ClassVar[str]
#: Earliest date available in the NOAA archive.
start_date: ClassVar[pd.Timestamp]
S3_BUCKET: ClassVar[str] = "noaa-oar-arl-hysplit-pds"
FTP_HOST: ClassVar[str] = "ftp.arl.noaa.gov"
HTTP_BASE: ClassVar[str] = "https://www.ready.noaa.gov/data/archives"
# ------------------------------------------------------------------
# Subclass interface
# ------------------------------------------------------------------
@abstractmethod
def _s3_key(self, time: pd.Timestamp) -> str:
"""S3 key (no leading slash) for the ARL file containing *time*."""
# ------------------------------------------------------------------
# Public interface
# ------------------------------------------------------------------
[docs]
def keys_for_range(
self,
start: pd.Timestamp | str,
end: pd.Timestamp | str,
) -> list[str]:
"""
Return deduplicated, sorted S3 keys covering ``[start, end]``.
Handles backward trajectories (``start > end``) by normalizing to
chronological order before scanning.
Parameters
----------
start, end : pandas.Timestamp or str
Inclusive time range to cover.
Returns
-------
list[str]
Unique archive keys in chronological order.
"""
t0 = ensure_timestamp(start, floor="h")
t1 = ensure_timestamp(end, floor="h")
if t0 > t1:
t0, t1 = t1, t0
seen: set[str] = set()
keys: list[str] = []
t = t0
while t <= t1:
key = self._s3_key(t)
if key not in seen:
seen.add(key)
keys.append(key)
t = ensure_timestamp(t + pd.Timedelta(hours=1))
return keys
[docs]
def fetch(
self,
start: pd.Timestamp | str,
end: pd.Timestamp | str,
*,
local_dir: Path | str,
backend: str = "s3",
bbox: tuple[float, float, float, float] | None = None,
overwrite: bool = False,
) -> list[Path]:
"""
Download ARL files covering ``[start, end]`` to *local_dir*.
Parameters
----------
start, end :
Time range (inclusive). Backward trajectories (start > end)
are handled automatically.
local_dir :
Directory to save downloaded files. Created if absent.
backend :
Storage backend — ``"s3"`` (default), ``"ftp"``, or ``"http"``.
bbox :
``(west, south, east, north)`` in degrees. When provided, each
file is cropped with :func:`arlmet.extract_subset` before
caching. Strongly recommended for global products (GFS, GDAS).
overwrite :
Re-download even if a matching local file already exists.
Returns
-------
list[Path]
Local paths to the downloaded (and optionally cropped) files,
in chronological order.
Raises
------
ImportError
If ``fsspec`` is not installed.
Examples
--------
>>> from arlmet.sources import HRRRSource
>>> source = HRRRSource()
>>> source.fetch("2024-07-18", "2024-07-19", local_dir="./met")
"""
try:
import fsspec # noqa: F401
except ImportError:
raise ImportError(
"fsspec is required for MeteorologySource.fetch(). "
"Install with: pip install arlmet[sources]"
) from None
local_dir = Path(local_dir)
local_dir.mkdir(parents=True, exist_ok=True)
results: list[Path] = []
for key in self.keys_for_range(start, end):
filename = Path(key).name
dest = self._dest_path(local_dir, filename, bbox)
if not overwrite and dest.exists():
logger.debug("Using cached %s", dest.name)
results.append(dest)
continue
url = self._url(key, backend)
opts = self._storage_options(backend)
logger.info("Fetching %s → %s", url, dest.name)
if bbox is not None:
self._fetch_and_crop(url, dest, bbox, opts)
else:
self._download(url, dest, opts)
results.append(dest)
return results
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _dest_path(
self,
local_dir: Path,
filename: str,
bbox: tuple[float, float, float, float] | None,
) -> Path:
"""Return the local cache path for a downloaded file and optional crop."""
if bbox is None:
return local_dir / filename
w, s, e, n = bbox
tag = f".crop_{w:.2f}_{s:.2f}_{e:.2f}_{n:.2f}"
return local_dir / f"{filename}{tag}"
def _url(self, key: str, backend: str) -> str:
"""Return the fully qualified remote URL for an archive key."""
if backend == "s3":
return f"s3://{self.S3_BUCKET}/{key}"
if backend == "ftp":
# FTP path mirrors S3 key structure under /archives/
return f"ftp://anonymous@{self.FTP_HOST}/archives/{key}"
if backend == "http":
return f"{self.HTTP_BASE}/{key}"
raise ValueError(f"Unknown backend {backend!r}. Choose 's3', 'ftp', or 'http'.")
def _storage_options(self, backend: str) -> dict[str, Any]:
"""Return fsspec storage options for the selected backend."""
if backend == "s3":
return {"anon": True}
return {}
def _download(self, url: str, dest: Path, opts: dict[str, Any]) -> None:
"""Download one ARL file to a temporary path and atomically move it into place."""
import fsspec
tmp = dest.with_suffix(dest.suffix + ".tmp")
try:
with fsspec.open(url, "rb", **opts) as src, open(tmp, "wb") as dst:
# fsspec.open() stubs return IO[Any]; "rb"/"wb" mode guarantees BinaryIO.
shutil.copyfileobj(
cast(BinaryIO, src),
cast(BinaryIO, dst),
length=8 * 1024 * 1024,
)
tmp.rename(dest)
except Exception:
tmp.unlink(missing_ok=True)
raise
def _fetch_and_crop(
self,
url: str,
dest: Path,
bbox: tuple[float, float, float, float],
opts: dict[str, Any],
) -> None:
"""Download one ARL file, crop it to *bbox*, and write the cropped copy."""
from arlmet.subset import extract_subset
with tempfile.NamedTemporaryFile(suffix=".arl", delete=False) as f:
tmp = Path(f.name)
try:
self._download(url, tmp, opts)
extract_subset(tmp, dest, bbox=bbox)
finally:
tmp.unlink(missing_ok=True)
@override
def __repr__(self) -> str:
return f"{type(self).__name__}()"
# ---------------------------------------------------------------------------
# Concrete source implementations
# ---------------------------------------------------------------------------
[docs]
class HRRRSource(MeteorologySource):
"""
HRRR 3 km analysis (CONUS, June 2019–present).
Files cover 6-hour UTC blocks (00–05, 06–11, 12–17, 18–23),
approximately 3.2 GB each.
S3: ``s3://noaa-oar-arl-hysplit-pds/hrrr/{year}/{month:02d}/{YYYYMMDD}_{HH}-{HH}_hrrr``
Note
----
The earliest June 2019 files may exist at the bucket root rather than
under the year subdirectory. This edge case is not currently handled.
"""
name = "hrrr"
description = "HRRR 3 km analysis"
start_date = ensure_timestamp("2019-06-12")
_HOURS_PER_FILE: ClassVar[int] = 6
def _filename(self, time: pd.Timestamp) -> str:
"""Return the HRRR archive filename covering *time*."""
start_h = (time.hour // self._HOURS_PER_FILE) * self._HOURS_PER_FILE
end_h = start_h + self._HOURS_PER_FILE - 1
return f"{time.strftime('%Y%m%d')}_{start_h:02d}-{end_h:02d}_hrrr"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the HRRR file covering *time*."""
return f"hrrr/{time.year}/{time.month:02d}/{self._filename(time)}"
[docs]
class NAMSource(MeteorologySource):
"""
NAM 12 km analysis (North America, May 2007–present).
One file per calendar day.
S3: ``s3://noaa-oar-arl-hysplit-pds/nam12/{year}/{month:02d}/{YYYYMMDD}_nam12``
"""
name = "nam12"
description = "NAM 12 km analysis"
start_date = ensure_timestamp("2007-05-01")
def _filename(self, time: pd.Timestamp) -> str:
"""Return the daily NAM archive filename for *time*."""
return f"{time.strftime('%Y%m%d')}_nam12"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the NAM file covering *time*."""
return f"nam12/{time.year}/{time.month:02d}/{self._filename(time)}"
[docs]
class GDASSource(MeteorologySource):
"""
GDAS 1-degree global analysis (December 2004–present).
Weekly files (~571 MB each). Week boundaries are fixed per month:
w1 = days 1–7, w2 = days 8–14, w3 = days 15–21,
w4 = days 22–28, w5 = days 29–end.
S3: ``s3://noaa-oar-arl-hysplit-pds/gdas1/{year}/gdas1.{mon}{YY}.w{N}``
"""
name = "gdas1"
description = "GDAS 1-degree global analysis"
start_date = ensure_timestamp("2004-12-01")
def _week(self, time: pd.Timestamp) -> int:
"""Return the 1-based archive week within the month for *time*."""
return (time.day - 1) // 7 + 1
def _filename(self, time: pd.Timestamp) -> str:
"""Return the weekly GDAS archive filename for *time*."""
month = _MONTH_CODES[time.month - 1]
year_2d = time.strftime("%y")
return f"gdas1.{month}{year_2d}.w{self._week(time)}"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the GDAS file covering *time*."""
return f"gdas1/{time.year}/{self._filename(time)}"
[docs]
class GFSSource(MeteorologySource):
"""
GFS 0.25-degree global analysis (June 2019–present).
One file per calendar day, approximately 2.7 GB each.
Cropping with ``bbox=`` on fetch is strongly recommended.
S3: ``s3://noaa-oar-arl-hysplit-pds/gfs0p25/{year}/{month:02d}/{YYYYMMDD}_gfs0p25``
"""
name = "gfs0p25"
description = "GFS 0.25-degree global analysis"
start_date = ensure_timestamp("2019-06-01")
def _filename(self, time: pd.Timestamp) -> str:
"""Return the daily GFS archive filename for *time*."""
return f"{time.strftime('%Y%m%d')}_gfs0p25"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the GFS file covering *time*."""
return f"gfs0p25/{time.year}/{time.month:02d}/{self._filename(time)}"
[docs]
class NAMSSource(MeteorologySource):
"""
NAMS hybrid sigma-pressure analysis (CONUS/Alaska/Hawaii, 2010–present).
One file per calendar day. Uses hybrid sigma-pressure vertical coordinates
(flag=4), making it suitable for high-accuracy boundary-layer transport.
Parameters
----------
domain : {"conus", "ak", "hi"}
Regional domain — CONUS (default), Alaska, or Hawaii.
S3: ``s3://noaa-oar-arl-hysplit-pds/nams/{year}/{month:02d}/{YYYYMMDD}_hysplit.t00z.namsa[.AK|.HI]``
"""
name = "nams"
description = "NAMS hybrid sigma-pressure analysis"
start_date = ensure_timestamp("2010-01-01")
_DOMAIN_SUFFIXES: ClassVar[dict[str, str]] = {
"conus": "",
"ak": ".AK",
"hi": ".HI",
}
def __init__(self, domain: str = "conus") -> None:
if domain not in self._DOMAIN_SUFFIXES:
raise ValueError(
f"domain must be one of {list(self._DOMAIN_SUFFIXES)!r}, got {domain!r}"
)
self.domain = domain
def _filename(self, time: pd.Timestamp) -> str:
"""Return the daily NAMS archive filename for *time* and the selected domain."""
suffix = self._DOMAIN_SUFFIXES[self.domain]
return f"{time.strftime('%Y%m%d')}_hysplit.t00z.namsa{suffix}"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the NAMS file covering *time*."""
return f"nams/{time.year}/{time.month:02d}/{self._filename(time)}"
@override
def __repr__(self) -> str:
return f"NAMSSource(domain={self.domain!r})"
[docs]
class ReanalysisSource(MeteorologySource):
"""
NCEP/NCAR Reanalysis 2.5-degree global (1948–present).
Monthly files (~500 MB each). Covers the full globe at 2.5-degree
resolution. Useful for long climatological back-trajectory studies.
Cropping with ``bbox=`` on fetch is strongly recommended.
S3: ``s3://noaa-oar-arl-hysplit-pds/reanalysis/{year}/RP{YYYYMM}.gbl``
"""
name = "reanalysis"
description = "NCEP/NCAR Reanalysis 2.5-degree global"
start_date = ensure_timestamp("1948-01-01")
def _filename(self, time: pd.Timestamp) -> str:
"""Return the monthly reanalysis archive filename for *time*."""
return f"RP{time.strftime('%Y%m')}.gbl"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the reanalysis file covering *time*."""
return f"reanalysis/{time.year}/{self._filename(time)}"
[docs]
class HRRRv1Source(MeteorologySource):
"""
HRRR 3 km analysis, version 1 (CONUS, June 2015–2019).
Files cover 6-hour UTC blocks (00z, 06z, 12z, 18z).
Superseded by :class:`HRRRSource` from June 2019 onward.
S3: ``s3://noaa-oar-arl-hysplit-pds/hrrr.v1/{year}/{month:02d}/hysplit.{YYYYMMDD}.{HH}z.hrrra``
"""
name = "hrrr.v1"
description = "HRRR 3 km analysis v1"
start_date = ensure_timestamp("2015-06-01")
_HOURS_PER_FILE: ClassVar[int] = 6
def _filename(self, time: pd.Timestamp) -> str:
"""Return the legacy HRRR v1 archive filename covering *time*."""
start_h = (time.hour // self._HOURS_PER_FILE) * self._HOURS_PER_FILE
return f"hysplit.{time.strftime('%Y%m%d')}.{start_h:02d}z.hrrra"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the HRRR v1 file covering *time*."""
return f"hrrr.v1/{time.year}/{time.month:02d}/{self._filename(time)}"
[docs]
class GDAS0p5Source(MeteorologySource):
"""
GDAS 0.5-degree global analysis (September 2007–mid 2019).
One file per calendar day. Higher resolution than :class:`GDASSource`
(1-degree). Cropping with ``bbox=`` on fetch is strongly recommended.
S3: ``s3://noaa-oar-arl-hysplit-pds/gdas0p5/{year}/{month:02d}/{YYYYMMDD}_gdas0p5``
"""
name = "gdas0p5"
description = "GDAS 0.5-degree global analysis"
start_date = ensure_timestamp("2007-09-01")
def _filename(self, time: pd.Timestamp) -> str:
"""Return the daily GDAS 0.5-degree archive filename for *time*."""
return f"{time.strftime('%Y%m%d')}_gdas0p5"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the GDAS 0.5-degree file covering *time*."""
return f"gdas0p5/{time.year}/{time.month:02d}/{self._filename(time)}"
[docs]
class NARRSource(MeteorologySource):
"""
NCEP North American Regional Reanalysis (January 1979–2019).
Monthly files at 32 km resolution over North America. Useful for
long climatological back-trajectory studies over the continent.
No file extension.
S3: ``s3://noaa-oar-arl-hysplit-pds/narr/{year}/NARR{YYYYMM}``
"""
name = "narr"
description = "NCEP North American Regional Reanalysis 32 km"
start_date = ensure_timestamp("1979-01-01")
def _filename(self, time: pd.Timestamp) -> str:
"""Return the monthly NARR archive filename for *time*."""
return f"NARR{time.strftime('%Y%m')}"
@override
def _s3_key(self, time: pd.Timestamp) -> str:
"""Return the NOAA ARL S3 object key for the NARR file covering *time*."""
return f"narr/{time.year}/{self._filename(time)}"
__all__ = [
"MeteorologySource",
"HRRRSource",
"HRRRv1Source",
"NAMSource",
"NAMSSource",
"GDASSource",
"GDAS0p5Source",
"GFSSource",
"NARRSource",
"ReanalysisSource",
]