#!/usr/bin/env python

r"""
Global Historical Climatology Network (GHCN) hourly data access script.

This script is designed to work with GHCNh (Global Historical Climatology Network hourly) data.

Station ID Generation:
1. Use station-associated 5-digit WBAN or 6-digit WMO or USAF to generate GHCN ID
   a. GHCN ID for sites with WBAN codes: <COUNTRY CODE>W000<WBAN>
      Example: Naples, Italy with WBAN = 34113 → ITW00034113
   b. GHCN ID for sites without WBAN codes but with WMO or USAF codes: <COUNTRY CODE>0000<WMO>
      Example: Cagliari, Italy with WMO 16560 → IT000016560

2. Convert NOAA NCEI weather data from GHCNh PSV to CSV

Example usage (Use caret characters ( ^ ) for line continuation.):

python z_GHCN_Retrieval_PST_to_CSV.py ^
  --station-id USW00014771 ^
  --start-year 2024 ^
  --end-year 2025 ^
  --out-dir "C:\iTree\WeatherPrep\Outputs\Syracuse_NY" ^
  --combine

python z_GHCN_Retrieval_PST_to_CSV.py --station-id USW00014771 --start-year 2024 --end-year 2025 --out-dir "C:\iTree\WeatherPrep\Outputs\Syracuse_NY" --combine
python z_GHCN_Retrieval_PST_to_CSV.py --station-id ITW00034113 --start-year 2024 --end-year 2024 --out-dir "C:\iTree\WeatherPrep\Outputs\Naples_IT"

Simple GHCN-Hourly downloader + converter for WeatherPrep.

Features:
- Build GHCN-H station ID from country code + WBAN/WMO/USAF, OR accept a full GHCN ID
- Download yearly GHCNh .psv files from NCEI via HTTPS
- Convert .psv to WeatherPrep-style CSV with columns:

    YR--MODAHRMN
    temperature_C
    dew_point_temperature_C
    station_level_pressure_hPa
    sea_level_pressure_hPa
    altimeter_hPa
    wind_direction_deg
    wind_speed_mps
    wind_gust_mps
    precipitation_mm
    precipitation_6_hour_mm
    precipitation_24_hour_mm
    sky_cover_baseht_1_m
    sky_cover_1_code
    sky_cover_baseht_2_m
    sky_cover_2_code
    sky_cover_baseht_3_m
    sky_cover_3_code

- Optionally combine multiple years into a single CSV:
    <station_id>-<start_year>-<end_year>.csv

You can then reference that CSV in WeatherPrepConfig.xml as SurfaceWeatherDataFile.
"""
import os
import argparse
import requests
import pandas as pd


BASE_URL = (
    "https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access"
)


def build_station_id(country_code, wban=None, wmo=None, usaf=None):
    """
    Build a GHCN-Hourly station ID using:

    - If WBAN is present (not 99999):
        <CC>W000<WBAN>
      e.g., Italy + WBAN 34113 -> ITW00034113

    - Else, if WMO is present:
        <CC>0000<WMO>
      e.g., Italy + WMO 16560 -> IT000016560

    - Else, if only USAF is present (6-digit, often WMO*10):
        use first 5 digits of USAF as WMO:
        <CC>0000<USAF[:5]>

    country_code: two-letter ISO-ish code (e.g., 'IT', 'US')
    WBAN/WMO/USAF may be str or int.
    """
    cc = country_code.upper()

    if wban is not None:
        wban_str = str(wban).zfill(5)
        if wban_str != "99999":
            return f"{cc}W000{wban_str}"

    if wmo is not None:
        wmo_str = str(wmo).zfill(5)
        return f"{cc}0000{wmo_str}"

    if usaf is not None:
        usaf_str = str(usaf).zfill(6)
        wmo_from_usaf = usaf_str[:5]
        return f"{cc}0000{wmo_from_usaf}"

    raise ValueError(
        "Must provide at least WBAN, WMO, or USAF if station_id is not given."
    )


def download_ghcnh_psv(station_id, year, out_dir="."):
    """
    Download a single year's GHCN-Hourly PSV file for a station.
    Returns the local .psv filename.
    """
    url = f"{BASE_URL}/by-year/{year}/psv/GHCNh_{station_id}_{year}.psv"
    local_name = os.path.join(out_dir, f"GHCNh_{station_id}_{year}.psv")

    print(f"  Downloading {url}")
    resp = requests.get(url, timeout=60)
    resp.raise_for_status()

    with open(local_name, "wb") as f:
        f.write(resp.content)

    print(f"  Saved {local_name}")
    return local_name


def psv_to_weatherprep_csv(station_id, year, in_dir=".", out_dir="."):
    """
    Convert a single GHCN-Hourly PSV file into a WeatherPrep-ready CSV.

    Input:
        GHCNh_{station_id}_{year}.psv in in_dir
    Output:
        {station_id}-{year}.csv in out_dir
    """
    psv_name = os.path.join(in_dir, f"GHCNh_{station_id}_{year}.psv")
    if not os.path.exists(psv_name):
        raise FileNotFoundError(psv_name)

    print(f"  Converting {psv_name} to CSV...")

    # Read PSV
    df = pd.read_csv(psv_name, sep="|", low_memory=False)

    # Identify datetime column – GHCN-H usually has 'date_time'
    dt_col = None
    for cand in ["date_time", "DATE", "date", "time"]:
        if cand in df.columns:
            dt_col = cand
            break
    if dt_col is None:
        raise ValueError(
            f"No datetime column found in {psv_name}. "
            "Check column names and adjust script."
        )

    df["datetime"] = pd.to_datetime(df[dt_col])

    def col(name):
        return df[name] if name in df.columns else pd.NA

    out = pd.DataFrame()

    # Date/time in YYYYMMDDHHMM
    out["YR--MODAHRMN"] = df["datetime"].dt.strftime("%Y%m%d%H%M")

    # Temperatures (°C)
    out["temperature_C"] = col("temperature")
    out["dew_point_temperature_C"] = col("dew_point_temperature")

    # Pressures (hPa)
    out["station_level_pressure_hPa"] = col("station_level_pressure")
    out["sea_level_pressure_hPa"] = col("sea_level_pressure")
    out["altimeter_hPa"] = col("altimeter")

    # Wind (deg, m/s)
    out["wind_direction_deg"] = col("wind_direction")      # degrees
    out["wind_speed_mps"] = col("wind_speed")              # m/s
    out["wind_gust_mps"] = col("wind_gust")                # m/s

    # Precipitation (mm)
    out["precipitation_mm"] = col("precipitation")                 # 1-hr total
    out["precipitation_6_hour_mm"] = col("precipitation_6_hour")
    out["precipitation_24_hour_mm"] = col("precipitation_24_hour")

    # Sky cover base heights (m)
    out["sky_cover_baseht_1_m"] = col("sky_cover_baseht_1")
    out["sky_cover_1_code"] = col("sky_cover_1")

    out["sky_cover_baseht_2_m"] = col("sky_cover_baseht_2")
    out["sky_cover_2_code"] = col("sky_cover_2")

    out["sky_cover_baseht_3_m"] = col("sky_cover_baseht_3")
    out["sky_cover_3_code"] = col("sky_cover_3")

    out_name = os.path.join(out_dir, f"{station_id}-{year}.csv")
    out.to_csv(out_name, index=False)
    print(f"  Wrote {out_name}")
    return out_name


def combine_years(station_id, years, out_dir="."):
    """
    Combine multiple yearly CSVs into a single file:
        <station_id>-<start_year>-<end_year>.csv

    Assumes each yearly file is named:
        {station_id}-{year}.csv  in out_dir

    and has identical headers.
    """
    years_sorted = sorted(years)
    start_year = years_sorted[0]
    end_year = years_sorted[-1]

    combined_name = os.path.join(
        out_dir, f"{station_id}-{start_year}-{end_year}.csv"
    )
    print(f"Combining years into {combined_name}")

    with open(combined_name, "w", newline="") as fout:
        wrote_header = False
        for y in years_sorted:
            fname = os.path.join(out_dir, f"{station_id}-{y}.csv")
            if not os.path.exists(fname):
                print(f"  WARNING: {fname} not found, skipping.")
                continue

            with open(fname, "r") as fin:
                lines = fin.readlines()
                if not lines:
                    continue

                header = lines[0]
                data_lines = lines[1:]

                if not wrote_header:
                    fout.write(header)
                    wrote_header = True

                for line in data_lines:
                    fout.write(line)

    print("Done combining.")
    return combined_name


def main():
    parser = argparse.ArgumentParser(
        description="Download and convert GHCN-Hourly data to WeatherPrep-ready CSV."
    )

    # Station identification
    parser.add_argument(
        "--station-id",
        help="Full GHCN-H station ID (e.g., ITW00034113). "
             "If provided, overrides country/WBAN/WMO/USAF.",
    )
    parser.add_argument(
        "--country",
        help="Two-letter country code (e.g., IT, US) for building station ID.",
    )
    parser.add_argument("--wban", help="5-digit WBAN code (e.g., 34113).")
    parser.add_argument("--wmo", help="5-digit WMO code (e.g., 16560).")
    parser.add_argument("--usaf", help="6-digit USAF code (e.g., 162890).")

    # Years
    parser.add_argument("--start-year", type=int, required=True)
    parser.add_argument("--end-year", type=int, required=True)

    # Output directory
    parser.add_argument(
        "--out-dir",
        default=".",
        help="Directory to store downloaded PSV and output CSV files (default: current directory).",
    )

    # Combine flag
    parser.add_argument(
        "--combine",
        action="store_true",
        help="If set, combine yearly CSVs into a single <station>-<start>-<end>.csv",
    )

    args = parser.parse_args()

    # Determine station_id
    if args.station_id:
        station_id = args.station_id.strip()
    else:
        if not args.country:
            raise SystemExit(
                "Error: if --station-id is not provided, "
                "--country plus at least one of --wban/--wmo/--usaf is required."
            )
        station_id = build_station_id(
            country_code=args.country,
            wban=args.wban,
            wmo=args.wmo,
            usaf=args.usaf,
        )

    print(f"Using GHCN-Hourly station ID: {station_id}")

    os.makedirs(args.out_dir, exist_ok=True)

    years = range(args.start_year, args.end_year + 1)

    # Download + convert per year
    for y in years:
        psv_file = download_ghcnh_psv(station_id, y, out_dir=args.out_dir)
        _ = psv_to_weatherprep_csv(
            station_id, y, in_dir=args.out_dir, out_dir=args.out_dir
        )
        # Optionally, delete .psv after conversion:
        # os.remove(psv_file)

    # Optional: combine yearly CSVs
    if args.combine and args.start_year != args.end_year:
        combine_years(station_id, years, out_dir=args.out_dir)

    print("All done.")


if __name__ == "__main__":
    main()
