#!/usr/bin/env python3

"""
Build ghcnh-station-attributes.csv from:

1) ghcnh-inventory_unique.csv       (contains columns: GHCNh_ID, YEAR, from https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/doc/ghcnh-inventory.txt)
2) emshr_lite.txt (huge fixed-width metadata file; https://www.ncei.noaa.gov/access/homr/file/emshr_lite.txt

For each GHCNh_ID in the inventory file, this script:

- finds matching rows in emshr_lite... by the GHCNH column
- if duplicates exist, keeps the row with the latest END_DT
  (e.g., 99991231 beats 19880501)
- writes a CSV with:
  GHCNH, NCDC, BEG_DT, END_DT, COOP, WBAN, WMO, STATION_NAME,
  CC, CTRY_NAME, ST, COUNTY, CD, UTC, LAT_DEC, LON_DEC, EL_GR_M
"""

import csv

INVENTORY_CSV = "ghcnh-inventory_unique.csv"
EMSHR_TXT = "emshr_lite.txt"
OUTPUT_CSV = "ghcnh-station-attributes.csv"


def read_inventory_ids(filename):
    """Read GHCNh_IDs from the inventory CSV into a list + set."""
    with open(filename, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)

        # Try to be a bit robust about the column name
        candidates = ["GHCNh_ID", "GHCNH_ID", "ghcnh_id"]
        ghcnh_field = None
        for cand in candidates:
            if cand in reader.fieldnames:
                ghcnh_field = cand
                break

        if ghcnh_field is None:
            raise ValueError(
                f"Could not find a GHCNh_ID column in {filename}. "
                f"Found columns: {reader.fieldnames}"
            )

        ids_list = []
        ids_set = set()
        for row in reader:
            station_id = row[ghcnh_field].strip()
            if not station_id:
                continue
            ids_list.append(station_id)
            ids_set.add(station_id)

    return ids_list, ids_set


def build_column_indexes(header_line):
    """
    Given the header line of emshr_lite_WBAN_or_USAF_to_GHCND.txt,
    compute fixed-width (start, end) slices for each column we need.

    IMPORTANT:
    For each column, we set:
      start = position of its header label
      end   = position of the NEXT header label in the full header sequence

    This ensures, for example, that WBAN spans exactly from 'WBAN' to 'ICAO',
    NOT all the way to 'WMO'.
    """
    h = header_line.rstrip("\n")

    # Full ordered list of header names in this file
    header_names = [
        "NCDC",
        "BEG_DT",
        "END_DT",
        "COOP",
        "WBAN",
        "ICAO",
        "FAA",
        "NWSLI",
        "WMO",
        "TRANS",
        "GHCND",
        "STATION_NAME",
        "CC",
        "CTRY_NAME",
        "ST",          # state/province
        "COUNTY",
        "CD",
        "UTC",
        "LAT_DEC",
        "LON_DEC",
        "LOC_PREC",
        "LAT_DMS",
        "LON_DMS",
        "EL_GR_FT",
        "EL_GR_M",
        "EL_AP_FT",
        "EL_AP_M",
        "TYPE",
        "RELOCATION",
        "GHCNMLT",
        "IGRA",
        "HPD",
        "GHCNH",
    ]

    # Find starting index of each header in order, scanning left→right
    starts = {}
    search_pos = 0
    for name in header_names:
        idx = h.index(name, search_pos)
        starts[name] = idx
        search_pos = idx + 1

    # Build (start, end) spans
    spans = {}
    for i, name in enumerate(header_names):
        start = starts[name]
        if i == len(header_names) - 1:
            end = len(h)
        else:
            next_name = header_names[i + 1]
            end = starts[next_name]
        spans[name] = (start, end)

    # We only need a subset of columns; return those
    cols = {
        "NCDC": spans["NCDC"],
        "BEG_DT": spans["BEG_DT"],
        "END_DT": spans["END_DT"],
        "COOP": spans["COOP"],
        "WBAN": spans["WBAN"],
        "WMO": spans["WMO"],
        "STATION_NAME": spans["STATION_NAME"],
        "CC": spans["CC"],
        "CTRY_NAME": spans["CTRY_NAME"],
        "ST": spans["ST"],
        "COUNTY": spans["COUNTY"],
        "CD": spans["CD"],
        "UTC": spans["UTC"],
        "LAT_DEC": spans["LAT_DEC"],
        "LON_DEC": spans["LON_DEC"],
        "EL_GR_M": spans["EL_GR_M"],
        "GHCNH": spans["GHCNH"],
    }

    return cols


def slice_field(line, start, end):
    """Extract and strip a fixed-width field from a line."""
    if len(line) < start:
        return ""
    return line[start:end].strip()


def build_station_attributes():
    # 1. Read inventory IDs
    print(f"Reading inventory from {INVENTORY_CSV} ...")
    ids_list, ids_set = read_inventory_ids(INVENTORY_CSV)
    print(f"  Found {len(ids_list)} GHCNh_ID entries (unique: {len(ids_set)})")

    # 2. Stream through emshr_lite... and pick best row per GHCNH
    best_rows = {}  # GHCNH -> (END_DT_int, row_data_dict)

    print(f"Scanning metadata file {EMSHR_TXT} ...")
    with open(EMSHR_TXT, encoding="utf-8") as f:
        header_line = f.readline()
        if not header_line:
            raise RuntimeError(f"{EMSHR_TXT} appears to be empty.")

        cols = build_column_indexes(header_line)

        # Next line is usually the dashed separator; skip it
        _ = f.readline()

        for line_num, line in enumerate(f, start=3):
            stripped = line.strip()
            if not stripped:
                continue
            if stripped.startswith("-"):
                continue  # separator lines

            ghcnh = slice_field(line, *cols["GHCNH"])
            if not ghcnh:
                continue

            if ghcnh not in ids_set:
                continue  # we only care about GHCNh IDs from the inventory

            # Extract END_DT and convert to int (0 if blank or bad)
            end_dt_str = slice_field(line, *cols["END_DT"])
            try:
                end_dt = int(end_dt_str) if end_dt_str else 0
            except ValueError:
                end_dt = 0

            # Build row data
            row = {
                "GHCNH": ghcnh,
                "NCDC": slice_field(line, *cols["NCDC"]),
                "BEG_DT": slice_field(line, *cols["BEG_DT"]),
                "END_DT": end_dt_str,
                "COOP": slice_field(line, *cols["COOP"]),
                "WBAN": slice_field(line, *cols["WBAN"]),
                "WMO": slice_field(line, *cols["WMO"]),
                "STATION_NAME": slice_field(line, *cols["STATION_NAME"]),
                "CC": slice_field(line, *cols["CC"]),
                "CTRY_NAME": slice_field(line, *cols["CTRY_NAME"]),
                "ST": slice_field(line, *cols["ST"]),
                "COUNTY": slice_field(line, *cols["COUNTY"]),
                "CD": slice_field(line, *cols["CD"]),
                "UTC": slice_field(line, *cols["UTC"]),
                "LAT_DEC": slice_field(line, *cols["LAT_DEC"]),
                "LON_DEC": slice_field(line, *cols["LON_DEC"]),
                "EL_GR_M": slice_field(line, *cols["EL_GR_M"]),
            }

            # Keep only the row with the latest END_DT for each GHCNH
            if ghcnh in best_rows:
                prev_end_dt, _ = best_rows[ghcnh]
                if end_dt > prev_end_dt:
                    best_rows[ghcnh] = (end_dt, row)
            else:
                best_rows[ghcnh] = (end_dt, row)

    print(f"  Collected metadata for {len(best_rows)} stations.")

    # 3. Write output CSV
    print(f"Writing output CSV to {OUTPUT_CSV} ...")
    fieldnames = [
        "GHCNH",
        "NCDC",
        "BEG_DT",
        "END_DT",
        "COOP",
        "WBAN",
        "WMO",
        "STATION_NAME",
        "CC",
        "CTRY_NAME",
        "ST",
        "COUNTY",
        "CD",
        "UTC",
        "LAT_DEC",
        "LON_DEC",
        "EL_GR_M",
    ]

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as out_f:
        writer = csv.DictWriter(out_f, fieldnames=fieldnames)
        writer.writeheader()

        missing = 0
        for ghcnh_id in ids_list:
            if ghcnh_id in best_rows:
                _, row = best_rows[ghcnh_id]
                writer.writerow(row)
            else:
                missing += 1

    print(f"Done. Wrote {len(best_rows)} station rows.")
    if missing:
        print(f"Warning: {missing} inventory GHCNh_IDs had no matching metadata rows.")


if __name__ == "__main__":
    build_station_attributes()
