#!/usr/bin/env python
"""
Process GHCNh inventory file to keep unique station IDs with greatest value in 2nd column.
Specifically designed for whitespace-separated GHCN inventory format.
"""

import pandas as pd
import numpy as np
import os
import sys
from io import StringIO

def process_ghcnh_inventory_robust(input_file, output_file=None):
    """
    Robust processor for GHCNh whitespace-separated inventory files.
    """
    if output_file is None:
        base_name = os.path.splitext(input_file)[0]
        output_file = f"{base_name}_unique.csv"
    
    print(f"Processing: {input_file}")
    print("Reading file...")
    
    try:
        # Read the entire file first to inspect
        with open(input_file, 'r') as f:
            content = f.read()
        
        # Show first few lines for debugging
        lines = content.strip().split('\n')
        print(f"\nFirst 3 lines of file:")
        for i in range(min(3, len(lines))):
            print(f"  Line {i+1}: {lines[i][:100]}...")
        
        # GHCN inventory files often have variable whitespace
        # Let's parse manually with regex
        import re
        
        parsed_data = []
        for line_num, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                continue
            
            # Split on any whitespace (tabs or multiple spaces)
            # Using regex to handle variable whitespace
            parts = re.split(r'\s+', line)
            
            if len(parts) >= 2:
                station_id = parts[0]
                try:
                    value = float(parts[1])
                    # Keep the entire line for output
                    parsed_data.append({
                        'station_id': station_id,
                        'value': value,
                        'line_num': line_num,
                        'full_line': line
                    })
                except ValueError:
                    # Skip if second column isn't a number
                    print(f"  Warning line {line_num}: Cannot convert '{parts[1]}' to number")
                    parsed_data.append({
                        'station_id': station_id,
                        'value': 0,  # Default to 0
                        'line_num': line_num,
                        'full_line': line
                    })
            else:
                print(f"  Warning line {line_num}: Only {len(parts)} column(s)")
        
        if not parsed_data:
            print("Error: No valid data found in file")
            return None
        
        print(f"\nParsed {len(parsed_data)} rows")
        
        # Create DataFrame from parsed data
        df = pd.DataFrame(parsed_data)
        
        print(f"Unique station IDs before: {df['station_id'].nunique()}")
        
        # Group by station_id and keep row with max value
        # If multiple rows have same max value, keep the first one
        idx_max = df.groupby('station_id')['value'].idxmax()
        result_df = df.loc[idx_max].sort_values('station_id').reset_index(drop=True)
        
        print(f"Unique station IDs after: {len(result_df)}")
        
        # Write output - preserving original format
        with open(output_file, 'w') as f:
            for _, row in result_df.iterrows():
                f.write(row['full_line'] + '\n')
        
        print(f"\nSaved {len(result_df)} unique stations to: {output_file}")
        
        # Statistics
        print(f"\n--- Statistics ---")
        print(f"Total lines in file: {len(lines)}")
        print(f"Valid data rows: {len(parsed_data)}")
        print(f"Unique stations kept: {len(result_df)}")
        print(f"Duplicates removed: {len(parsed_data) - len(result_df)}")
        
        if len(parsed_data) > 0:
            reduction_pct = ((len(parsed_data) - len(result_df)) / len(parsed_data)) * 100
            print(f"Reduction: {reduction_pct:.1f}%")
        
        # Show some examples
        print(f"\n--- Sample Results ---")
        print("First 5 stations (ID, value):")
        for i, (_, row) in enumerate(result_df.head().iterrows()):
            print(f"  {row['station_id']}: {row['value']}")
        
        print(f"\nTop 5 stations by value:")
        top5 = result_df.nlargest(5, 'value')
        for i, (_, row) in enumerate(top5.iterrows()):
            print(f"  {row['station_id']}: {row['value']}")
        
        return result_df
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return None

def main():
    import argparse
    
    parser = argparse.ArgumentParser(
        description='Process GHCNh inventory - keep unique stations with max value in 2nd column'
    )
    parser.add_argument('--input', '-i', default='ghcnh-inventory.txt',
                       help='Input inventory file (default: ghcnh-inventory.txt)')
    parser.add_argument('--output', '-o', 
                       help='Output file (default: <input>_unique.csv)')
    
    args = parser.parse_args()
    
    # Check if file exists
    if not os.path.exists(args.input):
        print(f"Error: File '{args.input}' not found.")
        print(f"Current directory: {os.getcwd()}")
        sys.exit(1)
    
    process_ghcnh_inventory_robust(args.input, args.output)

if __name__ == "__main__":
    main()