# Created by Li Zhang (SUNY-ESF).

# This script is used for sub-hourly resampling and shortening the input length according to the user-provided start date and end date.
# This revision addresses the data gap between the total precipitation in the original dataset and that in the interpolated dataset. 
# By evenly distributing this gap across the resampled data points, we ensure that the sum of precipitation values matches that of the original data.
# Before running this script, open and edit the following parameters:
# - root_path: The input path containing the raw weather data.
# - start_date: The start date for data processing.
# - end_date: The end date for data processing.
# - rawdata_interval_minutes: The time interval of your own data.
# - interval_minutes: The ideal time interval for processing.

# To run the script, execute: python z_SubHourResampleMeteorologicalData.py

import pandas as pd
import os
import sys

#root_path = sys.argv[1]
#output_path = sys.argv[2]
root_path = "C:/iTree/WeatherPrep/TestingFilesAndScript/TestCases/z_SubHourResample/input"
output_path = "C:/iTree/WeatherPrep/TestingFilesAndScript/TestCases/z_SubHourResample/output"

file_names = ['Weather.csv', 'Radiation.csv', 'Evaporation.csv']
file_paths = [os.path.join(root_path, file_name) for file_name in file_names]

start_date = '2020/01/01'
end_date = '2020/01/31'
rawdata_interval_minutes = 60
interval_minutes = 2


def preprocess_weather_data(file_path, start_date, end_date, interval_minutes):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path, parse_dates={'Datetime': ['YYYYMMDD', 'HH:MM:SS']})
    
    # Store the original header
    original_header = list(df.columns)
    
    # Convert the 'Datetime' column to the desired format 'YYYYMMDD'
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='%Y%m%d %H:%M:%S')

    # Filter data based on user-provided date range
    mask = (df['Datetime'] >= start_date) & (df['Datetime'] <= (pd.to_datetime(end_date) + pd.Timedelta(days=1)))
    df = df[mask]
    
    # Resample data based on the specified interval
    df.set_index('Datetime', inplace=True)
    interval_str = f'{interval_minutes}T'
    df_resampled = df.resample(interval_str).mean()
    
    # Use linear interpolation to fill data gaps
    df_resampled.interpolate(method='linear', inplace=True)
    
    #******************************************
    # Adjust columns containing 'Precipitation' in the name
    precipitation_cols = [col for col in df_resampled.columns if 'Precipitation' in col]
    for col in precipitation_cols:
        total_original = df[col].sum()
        total_resampled = df_resampled[col].sum() * (interval_minutes / rawdata_interval_minutes) # adjust based on original time step
        data_gap = total_original - total_resampled
        resampled_data_length = len(df_resampled)
        print(f"Original total {col}: {total_original}")
        print(f"Resampled total {col}: {total_resampled}")
        print(f"Data gap for {col}: {data_gap}")
        print(f"Total number of resampled data points: {resampled_data_length}")
        print(f"Adjustment per data point for {col}: {data_gap / resampled_data_length}") 
        df_resampled[col] += data_gap * rawdata_interval_minutes / interval_minutes / resampled_data_length
        print(f"Adjusted resampled total {col}: {df_resampled[col].sum() * (interval_minutes / rawdata_interval_minutes)}")
    #******************************************
    
    # Reset the index to restore the original 'Datetime' column
    df_resampled.reset_index(inplace=True)

    # Create separate columns for 'YYYYMMDD' and 'HH:MM:SS'
    df_resampled['YYYYMMDD'] = df_resampled['Datetime'].dt.strftime('%Y%m%d')
    df_resampled['HH:MM:SS'] = df_resampled['Datetime'].dt.strftime('%H:%M:%S')

    # Drop the original 'Datetime' column
    df_resampled.drop(columns=['Datetime'], inplace=True)

    # Restore the original header, excluding 'Datetime'
    df_resampled = df_resampled[['YYYYMMDD', 'HH:MM:SS'] + [col for col in original_header if col != 'Datetime']]  # Reorder columns
    
    return df_resampled

def process_multiple_files(file_paths, start_date, end_date, interval_minutes, output_path):
    result_dfs = []
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    for file_path in file_paths:
        result_df = preprocess_weather_data(file_path, start_date, end_date, interval_minutes)
        result_dfs.append(result_df)
        
        # Save the processed DataFrame to a new CSV file with the same base name
        output_file_path = os.path.join(output_path, f'{os.path.splitext(os.path.basename(file_path))[0]}.csv')
        result_df.to_csv(output_file_path, index=False, header=True, date_format='%Y%m%d %H:%M:%S')  # Include date_format parameter
        
    return result_dfs

result_dfs = process_multiple_files(file_paths, start_date, end_date, interval_minutes, output_path)


# Access results for each file
for i, result_df in enumerate(result_dfs):
    print(f"Saved to {os.path.join(output_path, f'{os.path.splitext(os.path.basename(file_paths[i]))[0]}.csv')}\n")
