#z_VaryingTimeStepResampleMeteorologicalData.py was created by T. Endreny (SUNY ESF) January 2025 for i-Tree meteorological processing

# This script is used for variable time step adjustment of i-Tree meteorological data files Weather.csv, Radiation.csv, and Evaporation.csv
# Before running this script, change the following parameters:
# - DateRange_Resampled provides pairs of dates (YYYYMMDD) for resampling a distinct timestep
# - TimeRange_Resampled provides pairs of times (HH:MM:SS) for resampling a distinct timestep, corresponding with DateRange_Resampled
# - TimeStep_Resampled provides scalars of durations (sed) for resampling a distinct timestep, corresponding with DateRange_Resampled

# To run the script, execute: python z_VaryingTimeStepResampleMeteorologicalData.py Path_to_MetData

import os
import sys
from datetime import datetime, timedelta

#Change DateRange_Resampled to match project time frame
#DateRange_Resampled provides pairs of dates (YYYYMMDD) for resampling that correspond with pairs in TimeRange_Resampled and TimeStep_Resampled
DateRange_Resampled = [(20150101, 20150228), (20150301, 20150930), (20151001, 20151231),(20160101, 20160228), (20160301, 20160930), (20161001, 20161231), (20170101, 20170228), (20170301, 20170930), (20171001, 20171231), (20180101, 20180228), (20180301, 20180930), (20181001, 20181231), (20190101, 20190228), (20190301, 20190930), (20191001, 20191231)]

#Change TimeRange_Resampled to correspond with DateRange_Resampled
#TimeRange_Resampled provides pairs of times (HH:MM:SS) for resampling, allowing for multiple pairs in any one date range 
TimeRange_Resampled = [("00:00:00", "23:00:00"), [("00:00:00", "06:00:00"), ("21:00:00", "23:00:00")], ("00:00:00", "23:00:00"),("00:00:00", "23:00:00"), [("00:00:00", "06:00:00"), ("21:00:00", "23:00:00")], ("00:00:00", "23:00:00"),("00:00:00", "23:00:00"), [("00:00:00", "06:00:00"), ("21:00:00", "23:00:00")], ("00:00:00", "23:00:00"),("00:00:00", "23:00:00"), [("00:00:00", "06:00:00"), ("21:00:00", "23:00:00")], ("00:00:00", "23:00:00"),("00:00:00", "23:00:00"), [("00:00:00", "06:00:00"), ("21:00:00", "23:00:00")], ("00:00:00", "23:00:00")]#TimeStep_Resampled provides integers of time step duration (sec) for resampling, with a value for each pair of times

#Change TimeRange_Resampled to correspond with DateRange_Resampled
#TimeStep_Resampled (seconds) provides single scalar timestep durations for resampling, one timestep per pair of TimeRange_Resampled
TimeStep_Resampled = [21600, [10800, 10800], 21600,21600, [10800, 10800], 21600,21600, [10800, 10800], 21600,21600, [10800, 10800], 21600,21600, [10800, 10800], 21600] 

#resample_timeStep_function function undertakes the resampling of meteorological files, called from main
def resample_timeStep_function(input_file, output_file, date_ranges, time_ranges, time_steps):
    #with open input_file for reading, as infile, and open output_file for writing as outfile
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        
        #header_values_vector is the infile_row_values read from the infile, stripped of white space, split by commas
        header_values_vector = infile.readline().strip().split(',')
        #outfile writes first instance of header_values_vector, which are the headers
        outfile.write(','.join(header_values_vector) + '\n')

        #data_and_aggregation_methods_dictionary initiated as empty dictionary
        data_and_aggregation_methods_dictionary = {}
        
        #For looop of infile_variable in header_values_vector, starting at index = 2, after date and time columns
        #Note: header row is being read, so infile_variable is variable name
        for infile_variable in header_values_vector[2:]:
            #if (m/h) is found in infile_variable then
            #Note: precipitation rates are averaged and not summed when assigning a new time step; summing intensifies precipitation rates over longer durations
            if "(yet to be defined)" in infile_variable:
                #data_and_aggregation_methods_dictionary dictionary with infile_variable key value of variable name is assigned aggregation method of sum 
                data_and_aggregation_methods_dictionary[infile_variable] = 'sum'
            #elif (F) (m/h) (m/s) (kPa) (W/m^2) are found in infile_variable then
            elif "(F)" in infile_variable or "(m/h)" in infile_variable or "(m/s)" in infile_variable or "(kPa)" in infile_variable or "(W/m^2)" in infile_variable:
                #data_and_aggregation_methods_dictionary dictionary with infile_variable key value of variable name is assigned aggregation method of mean 
                data_and_aggregation_methods_dictionary[infile_variable] = 'mean'
            #elif (m) (Radian) are found in infile_variable then
            elif "(m)" in infile_variable or "(Radian)" in infile_variable:
                #data_and_aggregation_methods_dictionary dictionary with infile_variable key value of variable name is assigned aggregation method of last 
                data_and_aggregation_methods_dictionary[infile_variable] = 'last'
            #elif (deg) is found in infile_variable then
            elif "(yet to be defined)" in infile_variable:
                #data_and_aggregation_methods_dictionary dictionary with infile_variable key value of variable name is assigned aggregation method of circular 
                #Note: This is for wind direction, which has a circular structure, where you can't average 10 deg and 350 deg to get 180 deg
                data_and_aggregation_methods_dictionary[infile_variable] = 'circular'

        #variable_vectors are vectors created as data_variable[], for each data_variable in data_and_aggregation_methods_dictionary.keys(), which is the variable header names
        variable_vectors = {data_variable: [] for data_variable in data_and_aggregation_methods_dictionary.keys()}
        #Initialize variables
        aggregated_row = {}
        #start_datetime set to None
        start_datetime = None
        #time_duration_sec set to 0
        time_duration_sec = 0
        #target_time_step set to None
        target_time_step = None

        #first_row set to True
        first_row = True

        #For loop of infile_row_values in infile
        for infile_row_values in infile:
            #row_values_vector from infile_row_values using strip and split for CSV
            row_values_vector = infile_row_values.strip().split(',')
            #If len(row_values_vector) is not equal to len (header_values_vector) then 
            if len(row_values_vector) != len(header_values_vector):
                #alert user
                print(f"Skipping malformed infile_row_values: {infile_row_values.strip()}")
                #exit for loop
                continue

            #date_str, time_str are extracted from row_values_vector[0] and [1]
            date_str, time_str = row_values_vector[0], row_values_vector[1]
            #current_datetime is from datetime.strptime function, sending date_str and time_str with formatting "%Y%m%d %H:%M:%S"
            current_datetime = datetime.strptime(f"{date_str} {time_str}", "%Y%m%d %H:%M:%S")
            #variable_values_vector is extracted from row_values_vector after 2nd column, not taking date and time
            variable_values_vector = {header_values_vector[i]: float(row_values_vector[i]) for i in range(2, len(row_values_vector))}

            #within_resampling set to False
            within_resampling = False
            
            #For date_range, time_range, and time_step in zip of date_ranges, time_ranges, time_steps, provided by user
            for date_range, time_range, time_step in zip(date_ranges, time_ranges, time_steps):
                #start_date, end_date are extracted from date_range
                start_date, end_date = date_range
                #If int(date_str) is between start_date and end_date
                if start_date <= int(date_str) <= end_date:
                    #If time_range is a list as determined by the isinstance function
                    #Note: time_range can be a list to cover multiple time blocks within a day
                    #Note: time_step follows the list or instance nature of time_range
                    if isinstance(time_range, list):
                        #For sub_time_range, sub_time_step in zip of time_range, time_step
                        #Note: This loops through all pairs of time_range and time_step
                        for sub_time_range, sub_time_step in zip(time_range, time_step):
                            #start_time, end_time extracted from sub_time_range
                            start_time, end_time = sub_time_range
                            #If time_str is between start_time and end_time
                            if start_time <= time_str <= end_time:                                
                                #within_resampling set to True
                                within_resampling = True
                                #target_time_step set to sub_time_step
                                target_time_step = sub_time_step
                                #break and leave immediate for loop
                                break
                    #else if time_step is not a list, but an instance
                    else:
                        #If time_str is between time_range[0] and time_range[1] 
                        if time_range[0] <= time_str <= time_range[1]:
                            #within_resampling set to True
                            within_resampling = True
                            target_time_step = time_step
                    #If within_resampling is true, as set above, else continue looking through date_range
                    if within_resampling:
                        #break and leave immediate for loop
                        break

            #If first_row is true then write line of data directly from infile
            if first_row:
                #outfile write infile_row_values 
                outfile.write(infile_row_values)
                #first_row set to False
                first_row = False
                #start_datetime set to current_datetime
                start_datetime = current_datetime
                #aggregated_row['YYYYMMDD'] set to date_str
                aggregated_row['YYYYMMDD'] = date_str
                #aggregated_row['HH:MM:SS'] set to time_str
                aggregated_row['HH:MM:SS'] = time_str
                #continue and escape to read next line of data
                continue

            #If not within_resampling then write line of data directly from infile
            if not within_resampling:
                #outfile write infile_row_values 
                outfile.write(infile_row_values)
                #continue and escape to read next line of data
                continue

            #time_duration_sec defined as (current_datetime - start_datetime) with total_seconds function
            time_duration_sec = (current_datetime - start_datetime).total_seconds()
            #For data_variable, data_aggregation_method in data_and_aggregation_methods_dictionary dictionary
            for data_variable, data_aggregation_method in data_and_aggregation_methods_dictionary.items():
                #variable_vectors for data_variable is appended variable_values_vector at index data_variable
                variable_vectors[data_variable].append(variable_values_vector[data_variable])

            #If time_duration_sec >= target_time_step then enter to write output
            if time_duration_sec >= target_time_step:

                #For data_variable, data_aggregation_method in dictionary data_and_aggregation_methods_dictionary.items():
                for data_variable, data_aggregation_method in data_and_aggregation_methods_dictionary.items():
                    #If data_aggregation_method is sum 
                    if data_aggregation_method == 'sum':
                        #aggregated_row for data_variable is function sum of variable_vectors[data_variable]
                        aggregated_row[data_variable] = sum(variable_vectors[data_variable])
                    #Elif data_aggregation_method is mean 
                    elif data_aggregation_method == 'mean':
                        #aggregated_row for data_variable is function mean, sum of variable_vectors[data_variable] divided by len of variable_vectors[data_variable]
                        aggregated_row[data_variable] = sum(variable_vectors[data_variable]) / len(variable_vectors[data_variable])
                    #Elif data_aggregation_method is last
                    elif data_aggregation_method == 'last':
                        #aggregated_row for data_variable is function last [-1] of variable_vectors[data_variable]
                        aggregated_row[data_variable] = variable_vectors[data_variable][-1]
                    #Elif data_aggregation_method is circular
                    elif data_aggregation_method == 'circular':
                        #wind_dir_radians_vector built from stored deg values to compute circular mean using vector components
                        wind_dir_radians_vector = [math.radians(value) for value in variable_vectors[data_variable]]
                        #x_component (run), y_component (rise) cartesian locations are taken from wind_dir_rad angle as cos and sine respectively
                        x_component = sum(math.cos(wind_dir_rad) for wind_dir_rad in wind_dir_radians_vector)
                        y_component = sum(math.sin(wind_dir_rad) for wind_dir_rad in wind_dir_radians_vector)
                        #wind_dir_mean_rad derived from atan of (y_component, x_component), rise over run
                        wind_dir_mean_rad = math.atan2(y_component, x_component)
                        #aggregated_row convert back to degrees and ensure it is between 0 and 360
                        aggregated_row[data_variable] = (math.degrees(mean_angle_rad) + 360) % 360
                        
                #aggregated_row['YYYYMMDD'] updated to date_str, current date
                aggregated_row['YYYYMMDD'] = date_str
                #aggregated_row['HH:MM:SS'] updated to time_str, current time
                aggregated_row['HH:MM:SS'] = time_str
                
                #outfile write with formatting the output as date, time, and all variables in data_and_aggregation_methods_dictionary, with line return
                #Note: formatting with f defined by {:.8f}, applied to  
                outfile.write(f"{aggregated_row['YYYYMMDD']},{aggregated_row['HH:MM:SS']}," 
                    + ",".join(f"{aggregated_row.get(data_variable, 0):.8f}" 
                         for data_variable in data_and_aggregation_methods_dictionary.keys()) 
                    + "\n")

                #variable_vectors reset for the next interval
                variable_vectors = {data_variable: [] for data_variable in data_and_aggregation_methods_dictionary.keys()}
                #start_datetime set to current_datetime
                start_datetime = current_datetime
                #time_duration_sec set to 0
                time_duration_sec = 0

        #End of For loop infile_variable in header_values_vector, starting at index = 2, after date and time columns
        #If time_duration_sec > 0 then handle any remaining variable_values_vector
        if time_duration_sec > 0:
            #aggregated_datetime_date is created from datetime transform of aggregated_row['YYYYMMDD'] + " " + aggregated_row['HH:MM:SS']
            aggregated_datetime_date = datetime.strptime(aggregated_row['YYYYMMDD'] + " " + aggregated_row['HH:MM:SS'], "%Y%m%d %H:%M:%S")

            #If aggregated_datetime_date >= current_datetime then the data is valid for writing
            if (aggregated_datetime_date >= current_datetime):
                
                #For data_variable, data_aggregation_method in dictionary data_and_aggregation_methods_dictionary.items():
                for data_variable, data_aggregation_method in data_and_aggregation_methods_dictionary.items():
                    #If data_aggregation_method is sum 
                    if data_aggregation_method == 'sum':
                        #aggregated_row for data_variable is function sum of variable_vectors[data_variable]
                        aggregated_row[data_variable] = sum(variable_vectors[data_variable])
                    #Elif data_aggregation_method is mean 
                    elif data_aggregation_method == 'mean':
                        #aggregated_row for data_variable is function mean, sum of variable_vectors[data_variable] divided by len of variable_vectors[data_variable]
                        aggregated_row[data_variable] = sum(variable_vectors[data_variable]) / len(variable_vectors[data_variable])
                    #Elif data_aggregation_method is last
                    elif data_aggregation_method == 'last':
                        #aggregated_row for data_variable is function last [-1] of variable_vectors[data_variable]
                        aggregated_row[data_variable] = variable_vectors[data_variable][-1]
                    #Elif data_aggregation_method is circular
                    elif data_aggregation_method == 'circular':
                        #wind_dir_radians_vector built from stored deg values to compute circular mean using vector components
                        wind_dir_radians_vector = [math.radians(value) for value in variable_vectors[data_variable]]
                        #x_component (run), y_component (rise) cartesian locations are taken from wind_dir_rad angle as cos and sine respectively
                        x_component = sum(math.cos(wind_dir_rad) for wind_dir_rad in wind_dir_radians_vector)
                        y_component = sum(math.sin(wind_dir_rad) for wind_dir_rad in wind_dir_radians_vector)
                        #wind_dir_mean_rad derived from atan of (y_component, x_component), rise over run
                        wind_dir_mean_rad = math.atan2(y_component, x_component)
                        #aggregated_row convert back to degrees and ensure it is between 0 and 360
                        aggregated_row[data_variable] = (math.degrees(mean_angle_rad) + 360) % 360

                #aggregated_row['YYYYMMDD'] updated to date_str, current date
                aggregated_row['YYYYMMDD'] = date_str
                #aggregated_row['HH:MM:SS'] updated to time_str, current time
                aggregated_row['HH:MM:SS'] = time_str

                #outfile write with formatting the output as date, time, and all variables in data_and_aggregation_methods_dictionary, with line return
                #Note: formatting with f defined by {:.8f}, applied to      
                outfile.write(f"{aggregated_row['YYYYMMDD']},{aggregated_row['HH:MM:SS']},"
                              + ",".join(f"{aggregated_row.get(data_variable, 0):.8f}" for data_variable in data_and_aggregation_methods_dictionary.keys()) + "\n")

#main function
def main():
    #If len sys.argv not 2 then missing input directory
    if len(sys.argv) != 2:
        print("Usage: python script.py <folder_path>")
        #sys.exit
        sys.exit(1)

    #folder_path defined as sys.argv[1], indexing at 0
    folder_path = sys.argv[1]

    #files defined as ['Weather.bak.csv', 'Radiation.bak.csv', 'Evaporation.bak.csv']
    #Note: rename original files with trailing .bak.csv
    files = ['Weather.bak.csv', 'Radiation.bak.csv', 'Evaporation.bak.csv']
    
    #For file in files
    for file in files:
        #input_file is os.path.join(folder_path, file)
        input_file = os.path.join(folder_path, file)
        #If os.path.exists(input_file) then enter
        if os.path.exists(input_file):
            #output_file defined as os.path.join(folder_path, file.replace('.bak.csv'
            output_file = os.path.join(folder_path, file.replace('.bak.csv', '.csv'))
            print(f"Processing {input_file}...")
            #resample_timeStep_function sent input_file, output_file, DateRange_Resampled, TimeRange_Resampled, TimeStep_Resampled
            resample_timeStep_function(input_file, output_file, DateRange_Resampled, TimeRange_Resampled, TimeStep_Resampled)
            print(f"Resampled variable_values_vector saved to {output_file}")
        #Else alert user there is no file
        else:
            print(f"File {file} not found in {folder_path}")

if __name__ == "__main__":
    main()
