@@ -37,6 +37,17 @@ def combine_datasets(lma_data):
3737 pyxlma.lmalib.io.cf_netcdf.new_dataset or
3838 pyxlma.lmalib.io.read.to_dataset
3939 """
40+ def reorder_stations (dataset , index_mapping ):
41+ for var in dataset .data_vars :
42+ if 'number_of_stations' in dataset [var ].dims :
43+ if var == 'station_code' :
44+ new_station_codes = dataset [var ].data .copy ()
45+ new_station_codes = new_station_codes [index_mapping ]
46+ dataset = dataset .assign (station_code = ('number_of_stations' , new_station_codes ))
47+ else :
48+ reordered_data = dataset [var ].isel (number_of_stations = index_mapping )
49+ dataset [var ].data = reordered_data .values
50+ return dataset
4051 # Get a list of all the global attributes from each dataset
4152 attrs = [d .attrs for d in lma_data ]
4253 # Create a dict of {attr_name: [list of values from each dataset]}
@@ -53,46 +64,153 @@ def combine_datasets(lma_data):
5364 final_attrs [k ] = tuple (set_of_values )[0 ]
5465 else :
5566 final_attrs [k ] = '; ' .join (attr_vals )
56- # print(final_attrs)
57-
58- # Get just the pure-station variables
59- lma_station_data = xr .concat (
60- [d .drop_dims (['number_of_events' ]) for d in lma_data ],
61- dim = 'number_of_files'
62- )
63- # print(lma_station_data)
64-
65- # Get just the pure-event variables
66- lma_event_data = xr .concat (
67- [d .drop_dims (['number_of_stations' ]).drop (
68- ['network_center_latitude' ,
69- 'network_center_longitude' ,
70- 'network_center_altitude' ,]
71- ) for d in lma_data ],
72- dim = 'number_of_events'
73- )
74- # print(lma_event_data)
75-
76- # Get any varaibles with joint dimensions,
77- # i.e., ('number_of_events', 'number_of_stations')
78- event_contributing_stations = xr .concat (
79- [d ['event_contributing_stations' ] for d in lma_data ],
80- dim = 'number_of_events'
81- )
82- # print(event_contributing_stations)
83-
84- # Find the mean of the station data, and then add back the event data
85- ds = xr .merge ([lma_station_data .mean (dim = 'number_of_files' ),
86- lma_event_data ],
87- compat = 'equals' )
88- # ... and then restore the contributing stations.
89- # Note the coordinate variables are being used as labels to ensure data remain aligned.
90- ds ['event_contributing_stations' ] = event_contributing_stations
91-
92- # Restore the global attributes
93- ds .attrs .update (final_attrs )
94-
95- return ds
67+ # Get the station data from the first dataset and assign each station a unique index
68+ lma_data [0 ]['station_code' ] = lma_data [0 ].number_of_stations
69+ lma_data [0 ]['number_of_stations' ] = np .arange (len (lma_data [0 ].number_of_stations ))
70+ all_data = lma_data [0 ]
71+
72+ # Set up a few arrays to keep track of all known stations
73+ station_networks = all_data .station_network .data .copy ()
74+ station_codes = all_data .station_code .data .copy ()
75+ station_lats = all_data .station_latitude .data .copy ()
76+ station_lons = all_data .station_longitude .data .copy ()
77+ station_alts = all_data .station_altitude .data .copy ()
78+
79+ # Check each subsequent dataset for new stations
80+ for new_file_num in range (1 , len (lma_data )):
81+ new_file = lma_data [new_file_num ]
82+ # Demote station code to a data variable and assign an index to each station in the new file
83+ new_file ['station_code' ] = new_file .number_of_stations
84+ new_file ['number_of_stations' ] = np .arange (len (new_file .number_of_stations ))
85+ stations_in_file = []
86+ # Check each station in the new file against all known stations
87+ for station_num in range (len (new_file .number_of_stations .data )):
88+ station = new_file .isel (number_of_stations = station_num )
89+ matching_networks = station_networks == station .station_network .data
90+ matching_station_code = station_codes == station .station_code .data
91+ matching_lat = station_lats == station .station_latitude .data
92+ matching_lon = station_lons == station .station_longitude .data
93+ matching_alt = station_alts == station .station_altitude .data
94+ # If the lat/lon/alt are the same, then the station is in the same location
95+ matching_loc = matching_lat * matching_lon * matching_alt
96+ matching_id = matching_networks * matching_station_code
97+ matching = matching_id * matching_loc
98+ # check for a single match in the previously-known stations
99+ if sum (matching ) == 1 :
100+ # station completely matches previous
101+ previous_index = np .argmax (matching )
102+ stations_in_file .append (previous_index )
103+ elif sum (matching_id ) == 1 :
104+ # station has moved geographically since previous
105+ previous_index = np .argmax (matching_id )
106+ # Update the global list of lat/lon/alts to reflect the move
107+ station_lats [previous_index ] = station .station_latitude .data
108+ station_lons [previous_index ] = station .station_longitude .data
109+ station_alts [previous_index ] = station .station_altitude .data
110+ stations_in_file .append (previous_index )
111+ elif sum (matching_loc ) == 1 :
112+ # station was renamed since previous
113+ previous_index = np .argmax (matching_loc )
114+ # Update the global list of station identifying information to reflect the rename
115+ station_networks [previous_index ] = station .station_network .data
116+ station_codes [previous_index ] = station .station_code .data
117+ stations_in_file .append (previous_index )
118+ else :
119+ # station is new
120+ previous_index = - 1
121+ stations_in_file .append (previous_index )
122+ # Find the indices of any newly-discovered stations
123+ indices_of_new_stations = np .where (np .array (stations_in_file ) == - 1 )[0 ]
124+ # Add the new stations to the known stations
125+ for idx_of_new_station in indices_of_new_stations :
126+ new_station = new_file .isel (number_of_stations = idx_of_new_station )
127+ # The new station is appended to the end of the known stations
128+ new_station_index = len (all_data .number_of_stations )
129+ # Expand all of the previous data to contain the new station. Fill with nan values for all previous times.
130+ all_data = all_data .reindex ({'number_of_stations' : np .arange (new_station_index + 1 )}, fill_value = np .nan )
131+ # Update the dimension coordinate to reflect the new station's addition
132+ all_data ['number_of_stations' ] = ('number_of_stations' , np .arange (new_station_index + 1 ))
133+ # Add the new station's information to the global list of known stations
134+ station_networks = np .append (station_networks , new_station .station_network .data )
135+ station_codes = np .append (station_codes , new_station .station_code .data )
136+ station_lats = np .append (station_lats , new_station .station_latitude .data )
137+ station_lons = np .append (station_lons , new_station .station_longitude .data )
138+ station_alts = np .append (station_alts , new_station .station_altitude .data )
139+ # Update the station index for the new station
140+ stations_in_file [idx_of_new_station ] = new_station_index
141+ # Check for any previously-known stations that are no longer in the new file
142+ for old_station_id in all_data .number_of_stations .data :
143+ if old_station_id not in stations_in_file :
144+ # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present
145+ # first create a temporary index for the dead station
146+ temp_station_id = len (new_file .number_of_stations )
147+ # fill the new row with nan values
148+ new_file = new_file .reindex ({'number_of_stations' : np .arange (temp_station_id + 1 )}, fill_value = np .nan )
149+ new_file ['number_of_stations' ] = ('number_of_stations' , np .arange (temp_station_id + 1 ))
150+ # add the dead station to the list of stations from this file so that it is the same length as the all_data dataset
151+ stations_in_file .append (temp_station_id )
152+ # update the mapping of the dead station to the temporary index so that the data can be re-ordered
153+ stations_in_file [old_station_id ] = temp_station_id
154+ stations_in_file [temp_station_id ] = old_station_id
155+ # Re-order the station data to match the order of the stations in the new file
156+ # This can happen if lma_analysis decides to change the order of the stations, or if new stations are added or removed
157+ new_file = reorder_stations (new_file , stations_in_file )
158+ # Concatenate the new file's station information with the previously-known station information
159+ lma_station_data = xr .concat (
160+ [d .drop_dims (['number_of_events' ]) for d in [all_data , new_file ]],
161+ dim = 'number_of_files'
162+ )
163+ # Rebuild the event_contributing_stations array
164+ event_contributing_stations = xr .concat (
165+ [d ['event_contributing_stations' ] for d in [all_data , new_file ]],
166+ dim = 'number_of_events'
167+ )
168+ # Attach the event_contributing_stations array to the station dataset
169+ lma_station_data ['event_contributing_stations' ] = event_contributing_stations
170+
171+ # Combining the events is a simple concatenation. If only the stations had been this easy...
172+ lma_event_data = xr .concat (
173+ [d .drop_dims (['number_of_stations' ]).drop_vars (
174+ ['network_center_latitude' ,
175+ 'network_center_longitude' ,
176+ 'network_center_altitude' ,
177+ 'file_start_time' ]
178+ ) for d in [all_data , new_file ]],
179+ dim = 'number_of_events'
180+ )
181+ # merge all the data from this file with the previously-known data
182+ combined_event_data = xr .merge ([lma_station_data , lma_event_data ])
183+ all_data = combined_event_data
184+ # Sort the data by file_start_time
185+ all_data = all_data .sortby ('file_start_time' )
186+ # Update the global attributes
187+ all_data .attrs .update (final_attrs )
188+ # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary
189+ all_the_same = True
190+ # These variables are expected to change between files, so they are not checked for equality
191+ expected_to_change = ['station_event_fraction' , 'station_power_ratio' , 'file_start_time' ]
192+ for var in all_data .data_vars :
193+ if var in expected_to_change :
194+ continue
195+ if 'number_of_files' in all_data [var ].dims :
196+ if (all_data [var ] == all_data [var ].isel (number_of_files = 0 )).all ():
197+ pass
198+ else :
199+ all_the_same = False
200+ # If all of the variables are the same for all files, remove the 'number_of_files' dimension.
201+ if all_the_same :
202+ # Some variables need to be averaged
203+ mean_data = all_data .mean (dim = 'number_of_files' )
204+ # ...but most can just be copied over (this is necessary because some are strings which can't be averaged)
205+ all_data = all_data .isel (number_of_files = 0 )
206+ # the file_start_time variable is not needed in the reduced dataset
207+ all_data = all_data .drop_vars (['file_start_time' ])
208+ # replace the copied variables with the averaged variables where necessary
209+ for var in expected_to_change :
210+ if var == 'file_start_time' :
211+ continue
212+ all_data [var ] = mean_data [var ]
213+ return all_data
96214
97215def dataset (filenames , sort_time = True ):
98216 """ Create an xarray dataset of the type returned by
@@ -127,15 +245,13 @@ def dataset(filenames, sort_time=True):
127245 # the dimension name, too.
128246 resetted = ('number_of_events_' , 'number_of_stations_' )
129247 ds = ds .reset_coords (resetted )
130- ds = ds .rename ({resetted [0 ]:'event_id' ,
131- resetted [1 ]:'station_code' })
248+ ds = ds .rename ({resetted [0 ]:'event_id' })
132249 else :
133250 # The approach for newer xarray versions requires we explicitly rename only the variables.
134251 # The generic "rename" in the block above renames vars, coords, and dims.
135252 resetted = ('number_of_events' , 'number_of_stations' )
136253 ds = ds .reset_coords (resetted )
137- ds = ds .rename_vars ({resetted [0 ]:'event_id' ,
138- resetted [1 ]:'station_code' })
254+ ds = ds .rename_vars ({resetted [0 ]:'event_id' })
139255 if sort_time :
140256 ds = ds .sortby ('event_time' )
141257 return ds , starttime
@@ -203,7 +319,7 @@ def to_dataset(lma_file, event_id_start=0):
203319 ds .attrs ['event_algorithm_name' ] = lma_file .analysis_program
204320 ds .attrs ['event_algorithm_version' ] = lma_file .analysis_program_version
205321 ds .attrs ['original_filename' ] = path .basename (lma_file .file )
206-
322+ ds [ 'file_start_time' ] = starttime
207323 # -- Populate the station mask information --
208324 # int, because NetCDF doesn't have booleans
209325 station_mask_bools = np .zeros ((N_events , N_stations ), dtype = 'int8' )
0 commit comments