Skip to content

Commit fe657e4

Browse files
committed
properly support stations appearing/disappearing/moving
1 parent f5bfa96 commit fe657e4

1 file changed

Lines changed: 161 additions & 45 deletions

File tree

pyxlma/lmalib/io/read.py

Lines changed: 161 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,17 @@ def combine_datasets(lma_data):
3737
pyxlma.lmalib.io.cf_netcdf.new_dataset or
3838
pyxlma.lmalib.io.read.to_dataset
3939
"""
40+
def reorder_stations(dataset, index_mapping):
41+
for var in dataset.data_vars:
42+
if 'number_of_stations' in dataset[var].dims:
43+
if var == 'station_code':
44+
new_station_codes = dataset[var].data.copy()
45+
new_station_codes = new_station_codes[index_mapping]
46+
dataset = dataset.assign(station_code=('number_of_stations', new_station_codes))
47+
else:
48+
reordered_data = dataset[var].isel(number_of_stations=index_mapping)
49+
dataset[var].data = reordered_data.values
50+
return dataset
4051
# Get a list of all the global attributes from each dataset
4152
attrs = [d.attrs for d in lma_data]
4253
# Create a dict of {attr_name: [list of values from each dataset]}
@@ -53,46 +64,153 @@ def combine_datasets(lma_data):
5364
final_attrs[k] = tuple(set_of_values)[0]
5465
else:
5566
final_attrs[k] = '; '.join(attr_vals)
56-
# print(final_attrs)
57-
58-
# Get just the pure-station variables
59-
lma_station_data = xr.concat(
60-
[d.drop_dims(['number_of_events']) for d in lma_data],
61-
dim='number_of_files'
62-
)
63-
# print(lma_station_data)
64-
65-
# Get just the pure-event variables
66-
lma_event_data = xr.concat(
67-
[d.drop_dims(['number_of_stations']).drop(
68-
['network_center_latitude',
69-
'network_center_longitude',
70-
'network_center_altitude',]
71-
) for d in lma_data],
72-
dim='number_of_events'
73-
)
74-
# print(lma_event_data)
75-
76-
# Get any varaibles with joint dimensions,
77-
# i.e., ('number_of_events', 'number_of_stations')
78-
event_contributing_stations = xr.concat(
79-
[d['event_contributing_stations'] for d in lma_data],
80-
dim='number_of_events'
81-
)
82-
# print(event_contributing_stations)
83-
84-
# Find the mean of the station data, and then add back the event data
85-
ds = xr.merge([lma_station_data.mean(dim='number_of_files'),
86-
lma_event_data],
87-
compat='equals')
88-
# ... and then restore the contributing stations.
89-
# Note the coordinate variables are being used as labels to ensure data remain aligned.
90-
ds['event_contributing_stations'] = event_contributing_stations
91-
92-
# Restore the global attributes
93-
ds.attrs.update(final_attrs)
94-
95-
return ds
67+
# Get the station data from the first dataset and assign each station a unique index
68+
lma_data[0]['station_code'] = lma_data[0].number_of_stations
69+
lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations))
70+
all_data = lma_data[0]
71+
72+
# Set up a few arrays to keep track of all known stations
73+
station_networks = all_data.station_network.data.copy()
74+
station_codes = all_data.station_code.data.copy()
75+
station_lats = all_data.station_latitude.data.copy()
76+
station_lons = all_data.station_longitude.data.copy()
77+
station_alts = all_data.station_altitude.data.copy()
78+
79+
# Check each subsequent dataset for new stations
80+
for new_file_num in range(1, len(lma_data)):
81+
new_file = lma_data[new_file_num]
82+
# Demote station code to a data variable and assign an index to each station in the new file
83+
new_file['station_code'] = new_file.number_of_stations
84+
new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations))
85+
stations_in_file = []
86+
# Check each station in the new file against all known stations
87+
for station_num in range(len(new_file.number_of_stations.data)):
88+
station = new_file.isel(number_of_stations=station_num)
89+
matching_networks = station_networks == station.station_network.data
90+
matching_station_code = station_codes == station.station_code.data
91+
matching_lat = station_lats == station.station_latitude.data
92+
matching_lon = station_lons == station.station_longitude.data
93+
matching_alt = station_alts == station.station_altitude.data
94+
# If the lat/lon/alt are the same, then the station is in the same location
95+
matching_loc = matching_lat * matching_lon * matching_alt
96+
matching_id = matching_networks * matching_station_code
97+
matching = matching_id * matching_loc
98+
# check for a single match in the previously-known stations
99+
if sum(matching) == 1:
100+
# station completely matches previous
101+
previous_index = np.argmax(matching)
102+
stations_in_file.append(previous_index)
103+
elif sum(matching_id) == 1:
104+
# station has moved geographically since previous
105+
previous_index = np.argmax(matching_id)
106+
# Update the global list of lat/lon/alts to reflect the move
107+
station_lats[previous_index] = station.station_latitude.data
108+
station_lons[previous_index] = station.station_longitude.data
109+
station_alts[previous_index] = station.station_altitude.data
110+
stations_in_file.append(previous_index)
111+
elif sum(matching_loc) == 1:
112+
# station was renamed since previous
113+
previous_index = np.argmax(matching_loc)
114+
# Update the global list of station identifying information to reflect the rename
115+
station_networks[previous_index] = station.station_network.data
116+
station_codes[previous_index] = station.station_code.data
117+
stations_in_file.append(previous_index)
118+
else:
119+
# station is new
120+
previous_index = -1
121+
stations_in_file.append(previous_index)
122+
# Find the indices of any newly-discovered stations
123+
indices_of_new_stations = np.where(np.array(stations_in_file) == -1)[0]
124+
# Add the new stations to the known stations
125+
for idx_of_new_station in indices_of_new_stations:
126+
new_station = new_file.isel(number_of_stations=idx_of_new_station)
127+
# The new station is appended to the end of the known stations
128+
new_station_index = len(all_data.number_of_stations)
129+
# Expand all of the previous data to contain the new station. Fill with nan values for all previous times.
130+
all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=np.nan)
131+
# Update the dimension coordinate to reflect the new station's addition
132+
all_data['number_of_stations'] = ('number_of_stations', np.arange(new_station_index+1))
133+
# Add the new station's information to the global list of known stations
134+
station_networks = np.append(station_networks, new_station.station_network.data)
135+
station_codes = np.append(station_codes, new_station.station_code.data)
136+
station_lats = np.append(station_lats, new_station.station_latitude.data)
137+
station_lons = np.append(station_lons, new_station.station_longitude.data)
138+
station_alts = np.append(station_alts, new_station.station_altitude.data)
139+
# Update the station index for the new station
140+
stations_in_file[idx_of_new_station] = new_station_index
141+
# Check for any previously-known stations that are no longer in the new file
142+
for old_station_id in all_data.number_of_stations.data:
143+
if old_station_id not in stations_in_file:
144+
# a station has been removed, create a row of nan values for this file to indicate that the station is no longer present
145+
# first create a temporary index for the dead station
146+
temp_station_id = len(new_file.number_of_stations)
147+
# fill the new row with nan values
148+
new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=np.nan)
149+
new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1))
150+
# add the dead station to the list of stations from this file so that it is the same length as the all_data dataset
151+
stations_in_file.append(temp_station_id)
152+
# update the mapping of the dead station to the temporary index so that the data can be re-ordered
153+
stations_in_file[old_station_id] = temp_station_id
154+
stations_in_file[temp_station_id] = old_station_id
155+
# Re-order the station data to match the order of the stations in the new file
156+
# This can happen if lma_analysis decides to change the order of the stations, or if new stations are added or removed
157+
new_file = reorder_stations(new_file, stations_in_file)
158+
# Concatenate the new file's station information with the previously-known station information
159+
lma_station_data = xr.concat(
160+
[d.drop_dims(['number_of_events']) for d in [all_data, new_file]],
161+
dim='number_of_files'
162+
)
163+
# Rebuild the event_contributing_stations array
164+
event_contributing_stations = xr.concat(
165+
[d['event_contributing_stations'] for d in [all_data, new_file]],
166+
dim='number_of_events'
167+
)
168+
# Attach the event_contributing_stations array to the station dataset
169+
lma_station_data['event_contributing_stations'] = event_contributing_stations
170+
171+
# Combining the events is a simple concatenation. If only the stations had been this easy...
172+
lma_event_data = xr.concat(
173+
[d.drop_dims(['number_of_stations']).drop_vars(
174+
['network_center_latitude',
175+
'network_center_longitude',
176+
'network_center_altitude',
177+
'file_start_time']
178+
) for d in [all_data, new_file]],
179+
dim='number_of_events'
180+
)
181+
# merge all the data from this file with the previously-known data
182+
combined_event_data = xr.merge([lma_station_data, lma_event_data])
183+
all_data = combined_event_data
184+
# Sort the data by file_start_time
185+
all_data = all_data.sortby('file_start_time')
186+
# Update the global attributes
187+
all_data.attrs.update(final_attrs)
188+
# To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary
189+
all_the_same = True
190+
# These variables are expected to change between files, so they are not checked for equality
191+
expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time']
192+
for var in all_data.data_vars:
193+
if var in expected_to_change:
194+
continue
195+
if 'number_of_files' in all_data[var].dims:
196+
if (all_data[var] == all_data[var].isel(number_of_files=0)).all():
197+
pass
198+
else:
199+
all_the_same = False
200+
# If all of the variables are the same for all files, remove the 'number_of_files' dimension.
201+
if all_the_same:
202+
# Some variables need to be averaged
203+
mean_data = all_data.mean(dim='number_of_files')
204+
# ...but most can just be copied over (this is necessary because some are strings which can't be averaged)
205+
all_data = all_data.isel(number_of_files=0)
206+
# the file_start_time variable is not needed in the reduced dataset
207+
all_data = all_data.drop_vars(['file_start_time'])
208+
# replace the copied variables with the averaged variables where necessary
209+
for var in expected_to_change:
210+
if var == 'file_start_time':
211+
continue
212+
all_data[var] = mean_data[var]
213+
return all_data
96214

97215
def dataset(filenames, sort_time=True):
98216
""" Create an xarray dataset of the type returned by
@@ -127,15 +245,13 @@ def dataset(filenames, sort_time=True):
127245
# the dimension name, too.
128246
resetted = ('number_of_events_', 'number_of_stations_')
129247
ds = ds.reset_coords(resetted)
130-
ds = ds.rename({resetted[0]:'event_id',
131-
resetted[1]:'station_code'})
248+
ds = ds.rename({resetted[0]:'event_id'})
132249
else:
133250
# The approach for newer xarray versions requires we explicitly rename only the variables.
134251
# The generic "rename" in the block above renames vars, coords, and dims.
135252
resetted = ('number_of_events', 'number_of_stations')
136253
ds = ds.reset_coords(resetted)
137-
ds = ds.rename_vars({resetted[0]:'event_id',
138-
resetted[1]:'station_code'})
254+
ds = ds.rename_vars({resetted[0]:'event_id'})
139255
if sort_time:
140256
ds = ds.sortby('event_time')
141257
return ds, starttime
@@ -203,7 +319,7 @@ def to_dataset(lma_file, event_id_start=0):
203319
ds.attrs['event_algorithm_name'] = lma_file.analysis_program
204320
ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version
205321
ds.attrs['original_filename'] = path.basename(lma_file.file)
206-
322+
ds['file_start_time'] = starttime
207323
# -- Populate the station mask information --
208324
# int, because NetCDF doesn't have booleans
209325
station_mask_bools = np.zeros((N_events, N_stations), dtype='int8')

0 commit comments

Comments
 (0)