forked from uataq/co2usa_data_synthesis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathco2usa_dataset_summary_info.m
160 lines (132 loc) · 6.95 KB
/
co2usa_dataset_summary_info.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
clear all
close all
set(0,'DefaultFigureWindowStyle','docked')
cities = {
'boston'
'indianapolis'
'los_angeles'
'portland'
'salt_lake_city'
'san_francisco_beacon'
'san_francisco_baaqmd'
'toronto'
'washington_dc_baltimore'
};
species_to_load = {'co2'
'ch4'
'co'
};
currentFolder = pwd;
readFolder = fullfile(currentFolder(1:regexp(currentFolder,'gcloud.utah.edu')+14),'data','co2-usa','synthesis_output_ornl_new','netCDF_formatted_files');
save_overview_image = 'n';
co2_usa = co2usa_load_netCDF(cities,species_to_load,readFolder,save_overview_image);
species = 'co2';
%% Author List
authors = cell(1,4);
count = 1;
for ii = 1:size(cities,1)
city = cities{ii,1}; %if ~isfield(co2_usa,city); continue; end
site_codes = fieldnames(co2_usa.(city)); site_codes = site_codes(contains(site_codes,[species,'_']));
providers_total_listed = str2double(co2_usa.(city).(site_codes{1}).global_attributes.provider_total_listed);
for j = 1:providers_total_listed
authors{count,2} = co2_usa.(city).(site_codes{1}).global_attributes.(['provider_',num2str(j),'_name']);
name_parts = strsplit(authors{count,2});
authors{count,1} = name_parts{end};
authors{count,3} = co2_usa.(city).(site_codes{1}).global_attributes.(['provider_',num2str(j),'_affiliation']);
authors{count,4} = co2_usa.(city).(site_codes{1}).global_attributes.(['provider_',num2str(j),'_email']);
authors{count,5} = co2_usa.(city).(site_codes{1}).global_attributes.(['provider_',num2str(j),'_orcid']);
count = count+1;
end
end
% List of all CO2-USA authors/affiliation/email alphabetized by last name
authors = sortrows(authors,1); % Sort alphabetically
authors = [{'Mitchell','Logan E. Mitchell','University of Utah','[email protected]','https://orcid.org/0000-0002-8749-954X'};...
{'Lin','John C. Lin','University of Utah','[email protected]','https://orcid.org/0000-0003-2794-184X'};...
{'Hutyra','Lucy R. Hutyra','Boston University','[email protected]',''}; authors]; % Add 3 primary authors
[~,ia,~] = unique(authors(:,1)); % find duplicates
authors = authors(sortrows(ia),:); % remove duplicate while preserving author order (3 primary authors, alphabetically after that)
fprintf('Data set author list:\n')
for ii = 1:length(authors)
fprintf('%-25s %-65s %-35s %s\n',authors{ii,2},authors{ii,3},authors{ii,4},authors{ii,5})
end
%% Overall dataset start and stop date
city_dataset_start_date = repmat(datetime(2050,1,1),length(cities),1); % Set dummy values that will be replaced.
city_dataset_stop_date = repmat(datetime(1970,1,1),length(cities),1); % Set dummy values that will be replaced.
for ii = 1:size(cities,1)
city = cities{ii,1};
t_city = tic;
fprintf('Working on %s:\n',city)
site_codes = fieldnames(co2_usa.(city)); site_codes = site_codes(contains(site_codes,[species,'_']));
for usc_i = 1:length(site_codes) % Loops through each site
site = site_codes{usc_i}; if strcmp(site_codes{usc_i},[species,'_background']); continue; end % Skip the background
city_dataset_start_date(ii,1) = min([city_dataset_start_date(ii,1),...
datetime(co2_usa.(city).(site).global_attributes.('dataset_start_date'),'InputFormat','yyyy-MM-dd''T''HH:mm:ss''Z')]);
city_dataset_stop_date(ii,1) = max([city_dataset_stop_date(ii,1),...
datetime(co2_usa.(city).(site).global_attributes.('dataset_stop_date'),'InputFormat','yyyy-MM-dd''T''HH:mm:ss''Z')]);
end
end
co2usa_dataset_start_date = min(city_dataset_start_date);
co2usa_dataset_stop_date = max(city_dataset_stop_date);
fprintf('Done.\n')
fprintf('Overall dataset start date: %s\n',datestr(co2usa_dataset_start_date,'yyyy-mm-dd'))
fprintf('Overall dataset stop date: %s\n',datestr(co2usa_dataset_stop_date,'yyyy-mm-dd'))
%% Bounding Box
co2usa_dataset_n_limit = nan;
co2usa_dataset_s_limit = nan;
co2usa_dataset_e_limit = nan;
co2usa_dataset_w_limit = nan;
for ii = 1:size(cities,1)
city = cities{ii,1};
site_codes = fieldnames(co2_usa.(city)); site_codes = site_codes(contains(site_codes,[species,'_']));
city_lats = nan(size(site_codes,1),1);
city_lons = nan(size(site_codes,1),1);
for usc_i = 1:length(site_codes) % Loops through each site
if strcmp(site_codes{usc_i},[species,'_background']); continue; end % Skip the background
site = site_codes{usc_i};
city_lats(usc_i,1) = str2double(co2_usa.(city).(site).global_attributes.('site_latitude'));
city_lons(usc_i,1) = str2double(co2_usa.(city).(site).global_attributes.('site_longitude'));
fprintf('%s-%s: %0.4f %0.4f\n',city,site,city_lats(usc_i,1),city_lons(usc_i,1))
end
fprintf('%s overall average: %0.4f %0.4f\n',city,nanmean(city_lats),nanmean(city_lons))
fprintf('Bounding box: N=%0.4f, S=%0.4f, W=%0.4f, E=%0.4f.\n',nanmax(city_lats),nanmin(city_lats),nanmin(city_lons),nanmax(city_lons))
co2usa_dataset_n_limit = max([co2usa_dataset_n_limit,nanmax(city_lats)]);
co2usa_dataset_s_limit = min([co2usa_dataset_s_limit,nanmin(city_lats)]);
co2usa_dataset_w_limit = min([co2usa_dataset_w_limit,nanmin(city_lons)]);
co2usa_dataset_e_limit = max([co2usa_dataset_e_limit,nanmax(city_lons)]);
end
fprintf('Done.\n')
fprintf('Overall CO2-USA Bounding box: N=%0.4f, S=%0.4f, W=%0.4f, E=%0.4f.\n',co2usa_dataset_n_limit,co2usa_dataset_s_limit,co2usa_dataset_w_limit,co2usa_dataset_e_limit)
%% File number & size
co2usa_nc_file_size = 0;
co2usa_text_file_size = 0;
co2usa_total_files = 0;
readFolder = fullfile(currentFolder(1:regexp(currentFolder,'gcloud.utah.edu')+14),'data','co2-usa','synthesis_output_ornl_new');
for ii = 1:size(cities,1)
city = cities{ii,1};
fn_nc = dir(fullfile(readFolder,'netCDF_formatted_files',[city,'*.nc']));
co2usa_nc_file_size = co2usa_nc_file_size+sum(cell2mat({fn_nc.bytes}));
fn_text = dir(fullfile(readFolder,'txt_formatted_files',[city,'*.txt']));
co2usa_text_file_size = co2usa_text_file_size+sum(cell2mat({fn_text.bytes}));
co2usa_total_files = co2usa_total_files+length(fn_nc)+length(fn_text);
end
fprintf('Total number of files in the dataset is %0.0f.\n',co2usa_total_files)
fprintf('Total dataset file size is %0.0f MB.\n',(co2usa_nc_file_size+co2usa_text_file_size)/1000000)
%% Check the data set for duplicate times
for kk = 1:length(species_to_load)
species = species_to_load{kk};
for ii = 1:size(cities,1)
city = cities{ii,1};
site_codes = fieldnames(co2_usa.(city)); site_codes = site_codes(contains(site_codes,[species,'_']));
for usc_i = 1:length(site_codes) % Loops through each site
site = site_codes{usc_i};
[~,ia,~] = unique(co2_usa.(city).(site).time);
f2 = setdiff(1:numel(co2_usa.(city).(site).time),ia);
if ~isempty(f2)
fprintf('Duplicate times found in %s at %s on:\n',city,site)
for jj = 1:length(f2)
fprintf('%s\n',co2_usa.(city).(site).time(f2(jj)))
end
end
end
end
end