Subject | Hash | Author | Date (UTC) |
---|---|---|---|
formating | f82ed6a3f1c0dccfc50aaa2a0fa3e866f5741acf | dleucas | 2021-12-18 23:18:45 |
update readme for GeoJSON command | e5c1a6f49ea17a8860cecffdcf280d1d2fdb3ac2 | dleucas | 2021-12-18 23:13:31 |
add species names to GeoJSON and World Map | 86af1d9597c3cb3c58f71c777902148d2ce333f8 | dleucas | 2021-12-18 22:51:11 |
re-worked GeoJSON from transformed JSON. identical output. added license | 8ec5e1678966c6b1c441d035a6833f55586325ec | dleucas | 2021-12-18 22:01:19 |
remove genus, add species common and scientific names, display names in record details | e2025219ade78f9ac2e6dfd81c479d242b9db24b | dleucas | 2021-12-18 17:18:28 |
add common and scientific names to schema | b4c7dcb4042b9d1cd3da24123f4506113a876f02 | dleucas | 2021-12-18 17:17:17 |
remove genus, add species and type to type_of namespace change | 2a7757a6c3e6dbb784b41d7a4b29381bc9105803 | dleucas | 2021-12-18 17:16:04 |
converted all transforms to functions | 9d37303b24abc225f11f9c3d3b622c5167ed43e3 | dleucas | 2021-12-18 16:00:04 |
more conversion to functions. match old output for now | fc4a8157a6902f4571b54c6ab84174f005adbe0d | dleucas | 2021-12-18 10:26:23 |
WIP convert filters to functions | 32badc3512dd9094d51ba2cc2ef8112eba2698bf | dleucas | 2021-12-16 18:33:21 |
convert html only once. extract species names as json. formating and lint. | e755dc7f4fe2d7c9b97826a0f3f2cf5385e90ef9 | dleucas | 2021-12-16 13:35:31 |
download once. use wget only. get species names. test for commands. formating | 572dbf1eaffe17c43a4a01dc9675737628c5a234 | dleucas | 2021-12-16 12:14:26 |
File | Lines added | Lines deleted |
---|---|---|
README.md | 2 | 2 |
File README.md changed (mode: 100644) (index 328ed11..843ceb9) | |||
... | ... | A configuration for ElasticSearch is provided in `srv/elasticsearch.yml` | |
38 | 38 | ||
39 | 39 | # Transform to GeoJSON for use with World Map | # Transform to GeoJSON for use with World Map |
40 | 40 | ||
41 | - ./GeoJSON.jq data/transformed.json > wmmsdb.geojson | ||
41 | - `./GeoJSON.jq data/transformed.json > wmmsdb.geojson` | ||
42 | 42 | ||
43 | 43 | # Search Interface Setup | # Search Interface Setup |
44 | 44 | ||
45 | TODO | ||
45 | TODO |
File | Lines added | Lines deleted |
---|---|---|
README.md | 21 | 10 |
File README.md changed (mode: 100644) (index e008281..328ed11) | |||
1 | 1 | # Watkins Marine Mammal Sound Database, Woods Hole Oceanographic Institution | # Watkins Marine Mammal Sound Database, Woods Hole Oceanographic Institution |
2 | 2 | ||
3 | ## Deluxe Remaster Edition 2018 | ||
3 | ## Remasterd Delux Edition | ||
4 | 4 | ||
5 | 5 | - [Data Source](http://cis.whoi.edu/science/B/whalesounds/fullCuts.cfm) | - [Data Source](http://cis.whoi.edu/science/B/whalesounds/fullCuts.cfm) |
6 | - [Source Code](https://rocketgit.com/user/dleucas/wmmsdb) | ||
6 | - [Source Code](https://codeberg.org/dleucas/wmmsdb) | ||
7 | - [Source Code (Mirror)](https://rocketgit.com/user/dleucas/wmmsdb) | ||
7 | 8 | - [Search Interface](https://marine-mammal.soundwave.cl) | - [Search Interface](https://marine-mammal.soundwave.cl) |
8 | 9 | ||
9 | 10 | # Overall Goal | # Overall Goal |
... | ... | Make the Sound Database more accessible and useful for researchers | |
14 | 15 | - Transform metadata to a descriptive modern JSON schema | - Transform metadata to a descriptive modern JSON schema |
15 | 16 | - Index metadata into ElasticSearch for easy exploration and search | - Index metadata into ElasticSearch for easy exploration and search |
16 | 17 | - Import metadata into SQLite for advanced queries | - Import metadata into SQLite for advanced queries |
17 | - Count coverage of data before and after transformation | ||
18 | - Document metadata before and after transformation | ||
18 | 19 | ||
19 | 20 | # Install | # Install |
20 | 21 | ||
21 | TODO | ||
22 | Download or clone this repository and install the following tools: | ||
22 | 23 | ||
23 | Tools used: bash, curl, wget, jq, xidel, xpath, regex, ElasticSearch, sqlite | ||
24 | - bash | ||
25 | - curl | ||
26 | - wget | ||
27 | - [jq](https://stedolan.github.io/jq/) | ||
28 | - [Xidel](https://www.videlibri.de/xidel.html) | ||
29 | - elasticsearch-1.7.6 | ||
30 | |||
31 | A configuration for ElasticSearch is provided in `srv/elasticsearch.yml` | ||
24 | 32 | ||
25 | 33 | # Usage | # Usage |
26 | 34 | ||
27 | - ./download.sh | ||
28 | - ./transform.sh | ||
29 | - ./index.sh | ||
35 | - `./download.sh` | ||
36 | - `./transform.sh` | ||
37 | - `./index.sh` | ||
38 | |||
39 | # Transform to GeoJSON for use with World Map | ||
30 | 40 | ||
31 | # Transform to GeoJSON | ||
41 | - ./GeoJSON.jq data/transformed.json > wmmsdb.geojson | ||
32 | 42 | ||
33 | - ./GeoJSON.jq data/rn/*json > wmmsdb.geojson | ||
43 | # Search Interface Setup | ||
34 | 44 | ||
45 | TODO |
File | Lines added | Lines deleted |
---|---|---|
GeoJSON.jq | 3 | 2 |
webroot/map.arcgis.html | 8 | 4 |
File GeoJSON.jq changed (mode: 100755) (index a9d3671..76074ba) | |||
33 | 33 | "id": .record_number, | "id": .record_number, |
34 | 34 | "note": .note, | "note": .note, |
35 | 35 | "location_name": (.location.name | join(" ")), | "location_name": (.location.name | join(" ")), |
36 | "observation_date": (if .observation_date != null then .observation_date else "" end), | ||
37 | # "species": (.animal.species[].common_name ), | ||
36 | "observation_date": .observation_date, | ||
37 | "species_common_name": .animal.species[].common_name, | ||
38 | "species_scientific_name": .animal.species[].scientific_name, | ||
38 | 39 | } | } |
39 | 40 | } | } |
40 | 41 | ] | ] |
File webroot/map.arcgis.html changed (mode: 100644) (index f1206a4..b6c2ec1) | |||
20 | 20 | height: 100%; | height: 100%; |
21 | 21 | width: 100%; | width: 100%; |
22 | 22 | } | } |
23 | ul.metadata > li > span { font-weight: bold; } | ||
23 | 24 | </style> | </style> |
24 | 25 | ||
25 | 26 | <link | <link |
40 | 41 | "wmmsdb.geojson"; | "wmmsdb.geojson"; |
41 | 42 | ||
42 | 43 | const template = { | const template = { |
43 | title: "Record Number: {id}", | ||
44 | title: "Recording {id}: {species_scientific_name}", | ||
44 | 45 | content: ` | content: ` |
45 | <p>Observation Date: {observation_date}</p> | ||
46 | <p>{note}</p> | ||
47 | <p>Location: {location_name}</p> | ||
46 | <p class="note">{note}</p> | ||
47 | <ul class="metadata"> | ||
48 | <li><span>Species:</span> {species_scientific_name} ({species_common_name})</li> | ||
49 | <li><span>Location:</span> {location_name}</li> | ||
50 | <li><span>Observation Date:</span> {observation_date}</li> | ||
51 | </ul> | ||
48 | 52 | <audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/{id}.wav'> | <audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/{id}.wav'> |
49 | 53 | [ Audio Player ] | [ Audio Player ] |
50 | 54 | </audio> | </audio> |
File | Lines added | Lines deleted |
---|---|---|
GeoJSON.jq | 30 | 44 |
File GeoJSON.jq changed (mode: 100755) (index 77e9f5c..a9d3671) | |||
1 | 1 | #!/usr/bin/jq -fsc | #!/usr/bin/jq -fsc |
2 | |||
3 | def as_coord: | ||
4 | # Example W073 or W70, degree only, negate | ||
5 | if startswith("W") and length <= 4 then | ||
6 | -(.[1:] | tonumber) | ||
7 | # Example W12404 degree with minutes, negate | ||
8 | # Negate after addition | ||
9 | elif startswith("W") and length == 6 then | ||
10 | -((.[1:4] | tonumber) + (.[4:] | tonumber / 60)) | ||
11 | # Example S38 degree only, negate | ||
12 | elif startswith("S") and length == 3 then | ||
13 | -(.[1:] | tonumber) | ||
14 | # Degree with minutes, negate | ||
15 | elif startswith("S") and length == 5 then | ||
16 | -((.[1:3] | tonumber) + (.[3:] | tonumber / 60)) | ||
17 | # Degree only | ||
18 | elif startswith("N") and length == 3 then | ||
19 | (.[1:] | tonumber) | ||
20 | # Degree with minutes N4439 | ||
21 | elif startswith("N") and length == 5 then | ||
22 | ((.[1:3] | tonumber) + (.[3:] | tonumber / 60)) | ||
23 | # Degree only | ||
24 | elif startswith("E") and length <= 4 then | ||
25 | (.[1:] | tonumber) | ||
26 | # Degree with minutes | ||
27 | elif startswith("E") and length == 5 then | ||
28 | ((.[1:3] | tonumber) + (.[3:] | tonumber / 60)) | ||
29 | else | ||
30 | null | ||
31 | end; | ||
32 | |||
2 | # Build GeoJSON object from transformed JSON data | ||
3 | # Loaded by webroot/map.arcgis.html | ||
4 | # | ||
5 | # Usage: ./GeoJSON.jq data/transformed.json > data/wmmsdb.geojson | ||
6 | # | ||
7 | # | ||
8 | # SPDX-License-Identifier: GPL-3.0-or-later | ||
9 | # | ||
10 | # Copyright (C) 2018-2022 leuc | ||
11 | # | ||
12 | # This program is free software: you can redistribute it and/or modify it under the | ||
13 | # terms of the GNU Affero General Public License as published by the Free Software | ||
14 | # Foundation, either version 3 of the License, or (at your option) any later | ||
15 | # version. | ||
16 | # | ||
17 | # This program is distributed in the hope that it will be useful, but WITHOUT ANY | ||
18 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A | ||
19 | # PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. | ||
20 | # | ||
21 | # You should have received a copy of the GNU Affero General Public License along | ||
22 | # with this program. If not, see <https://www.gnu.org/licenses/>. | ||
33 | 23 | { | { |
34 | 24 | "type": "FeatureCollection", | "type": "FeatureCollection", |
35 | "features": [ .[] | if (.GC|length == 0) then empty else . end | | ||
36 | { | ||
25 | "features": [ | ||
26 | .[] | select(.location.coordinates | length > 0) | { | ||
37 | 27 | "type": "Feature", | "type": "Feature", |
38 | 28 | "geometry": { | "geometry": { |
39 | "type": "MultiPoint", | ||
40 | "coordinates": | ||
41 | .GC | split("|") | ||
42 | | map(capture("(?<lat>[NS]{1}\\d{1,4})[A-Z]{1,2}(\\d{1,2})?([A-Z]{1})?\\s+(?<lon>[EW]{1}\\d{1,5})")) | ||
43 | | map([(.lon | as_coord), (.lat | as_coord)]) | ||
29 | "type": "MultiPoint", | ||
30 | "coordinates": .location.coordinates | map([.lon, .lat]) | ||
44 | 31 | }, | }, |
45 | 32 | "properties": { | "properties": { |
46 | "id": .RN, | ||
47 | "note": .NT, | ||
48 | "location_name": .GB | split("|") | map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?|(X$)"; ""; "gm")) | join(" "), | ||
49 | "observation_date": [ | ||
50 | .OD | capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | strptime("%d-%B-%Y") | todateiso8601 | ||
51 | ] | .[0] | ||
33 | "id": .record_number, | ||
34 | "note": .note, | ||
35 | "location_name": (.location.name | join(" ")), | ||
36 | "observation_date": (if .observation_date != null then .observation_date else "" end), | ||
37 | # "species": (.animal.species[].common_name ), | ||
52 | 38 | } | } |
53 | 39 | } | } |
54 | 40 | ] | ] |
File | Lines added | Lines deleted |
---|---|---|
webroot/index.html | 63 | 49 |
File webroot/index.html changed (mode: 100644) (index 0942281..76cc113) | |||
11 | 11 | <script type="text/javascript" src="vendor/jquery/1.7.1/jquery-1.7.1.min.js"></script> | <script type="text/javascript" src="vendor/jquery/1.7.1/jquery-1.7.1.min.js"></script> |
12 | 12 | ||
13 | 13 | <link rel="stylesheet" href="vendor/bootstrap/css/bootstrap.min.css"> | <link rel="stylesheet" href="vendor/bootstrap/css/bootstrap.min.css"> |
14 | <script type="text/javascript" src="vendor/bootstrap/js/bootstrap.min.js"></script> | ||
14 | <script type="text/javascript" src="vendor/bootstrap/js/bootstrap.min.js"></script> | ||
15 | 15 | ||
16 | 16 | <link rel="stylesheet" href="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.css"> | <link rel="stylesheet" href="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.css"> |
17 | 17 | <script type="text/javascript" src="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.min.js"></script> | <script type="text/javascript" src="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.min.js"></script> |
... | ... | jQuery(document).ready(function($) { | |
42 | 42 | searchfield: "_all", | searchfield: "_all", |
43 | 43 | q : "", | q : "", |
44 | 44 | facets: [ | facets: [ |
45 | {'field': 'animal.genus.name', | ||
46 | 'display': 'Genus', | ||
45 | {'field': 'animal.species.common_name', | ||
46 | 'display': 'Species Common Name', | ||
47 | 47 | 'open' : false, 'size': 50}, | 'open' : false, 'size': 50}, |
48 | {'field': 'animal.genus.species_code', | ||
49 | 'display': 'Species Code', | ||
48 | {'field': 'animal.species.scientific_name', | ||
49 | 'display': 'Species Scientific Name', | ||
50 | 'open' : false, 'size': 50}, | ||
51 | {'field': 'animal.species.species_code', | ||
52 | 'display': 'Species Code', | ||
53 | 'open' : false}, | ||
54 | {'field': 'animal.vocal.common_name', | ||
55 | 'display': 'Vocal Species Common Name', | ||
56 | 'open' : false}, | ||
57 | {'field': 'animal.vocal.scientific_name', | ||
58 | 'display': 'Vocal Species Scientific Name', | ||
50 | 59 | 'open' : false}, | 'open' : false}, |
51 | 60 | {'field': 'animal.vocal.species_code', | {'field': 'animal.vocal.species_code', |
52 | 'display': 'Vocal Species Code', | ||
61 | 'display': 'Vocal Species Code', | ||
53 | 62 | 'open' : false}, | 'open' : false}, |
54 | 63 | {'field': 'animal.behavior.type_of', | {'field': 'animal.behavior.type_of', |
55 | 64 | 'display': 'Behavior Type', | 'display': 'Behavior Type', |
56 | 65 | 'open' : false}, | 'open' : false}, |
57 | 66 | {'field': 'animal.vocal.animal_id', | {'field': 'animal.vocal.animal_id', |
58 | 'display': 'Vocal ID', | ||
67 | 'display': 'Vocal ID', | ||
59 | 68 | 'open' : false}, | 'open' : false}, |
60 | {'field': 'animal.interaction.type', | ||
61 | 'display': 'Interaction Type', | ||
69 | {'field': 'animal.interaction.type_of', | ||
70 | 'display': 'Interaction Type', | ||
62 | 71 | 'open' : false}, | 'open' : false}, |
63 | {'field': 'animal.profile.animal_id', | ||
64 | 'display': 'ID', | ||
72 | {'field': 'animal.profile.animal_id', | ||
73 | 'display': 'ID', | ||
65 | 74 | 'open' : false}, | 'open' : false}, |
66 | {'field': 'animal.profile.age', | ||
67 | 'display': 'Age', | ||
75 | {'field': 'animal.profile.age', | ||
76 | 'display': 'Age', | ||
68 | 77 | 'open' : false}, | 'open' : false}, |
69 | {'field': 'animal.profile.sex', | ||
70 | 'display': 'Sex', | ||
78 | {'field': 'animal.profile.sex', | ||
79 | 'display': 'Sex', | ||
71 | 80 | 'open' : false}, | 'open' : false}, |
72 | {'field': 'animal.profile.birth_year', | ||
73 | 'display': 'Birth Year', | ||
81 | {'field': 'animal.profile.birth_year', | ||
82 | 'display': 'Birth Year', | ||
74 | 83 | 'open' : false}, | 'open' : false}, |
75 | 84 | ||
76 | 85 | {'field': 'observation_date', | {'field': 'observation_date', |
... | ... | jQuery(document).ready(function($) { | |
83 | 92 | 'value_function': function(v) { return new Date(v).getFullYear() } | 'value_function': function(v) { return new Date(v).getFullYear() } |
84 | 93 | }, | }, |
85 | 94 | ||
86 | {'field': 'location.name', | ||
87 | 'display': 'Geo Location', | ||
95 | {'field': 'location.name', | ||
96 | 'display': 'Geo Location', | ||
88 | 97 | 'open' : false}, | 'open' : false}, |
89 | 98 | /* | /* |
90 | 99 | {'field' : 'location.coordinates', | {'field' : 'location.coordinates', |
... | ... | jQuery(document).ready(function($) { | |
105 | 114 | ] | ] |
106 | 115 | }, | }, |
107 | 116 | */ | */ |
108 | {'field': 'signal.class', | ||
109 | 'display': 'Class', | ||
117 | {'field': 'signal.class', | ||
118 | 'display': 'Class', | ||
110 | 119 | 'open' : false}, | 'open' : false}, |
111 | {'field': 'signal.overlap', | ||
112 | 'display': 'Overlap', | ||
120 | {'field': 'signal.overlap', | ||
121 | 'display': 'Overlap', | ||
113 | 122 | 'open' : false}, | 'open' : false}, |
114 | {'field': 'signal.quality', | ||
115 | 'display': 'Quality', | ||
123 | {'field': 'signal.quality', | ||
124 | 'display': 'Quality', | ||
116 | 125 | 'open' : false}, | 'open' : false}, |
117 | {'field': 'signal.source.name', | ||
118 | 'display': 'Source', | ||
126 | {'field': 'signal.source.name', | ||
127 | 'display': 'Source', | ||
119 | 128 | 'open' : false}, | 'open' : false}, |
120 | {'field': 'signal.source.order', | ||
121 | 'display': 'Source Type', | ||
129 | {'field': 'signal.source.order', | ||
130 | 'display': 'Source Type', | ||
122 | 131 | 'open' : false}, | 'open' : false}, |
123 | 132 | ||
124 | {'field': 'sound.sample_rate', | ||
125 | 'display': 'Sample Rate', | ||
133 | {'field': 'sound.sample_rate', | ||
134 | 'display': 'Sample Rate', | ||
126 | 135 | 'open' : false}, | 'open' : false}, |
127 | {'field': 'sound.channel.recorded', | ||
128 | 'display': 'Channels Recorded', | ||
136 | {'field': 'sound.channel.recorded', | ||
137 | 'display': 'Channels Recorded', | ||
129 | 138 | 'open' : false}, | 'open' : false}, |
130 | 139 | /* | /* |
131 | {'field': 'sound.freq.P1', | ||
132 | 'display': 'Freq Initial Percentile', | ||
140 | {'field': 'sound.freq.P1', | ||
141 | 'display': 'Freq Initial Percentile', | ||
133 | 142 | 'type': 'range', | 'type': 'range', |
134 | 143 | 'size': false, | 'size': false, |
135 | 144 | 'hide_empty_range': true, | 'hide_empty_range': true, |
... | ... | jQuery(document).ready(function($) { | |
151 | 160 | ], | ], |
152 | 161 | }, | }, |
153 | 162 | */ | */ |
154 | {'field': 'sound.freq.IPR', | ||
155 | 'display': 'Interpercentile Range (Frequency)', | ||
163 | {'field': 'sound.freq.IPR', | ||
164 | 'display': 'Interpercentile Range (Frequency)', | ||
156 | 165 | 'type': 'range', | 'type': 'range', |
157 | 166 | 'size': false, | 'size': false, |
158 | 167 | 'hide_empty_range': true, | 'hide_empty_range': true, |
... | ... | jQuery(document).ready(function($) { | |
170 | 179 | {"from" : 90.0, "display" : ">=90.0"} | {"from" : 90.0, "display" : ">=90.0"} |
171 | 180 | ], | ], |
172 | 181 | }, | }, |
173 | {'field': 'sound.time.IPR', | ||
174 | 'display': 'Interpercentile Range (Time)', | ||
182 | {'field': 'sound.time.IPR', | ||
183 | 'display': 'Interpercentile Range (Time)', | ||
175 | 184 | 'type': 'range', | 'type': 'range', |
176 | 185 | 'size': false, | 'size': false, |
177 | 186 | 'hide_empty_range': true, | 'hide_empty_range': true, |
... | ... | jQuery(document).ready(function($) { | |
217 | 226 | debug: false, | debug: false, |
218 | 227 | //fields: ["_id", "animal.profile.animal_id"], | //fields: ["_id", "animal.profile.animal_id"], |
219 | 228 | render_results_metadata: pageSlider, | render_results_metadata: pageSlider, |
220 | "result_display" : [ | ||
229 | "result_display" : [ | ||
221 | 230 | [ {"pre" : "<h4>Record Number: ", "field": "record_number", "post" : "</h4>"} ], | [ {"pre" : "<h4>Record Number: ", "field": "record_number", "post" : "</h4>"} ], |
222 | 231 | [ {"pre" : "<div><img class='' src='/spectro/", "field": "record_number", "post" : ".sox.png'/ >"} ], | [ {"pre" : "<div><img class='' src='/spectro/", "field": "record_number", "post" : ".sox.png'/ >"} ], |
223 | 232 | [ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".seewave.png'/ >"} ], | [ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".seewave.png'/ >"} ], |
224 | 233 | [ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".acoustat.png'/ ></div>"} ], | [ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".acoustat.png'/ ></div>"} ], |
225 | [ {"pre" : "<div><audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/", | ||
226 | "field": "record_number", | ||
234 | [ {"pre" : "<div><audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/", | ||
235 | "field": "record_number", | ||
227 | 236 | "post" : ".wav'>[ Audio Player ]</audio></div><table><tbody>"} ], | "post" : ".wav'>[ Audio Player ]</audio></div><table><tbody>"} ], |
228 | 237 | [ {"pre" : "<tr><th>Note</th><td>", "field": "note", "post" : "</td></tr>"} ], | [ {"pre" : "<tr><th>Note</th><td>", "field": "note", "post" : "</td></tr>"} ], |
238 | [ {"pre" : "<tr><th>Species Common Name</th><td>", "field": "animal.species.0.common_name", "post" : "</td></tr>"} ], | ||
239 | [ {"pre" : "<tr><th>Species Scientific Name</th><td>", "field": "animal.species.0.scientific_name", "post" : "</td></tr>"} ], | ||
229 | 240 | [ {"pre" : "<tr><th>Observation Date</th><td>", "field": "observation_date", "post" : "</td></tr>"} ], | [ {"pre" : "<tr><th>Observation Date</th><td>", "field": "observation_date", "post" : "</td></tr>"} ], |
230 | 241 | [ {"pre" : "<tr><th>Last modified Date</th><td>", "field": "last_modified_date", "post" : "</td></tr>"} ], | [ {"pre" : "<tr><th>Last modified Date</th><td>", "field": "last_modified_date", "post" : "</td></tr>"} ], |
231 | 242 | [ {"pre" : "<tr><th>Geographic location area name</th><td>", "field": "location.name", "post" : "</td></tr>"} ], | [ {"pre" : "<tr><th>Geographic location area name</th><td>", "field": "location.name", "post" : "</td></tr>"} ], |
... | ... | jQuery(document).ready(function($) { | |
243 | 254 | [ {"pre" : "<tr><th>Terminal Percentile (Frequency)</th><td>", "field": "sound.freq.P2", "post" : "</td></tr>"} ], | [ {"pre" : "<tr><th>Terminal Percentile (Frequency)</th><td>", "field": "sound.freq.P2", "post" : "</td></tr>"} ], |
244 | 255 | [ {"pre" : "<tr><th>Frequency Median</th><td>", "field": "sound.freq.M", "post" : "</td></tr> "} ], | [ {"pre" : "<tr><th>Frequency Median</th><td>", "field": "sound.freq.M", "post" : "</td></tr> "} ], |
245 | 256 | [ {"pre" : "<tr><th>Interpercentile Range (Frequency)</th><td>", "field": "sound.freq.IPR", "post" : "</td></tr>"} ], | [ {"pre" : "<tr><th>Interpercentile Range (Frequency)</th><td>", "field": "sound.freq.IPR", "post" : "</td></tr>"} ], |
246 | [ {"pre" : "</tbody></table><p><a href='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/", | ||
247 | "field": "record_number", | ||
257 | [ {"pre" : "</tbody></table><p><a href='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/", | ||
258 | "field": "record_number", | ||
248 | 259 | "post" : ".wav'>Wave Audio File (.wav) from whoi.edu</a></p>"} ], | "post" : ".wav'>Wave Audio File (.wav) from whoi.edu</a></p>"} ], |
249 | 260 | ], | ], |
250 | 261 | selected_filters_in_facet: false, | selected_filters_in_facet: false, |
... | ... | jQuery(document).ready(function($) { | |
253 | 264 | }); | }); |
254 | 265 | }); | }); |
255 | 266 | ||
256 | $('#facetview_filter_animal_genus_name').each(function() { | ||
267 | $('#facetview_filter_animal_species_common_name').each(function() { | ||
257 | 268 | $(this).before($('<h4>').text("Animal")); | $(this).before($('<h4>').text("Animal")); |
258 | 269 | }); | }); |
259 | 270 | $('#facetview_filter_observation_date').each(function() { | $('#facetview_filter_observation_date').each(function() { |
... | ... | header { | |
318 | 329 | margin-bottom: .5em; | margin-bottom: .5em; |
319 | 330 | } | } |
320 | 331 | ||
321 | #facetview_filter_animal_genus_name, | ||
322 | #facetview_filter_animal_genus_species_code, | ||
332 | #facetview_filter_animal_species_common_name, | ||
333 | #facetview_filter_animal_species_scientific_name, | ||
334 | #facetview_filter_animal_species_species_code, | ||
323 | 335 | #facetview_filter_animal_vocal_animal_id, | #facetview_filter_animal_vocal_animal_id, |
324 | #facetview_filter_animal_interaction_type, | ||
336 | #facetview_filter_animal_vocal_common_name, | ||
337 | #facetview_filter_animal_vocal_scientific_name, | ||
338 | #facetview_filter_animal_interaction_type_of, | ||
325 | 339 | #facetview_filter_animal_behavior_type_of, | #facetview_filter_animal_behavior_type_of, |
326 | 340 | #facetview_filter_animal_vocal_species_code, | #facetview_filter_animal_vocal_species_code, |
327 | 341 | #facetview_filter_animal_profile_animal_id, | #facetview_filter_animal_profile_animal_id, |
... | ... | header { | |
340 | 354 | #facetview_filter_signal_overlap, | #facetview_filter_signal_overlap, |
341 | 355 | #facetview_filter_signal_quality, | #facetview_filter_signal_quality, |
342 | 356 | #facetview_filter_signal_source_name, | #facetview_filter_signal_source_name, |
343 | #facetview_filter_signal_source_order | ||
357 | #facetview_filter_signal_source_order | ||
344 | 358 | { | { |
345 | 359 | border-color: blue; | border-color: blue; |
346 | 360 | } | } |
File | Lines added | Lines deleted |
---|---|---|
index.mapping.json | 29 | 1 |
File index.mapping.json changed (mode: 100644) (index e3d62ee..9c20421) | |||
31 | 31 | } | } |
32 | 32 | } | } |
33 | 33 | }, | }, |
34 | "species": { | ||
35 | "properties": { | ||
36 | "_as_entered": { | ||
37 | "index": "not_analyzed", | ||
38 | "type": "string" | ||
39 | }, | ||
40 | "common_name": { | ||
41 | "index": "not_analyzed", | ||
42 | "type": "string" | ||
43 | }, | ||
44 | "scientific_name": { | ||
45 | "index": "not_analyzed", | ||
46 | "type": "string" | ||
47 | }, | ||
48 | "species_code": { | ||
49 | "index": "not_analyzed", | ||
50 | "type": "string" | ||
51 | } | ||
52 | } | ||
53 | }, | ||
34 | 54 | "interaction": { | "interaction": { |
35 | 55 | "properties": { | "properties": { |
36 | 56 | "animal_id": { | "animal_id": { |
37 | 57 | "index": "not_analyzed", | "index": "not_analyzed", |
38 | 58 | "type": "string" | "type": "string" |
39 | 59 | }, | }, |
40 | "type": { | ||
60 | "type_of": { | ||
41 | 61 | "index": "not_analyzed", | "index": "not_analyzed", |
42 | 62 | "type": "string" | "type": "string" |
43 | 63 | } | } |
70 | 90 | "species_code": { | "species_code": { |
71 | 91 | "index": "not_analyzed", | "index": "not_analyzed", |
72 | 92 | "type": "string" | "type": "string" |
93 | }, | ||
94 | "common_name": { | ||
95 | "index": "not_analyzed", | ||
96 | "type": "string" | ||
97 | }, | ||
98 | "scientific_name": { | ||
99 | "index": "not_analyzed", | ||
100 | "type": "string" | ||
73 | 101 | } | } |
74 | 102 | } | } |
75 | 103 | } | } |
File | Lines added | Lines deleted |
---|---|---|
transform.jq | 6 | 4 |
File transform.jq changed (mode: 100755) (index 3da8644..3079e97) | |||
... | ... | def as_animal_interaction: | |
110 | 110 | # FCFB153 FCFB150 | # FCFB153 FCFB150 |
111 | 111 | # FCFB5 FCFB55 | # FCFB5 FCFB55 |
112 | 112 | # FCFB73 FCFB34 | # FCFB73 FCFB34 |
113 | capture("(?<type>[FMC]{2})(?<animal_id>FB\\d+)"; "g")//null; | ||
113 | capture("(?<type_of>[FMC]{2})(?<animal_id>FB\\d+)"; "g")//null; | ||
114 | 114 | ||
115 | 115 | def as_animal_profile: | def as_animal_profile: |
116 | 116 | # age, sex and id, a animal profile | # age, sex and id, a animal profile |
... | ... | def as_animal_vocal: | |
198 | 198 | # also trim space from resulting string | # also trim space from resulting string |
199 | 199 | { | { |
200 | 200 | animal_id: $s[0:.offset] | trim, | animal_id: $s[0:.offset] | trim, |
201 | species_code: .string | trim | ||
201 | species_code: .string | trim, | ||
202 | scientific_name: $s | as_species_sci_name, | ||
203 | common_name: $s | as_species_common_name, | ||
202 | 204 | }) | | }) | |
203 | 205 | # if no object was created, use input as fallback | # if no object was created, use input as fallback |
204 | 206 | # this is for entries without a species code like "Keiko" | # this is for entries without a species code like "Keiko" |
... | ... | def as_sound_sample_rate: | |
430 | 432 | # species code not always present, use input as fallback | # species code not always present, use input as fallback |
431 | 433 | behavior: .BH | split("|") | map(as_animal_behavior), | behavior: .BH | split("|") | map(as_animal_behavior), |
432 | 434 | # Genus name and species code | # Genus name and species code |
433 | genus: .GS | split("|") | as_animal_genus, | ||
435 | # genus: .GS | split("|") | as_animal_genus, | ||
434 | 436 | # Species | # Species |
435 | # species: .GS | split("|") | as_animal_species, | ||
437 | species: .GS | split("|") | as_animal_species, | ||
436 | 438 | } | } |
437 | 439 | } | } |
File | Lines added | Lines deleted |
---|---|---|
transform.jq | 254 | 156 |
File transform.jq changed (mode: 100755) (index cd2aa89..3da8644) | |||
1 | 1 | #!/usr/bin/jq -fr | #!/usr/bin/jq -fr |
2 | 2 | ||
3 | 3 | # jq filter chain to transform flat source metadata into object structure. | # jq filter chain to transform flat source metadata into object structure. |
4 | # Source data combines multiple values into one field, so split that up | ||
5 | # also use native data types if possible. | ||
4 | # | ||
5 | # Each transformation is it's own function with documented input examples. | ||
6 | # | ||
7 | # | ||
8 | # - Source data combines multiple values into one field with "|" | ||
9 | # - Use native JSON data types if possible | ||
10 | # - Clean-up whitespace and normalize value formats | ||
6 | 11 | ||
12 | # Mapping of species code to names | ||
13 | # Extracted from WHOI website by `download.sh` | ||
7 | 14 | import "./data/species.sci.names" as $species_sci_names; | import "./data/species.sci.names" as $species_sci_names; |
8 | 15 | import "./data/species.common.names" as $species_common_names; | import "./data/species.common.names" as $species_common_names; |
9 | 16 | ||
17 | # | ||
18 | # helper functions | ||
19 | # | ||
20 | |||
21 | def trim: | ||
22 | # remove leading and trailing whitespace | ||
23 | gsub("^\\s+|\\s+$";""); | ||
24 | |||
25 | # | ||
26 | # transform functions | ||
27 | # | ||
28 | |||
10 | 29 | # Convert Degree.Minute coordinates into decimal notation | # Convert Degree.Minute coordinates into decimal notation |
11 | 30 | def as_coord: | def as_coord: |
12 | 31 | # Example W073 or W70, degree only, negate | # Example W073 or W70, degree only, negate |
... | ... | def as_coord: | |
41 | 60 | def as_date: | def as_date: |
42 | 61 | (capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | strptime("%d-%B-%Y") | todateiso8601)//null; | (capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | strptime("%d-%B-%Y") | todateiso8601)//null; |
43 | 62 | ||
44 | def as_signal_overlap: | ||
45 | ({ | ||
46 | "OF": "Frequency", | ||
47 | "OT": "Time", | ||
48 | "OTF": "Time and Frequency", | ||
49 | "N": "No" | ||
50 | } as $overlap_type | capture("(?<o>O[TF]{1,2}|N)") | $overlap_type[.o]?)//null ; | ||
63 | def as_location_name: | ||
64 | # Location Name | ||
65 | # Remove species code and whitespace | ||
66 | # | ||
67 | # Example source data: | ||
68 | # 2.25 mi. west of Castle Rock, McMurdo Sound, Antarctica CC5A | ||
69 | # 20 mi. NW Gambell, St. Lawrence Island, Alaska CC2A X | ||
70 | # Castle Harbour, Bermuda AC2A | ||
71 | # | ||
72 | # TODO improve clean-up | ||
73 | # jq -r '.GB | split("|")[]' data/rn/*json| sort -u | grep -P '(\s+)?[A-D][A-Z]\d+[A-Z](\s+)?|([\sXO]*$)' | ||
74 | gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?|(X$)"; ""; "gm"); | ||
75 | |||
76 | def as_location_coordinates: | ||
77 | # Example source data | ||
78 | # N10BD15A W086BD15A | ||
79 | # N13BA2A W061BA2A | ||
80 | # N13BA2A W061BA2A | ||
81 | # N13BD15B W061BD15B | ||
82 | # N14X W061X | ||
83 | # N75BB2A W075BB2A approx | ||
84 | # S52BD1A W070BD1A | ||
85 | # S71CC14A E170CC14A | ||
86 | map(capture("(?<lat>[NS]{1}\\d{1,4})[A-Z]{1,2}(\\d{1,2})?([A-Z]{1})?\\s+(?<lon>[EW]{1}\\d{1,5})")) | | ||
87 | map({ lat: (.lat | as_coord), lon: (.lon | as_coord) }); | ||
51 | 88 | ||
52 | 89 | def as_species_code: | def as_species_code: |
53 | 90 | (capture("(?<code>[A-C][A-Z]\\d+[A-Z])") | .code)//null; | (capture("(?<code>[A-C][A-Z]\\d+[A-Z])") | .code)//null; |
... | ... | def as_species_common_name: | |
58 | 95 | def as_species_sci_name: | def as_species_sci_name: |
59 | 96 | as_species_code | $species_sci_names[0][.?]; | as_species_code | $species_sci_names[0][.?]; |
60 | 97 | ||
98 | # | ||
99 | # Animal | ||
100 | # | ||
101 | |||
61 | 102 | def as_animal_interaction: | def as_animal_interaction: |
62 | 103 | # interaction between animals | # interaction between animals |
63 | 104 | # always a pair, and multiple sets of pairs are possible | # always a pair, and multiple sets of pairs are possible |
... | ... | def as_animal_behavior: | |
119 | 160 | . as $b | match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false | | . as $b | match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false | |
120 | 161 | if . then | if . then |
121 | 162 | { | { |
122 | type_of: ($b[0:.offset] | gsub("^\\s+|\\s+$";"") | if (.|length) > 0 then . else null end), | ||
123 | species_code: .string | gsub("^\\s+|\\s+$";"") | ||
163 | type_of: ($b[0:.offset] | trim | if (.|length) > 0 then . else null end), | ||
164 | species_code: .string | trim | ||
124 | 165 | } | } |
125 | 166 | # fallback without species code | # fallback without species code |
126 | 167 | else | else |
127 | { type_of: $b | gsub("^\\s+|\\s+$";"") } | ||
168 | { type_of: $b | trim } | ||
128 | 169 | end; | end; |
129 | |||
170 | |||
171 | def as_animal_vocal: | ||
172 | # List of vocal animals, name and species code | ||
173 | # All existing entries: | ||
174 | # FB145 #?? BD19D | ||
175 | # FB147 #?? BD19D | ||
176 | # FB150 #?? BD19D | ||
177 | # FB153 #50 Blacktip Doubledip BD19D | ||
178 | # FB34 #30 Wee Willie BD19D | ||
179 | # FB55 #159 BD19D | ||
180 | # FB5 #5 BD19D | ||
181 | # FB73 #35 BD19D | ||
182 | # Keiko | ||
183 | # Keiko BE7A | ||
184 | # Minks BF2A | Jinks BF2A | ||
185 | # Moby Doll | ||
186 | # Moby Doll BE7A | ||
187 | # Olaf CB1A | ||
188 | # Snoopy BA2A | ||
189 | # The lark BE3B | ||
190 | # Wolfie CB1A | Farouk CB1A | ||
191 | # | ||
192 | # create array with objects for each animal | ||
193 | # save input as fallback and split by | | ||
194 | . as $input | $input | split("|") | | ||
195 | # try to match species code | ||
196 | map(. as $s | match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") | | ||
197 | # create object, anything before matched species code is id | ||
198 | # also trim space from resulting string | ||
199 | { | ||
200 | animal_id: $s[0:.offset] | trim, | ||
201 | species_code: .string | trim | ||
202 | }) | | ||
203 | # if no object was created, use input as fallback | ||
204 | # this is for entries without a species code like "Keiko" | ||
205 | if (. == [] and ($input|length)>0 ) then [{animal_id: $input}] else . end; | ||
206 | |||
207 | def as_animal_genus: | ||
208 | map(. as $s | match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") | | ||
209 | { | ||
210 | name: $s[0:.offset] | trim, | ||
211 | species_code: .string | trim | ||
212 | }); | ||
213 | |||
214 | def as_animal_species: | ||
215 | map(. as $s | | ||
216 | { | ||
217 | _as_noted: $s | trim, | ||
218 | species_code: $s | as_species_code, | ||
219 | scientific_name: $s | as_species_sci_name, | ||
220 | common_name: $s | as_species_common_name, | ||
221 | }); | ||
222 | |||
223 | # | ||
224 | # Signal | ||
225 | # | ||
226 | |||
227 | # Cue field contains 3 values describing the postion on tape | ||
228 | # Example input from the docu | ||
229 | # 542 B2:8 8.130 | ||
230 | # 1:03:12 B2:8 8.130 | ||
231 | # however, following formats are also found | ||
232 | # 0:00:00 B30:00 10:20.602 | ||
233 | # 995 B11:28.497 5:20.426 | ||
234 | # 96 B4.00 1.525 | ||
235 | # 93 B23.7 9.164 | ||
236 | # 93 B3:00 2:13.828 | ||
237 | # 01:52:52:04 | ||
238 | # 09:11:00 20:00 951.50 | ||
239 | # 0 B2:00:00 | ||
240 | |||
241 | def as_signal_position_cue: | ||
242 | # "cue" as in a first matched single integer, | ||
243 | # without dot or colon followed by space or end of string | ||
244 | # do not use \b because of the colon in 00:00 values | ||
245 | capture("(?<c>^\\d+(\\s|$))") | {"cue": (.c | tonumber)}; | ||
246 | |||
247 | def as_signal_position_time: | ||
248 | # "time" as in first matched integer with 2 or 3 colons | ||
249 | # followed by space or end of string | ||
250 | capture("(?<time>^\\d+:\\d+:\\d+(:\\d+)?(\\s|$))"); | ||
251 | |||
252 | def as_signal_position_analyzer_buffer_size: | ||
253 | # buffer size, B followed by integer with colon or dot, | ||
254 | # also remove B prefix | ||
255 | # TODO match 2 colon version | ||
256 | capture("(?<analyzer_buffer_size>(?<=B)\\d+[:\\.]\\d+(\\.\\d+)?)"); | ||
257 | |||
258 | # Signal class encodes multiple values, quality, overlap and class | ||
259 | # | ||
260 | # it's only been used 123 times | ||
261 | # | ||
262 | # Example source data: | ||
263 | # 3 OT | ||
264 | # 3 OTF | ||
265 | # C 4 OF | ||
266 | # D | ||
267 | # M | ||
268 | # N | ||
269 | # No | ||
270 | # NO | ||
271 | # OF | ||
272 | # OT | ||
273 | # OTF | ||
274 | # OTF 3 | ||
275 | # OTF 4 | ||
276 | # S | ||
277 | # S 5 | ||
278 | # U | ||
279 | # V | ||
280 | |||
130 | 281 | def as_signal_quality: | def as_signal_quality: |
282 | # any digit in the signal class indicates quality | ||
131 | 283 | (capture("(?<q>\\d+)") | .q | tonumber)//null; | (capture("(?<q>\\d+)") | .q | tonumber)//null; |
132 | 284 | ||
133 | 285 | def as_signal_class: | def as_signal_class: |
... | ... | def as_signal_class: | |
141 | 293 | "C": "Calf" | "C": "Calf" |
142 | 294 | } as $class_names | capture("(?<c>[SMVDUC]{1})") | $class_names[.c]?)//null; | } as $class_names | capture("(?<c>[SMVDUC]{1})") | $class_names[.c]?)//null; |
143 | 295 | ||
296 | def as_signal_overlap: | ||
297 | ({ | ||
298 | "OF": "Frequency", | ||
299 | "OT": "Time", | ||
300 | "OTF": "Time and Frequency", | ||
301 | "N": "No" | ||
302 | } as $overlap_type | capture("(?<o>O[TF]{1,2}|N)") | $overlap_type[.o]?)//null ; | ||
303 | |||
304 | def as_signal_cut_size: | ||
305 | # Signal cut size | ||
306 | # | ||
307 | # Example source data: | ||
308 | # 3.36 | ||
309 | # 9.411 | ||
310 | # 16.564 | ||
311 | # 20.35 | ||
312 | # etc | ||
313 | # only 210 records use a different format, ignored for now | ||
314 | # 2:00.000 | ||
315 | # 1:00.030 | ||
316 | # 10:25.540 | ||
317 | # 1:25.158 | ||
318 | # etc. | ||
319 | # set to null if empty or contains a colon | ||
320 | if (. | contains(":") or (length == 0)) then | ||
321 | null | ||
322 | else | ||
323 | # cast as number and handle a few remaining badly formated | ||
324 | # records like "0.2.95" | ||
325 | (try (. | tonumber) catch null) | ||
326 | end; | ||
327 | |||
328 | def as_signal_source: | ||
329 | # Other general sound producing sources listed in genus field | ||
330 | # | ||
331 | # Example source data: | ||
332 | # Transient ship noise X | ||
333 | # Ship electrical noise X | ||
334 | # Rain X | ||
335 | # Homo sapiens E | ||
336 | # Crustacea O | ||
337 | map(. as $s | match("\\s+[E-Z]{1}(\\s+)?$"; "m") | | ||
338 | { | ||
339 | "E": "Primates", | ||
340 | "O": "Crustacea", | ||
341 | "T": "Fossils", | ||
342 | "U": "Uncertain", | ||
343 | "V": "General pinniped", | ||
344 | "W": "General cetacean", | ||
345 | "X": "Ambient noise" | ||
346 | } as $order | | ||
347 | { | ||
348 | name: $s[0:.offset] | trim, | ||
349 | # not sort order | ||
350 | order: $order[.string | trim] | ||
351 | }); | ||
352 | |||
144 | 353 | def as_sound_channel: | def as_sound_channel: |
145 | 354 | # numbers of channels | # numbers of channels |
146 | 355 | # input data mostly follows the documentation: | # input data mostly follows the documentation: |
... | ... | def as_sound_channel: | |
152 | 361 | # not clear what other input values mean exactly | # not clear what other input values mean exactly |
153 | 362 | # 211 | # 211 |
154 | 363 | (capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") | | (capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") | |
155 | { | ||
156 | recorded: .r | tonumber, | ||
364 | { | ||
365 | recorded: .r | tonumber, | ||
157 | 366 | multiplexed: .m | tonumber, | multiplexed: .m | tonumber, |
158 | 367 | side: .s | side: .s |
159 | 368 | })//null; | })//null; |
160 | 369 | ||
370 | |||
371 | def as_sound_sample_rate: | ||
372 | # plain sample rate as number | ||
373 | # remove dot or colon, and ignore empty strings | ||
374 | # | ||
375 | # Example source data: | ||
376 | # 1000 | ||
377 | # 10,000 | ||
378 | # 10000 | ||
379 | # 100000 | ||
380 | # 10200 | ||
381 | if (. | length > 0) then . | sub("[\\.,]"; "") | tonumber else null end; | ||
382 | |||
383 | # | ||
384 | # Assemble the object tree | ||
385 | # | ||
386 | |||
161 | 387 | # root | # root |
162 | 388 | { | { |
163 | 389 | # record number is unique, can be used as _id | # record number is unique, can be used as _id |
164 | 390 | record_number: .RN, | record_number: .RN, |
165 | 391 | note: .NT, | note: .NT, |
166 | # a lot of noise in the original field, only parsing date | ||
392 | # a lot of noise in the "OD" original field, only parsing date | ||
167 | 393 | observation_date: .OD | as_date, | observation_date: .OD | as_date, |
168 | 394 | last_modified_date: .DA | as_date, | last_modified_date: .DA | as_date, |
169 | 395 | location: { | location: { |
170 | name: .GB | split("|") | map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?|(X$)"; ""; "gm")), | ||
171 | coordinates: .GC | split("|") | ||
172 | | map(capture("(?<lat>[NS]{1}\\d{1,4})[A-Z]{1,2}(\\d{1,2})?([A-Z]{1})?\\s+(?<lon>[EW]{1}\\d{1,5})")) | ||
173 | | map({ lat: (.lat | as_coord), lon: (.lon | as_coord) }) | ||
396 | name: .GB | split("|") | map(as_location_name), | ||
397 | coordinates: .GC | split("|") | as_location_coordinates | ||
174 | 398 | }, | }, |
175 | 399 | # object contains properties of the captured signal | # object contains properties of the captured signal |
176 | 400 | signal: { | signal: { |
177 | # create a list of JSON objects and add them together | ||
178 | |||
179 | # Cue field contains 3 values describing the postion on tape | ||
180 | # Example input from the docu | ||
181 | # 542 B2:8 8.130 | ||
182 | # 1:03:12 B2:8 8.130 | ||
183 | # however, following formats are also found | ||
184 | # 0:00:00 B30:00 10:20.602 | ||
185 | # 995 B11:28.497 5:20.426 | ||
186 | # 96 B4.00 1.525 | ||
187 | # 93 B23.7 9.164 | ||
188 | # 93 B3:00 2:13.828 | ||
189 | # 01:52:52:04 | ||
190 | # 09:11:00 20:00 951.50 | ||
191 | # 0 B2:00:00 | ||
192 | 401 | position: [ | position: [ |
193 | 402 | # keep the source string as reference? | # keep the source string as reference? |
194 | 403 | {_source_cu: .CU}, | {_source_cu: .CU}, |
195 | |||
196 | # "cue" as in a first matched single integer, | ||
197 | # without dot or colon followed by space or end of string | ||
198 | # do not use \b because of the colon in 00:00 values | ||
199 | (.CU | capture( "(?<c>^\\d+(\\s|$))" ) | {cue: .c|tonumber } ), | ||
200 | |||
201 | # "time" as in first matched integer with 2 or 3 colons | ||
202 | # followed by space or end of string | ||
203 | (.CU | capture( "(?<time>^\\d+:\\d+:\\d+(:\\d+)?(\\s|$))" ) ), | ||
204 | |||
205 | # buffer size, B followed by integer with colon or dot, | ||
206 | # also remove B prefix | ||
207 | # TODO match 2 colon version | ||
208 | (.CU | capture("(?<analyzer_buffer_size>(?<=B)\\d+[:\\.]\\d+(\\.\\d+)?)") ) | ||
404 | (.CU | as_signal_position_cue), | ||
405 | (.CU | as_signal_position_time), | ||
406 | (.CU | as_signal_position_analyzer_buffer_size ) | ||
209 | 407 | ] | add, | ] | add, |
210 | # cut size | ||
211 | # 3.36 | ||
212 | # 9.411 | ||
213 | # 16.564 | ||
214 | # 20.35 | ||
215 | # etc | ||
216 | # only 210 records use a different format, ignored for now | ||
217 | # 2:00.000 | ||
218 | # 1:00.030 | ||
219 | # 10:25.540 | ||
220 | # 1:25.158 | ||
221 | # etc. | ||
222 | cut_size: ( | ||
223 | # set to null if empty or contains a colon | ||
224 | if (.CS | contains(":") or (length == 0)) then | ||
225 | null | ||
226 | else | ||
227 | # cast as number and handle a few remaining badly formated | ||
228 | # records like "0.2.95" | ||
229 | (try (.CS | tonumber) catch null) | ||
230 | end | ||
231 | ), | ||
232 | # any digit in the signal class indicates quality | ||
233 | # it's only been used 123 times | ||
408 | cut_size: .CS | as_signal_cut_size, | ||
234 | 409 | _source_sc: .SC, | _source_sc: .SC, |
235 | 410 | quality: .SC | as_signal_quality, | quality: .SC | as_signal_quality, |
236 | 411 | class: .SC | as_signal_class, | class: .SC | as_signal_class, |
237 | 412 | overlap: .SC | as_signal_overlap, | overlap: .SC | as_signal_overlap, |
238 | # other general sound producing sources listed in genus field | ||
239 | source: ( .GS | split("|") | | ||
240 | map(. as $s | match("\\s+[E-Z]{1}(\\s+)?$"; "m") | | ||
241 | { | ||
242 | "E": "Primates", | ||
243 | "O": "Crustacea", | ||
244 | "T": "Fossils", | ||
245 | "U": "Uncertain", | ||
246 | "V": "General pinniped", | ||
247 | "W": "General cetacean", | ||
248 | "X": "Ambient noise" | ||
249 | } as $order | | ||
250 | { | ||
251 | name: $s[0:.offset] | gsub("^\\s+|\\s+$";""), | ||
252 | # not sort order | ||
253 | order: $order[.string | gsub("^\\s+|\\s+$";"")] | ||
254 | }) | ||
255 | ) | ||
413 | source: .GS | split("|") | as_signal_source, | ||
256 | 414 | }, | }, |
257 | 415 | sound: { | sound: { |
258 | # plain sample rate as number, however not normalized in digit length | ||
259 | # remove dot or colon, and ignore empty strings | ||
260 | # a bit difficult to tell what is hz and what khz | ||
261 | sample_rate: ( | ||
262 | if (.SR | length > 0) then | ||
263 | .SR | sub("[\\.,]"; "") | tonumber | ||
264 | else | ||
265 | null | ||
266 | end | ||
267 | ), | ||
416 | sample_rate: .SR | as_sound_sample_rate, | ||
268 | 417 | channel: [ | channel: [ |
269 | {"_source_nc": .NC }, | ||
418 | {"_source_nc": .NC }, | ||
270 | 419 | (.NC | as_sound_channel) | (.NC | as_sound_channel) |
271 | 420 | ] | add | ] | add |
272 | 421 | }, | }, |
273 | 422 | animal: { | animal: { |
274 | 423 | _source_id: .ID, | _source_id: .ID, |
275 | 424 | # List of vocal animals, name and species code | # List of vocal animals, name and species code |
276 | # All existing entries: | ||
277 | # FB145 #?? BD19D | ||
278 | # FB147 #?? BD19D | ||
279 | # FB150 #?? BD19D | ||
280 | # FB153 #50 Blacktip Doubledip BD19D | ||
281 | # FB34 #30 Wee Willie BD19D | ||
282 | # FB55 #159 BD19D | ||
283 | # FB5 #5 BD19D | ||
284 | # FB73 #35 BD19D | ||
285 | # Keiko | ||
286 | # Keiko BE7A | ||
287 | # Minks BF2A | Jinks BF2A | ||
288 | # Moby Doll | ||
289 | # Moby Doll BE7A | ||
290 | # Olaf CB1A | ||
291 | # Snoopy BA2A | ||
292 | # The lark BE3B | ||
293 | # Wolfie CB1A | Farouk CB1A | ||
294 | # | ||
295 | # create array with objects for each animal | ||
296 | # save input as fallback and split by | | ||
297 | vocal: ( .ID as $input | $input | split("|") | | ||
298 | # try to match species code | ||
299 | map(. as $s | match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") | | ||
300 | # create object, anything before matched species code is id | ||
301 | # also trim space from resulting string | ||
302 | { | ||
303 | animal_id: $s[0:.offset] | gsub("^\\s+|\\s+$";""), | ||
304 | species_code: .string | gsub("^\\s+|\\s+$";"") | ||
305 | }) | | ||
306 | # if no object was created, use input as fallback | ||
307 | # this is for entries without a species code like "Keiko" | ||
308 | if (. == [] and ($input|length)>0 ) then | ||
309 | [{animal_id: $input}] | ||
310 | else | ||
311 | . | ||
312 | end | ||
313 | ), | ||
425 | vocal: .ID | as_animal_vocal, | ||
314 | 426 | # age, sex and id, a animal profile | # age, sex and id, a animal profile |
315 | 427 | profile: .AG | as_animal_profile, | profile: .AG | as_animal_profile, |
316 | 428 | # interaction between animals | # interaction between animals |
317 | 429 | interaction: .IA | split("|") | map([as_animal_interaction]), | interaction: .IA | split("|") | map([as_animal_interaction]), |
318 | # behavior type and species code | ||
319 | 430 | # species code not always present, use input as fallback | # species code not always present, use input as fallback |
320 | 431 | behavior: .BH | split("|") | map(as_animal_behavior), | behavior: .BH | split("|") | map(as_animal_behavior), |
321 | 432 | # Genus name and species code | # Genus name and species code |
322 | genus: ( .GS | split("|") | | ||
323 | map(. as $s | match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") | | ||
324 | { | ||
325 | name: $s[0:.offset] | gsub("^\\s+|\\s+$";""), | ||
326 | species_code: .string | gsub("^\\s+|\\s+$";"") | ||
327 | }) | ||
328 | ) | ||
433 | genus: .GS | split("|") | as_animal_genus, | ||
329 | 434 | # Species | # Species |
330 | #species: .GS | split("|") | | ||
331 | # map(. as $s | | ||
332 | # { | ||
333 | # _as_noted: $s | gsub("^\\s+|\\s+$";""), | ||
334 | # species_code: $s | as_species_code, | ||
335 | # scientific_name: $s | as_species_sci_name, | ||
336 | # common_name: $s | as_species_common_name, | ||
337 | # }) | ||
435 | # species: .GS | split("|") | as_animal_species, | ||
338 | 436 | } | } |
339 | 437 | } | } |
File | Lines added | Lines deleted |
---|---|---|
transform.jq | 125 | 98 |
File transform.jq changed (mode: 100755) (index 16bc0b2..cd2aa89) | |||
... | ... | def as_coord: | |
39 | 39 | end; | end; |
40 | 40 | ||
41 | 41 | def as_date: | def as_date: |
42 | capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | strptime("%d-%B-%Y") | todateiso8601; | ||
42 | (capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | strptime("%d-%B-%Y") | todateiso8601)//null; | ||
43 | 43 | ||
44 | 44 | def as_signal_overlap: | def as_signal_overlap: |
45 | { | ||
45 | ({ | ||
46 | 46 | "OF": "Frequency", | "OF": "Frequency", |
47 | 47 | "OT": "Time", | "OT": "Time", |
48 | 48 | "OTF": "Time and Frequency", | "OTF": "Time and Frequency", |
49 | 49 | "N": "No" | "N": "No" |
50 | } as $overlap_type | capture("(?<o>O[TF]{1,2}|N)") | $overlap_type[.o]?; | ||
50 | } as $overlap_type | capture("(?<o>O[TF]{1,2}|N)") | $overlap_type[.o]?)//null ; | ||
51 | 51 | ||
52 | 52 | def as_species_code: | def as_species_code: |
53 | capture("(?<code>[A-C][A-Z]\\d+[A-Z])") | .code; | ||
53 | (capture("(?<code>[A-C][A-Z]\\d+[A-Z])") | .code)//null; | ||
54 | 54 | ||
55 | 55 | def as_species_common_name: | def as_species_common_name: |
56 | 56 | as_species_code | $species_common_names[0][.?]; | as_species_code | $species_common_names[0][.?]; |
... | ... | def as_species_common_name: | |
58 | 58 | def as_species_sci_name: | def as_species_sci_name: |
59 | 59 | as_species_code | $species_sci_names[0][.?]; | as_species_code | $species_sci_names[0][.?]; |
60 | 60 | ||
61 | def as_animal_interaction: | ||
62 | # interaction between animals | ||
63 | # always a pair, and multiple sets of pairs are possible | ||
64 | # [[{}, {}]] or [[{} {}], [{} {}], ...] | ||
65 | # | ||
66 | # Example source data: | ||
67 | # FCFB147 FCFB145 | ||
68 | # FCFB147 FCFB145 | FFFB147 FFFB149 | FFFB145 FFFB149 | FCFB153 FCFB150 | ||
69 | # FCFB153 FCFB150 | ||
70 | # FCFB5 FCFB55 | ||
71 | # FCFB73 FCFB34 | ||
72 | capture("(?<type>[FMC]{2})(?<animal_id>FB\\d+)"; "g")//null; | ||
73 | |||
74 | def as_animal_profile: | ||
75 | # age, sex and id, a animal profile | ||
76 | # ignoring species code | ||
77 | # | ||
78 | # Example source data: | ||
79 | # F03FB55 F1986FB55 | ||
80 | # F26FB5 F1963FB5 | ||
81 | # F??FB145 F????FB145 | ||
82 | # F??FB147 F????FB147 | ||
83 | # F??FB153 F????FB153 | ||
84 | # F??FB73 F????FB73 | ||
85 | # F??FB73 F????FB7370 | ||
86 | # M05FB150 M1984FB150 | ||
87 | # M17Keiko M1975Keiko BE7A | ||
88 | # M17Keiko M1975Keiko BE7A | ||
89 | # M??FB34 M????FB34 | ||
90 | # M??FB73 M????FB73 | ||
91 | capture("^(?<sex>[FM])" + | ||
92 | "(?<age>[\\?\\d]{2})" + | ||
93 | "(?<animal_id>(FB\\d+|\\w+))" + | ||
94 | "\\s+" + | ||
95 | "[FM](?<birth_year>[\\d\\?]{4})")//null | | ||
96 | {"F": "Female", "M": "Male"} as $sex | | ||
97 | { | ||
98 | sex: (if (.sex != null) then $sex[.sex] else null end), | ||
99 | age: (try (.age | tonumber) catch null), | ||
100 | animal_id: .animal_id, | ||
101 | birth_year: (try (.birth_year | tonumber) catch null) | ||
102 | }; | ||
103 | |||
104 | def as_animal_behavior: | ||
105 | # Behavior of the recorded animal with species code | ||
106 | # species code not always present, use input as fallback | ||
107 | # | ||
108 | # Example source data: | ||
109 | # Approaching ship BA2A | ||
110 | # BA2A A few larger whales seen mixed with others | ||
111 | # BE7A | ||
112 | # Bow riding BD17A | ||
113 | # Courtship CB1A | ||
114 | # Dive BA2A | ||
115 | # Feeding AA3A | ||
116 | |||
117 | # find the species code position and use the text before as behavior | ||
118 | # match() returns "empty" which we can not test with if | ||
119 | . as $b | match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false | | ||
120 | if . then | ||
121 | { | ||
122 | type_of: ($b[0:.offset] | gsub("^\\s+|\\s+$";"") | if (.|length) > 0 then . else null end), | ||
123 | species_code: .string | gsub("^\\s+|\\s+$";"") | ||
124 | } | ||
125 | # fallback without species code | ||
126 | else | ||
127 | { type_of: $b | gsub("^\\s+|\\s+$";"") } | ||
128 | end; | ||
129 | |||
130 | def as_signal_quality: | ||
131 | (capture("(?<q>\\d+)") | .q | tonumber)//null; | ||
132 | |||
133 | def as_signal_class: | ||
134 | # class name lookup table | ||
135 | ({ | ||
136 | "S": "Signature", | ||
137 | "M": "Mimic", | ||
138 | "V": "Variant", | ||
139 | "D": "Deletion", | ||
140 | "U": "Uncharacteristic", | ||
141 | "C": "Calf" | ||
142 | } as $class_names | capture("(?<c>[SMVDUC]{1})") | $class_names[.c]?)//null; | ||
143 | |||
144 | def as_sound_channel: | ||
145 | # numbers of channels | ||
146 | # input data mostly follows the documentation: | ||
147 | # 11A | ||
148 | # 11B | ||
149 | # 41D | ||
150 | # 21L | ||
151 | # regex will match only those | ||
152 | # not clear what other input values mean exactly | ||
153 | # 211 | ||
154 | (capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") | | ||
155 | { | ||
156 | recorded: .r | tonumber, | ||
157 | multiplexed: .m | tonumber, | ||
158 | side: .s | ||
159 | })//null; | ||
160 | |||
61 | 161 | # root | # root |
62 | 162 | { | { |
63 | 163 | # record number is unique, can be used as _id | # record number is unique, can be used as _id |
... | ... | def as_species_sci_name: | |
131 | 231 | ), | ), |
132 | 232 | # any digit in the signal class indicates quality | # any digit in the signal class indicates quality |
133 | 233 | # it's only been used 123 times | # it's only been used 123 times |
134 | # note enclosing [] instead of (), otherwise capture() will remove | ||
135 | # non-matching items | ||
136 | 234 | _source_sc: .SC, | _source_sc: .SC, |
137 | quality: [ .SC | capture("(?<q>\\d+)") | .q | tonumber ] | .[0], | ||
138 | class: [ | ||
139 | # class name lookup table | ||
140 | { | ||
141 | "S": "Signature", | ||
142 | "M": "Mimic", | ||
143 | "V": "Variant", | ||
144 | "D": "Deletion", | ||
145 | "U": "Uncharacteristic", | ||
146 | "C": "Calf" | ||
147 | } as $class_names | | ||
148 | [ .SC | capture("(?<c>[SMVDUC]{1})") ] | $class_names[.[0].c]? | ||
149 | ] | .[0], | ||
235 | quality: .SC | as_signal_quality, | ||
236 | class: .SC | as_signal_class, | ||
150 | 237 | overlap: .SC | as_signal_overlap, | overlap: .SC | as_signal_overlap, |
151 | 238 | # other general sound producing sources listed in genus field | # other general sound producing sources listed in genus field |
152 | 239 | source: ( .GS | split("|") | | source: ( .GS | split("|") | |
... | ... | def as_species_sci_name: | |
178 | 265 | null | null |
179 | 266 | end | end |
180 | 267 | ), | ), |
181 | # numbers of channels | ||
182 | # input data mostly follows the documentation: | ||
183 | # 11A | ||
184 | # 11B | ||
185 | # 41D | ||
186 | # 21L | ||
187 | # regex will match only those | ||
188 | # not clear what other input values mean exactly | ||
189 | # 211 | ||
190 | 268 | channel: [ | channel: [ |
191 | {_source_nc: .NC}, | ||
192 | ( | ||
193 | .NC | | ||
194 | capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") | | ||
195 | { | ||
196 | recorded: .r | tonumber, | ||
197 | multiplexed: .m | tonumber, | ||
198 | side: .s | ||
199 | } | ||
200 | ) | ||
269 | {"_source_nc": .NC }, | ||
270 | (.NC | as_sound_channel) | ||
201 | 271 | ] | add | ] | add |
202 | 272 | }, | }, |
203 | 273 | animal: { | animal: { |
... | ... | def as_species_sci_name: | |
242 | 312 | end | end |
243 | 313 | ), | ), |
244 | 314 | # age, sex and id, a animal profile | # age, sex and id, a animal profile |
245 | # source data, ignoring species code | ||
246 | # F03FB55 F1986FB55 | ||
247 | # F26FB5 F1963FB5 | ||
248 | # F??FB145 F????FB145 | ||
249 | # F??FB147 F????FB147 | ||
250 | # F??FB153 F????FB153 | ||
251 | # F??FB73 F????FB73 | ||
252 | # F??FB73 F????FB7370 | ||
253 | # M05FB150 M1984FB150 | ||
254 | # M17Keiko M1975Keiko BE7A | ||
255 | # M17Keiko M1975Keiko BE7A | ||
256 | # M??FB34 M????FB34 | ||
257 | # M??FB73 M????FB73 | ||
258 | profile: [.AG | | ||
259 | capture("^(?<sex>[FM])" + | ||
260 | "(?<age>[\\?\\d]{2})" + | ||
261 | "(?<animal_id>(FB\\d+|\\w+))" + | ||
262 | "\\s+" + | ||
263 | "[FM](?<birth_year>[\\d\\?]{4})") | ||
264 | ] | .[0] | | ||
265 | ( | ||
266 | {"F": "Female", "M": "Male"} as $sex | | ||
267 | { | ||
268 | sex: (if (.sex != null) then $sex[.sex] else null end), | ||
269 | age: (try (.age | tonumber) catch null), | ||
270 | animal_id: .animal_id, | ||
271 | birth_year: (try (.birth_year | tonumber) catch null) | ||
272 | } | ||
273 | ), | ||
315 | profile: .AG | as_animal_profile, | ||
274 | 316 | # interaction between animals | # interaction between animals |
275 | # always a pair, and multiple sets of pairs are possible | ||
276 | # [[{}, {}]] or [[{} {}], [{} {}], ...] | ||
277 | # source data | ||
278 | # FCFB147 FCFB145 | ||
279 | # FCFB147 FCFB145 | FFFB147 FFFB149 | FFFB145 FFFB149 | FCFB153 FCFB150 | ||
280 | # FCFB153 FCFB150 | ||
281 | # FCFB5 FCFB55 | ||
282 | # FCFB73 FCFB34 | ||
283 | interaction: ( .IA | split("|") | | ||
284 | map([capture("(?<type>[FMC]{2})(?<animal_id>FB\\d+)"; "g")]) | ||
285 | ), | ||
317 | interaction: .IA | split("|") | map([as_animal_interaction]), | ||
286 | 318 | # behavior type and species code | # behavior type and species code |
287 | 319 | # species code not always present, use input as fallback | # species code not always present, use input as fallback |
288 | behavior: ( .BH | split("|") | | ||
289 | # match() returns "empty" which we can not test with if | ||
290 | map(. as $b | match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false | | ||
291 | if . then | ||
320 | behavior: .BH | split("|") | map(as_animal_behavior), | ||
321 | # Genus name and species code | ||
322 | genus: ( .GS | split("|") | | ||
323 | map(. as $s | match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") | | ||
292 | 324 | { | { |
293 | type_of: ($b[0:.offset] | gsub("^\\s+|\\s+$";"") | if (.|length) > 0 then . else null end), | ||
325 | name: $s[0:.offset] | gsub("^\\s+|\\s+$";""), | ||
294 | 326 | species_code: .string | gsub("^\\s+|\\s+$";"") | species_code: .string | gsub("^\\s+|\\s+$";"") |
295 | } | ||
296 | # fallback without species code | ||
297 | else | ||
298 | { type_of: $b | gsub("^\\s+|\\s+$";"") } | ||
299 | end | ||
300 | ) | ||
301 | ), | ||
302 | # Genus | ||
303 | species: .GS | split("|") | | ||
304 | map(. as $s | | ||
305 | { | ||
306 | _as_noted: $s | gsub("^\\s+|\\s+$";""), | ||
307 | species_code: $s | as_species_code, | ||
308 | scientific_name: $s | as_species_sci_name, | ||
309 | common_name: $s | as_species_common_name, | ||
310 | 327 | }) | }) |
328 | ) | ||
329 | # Species | ||
330 | #species: .GS | split("|") | | ||
331 | # map(. as $s | | ||
332 | # { | ||
333 | # _as_noted: $s | gsub("^\\s+|\\s+$";""), | ||
334 | # species_code: $s | as_species_code, | ||
335 | # scientific_name: $s | as_species_sci_name, | ||
336 | # common_name: $s | as_species_common_name, | ||
337 | # }) | ||
311 | 338 | } | } |
312 | 339 | } | } |
File | Lines added | Lines deleted |
---|---|---|
transform.jq | 33 | 24 |
File transform.jq changed (mode: 100755) (index 0016ceb..16bc0b2) | |||
4 | 4 | # Source data combines multiple values into one field, so split that up | # Source data combines multiple values into one field, so split that up |
5 | 5 | # also use native data types if possible. | # also use native data types if possible. |
6 | 6 | ||
7 | import "./data/species.sci.names" as $species_sci_names; | ||
8 | import "./data/species.common.names" as $species_common_names; | ||
9 | |||
7 | 10 | # Convert Degree.Minute coordinates into decimal notation | # Convert Degree.Minute coordinates into decimal notation |
8 | 11 | def as_coord: | def as_coord: |
9 | 12 | # Example W073 or W70, degree only, negate | # Example W073 or W70, degree only, negate |
... | ... | def as_coord: | |
35 | 38 | null | null |
36 | 39 | end; | end; |
37 | 40 | ||
41 | def as_date: | ||
42 | capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | strptime("%d-%B-%Y") | todateiso8601; | ||
43 | |||
44 | def as_signal_overlap: | ||
45 | { | ||
46 | "OF": "Frequency", | ||
47 | "OT": "Time", | ||
48 | "OTF": "Time and Frequency", | ||
49 | "N": "No" | ||
50 | } as $overlap_type | capture("(?<o>O[TF]{1,2}|N)") | $overlap_type[.o]?; | ||
51 | |||
52 | def as_species_code: | ||
53 | capture("(?<code>[A-C][A-Z]\\d+[A-Z])") | .code; | ||
54 | |||
55 | def as_species_common_name: | ||
56 | as_species_code | $species_common_names[0][.?]; | ||
57 | |||
58 | def as_species_sci_name: | ||
59 | as_species_code | $species_sci_names[0][.?]; | ||
60 | |||
38 | 61 | # root | # root |
39 | 62 | { | { |
40 | 63 | # record number is unique, can be used as _id | # record number is unique, can be used as _id |
41 | 64 | record_number: .RN, | record_number: .RN, |
42 | 65 | note: .NT, | note: .NT, |
43 | 66 | # a lot of noise in the original field, only parsing date | # a lot of noise in the original field, only parsing date |
44 | observation_date: [ | ||
45 | .OD | capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | | ||
46 | strptime("%d-%B-%Y") | todateiso8601 | ||
47 | ] | .[0], | ||
48 | last_modified_date: [ | ||
49 | .DA | capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") | .date | | ||
50 | strptime("%d-%B-%Y") | todateiso8601 | ||
51 | ] | .[0], | ||
67 | observation_date: .OD | as_date, | ||
68 | last_modified_date: .DA | as_date, | ||
52 | 69 | location: { | location: { |
53 | 70 | name: .GB | split("|") | map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?|(X$)"; ""; "gm")), | name: .GB | split("|") | map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?|(X$)"; ""; "gm")), |
54 | 71 | coordinates: .GC | split("|") | coordinates: .GC | split("|") |
... | ... | def as_coord: | |
130 | 147 | } as $class_names | | } as $class_names | |
131 | 148 | [ .SC | capture("(?<c>[SMVDUC]{1})") ] | $class_names[.[0].c]? | [ .SC | capture("(?<c>[SMVDUC]{1})") ] | $class_names[.[0].c]? |
132 | 149 | ] | .[0], | ] | .[0], |
133 | overlap: [ | ||
134 | # overlap lookup table | ||
135 | { | ||
136 | "OF": "Frequency", | ||
137 | "OT": "Time", | ||
138 | "OTF": "Time and Frequency", | ||
139 | "N": "No" | ||
140 | } as $overlap_type | | ||
141 | [ .SC | capture("(?<o>O[TF]{1,2}|N)") ] | $overlap_type[.[0].o]? | ||
142 | ] | .[0], | ||
150 | overlap: .SC | as_signal_overlap, | ||
143 | 151 | # other general sound producing sources listed in genus field | # other general sound producing sources listed in genus field |
144 | 152 | source: ( .GS | split("|") | | source: ( .GS | split("|") | |
145 | 153 | map(. as $s | match("\\s+[E-Z]{1}(\\s+)?$"; "m") | | map(. as $s | match("\\s+[E-Z]{1}(\\s+)?$"; "m") | |
... | ... | def as_coord: | |
291 | 299 | end | end |
292 | 300 | ) | ) |
293 | 301 | ), | ), |
294 | # Genus name and species code | ||
295 | genus: ( .GS | split("|") | | ||
296 | map(. as $s | match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") | | ||
302 | # Genus | ||
303 | species: .GS | split("|") | | ||
304 | map(. as $s | | ||
297 | 305 | { | { |
298 | name: $s[0:.offset] | gsub("^\\s+|\\s+$";""), | ||
299 | species_code: .string | gsub("^\\s+|\\s+$";"") | ||
306 | _as_noted: $s | gsub("^\\s+|\\s+$";""), | ||
307 | species_code: $s | as_species_code, | ||
308 | scientific_name: $s | as_species_sci_name, | ||
309 | common_name: $s | as_species_common_name, | ||
300 | 310 | }) | }) |
301 | ), | ||
302 | 311 | } | } |
303 | 312 | } | } |
File | Lines added | Lines deleted |
---|---|---|
transform.sh | 29 | 8 |
File transform.sh changed (mode: 100755) (index 64f7fc5..1ea333d) | |||
1 | 1 | #!/bin/bash | #!/bin/bash |
2 | set -e # abort on any errors | ||
2 | set -eo pipefail | ||
3 | # set -x | ||
4 | |||
5 | test -e "$(command -v xidel)" || ( | ||
6 | echo "ERR: Need xidel from https://www.videlibri.de/xidel.html" | ||
7 | exit 1 | ||
8 | ) | ||
9 | test -e "$(command -v jq)" || ( | ||
10 | echo "ERR: Need jq from https://stedolan.github.io/jq/" | ||
11 | exit 1 | ||
12 | ) | ||
13 | |||
14 | # Mapping of species id to common and scientific name | ||
15 | |||
16 | tail -n+56 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.sci.names.json | ||
17 | head -n 55 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.common.names.json | ||
3 | 18 | ||
4 | 19 | # Transform HTML metadata from source site into JSON | # Transform HTML metadata from source site into JSON |
5 | 20 | ||
6 | # for xpath | ||
21 | # for xpath | ||
7 | 22 | XIDEL='xidel -s --input-format=html --output-format=json-wrapped' | XIDEL='xidel -s --input-format=html --output-format=json-wrapped' |
8 | 23 | ||
9 | 24 | # select all rows from the 2nd table element | # select all rows from the 2nd table element |
... | ... | XPATH_ENTRY='/html/body/table[2]/tbody/tr/td' | |
20 | 35 | # "SR:": "3400", | # "SR:": "3400", |
21 | 36 | # "CS:": "3.388", | # "CS:": "3.388", |
22 | 37 | # ... | # ... |
23 | #} | ||
38 | #} | ||
24 | 39 | # The jq filter explained | # The jq filter explained |
25 | 40 | # 1. assign the whole array to $row | # 1. assign the whole array to $row |
26 | 41 | # 2. create a range with a step of 2 over the lenght of the array, 0,2,4,... | # 2. create a range with a step of 2 over the lenght of the array, 0,2,4,... |
27 | 42 | # 3. create a object and use the range as index for the $row elements | # 3. create a object and use the range as index for the $row elements |
28 | 43 | # 3.5 remove right most colon from key | # 3.5 remove right most colon from key |
29 | 44 | # 4. combine the list of objects into a single object with "add" | # 4. combine the list of objects into a single object with "add" |
45 | |||
46 | # shellcheck disable=SC2016 | ||
30 | 47 | JQ_ARR2OBJ='[ .[] as $row | range(0; $row|length; 2) | {( $row[.] | rtrimstr(":")): ($row[.+1]) } ] | add' | JQ_ARR2OBJ='[ .[] as $row | range(0; $row|length; 2) | {( $row[.] | rtrimstr(":")): ($row[.+1]) } ] | add' |
31 | 48 | ||
49 | test -d data/rn || mkdir -p data/rn | ||
32 | 50 | ||
33 | while read RN | ||
34 | do | ||
35 | $XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" | jq "$JQ_ARR2OBJ" > "data/rn/$RN.json" | ||
36 | done < data/retrieval.numbers | ||
51 | while read -r RN; do | ||
52 | # input should exist | ||
53 | test -f "raw/rn/metaData.cfm?RN=$RN" || continue | ||
54 | # output should not exist | ||
55 | test -f "data/rn/$RN.json" && continue | ||
56 | $XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" | jq -c "$JQ_ARR2OBJ" >"data/rn/$RN.json" | ||
57 | done <data/retrieval.numbers | ||
37 | 58 | ||
38 | 59 | # transform all records with jq, this is where the magic happens | # transform all records with jq, this is where the magic happens |
39 | ./transform.jq data/rn/*json > data/transformed.json | ||
60 | ./transform.jq data/rn/*json >data/transformed.json |
File | Lines added | Lines deleted |
---|---|---|
download.sh | 77 | 31 |
File download.sh changed (mode: 100755) (index d154c61..321faae) | |||
1 | #!/bin/bash | ||
1 | #!/bin/bash | ||
2 | # | ||
3 | # Scrape metadata from Woods Hole Oceanographic Institution | ||
4 | # | ||
5 | # Download of all audio files is left as exercise to the reader | ||
6 | # | ||
7 | # - Grab species index from "All Cuts" page | ||
8 | # - Grab all audio cuts for every species for all listed years | ||
9 | # - Grap metadata pop-up for every audio cut | ||
10 | # | ||
11 | # https://whoicf2.whoi.edu/science/B/whalesounds/metaData.cfm?RN=91008005 | ||
12 | # | ||
13 | # Using xmllint XPath 1.0 for parsing, because it continues on broken HTML. | ||
14 | # | ||
15 | # | ||
16 | # SPDX-License-Identifier: GPL-3.0-or-later | ||
17 | # | ||
18 | # Copyright (C) 2021 leuc | ||
19 | # | ||
20 | # This program is free software: you can redistribute it and/or modify it under the | ||
21 | # terms of the GNU Affero General Public License as published by the Free Software | ||
22 | # Foundation, either version 3 of the License, or (at your option) any later | ||
23 | # version. | ||
24 | # | ||
25 | # This program is distributed in the hope that it will be useful, but WITHOUT ANY | ||
26 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A | ||
27 | # PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. | ||
28 | # | ||
29 | # You should have received a copy of the GNU Affero General Public License along | ||
30 | # with this program. If not, see <https://www.gnu.org/licenses/>. | ||
2 | 31 | ||
3 | 32 | set -e | set -e |
4 | # set -x | ||
5 | 33 | ||
6 | URL='http://cis.whoi.edu/science/B/whalesounds' | ||
7 | CURL='curl -s' | ||
34 | test -e "$(command -v wget)" || ( | ||
35 | echo "ERR: Please install wget" | ||
36 | exit 1 | ||
37 | ) | ||
38 | test -e "$(command -v xmllint)" || ( | ||
39 | echo "ERR: Please install xmllint from libxml2-utils" | ||
40 | exit 1 | ||
41 | ) | ||
42 | |||
43 | URL='https://cis.whoi.edu/science/B/whalesounds' | ||
44 | WGET='wget -nv -nc' | ||
8 | 45 | XMLLINT='xmllint --recover --html' | XMLLINT='xmllint --recover --html' |
9 | 46 | ||
47 | # Ensure target dir | ||
48 | test -d raw || mkdir raw | ||
49 | test -d data || mkdir data | ||
50 | |||
10 | 51 | # Download Index page listing all search options | # Download Index page listing all search options |
11 | # $CURL "$URL/fullCuts.cfm" > raw/fullCuts.cfm | ||
52 | $WGET "$URL/fullCuts.cfm" -O raw/fullCuts.cfm || true | ||
53 | |||
54 | # Extract catalog IDs for each mammal from HTML drop down | ||
55 | XPATH_SP='//select[@id="getSpecies"]/option[not(contains(text(),"Select"))]/@value' | ||
56 | $XMLLINT -xpath "${XPATH_SP}" raw/fullCuts.cfm 2>/dev/null | grep -oP 'SP=\K(\w+)' >data/species.list | ||
12 | 57 | ||
13 | # Extract catalog IDs for each mammal | ||
14 | XPATH_SP='//select[@id="getSpecies"]/option/@value' | ||
15 | $XMLLINT -xpath $XPATH_SP raw/fullCuts.cfm 2>/dev/null | grep -oP 'SP=\K(\w+)' | sort -u > data/species.ids | ||
58 | # Extract mammal names from HTML drop down | ||
59 | XPATH_NAME='//select[@id="getSpecies"]/option[not(contains(text(),"Select"))]/text()' | ||
60 | $XMLLINT --xpath "${XPATH_NAME}" raw/fullCuts.cfm 2>/dev/null | sed '/^\s*$/d' | sed 's/^\s*//g' | sed 's/\s*$//g' >data/species.names | ||
61 | |||
62 | # Make unique list of IDs | ||
63 | sort -u data/species.list >data/species.ids | ||
64 | |||
65 | # Merge ids and names for later mapping in transform.sh | ||
66 | paste data/species.list data/species.names >data/species.map | ||
16 | 67 | ||
17 | 68 | # create a list of pages to download | # create a list of pages to download |
18 | while read SP | ||
19 | do | ||
20 | echo "$URL/fullCuts.cfm?SP=$SP&YR=-1" | ||
21 | done < data/species.ids > data/species.urls | ||
69 | while read -r SP; do | ||
70 | echo "$URL/fullCuts.cfm?SP=$SP&YR=-1" | ||
71 | done <data/species.ids >data/species.urls | ||
22 | 72 | ||
23 | 73 | # Download all pages | # Download all pages |
24 | # wget -P raw/sp/ -i data/species.urls | ||
74 | $WGET -P raw/sp/ -i data/species.urls | ||
25 | 75 | ||
26 | 76 | # create a list of pages for each year and species | # create a list of pages for each year and species |
27 | 77 | XPATH_YR='//select[@id="pickYear"]/option/@value' | XPATH_YR='//select[@id="pickYear"]/option/@value' |
28 | while read SP | ||
29 | do | ||
30 | YEARS=$($XMLLINT -xpath "$XPATH_YR" "raw/sp/fullCuts.cfm?SP=$SP&YR=-1" 2>/dev/null | grep -oP 'YR=\K(\d+)' | sort -u) | ||
31 | for YEAR in $YEARS; do | ||
32 | echo "$URL/fullCuts.cfm?SP=$SP&YR=$YEAR" | ||
33 | done | ||
34 | done < data/species.ids > data/species.year.urls | ||
78 | while read -r SP; do | ||
79 | YEARS=$($XMLLINT -xpath "$XPATH_YR" "raw/sp/fullCuts.cfm?SP=$SP&YR=-1" 2>/dev/null | grep -oP 'YR=\K(\d+)' | sort -u) | ||
80 | for YEAR in $YEARS; do | ||
81 | echo "$URL/fullCuts.cfm?SP=$SP&YR=$YEAR" | ||
82 | done | ||
83 | done <data/species.ids >data/species.year.urls | ||
35 | 84 | ||
36 | # wget -P raw/spyr/ -i data/species.year.urls | ||
85 | $WGET -P raw/spyr/ -i data/species.year.urls | ||
37 | 86 | ||
38 | 87 | # Extract retrieval number from all sp/year pages | # Extract retrieval number from all sp/year pages |
39 | 88 | XPATH_RN='//table//tr//td[5]/a/@href' | XPATH_RN='//table//tr//td[5]/a/@href' |
40 | for F in raw/spyr/fullCuts.cfm* | ||
41 | do | ||
42 | $XMLLINT -xpath "$XPATH_RN" "$F" 2>/dev/null | grep -oP 'WhaleSounds/\K([\da-zA-Z]+)' | ||
43 | done | sort -u > data/retrieval.numbers | ||
89 | for F in raw/spyr/fullCuts.cfm*; do | ||
90 | $XMLLINT -xpath "$XPATH_RN" "$F" 2>/dev/null | grep -oP 'WhaleSounds/\K([\da-zA-Z]+)' | ||
91 | done | sort -u >data/retrieval.numbers | ||
44 | 92 | ||
45 | 93 | # Create list of URLs to download | # Create list of URLs to download |
46 | while read RN | ||
47 | do | ||
48 | echo "$URL/metaData.cfm?RN=$RN" | ||
49 | done < data/retrieval.numbers > data/retrieval.urls | ||
50 | |||
51 | # wget -P raw/rn/ -i data/retrieval.urls | ||
94 | while read -r RN; do | ||
95 | echo "$URL/metaData.cfm?RN=$RN" | ||
96 | done <data/retrieval.numbers >data/retrieval.urls | ||
52 | 97 | ||
98 | $WGET -P raw/rn/ -i data/retrieval.urls |