RocketGit

Subject	Hash	Author	Date (UTC)
formating	f82ed6a3f1c0dccfc50aaa2a0fa3e866f5741acf	dleucas	2021-12-18 23:18:45
update readme for GeoJSON command	e5c1a6f49ea17a8860cecffdcf280d1d2fdb3ac2	dleucas	2021-12-18 23:13:31
add species names to GeoJSON and World Map	86af1d9597c3cb3c58f71c777902148d2ce333f8	dleucas	2021-12-18 22:51:11
re-worked GeoJSON from transformed JSON. identical output. added license	8ec5e1678966c6b1c441d035a6833f55586325ec	dleucas	2021-12-18 22:01:19
remove genus, add species common and scientific names, display names in record details	e2025219ade78f9ac2e6dfd81c479d242b9db24b	dleucas	2021-12-18 17:18:28
add common and scientific names to schema	b4c7dcb4042b9d1cd3da24123f4506113a876f02	dleucas	2021-12-18 17:17:17
remove genus, add species and type to type_of namespace change	2a7757a6c3e6dbb784b41d7a4b29381bc9105803	dleucas	2021-12-18 17:16:04
converted all transforms to functions	9d37303b24abc225f11f9c3d3b622c5167ed43e3	dleucas	2021-12-18 16:00:04
more conversion to functions. match old output for now	fc4a8157a6902f4571b54c6ab84174f005adbe0d	dleucas	2021-12-18 10:26:23
WIP convert filters to functions	32badc3512dd9094d51ba2cc2ef8112eba2698bf	dleucas	2021-12-16 18:33:21
convert html only once. extract species names as json. formating and lint.	e755dc7f4fe2d7c9b97826a0f3f2cf5385e90ef9	dleucas	2021-12-16 13:35:31
download once. use wget only. get species names. test for commands. formating	572dbf1eaffe17c43a4a01dc9675737628c5a234	dleucas	2021-12-16 12:14:26

Commit f82ed6a3f1c0dccfc50aaa2a0fa3e866f5741acf - formating
Author: dleucas
Author date (UTC): 2021-12-18 23:18
Committer name: dleucas
Committer date (UTC): 2021-12-18 23:18
Parent(s): e5c1a6f49ea17a8860cecffdcf280d1d2fdb3ac2
Signing key:
Tree: a240a2c7b4d39eb0b8c3300ba93a3d8bb786ef1f

File	Lines added	Lines deleted
README.md	2	2

File README.md changed (mode: 100644) (index 328ed11..843ceb9)
...	...	A configuration for ElasticSearch is provided in `srv/elasticsearch.yml`
38	38
39	39	# Transform to GeoJSON for use with World Map	# Transform to GeoJSON for use with World Map
40	40
41		- ./GeoJSON.jq data/transformed.json > wmmsdb.geojson
	41		- `./GeoJSON.jq data/transformed.json > wmmsdb.geojson`
42	42
43	43	# Search Interface Setup	# Search Interface Setup
44	44
45		TODO
	45		TODO

Commit e5c1a6f49ea17a8860cecffdcf280d1d2fdb3ac2 - update readme for GeoJSON command
Author: dleucas
Author date (UTC): 2021-12-18 23:13
Committer name: dleucas
Committer date (UTC): 2021-12-18 23:13
Parent(s): 86af1d9597c3cb3c58f71c777902148d2ce333f8
Signing key:
Tree: 9a25d6fc99ae1345668dd61f133d32138044b0b1

File	Lines added	Lines deleted
README.md	21	10

File README.md changed (mode: 100644) (index e008281..328ed11)
1	1	# Watkins Marine Mammal Sound Database, Woods Hole Oceanographic Institution	# Watkins Marine Mammal Sound Database, Woods Hole Oceanographic Institution
2	2
3		## Deluxe Remaster Edition 2018
	3		## Remasterd Delux Edition
4	4
5	5	- [Data Source](http://cis.whoi.edu/science/B/whalesounds/fullCuts.cfm)	- [Data Source](http://cis.whoi.edu/science/B/whalesounds/fullCuts.cfm)
6		- [Source Code](https://rocketgit.com/user/dleucas/wmmsdb)
	6		- [Source Code](https://codeberg.org/dleucas/wmmsdb)
	7		- [Source Code (Mirror)](https://rocketgit.com/user/dleucas/wmmsdb)
7	8	- [Search Interface](https://marine-mammal.soundwave.cl)	- [Search Interface](https://marine-mammal.soundwave.cl)
8	9
9	10	# Overall Goal	# Overall Goal

...	...	Make the Sound Database more accessible and useful for researchers
14	15	- Transform metadata to a descriptive modern JSON schema	- Transform metadata to a descriptive modern JSON schema
15	16	- Index metadata into ElasticSearch for easy exploration and search	- Index metadata into ElasticSearch for easy exploration and search
16	17	- Import metadata into SQLite for advanced queries	- Import metadata into SQLite for advanced queries
17		- Count coverage of data before and after transformation
	18		- Document metadata before and after transformation
18	19
19	20	# Install	# Install
20	21
21		TODO
	22		Download or clone this repository and install the following tools:
22	23
23		Tools used: bash, curl, wget, jq, xidel, xpath, regex, ElasticSearch, sqlite
	24		- bash
	25		- curl
	26		- wget
	27		- [jq](https://stedolan.github.io/jq/)
	28		- [Xidel](https://www.videlibri.de/xidel.html)
	29		- elasticsearch-1.7.6
	30
	31		A configuration for ElasticSearch is provided in `srv/elasticsearch.yml`
24	32
25	33	# Usage	# Usage
26	34
27		- ./download.sh
28		- ./transform.sh
29		- ./index.sh
	35		- `./download.sh`
	36		- `./transform.sh`
	37		- `./index.sh`
	38
	39		# Transform to GeoJSON for use with World Map
30	40
31		# Transform to GeoJSON
	41		- ./GeoJSON.jq data/transformed.json > wmmsdb.geojson
32	42
33		- ./GeoJSON.jq data/rn/*json > wmmsdb.geojson
	43		# Search Interface Setup
34	44
	45		TODO

Commit 86af1d9597c3cb3c58f71c777902148d2ce333f8 - add species names to GeoJSON and World Map
Author: dleucas
Author date (UTC): 2021-12-18 22:51
Committer name: dleucas
Committer date (UTC): 2021-12-18 22:51
Parent(s): 8ec5e1678966c6b1c441d035a6833f55586325ec
Signing key:
Tree: 7aada6946266fa9ee719864b5f55aaa268ba8639

File	Lines added	Lines deleted
GeoJSON.jq	3	2
webroot/map.arcgis.html	8	4

File GeoJSON.jq changed (mode: 100755) (index a9d3671..76074ba)
33	33	"id": .record_number,	"id": .record_number,
34	34	"note": .note,	"note": .note,
35	35	"location_name": (.location.name \| join(" ")),	"location_name": (.location.name \| join(" ")),
36		"observation_date": (if .observation_date != null then .observation_date else "" end),
37		# "species": (.animal.species[].common_name ),
	36		"observation_date": .observation_date,
	37		"species_common_name": .animal.species[].common_name,
	38		"species_scientific_name": .animal.species[].scientific_name,
38	39	}	}
39	40	}	}
40	41	]	]

File webroot/map.arcgis.html changed (mode: 100644) (index f1206a4..b6c2ec1)
20	20	height: 100%;	height: 100%;
21	21	width: 100%;	width: 100%;
22	22	}	}
	23		ul.metadata > li > span { font-weight: bold; }
23	24	</style>	</style>
24	25
25	26	<link	<link

40	41	"wmmsdb.geojson";	"wmmsdb.geojson";
41	42
42	43	const template = {	const template = {
43		title: "Record Number: {id}",
	44		title: "Recording {id}: {species_scientific_name}",
44	45	content: `	content: `
45		<p>Observation Date: {observation_date}</p>
46		<p>{note}</p>
47		<p>Location: {location_name}</p>
	46		<p class="note">{note}</p>
	47		<ul class="metadata">
	48		<li><span>Species:</span> {species_scientific_name} ({species_common_name})</li>
	49		<li><span>Location:</span> {location_name}</li>
	50		<li><span>Observation Date:</span> {observation_date}</li>
	51		</ul>
48	52	<audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/{id}.wav'>	<audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/{id}.wav'>
49	53	[ Audio Player ]	[ Audio Player ]
50	54	</audio>	</audio>

Commit 8ec5e1678966c6b1c441d035a6833f55586325ec - re-worked GeoJSON from transformed JSON. identical output. added license
Author: dleucas
Author date (UTC): 2021-12-18 22:01
Committer name: dleucas
Committer date (UTC): 2021-12-18 22:01
Parent(s): e2025219ade78f9ac2e6dfd81c479d242b9db24b
Signing key:
Tree: e14b6a0b01943b3c3e15dd003c2314cdff2a0764

File	Lines added	Lines deleted
GeoJSON.jq	30	44

File GeoJSON.jq changed (mode: 100755) (index 77e9f5c..a9d3671)
1	1	#!/usr/bin/jq -fsc	#!/usr/bin/jq -fsc
2
3		def as_coord:
4		# Example W073 or W70, degree only, negate
5		if startswith("W") and length <= 4 then
6		-(.[1:] \| tonumber)
7		# Example W12404 degree with minutes, negate
8		# Negate after addition
9		elif startswith("W") and length == 6 then
10		-((.[1:4] \| tonumber) + (.[4:] \| tonumber / 60))
11		# Example S38 degree only, negate
12		elif startswith("S") and length == 3 then
13		-(.[1:] \| tonumber)
14		# Degree with minutes, negate
15		elif startswith("S") and length == 5 then
16		-((.[1:3] \| tonumber) + (.[3:] \| tonumber / 60))
17		# Degree only
18		elif startswith("N") and length == 3 then
19		(.[1:] \| tonumber)
20		# Degree with minutes N4439
21		elif startswith("N") and length == 5 then
22		((.[1:3] \| tonumber) + (.[3:] \| tonumber / 60))
23		# Degree only
24		elif startswith("E") and length <= 4 then
25		(.[1:] \| tonumber)
26		# Degree with minutes
27		elif startswith("E") and length == 5 then
28		((.[1:3] \| tonumber) + (.[3:] \| tonumber / 60))
29		else
30		null
31		end;
32
	2		# Build GeoJSON object from transformed JSON data
	3		# Loaded by webroot/map.arcgis.html
	4		#
	5		# Usage: ./GeoJSON.jq data/transformed.json > data/wmmsdb.geojson
	6		#
	7		#
	8		# SPDX-License-Identifier: GPL-3.0-or-later
	9		#
	10		# Copyright (C) 2018-2022 leuc
	11		#
	12		# This program is free software: you can redistribute it and/or modify it under the
	13		# terms of the GNU Affero General Public License as published by the Free Software
	14		# Foundation, either version 3 of the License, or (at your option) any later
	15		# version.
	16		#
	17		# This program is distributed in the hope that it will be useful, but WITHOUT ANY
	18		# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
	19		# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
	20		#
	21		# You should have received a copy of the GNU Affero General Public License along
	22		# with this program. If not, see <https://www.gnu.org/licenses/>.
33	23	{	{
34	24	"type": "FeatureCollection",	"type": "FeatureCollection",
35		"features": [ .[] \| if (.GC\|length == 0) then empty else . end \|
36		{
	25		"features": [
	26		.[] \| select(.location.coordinates \| length > 0) \| {
37	27	"type": "Feature",	"type": "Feature",
38	28	"geometry": {	"geometry": {
39		"type": "MultiPoint",
40		"coordinates":
41		.GC \| split("\|")
42		\| map(capture("(?<lat>[NS]{1}\\d{1,4})[A-Z]{1,2}(\\d{1,2})?([A-Z]{1})?\\s+(?<lon>[EW]{1}\\d{1,5})"))
43		\| map([(.lon \| as_coord), (.lat \| as_coord)])
	29		"type": "MultiPoint",
	30		"coordinates": .location.coordinates \| map([.lon, .lat])
44	31	},	},
45	32	"properties": {	"properties": {
46		"id": .RN,
47		"note": .NT,
48		"location_name": .GB \| split("\|") \| map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?\|(X$)"; ""; "gm")) \| join(" "),
49		"observation_date": [
50		.OD \| capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \| strptime("%d-%B-%Y") \| todateiso8601
51		] \| .[0]
	33		"id": .record_number,
	34		"note": .note,
	35		"location_name": (.location.name \| join(" ")),
	36		"observation_date": (if .observation_date != null then .observation_date else "" end),
	37		# "species": (.animal.species[].common_name ),
52	38	}	}
53	39	}	}
54	40	]	]

Commit e2025219ade78f9ac2e6dfd81c479d242b9db24b - remove genus, add species common and scientific names, display names in record details
Author: dleucas
Author date (UTC): 2021-12-18 17:18
Committer name: dleucas
Committer date (UTC): 2021-12-18 17:18
Parent(s): b4c7dcb4042b9d1cd3da24123f4506113a876f02
Signing key:
Tree: fa27c6711430a4f07c3c7dc798308ff08f1d7459

File	Lines added	Lines deleted
webroot/index.html	63	49

File webroot/index.html changed (mode: 100644) (index 0942281..76cc113)
11	11	<script type="text/javascript" src="vendor/jquery/1.7.1/jquery-1.7.1.min.js"></script>	<script type="text/javascript" src="vendor/jquery/1.7.1/jquery-1.7.1.min.js"></script>
12	12
13	13	<link rel="stylesheet" href="vendor/bootstrap/css/bootstrap.min.css">	<link rel="stylesheet" href="vendor/bootstrap/css/bootstrap.min.css">
14		<script type="text/javascript" src="vendor/bootstrap/js/bootstrap.min.js"></script>
	14		<script type="text/javascript" src="vendor/bootstrap/js/bootstrap.min.js"></script>
15	15
16	16	<link rel="stylesheet" href="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.css">	<link rel="stylesheet" href="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.css">
17	17	<script type="text/javascript" src="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.min.js"></script>	<script type="text/javascript" src="vendor/jquery-ui-1.8.18.custom/jquery-ui-1.8.18.custom.min.js"></script>

...	...	jQuery(document).ready(function($) {
42	42	searchfield: "_all",	searchfield: "_all",
43	43	q : "",	q : "",
44	44	facets: [	facets: [
45		{'field': 'animal.genus.name',
46		'display': 'Genus',
	45		{'field': 'animal.species.common_name',
	46		'display': 'Species Common Name',
47	47	'open' : false, 'size': 50},	'open' : false, 'size': 50},
48		{'field': 'animal.genus.species_code',
49		'display': 'Species Code',
	48		{'field': 'animal.species.scientific_name',
	49		'display': 'Species Scientific Name',
	50		'open' : false, 'size': 50},
	51		{'field': 'animal.species.species_code',
	52		'display': 'Species Code',
	53		'open' : false},
	54		{'field': 'animal.vocal.common_name',
	55		'display': 'Vocal Species Common Name',
	56		'open' : false},
	57		{'field': 'animal.vocal.scientific_name',
	58		'display': 'Vocal Species Scientific Name',
50	59	'open' : false},	'open' : false},
51	60	{'field': 'animal.vocal.species_code',	{'field': 'animal.vocal.species_code',
52		'display': 'Vocal Species Code',
	61		'display': 'Vocal Species Code',
53	62	'open' : false},	'open' : false},
54	63	{'field': 'animal.behavior.type_of',	{'field': 'animal.behavior.type_of',
55	64	'display': 'Behavior Type',	'display': 'Behavior Type',
56	65	'open' : false},	'open' : false},
57	66	{'field': 'animal.vocal.animal_id',	{'field': 'animal.vocal.animal_id',
58		'display': 'Vocal ID',
	67		'display': 'Vocal ID',
59	68	'open' : false},	'open' : false},
60		{'field': 'animal.interaction.type',
61		'display': 'Interaction Type',
	69		{'field': 'animal.interaction.type_of',
	70		'display': 'Interaction Type',
62	71	'open' : false},	'open' : false},
63		{'field': 'animal.profile.animal_id',
64		'display': 'ID',
	72		{'field': 'animal.profile.animal_id',
	73		'display': 'ID',
65	74	'open' : false},	'open' : false},
66		{'field': 'animal.profile.age',
67		'display': 'Age',
	75		{'field': 'animal.profile.age',
	76		'display': 'Age',
68	77	'open' : false},	'open' : false},
69		{'field': 'animal.profile.sex',
70		'display': 'Sex',
	78		{'field': 'animal.profile.sex',
	79		'display': 'Sex',
71	80	'open' : false},	'open' : false},
72		{'field': 'animal.profile.birth_year',
73		'display': 'Birth Year',
	81		{'field': 'animal.profile.birth_year',
	82		'display': 'Birth Year',
74	83	'open' : false},	'open' : false},
75	84
76	85	{'field': 'observation_date',	{'field': 'observation_date',

...	...	jQuery(document).ready(function($) {
83	92	'value_function': function(v) { return new Date(v).getFullYear() }	'value_function': function(v) { return new Date(v).getFullYear() }
84	93	},	},
85	94
86		{'field': 'location.name',
87		'display': 'Geo Location',
	95		{'field': 'location.name',
	96		'display': 'Geo Location',
88	97	'open' : false},	'open' : false},
89	98	/*	/*
90	99	{'field' : 'location.coordinates',	{'field' : 'location.coordinates',

...	...	jQuery(document).ready(function($) {
105	114	]	]
106	115	},	},
107	116	*/	*/
108		{'field': 'signal.class',
109		'display': 'Class',
	117		{'field': 'signal.class',
	118		'display': 'Class',
110	119	'open' : false},	'open' : false},
111		{'field': 'signal.overlap',
112		'display': 'Overlap',
	120		{'field': 'signal.overlap',
	121		'display': 'Overlap',
113	122	'open' : false},	'open' : false},
114		{'field': 'signal.quality',
115		'display': 'Quality',
	123		{'field': 'signal.quality',
	124		'display': 'Quality',
116	125	'open' : false},	'open' : false},
117		{'field': 'signal.source.name',
118		'display': 'Source',
	126		{'field': 'signal.source.name',
	127		'display': 'Source',
119	128	'open' : false},	'open' : false},
120		{'field': 'signal.source.order',
121		'display': 'Source Type',
	129		{'field': 'signal.source.order',
	130		'display': 'Source Type',
122	131	'open' : false},	'open' : false},
123	132
124		{'field': 'sound.sample_rate',
125		'display': 'Sample Rate',
	133		{'field': 'sound.sample_rate',
	134		'display': 'Sample Rate',
126	135	'open' : false},	'open' : false},
127		{'field': 'sound.channel.recorded',
128		'display': 'Channels Recorded',
	136		{'field': 'sound.channel.recorded',
	137		'display': 'Channels Recorded',
129	138	'open' : false},	'open' : false},
130	139	/*	/*
131		{'field': 'sound.freq.P1',
132		'display': 'Freq Initial Percentile',
	140		{'field': 'sound.freq.P1',
	141		'display': 'Freq Initial Percentile',
133	142	'type': 'range',	'type': 'range',
134	143	'size': false,	'size': false,
135	144	'hide_empty_range': true,	'hide_empty_range': true,

...	...	jQuery(document).ready(function($) {
151	160	],	],
152	161	},	},
153	162	*/	*/
154		{'field': 'sound.freq.IPR',
155		'display': 'Interpercentile Range (Frequency)',
	163		{'field': 'sound.freq.IPR',
	164		'display': 'Interpercentile Range (Frequency)',
156	165	'type': 'range',	'type': 'range',
157	166	'size': false,	'size': false,
158	167	'hide_empty_range': true,	'hide_empty_range': true,

...	...	jQuery(document).ready(function($) {
170	179	{"from" : 90.0, "display" : ">=90.0"}	{"from" : 90.0, "display" : ">=90.0"}
171	180	],	],
172	181	},	},
173		{'field': 'sound.time.IPR',
174		'display': 'Interpercentile Range (Time)',
	182		{'field': 'sound.time.IPR',
	183		'display': 'Interpercentile Range (Time)',
175	184	'type': 'range',	'type': 'range',
176	185	'size': false,	'size': false,
177	186	'hide_empty_range': true,	'hide_empty_range': true,

...	...	jQuery(document).ready(function($) {
217	226	debug: false,	debug: false,
218	227	//fields: ["_id", "animal.profile.animal_id"],	//fields: ["_id", "animal.profile.animal_id"],
219	228	render_results_metadata: pageSlider,	render_results_metadata: pageSlider,
220		"result_display" : [
	229		"result_display" : [
221	230	[ {"pre" : "<h4>Record Number: ", "field": "record_number", "post" : "</h4>"} ],	[ {"pre" : "<h4>Record Number: ", "field": "record_number", "post" : "</h4>"} ],
222	231	[ {"pre" : "<div><img class='' src='/spectro/", "field": "record_number", "post" : ".sox.png'/ >"} ],	[ {"pre" : "<div><img class='' src='/spectro/", "field": "record_number", "post" : ".sox.png'/ >"} ],
223	232	[ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".seewave.png'/ >"} ],	[ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".seewave.png'/ >"} ],
224	233	[ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".acoustat.png'/ ></div>"} ],	[ {"pre" : "<img class='' src='/spectro/", "field": "record_number", "post" : ".acoustat.png'/ ></div>"} ],
225		[ {"pre" : "<div><audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/",
226		"field": "record_number",
	234		[ {"pre" : "<div><audio controls preload='none'><source type='audio/wav' src='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/",
	235		"field": "record_number",
227	236	"post" : ".wav'>[ Audio Player ]</audio></div><table><tbody>"} ],	"post" : ".wav'>[ Audio Player ]</audio></div><table><tbody>"} ],
228	237	[ {"pre" : "<tr><th>Note</th><td>", "field": "note", "post" : "</td></tr>"} ],	[ {"pre" : "<tr><th>Note</th><td>", "field": "note", "post" : "</td></tr>"} ],
	238		[ {"pre" : "<tr><th>Species Common Name</th><td>", "field": "animal.species.0.common_name", "post" : "</td></tr>"} ],
	239		[ {"pre" : "<tr><th>Species Scientific Name</th><td>", "field": "animal.species.0.scientific_name", "post" : "</td></tr>"} ],
229	240	[ {"pre" : "<tr><th>Observation Date</th><td>", "field": "observation_date", "post" : "</td></tr>"} ],	[ {"pre" : "<tr><th>Observation Date</th><td>", "field": "observation_date", "post" : "</td></tr>"} ],
230	241	[ {"pre" : "<tr><th>Last modified Date</th><td>", "field": "last_modified_date", "post" : "</td></tr>"} ],	[ {"pre" : "<tr><th>Last modified Date</th><td>", "field": "last_modified_date", "post" : "</td></tr>"} ],
231	242	[ {"pre" : "<tr><th>Geographic location area name</th><td>", "field": "location.name", "post" : "</td></tr>"} ],	[ {"pre" : "<tr><th>Geographic location area name</th><td>", "field": "location.name", "post" : "</td></tr>"} ],

...	...	jQuery(document).ready(function($) {
243	254	[ {"pre" : "<tr><th>Terminal Percentile (Frequency)</th><td>", "field": "sound.freq.P2", "post" : "</td></tr>"} ],	[ {"pre" : "<tr><th>Terminal Percentile (Frequency)</th><td>", "field": "sound.freq.P2", "post" : "</td></tr>"} ],
244	255	[ {"pre" : "<tr><th>Frequency Median</th><td>", "field": "sound.freq.M", "post" : "</td></tr> "} ],	[ {"pre" : "<tr><th>Frequency Median</th><td>", "field": "sound.freq.M", "post" : "</td></tr> "} ],
245	256	[ {"pre" : "<tr><th>Interpercentile Range (Frequency)</th><td>", "field": "sound.freq.IPR", "post" : "</td></tr>"} ],	[ {"pre" : "<tr><th>Interpercentile Range (Frequency)</th><td>", "field": "sound.freq.IPR", "post" : "</td></tr>"} ],
246		[ {"pre" : "</tbody></table><p><a href='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/",
247		"field": "record_number",
	257		[ {"pre" : "</tbody></table><p><a href='https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/",
	258		"field": "record_number",
248	259	"post" : ".wav'>Wave Audio File (.wav) from whoi.edu</a></p>"} ],	"post" : ".wav'>Wave Audio File (.wav) from whoi.edu</a></p>"} ],
249	260	],	],
250	261	selected_filters_in_facet: false,	selected_filters_in_facet: false,

...	...	jQuery(document).ready(function($) {
253	264	});	});
254	265	});	});
255	266
256		$('#facetview_filter_animal_genus_name').each(function() {
	267		$('#facetview_filter_animal_species_common_name').each(function() {
257	268	$(this).before($('<h4>').text("Animal"));	$(this).before($('<h4>').text("Animal"));
258	269	});	});
259	270	$('#facetview_filter_observation_date').each(function() {	$('#facetview_filter_observation_date').each(function() {

...	...	header {
318	329	margin-bottom: .5em;	margin-bottom: .5em;
319	330	}	}
320	331
321		#facetview_filter_animal_genus_name,
322		#facetview_filter_animal_genus_species_code,
	332		#facetview_filter_animal_species_common_name,
	333		#facetview_filter_animal_species_scientific_name,
	334		#facetview_filter_animal_species_species_code,
323	335	#facetview_filter_animal_vocal_animal_id,	#facetview_filter_animal_vocal_animal_id,
324		#facetview_filter_animal_interaction_type,
	336		#facetview_filter_animal_vocal_common_name,
	337		#facetview_filter_animal_vocal_scientific_name,
	338		#facetview_filter_animal_interaction_type_of,
325	339	#facetview_filter_animal_behavior_type_of,	#facetview_filter_animal_behavior_type_of,
326	340	#facetview_filter_animal_vocal_species_code,	#facetview_filter_animal_vocal_species_code,
327	341	#facetview_filter_animal_profile_animal_id,	#facetview_filter_animal_profile_animal_id,

...	...	header {
340	354	#facetview_filter_signal_overlap,	#facetview_filter_signal_overlap,
341	355	#facetview_filter_signal_quality,	#facetview_filter_signal_quality,
342	356	#facetview_filter_signal_source_name,	#facetview_filter_signal_source_name,
343		#facetview_filter_signal_source_order
	357		#facetview_filter_signal_source_order
344	358	{	{
345	359	border-color: blue;	border-color: blue;
346	360	}	}

Commit b4c7dcb4042b9d1cd3da24123f4506113a876f02 - add common and scientific names to schema
Author: dleucas
Author date (UTC): 2021-12-18 17:17
Committer name: dleucas
Committer date (UTC): 2021-12-18 17:17
Parent(s): 2a7757a6c3e6dbb784b41d7a4b29381bc9105803
Signing key:
Tree: 0d95c74cf769d6887a87d0899fdcadb5a0986ab6

File	Lines added	Lines deleted
index.mapping.json	29	1

File index.mapping.json changed (mode: 100644) (index e3d62ee..9c20421)
31	31	}	}
32	32	}	}
33	33	},	},
	34		"species": {
	35		"properties": {
	36		"_as_entered": {
	37		"index": "not_analyzed",
	38		"type": "string"
	39		},
	40		"common_name": {
	41		"index": "not_analyzed",
	42		"type": "string"
	43		},
	44		"scientific_name": {
	45		"index": "not_analyzed",
	46		"type": "string"
	47		},
	48		"species_code": {
	49		"index": "not_analyzed",
	50		"type": "string"
	51		}
	52		}
	53		},
34	54	"interaction": {	"interaction": {
35	55	"properties": {	"properties": {
36	56	"animal_id": {	"animal_id": {
37	57	"index": "not_analyzed",	"index": "not_analyzed",
38	58	"type": "string"	"type": "string"
39	59	},	},
40		"type": {
	60		"type_of": {
41	61	"index": "not_analyzed",	"index": "not_analyzed",
42	62	"type": "string"	"type": "string"
43	63	}	}

70	90	"species_code": {	"species_code": {
71	91	"index": "not_analyzed",	"index": "not_analyzed",
72	92	"type": "string"	"type": "string"
	93		},
	94		"common_name": {
	95		"index": "not_analyzed",
	96		"type": "string"
	97		},
	98		"scientific_name": {
	99		"index": "not_analyzed",
	100		"type": "string"
73	101	}	}
74	102	}	}
75	103	}	}

Commit 2a7757a6c3e6dbb784b41d7a4b29381bc9105803 - remove genus, add species and type to type_of namespace change
Author: dleucas
Author date (UTC): 2021-12-18 17:16
Committer name: dleucas
Committer date (UTC): 2021-12-18 17:16
Parent(s): 9d37303b24abc225f11f9c3d3b622c5167ed43e3
Signing key:
Tree: b20592b03acf471eca6a562241b00d19d5f9f935

File	Lines added	Lines deleted
transform.jq	6	4

File transform.jq changed (mode: 100755) (index 3da8644..3079e97)
...	...	def as_animal_interaction:
110	110	# FCFB153 FCFB150	# FCFB153 FCFB150
111	111	# FCFB5 FCFB55	# FCFB5 FCFB55
112	112	# FCFB73 FCFB34	# FCFB73 FCFB34
113		capture("(?<type>[FMC]{2})(?<animal_id>FB\\d+)"; "g")//null;
	113		capture("(?<type_of>[FMC]{2})(?<animal_id>FB\\d+)"; "g")//null;
114	114
115	115	def as_animal_profile:	def as_animal_profile:
116	116	# age, sex and id, a animal profile	# age, sex and id, a animal profile

...	...	def as_animal_vocal:
198	198	# also trim space from resulting string	# also trim space from resulting string
199	199	{	{
200	200	animal_id: $s[0:.offset] \| trim,	animal_id: $s[0:.offset] \| trim,
201		species_code: .string \| trim
	201		species_code: .string \| trim,
	202		scientific_name: $s \| as_species_sci_name,
	203		common_name: $s \| as_species_common_name,
202	204	}) \|	}) \|
203	205	# if no object was created, use input as fallback	# if no object was created, use input as fallback
204	206	# this is for entries without a species code like "Keiko"	# this is for entries without a species code like "Keiko"

...	...	def as_sound_sample_rate:
430	432	# species code not always present, use input as fallback	# species code not always present, use input as fallback
431	433	behavior: .BH \| split("\|") \| map(as_animal_behavior),	behavior: .BH \| split("\|") \| map(as_animal_behavior),
432	434	# Genus name and species code	# Genus name and species code
433		genus: .GS \| split("\|") \| as_animal_genus,
	435		# genus: .GS \| split("\|") \| as_animal_genus,
434	436	# Species	# Species
435		# species: .GS \| split("\|") \| as_animal_species,
	437		species: .GS \| split("\|") \| as_animal_species,
436	438	}	}
437	439	}	}

Commit 9d37303b24abc225f11f9c3d3b622c5167ed43e3 - converted all transforms to functions
Author: dleucas
Author date (UTC): 2021-12-18 16:00
Committer name: dleucas
Committer date (UTC): 2021-12-18 16:00
Parent(s): fc4a8157a6902f4571b54c6ab84174f005adbe0d
Signing key:
Tree: 02e257217623d10d851fa1a9e8f7db8398ac5c3b

File	Lines added	Lines deleted
transform.jq	254	156

File transform.jq changed (mode: 100755) (index cd2aa89..3da8644)
1	1	#!/usr/bin/jq -fr	#!/usr/bin/jq -fr
2	2
3	3	# jq filter chain to transform flat source metadata into object structure.	# jq filter chain to transform flat source metadata into object structure.
4		# Source data combines multiple values into one field, so split that up
5		# also use native data types if possible.
	4		#
	5		# Each transformation is it's own function with documented input examples.
	6		#
	7		#
	8		# - Source data combines multiple values into one field with "\|"
	9		# - Use native JSON data types if possible
	10		# - Clean-up whitespace and normalize value formats
6	11
	12		# Mapping of species code to names
	13		# Extracted from WHOI website by `download.sh`
7	14	import "./data/species.sci.names" as $species_sci_names;	import "./data/species.sci.names" as $species_sci_names;
8	15	import "./data/species.common.names" as $species_common_names;	import "./data/species.common.names" as $species_common_names;
9	16
	17		#
	18		# helper functions
	19		#
	20
	21		def trim:
	22		# remove leading and trailing whitespace
	23		gsub("^\\s+\|\\s+$";"");
	24
	25		#
	26		# transform functions
	27		#
	28
10	29	# Convert Degree.Minute coordinates into decimal notation	# Convert Degree.Minute coordinates into decimal notation
11	30	def as_coord:	def as_coord:
12	31	# Example W073 or W70, degree only, negate	# Example W073 or W70, degree only, negate

...	...	def as_coord:
41	60	def as_date:	def as_date:
42	61	(capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \| strptime("%d-%B-%Y") \| todateiso8601)//null;	(capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \| strptime("%d-%B-%Y") \| todateiso8601)//null;
43	62
44		def as_signal_overlap:
45		({
46		"OF": "Frequency",
47		"OT": "Time",
48		"OTF": "Time and Frequency",
49		"N": "No"
50		} as $overlap_type \| capture("(?<o>O[TF]{1,2}\|N)") \| $overlap_type[.o]?)//null ;
	63		def as_location_name:
	64		# Location Name
	65		# Remove species code and whitespace
	66		#
	67		# Example source data:
	68		# 2.25 mi. west of Castle Rock, McMurdo Sound, Antarctica CC5A
	69		# 20 mi. NW Gambell, St. Lawrence Island, Alaska CC2A X
	70		# Castle Harbour, Bermuda AC2A
	71		#
	72		# TODO improve clean-up
	73		# jq -r '.GB \| split("\|")[]' data/rn/json\| sort -u \| grep -P '(\s+)?[A-D][A-Z]\d+[A-Z](\s+)?\|([\sXO]$)'
	74		gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?\|(X$)"; ""; "gm");
	75
	76		def as_location_coordinates:
	77		# Example source data
	78		# N10BD15A W086BD15A
	79		# N13BA2A W061BA2A
	80		# N13BA2A W061BA2A
	81		# N13BD15B W061BD15B
	82		# N14X W061X
	83		# N75BB2A W075BB2A approx
	84		# S52BD1A W070BD1A
	85		# S71CC14A E170CC14A
	86		map(capture("(?<lat>[NS]{1}\\d{1,4})[A-Z]{1,2}(\\d{1,2})?([A-Z]{1})?\\s+(?<lon>[EW]{1}\\d{1,5})")) \|
	87		map({ lat: (.lat \| as_coord), lon: (.lon \| as_coord) });
51	88
52	89	def as_species_code:	def as_species_code:
53	90	(capture("(?<code>[A-C][A-Z]\\d+[A-Z])") \| .code)//null;	(capture("(?<code>[A-C][A-Z]\\d+[A-Z])") \| .code)//null;

...	...	def as_species_common_name:
58	95	def as_species_sci_name:	def as_species_sci_name:
59	96	as_species_code \| $species_sci_names[0][.?];	as_species_code \| $species_sci_names[0][.?];
60	97
	98		#
	99		# Animal
	100		#
	101
61	102	def as_animal_interaction:	def as_animal_interaction:
62	103	# interaction between animals	# interaction between animals
63	104	# always a pair, and multiple sets of pairs are possible	# always a pair, and multiple sets of pairs are possible

...	...	def as_animal_behavior:
119	160	. as $b \| match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false \|	. as $b \| match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false \|
120	161	if . then	if . then
121	162	{	{
122		type_of: ($b[0:.offset] \| gsub("^\\s+\|\\s+$";"") \| if (.\|length) > 0 then . else null end),
123		species_code: .string \| gsub("^\\s+\|\\s+$";"")
	163		type_of: ($b[0:.offset] \| trim \| if (.\|length) > 0 then . else null end),
	164		species_code: .string \| trim
124	165	}	}
125	166	# fallback without species code	# fallback without species code
126	167	else	else
127		{ type_of: $b \| gsub("^\\s+\|\\s+$";"") }
	168		{ type_of: $b \| trim }
128	169	end;	end;
129
	170
	171		def as_animal_vocal:
	172		# List of vocal animals, name and species code
	173		# All existing entries:
	174		# FB145 #?? BD19D
	175		# FB147 #?? BD19D
	176		# FB150 #?? BD19D
	177		# FB153 #50 Blacktip Doubledip BD19D
	178		# FB34 #30 Wee Willie BD19D
	179		# FB55 #159 BD19D
	180		# FB5 #5 BD19D
	181		# FB73 #35 BD19D
	182		# Keiko
	183		# Keiko BE7A
	184		# Minks BF2A \| Jinks BF2A
	185		# Moby Doll
	186		# Moby Doll BE7A
	187		# Olaf CB1A
	188		# Snoopy BA2A
	189		# The lark BE3B
	190		# Wolfie CB1A \| Farouk CB1A
	191		#
	192		# create array with objects for each animal
	193		# save input as fallback and split by \|
	194		. as $input \| $input \| split("\|") \|
	195		# try to match species code
	196		map(. as $s \| match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") \|
	197		# create object, anything before matched species code is id
	198		# also trim space from resulting string
	199		{
	200		animal_id: $s[0:.offset] \| trim,
	201		species_code: .string \| trim
	202		}) \|
	203		# if no object was created, use input as fallback
	204		# this is for entries without a species code like "Keiko"
	205		if (. == [] and ($input\|length)>0 ) then [{animal_id: $input}] else . end;
	206
	207		def as_animal_genus:
	208		map(. as $s \| match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") \|
	209		{
	210		name: $s[0:.offset] \| trim,
	211		species_code: .string \| trim
	212		});
	213
	214		def as_animal_species:
	215		map(. as $s \|
	216		{
	217		_as_noted: $s \| trim,
	218		species_code: $s \| as_species_code,
	219		scientific_name: $s \| as_species_sci_name,
	220		common_name: $s \| as_species_common_name,
	221		});
	222
	223		#
	224		# Signal
	225		#
	226
	227		# Cue field contains 3 values describing the postion on tape
	228		# Example input from the docu
	229		# 542 B2:8 8.130
	230		# 1:03:12 B2:8 8.130
	231		# however, following formats are also found
	232		# 0:00:00 B30:00 10:20.602
	233		# 995 B11:28.497 5:20.426
	234		# 96 B4.00 1.525
	235		# 93 B23.7 9.164
	236		# 93 B3:00 2:13.828
	237		# 01:52:52:04
	238		# 09:11:00 20:00 951.50
	239		# 0 B2:00:00
	240
	241		def as_signal_position_cue:
	242		# "cue" as in a first matched single integer,
	243		# without dot or colon followed by space or end of string
	244		# do not use \b because of the colon in 00:00 values
	245		capture("(?<c>^\\d+(\\s\|$))") \| {"cue": (.c \| tonumber)};
	246
	247		def as_signal_position_time:
	248		# "time" as in first matched integer with 2 or 3 colons
	249		# followed by space or end of string
	250		capture("(?<time>^\\d+:\\d+:\\d+(:\\d+)?(\\s\|$))");
	251
	252		def as_signal_position_analyzer_buffer_size:
	253		# buffer size, B followed by integer with colon or dot,
	254		# also remove B prefix
	255		# TODO match 2 colon version
	256		capture("(?<analyzer_buffer_size>(?<=B)\\d+[:\\.]\\d+(\\.\\d+)?)");
	257
	258		# Signal class encodes multiple values, quality, overlap and class
	259		#
	260		# it's only been used 123 times
	261		#
	262		# Example source data:
	263		# 3 OT
	264		# 3 OTF
	265		# C 4 OF
	266		# D
	267		# M
	268		# N
	269		# No
	270		# NO
	271		# OF
	272		# OT
	273		# OTF
	274		# OTF 3
	275		# OTF 4
	276		# S
	277		# S 5
	278		# U
	279		# V
	280
130	281	def as_signal_quality:	def as_signal_quality:
	282		# any digit in the signal class indicates quality
131	283	(capture("(?<q>\\d+)") \| .q \| tonumber)//null;	(capture("(?<q>\\d+)") \| .q \| tonumber)//null;
132	284
133	285	def as_signal_class:	def as_signal_class:

...	...	def as_signal_class:
141	293	"C": "Calf"	"C": "Calf"
142	294	} as $class_names \| capture("(?<c>[SMVDUC]{1})") \| $class_names[.c]?)//null;	} as $class_names \| capture("(?<c>[SMVDUC]{1})") \| $class_names[.c]?)//null;
143	295
	296		def as_signal_overlap:
	297		({
	298		"OF": "Frequency",
	299		"OT": "Time",
	300		"OTF": "Time and Frequency",
	301		"N": "No"
	302		} as $overlap_type \| capture("(?<o>O[TF]{1,2}\|N)") \| $overlap_type[.o]?)//null ;
	303
	304		def as_signal_cut_size:
	305		# Signal cut size
	306		#
	307		# Example source data:
	308		# 3.36
	309		# 9.411
	310		# 16.564
	311		# 20.35
	312		# etc
	313		# only 210 records use a different format, ignored for now
	314		# 2:00.000
	315		# 1:00.030
	316		# 10:25.540
	317		# 1:25.158
	318		# etc.
	319		# set to null if empty or contains a colon
	320		if (. \| contains(":") or (length == 0)) then
	321		null
	322		else
	323		# cast as number and handle a few remaining badly formated
	324		# records like "0.2.95"
	325		(try (. \| tonumber) catch null)
	326		end;
	327
	328		def as_signal_source:
	329		# Other general sound producing sources listed in genus field
	330		#
	331		# Example source data:
	332		# Transient ship noise X
	333		# Ship electrical noise X
	334		# Rain X
	335		# Homo sapiens E
	336		# Crustacea O
	337		map(. as $s \| match("\\s+[E-Z]{1}(\\s+)?$"; "m") \|
	338		{
	339		"E": "Primates",
	340		"O": "Crustacea",
	341		"T": "Fossils",
	342		"U": "Uncertain",
	343		"V": "General pinniped",
	344		"W": "General cetacean",
	345		"X": "Ambient noise"
	346		} as $order \|
	347		{
	348		name: $s[0:.offset] \| trim,
	349		# not sort order
	350		order: $order[.string \| trim]
	351		});
	352
144	353	def as_sound_channel:	def as_sound_channel:
145	354	# numbers of channels	# numbers of channels
146	355	# input data mostly follows the documentation:	# input data mostly follows the documentation:

...	...	def as_sound_channel:
152	361	# not clear what other input values mean exactly	# not clear what other input values mean exactly
153	362	# 211	# 211
154	363	(capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") \|	(capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") \|
155		{
156		recorded: .r \| tonumber,
	364		{
	365		recorded: .r \| tonumber,
157	366	multiplexed: .m \| tonumber,	multiplexed: .m \| tonumber,
158	367	side: .s	side: .s
159	368	})//null;	})//null;
160	369
	370
	371		def as_sound_sample_rate:
	372		# plain sample rate as number
	373		# remove dot or colon, and ignore empty strings
	374		#
	375		# Example source data:
	376		# 1000
	377		# 10,000
	378		# 10000
	379		# 100000
	380		# 10200
	381		if (. \| length > 0) then . \| sub("[\\.,]"; "") \| tonumber else null end;
	382
	383		#
	384		# Assemble the object tree
	385		#
	386
161	387	# root	# root
162	388	{	{
163	389	# record number is unique, can be used as _id	# record number is unique, can be used as _id
164	390	record_number: .RN,	record_number: .RN,
165	391	note: .NT,	note: .NT,
166		# a lot of noise in the original field, only parsing date
	392		# a lot of noise in the "OD" original field, only parsing date
167	393	observation_date: .OD \| as_date,	observation_date: .OD \| as_date,
168	394	last_modified_date: .DA \| as_date,	last_modified_date: .DA \| as_date,
169	395	location: {	location: {
170		name: .GB \| split("\|") \| map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?\|(X$)"; ""; "gm")),
171		coordinates: .GC \| split("\|")
172		\| map(capture("(?<lat>[NS]{1}\\d{1,4})[A-Z]{1,2}(\\d{1,2})?([A-Z]{1})?\\s+(?<lon>[EW]{1}\\d{1,5})"))
173		\| map({ lat: (.lat \| as_coord), lon: (.lon \| as_coord) })
	396		name: .GB \| split("\|") \| map(as_location_name),
	397		coordinates: .GC \| split("\|") \| as_location_coordinates
174	398	},	},
175	399	# object contains properties of the captured signal	# object contains properties of the captured signal
176	400	signal: {	signal: {
177		# create a list of JSON objects and add them together
178
179		# Cue field contains 3 values describing the postion on tape
180		# Example input from the docu
181		# 542 B2:8 8.130
182		# 1:03:12 B2:8 8.130
183		# however, following formats are also found
184		# 0:00:00 B30:00 10:20.602
185		# 995 B11:28.497 5:20.426
186		# 96 B4.00 1.525
187		# 93 B23.7 9.164
188		# 93 B3:00 2:13.828
189		# 01:52:52:04
190		# 09:11:00 20:00 951.50
191		# 0 B2:00:00
192	401	position: [	position: [
193	402	# keep the source string as reference?	# keep the source string as reference?
194	403	{_source_cu: .CU},	{_source_cu: .CU},
195
196		# "cue" as in a first matched single integer,
197		# without dot or colon followed by space or end of string
198		# do not use \b because of the colon in 00:00 values
199		(.CU \| capture( "(?<c>^\\d+(\\s\|$))" ) \| {cue: .c\|tonumber } ),
200
201		# "time" as in first matched integer with 2 or 3 colons
202		# followed by space or end of string
203		(.CU \| capture( "(?<time>^\\d+:\\d+:\\d+(:\\d+)?(\\s\|$))" ) ),
204
205		# buffer size, B followed by integer with colon or dot,
206		# also remove B prefix
207		# TODO match 2 colon version
208		(.CU \| capture("(?<analyzer_buffer_size>(?<=B)\\d+[:\\.]\\d+(\\.\\d+)?)") )
	404		(.CU \| as_signal_position_cue),
	405		(.CU \| as_signal_position_time),
	406		(.CU \| as_signal_position_analyzer_buffer_size )
209	407	] \| add,	] \| add,
210		# cut size
211		# 3.36
212		# 9.411
213		# 16.564
214		# 20.35
215		# etc
216		# only 210 records use a different format, ignored for now
217		# 2:00.000
218		# 1:00.030
219		# 10:25.540
220		# 1:25.158
221		# etc.
222		cut_size: (
223		# set to null if empty or contains a colon
224		if (.CS \| contains(":") or (length == 0)) then
225		null
226		else
227		# cast as number and handle a few remaining badly formated
228		# records like "0.2.95"
229		(try (.CS \| tonumber) catch null)
230		end
231		),
232		# any digit in the signal class indicates quality
233		# it's only been used 123 times
	408		cut_size: .CS \| as_signal_cut_size,
234	409	_source_sc: .SC,	_source_sc: .SC,
235	410	quality: .SC \| as_signal_quality,	quality: .SC \| as_signal_quality,
236	411	class: .SC \| as_signal_class,	class: .SC \| as_signal_class,
237	412	overlap: .SC \| as_signal_overlap,	overlap: .SC \| as_signal_overlap,
238		# other general sound producing sources listed in genus field
239		source: ( .GS \| split("\|") \|
240		map(. as $s \| match("\\s+[E-Z]{1}(\\s+)?$"; "m") \|
241		{
242		"E": "Primates",
243		"O": "Crustacea",
244		"T": "Fossils",
245		"U": "Uncertain",
246		"V": "General pinniped",
247		"W": "General cetacean",
248		"X": "Ambient noise"
249		} as $order \|
250		{
251		name: $s[0:.offset] \| gsub("^\\s+\|\\s+$";""),
252		# not sort order
253		order: $order[.string \| gsub("^\\s+\|\\s+$";"")]
254		})
255		)
	413		source: .GS \| split("\|") \| as_signal_source,
256	414	},	},
257	415	sound: {	sound: {
258		# plain sample rate as number, however not normalized in digit length
259		# remove dot or colon, and ignore empty strings
260		# a bit difficult to tell what is hz and what khz
261		sample_rate: (
262		if (.SR \| length > 0) then
263		.SR \| sub("[\\.,]"; "") \| tonumber
264		else
265		null
266		end
267		),
	416		sample_rate: .SR \| as_sound_sample_rate,
268	417	channel: [	channel: [
269		{"_source_nc": .NC },
	418		{"_source_nc": .NC },
270	419	(.NC \| as_sound_channel)	(.NC \| as_sound_channel)
271	420	] \| add	] \| add
272	421	},	},
273	422	animal: {	animal: {
274	423	_source_id: .ID,	_source_id: .ID,
275	424	# List of vocal animals, name and species code	# List of vocal animals, name and species code
276		# All existing entries:
277		# FB145 #?? BD19D
278		# FB147 #?? BD19D
279		# FB150 #?? BD19D
280		# FB153 #50 Blacktip Doubledip BD19D
281		# FB34 #30 Wee Willie BD19D
282		# FB55 #159 BD19D
283		# FB5 #5 BD19D
284		# FB73 #35 BD19D
285		# Keiko
286		# Keiko BE7A
287		# Minks BF2A \| Jinks BF2A
288		# Moby Doll
289		# Moby Doll BE7A
290		# Olaf CB1A
291		# Snoopy BA2A
292		# The lark BE3B
293		# Wolfie CB1A \| Farouk CB1A
294		#
295		# create array with objects for each animal
296		# save input as fallback and split by \|
297		vocal: ( .ID as $input \| $input \| split("\|") \|
298		# try to match species code
299		map(. as $s \| match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") \|
300		# create object, anything before matched species code is id
301		# also trim space from resulting string
302		{
303		animal_id: $s[0:.offset] \| gsub("^\\s+\|\\s+$";""),
304		species_code: .string \| gsub("^\\s+\|\\s+$";"")
305		}) \|
306		# if no object was created, use input as fallback
307		# this is for entries without a species code like "Keiko"
308		if (. == [] and ($input\|length)>0 ) then
309		[{animal_id: $input}]
310		else
311		.
312		end
313		),
	425		vocal: .ID \| as_animal_vocal,
314	426	# age, sex and id, a animal profile	# age, sex and id, a animal profile
315	427	profile: .AG \| as_animal_profile,	profile: .AG \| as_animal_profile,
316	428	# interaction between animals	# interaction between animals
317	429	interaction: .IA \| split("\|") \| map([as_animal_interaction]),	interaction: .IA \| split("\|") \| map([as_animal_interaction]),
318		# behavior type and species code
319	430	# species code not always present, use input as fallback	# species code not always present, use input as fallback
320	431	behavior: .BH \| split("\|") \| map(as_animal_behavior),	behavior: .BH \| split("\|") \| map(as_animal_behavior),
321	432	# Genus name and species code	# Genus name and species code
322		genus: ( .GS \| split("\|") \|
323		map(. as $s \| match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") \|
324		{
325		name: $s[0:.offset] \| gsub("^\\s+\|\\s+$";""),
326		species_code: .string \| gsub("^\\s+\|\\s+$";"")
327		})
328		)
	433		genus: .GS \| split("\|") \| as_animal_genus,
329	434	# Species	# Species
330		#species: .GS \| split("\|") \|
331		# map(. as $s \|
332		# {
333		# _as_noted: $s \| gsub("^\\s+\|\\s+$";""),
334		# species_code: $s \| as_species_code,
335		# scientific_name: $s \| as_species_sci_name,
336		# common_name: $s \| as_species_common_name,
337		# })
	435		# species: .GS \| split("\|") \| as_animal_species,
338	436	}	}
339	437	}	}

Commit fc4a8157a6902f4571b54c6ab84174f005adbe0d - more conversion to functions. match old output for now
Author: dleucas
Author date (UTC): 2021-12-18 10:26
Committer name: dleucas
Committer date (UTC): 2021-12-18 10:26
Parent(s): 32badc3512dd9094d51ba2cc2ef8112eba2698bf
Signing key:
Tree: 4670619fe3d8629ba4bccf529c8e2055ce108550

File	Lines added	Lines deleted
transform.jq	125	98

File transform.jq changed (mode: 100755) (index 16bc0b2..cd2aa89)
...	...	def as_coord:
39	39	end;	end;
40	40
41	41	def as_date:	def as_date:
42		capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \| strptime("%d-%B-%Y") \| todateiso8601;
	42		(capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \| strptime("%d-%B-%Y") \| todateiso8601)//null;
43	43
44	44	def as_signal_overlap:	def as_signal_overlap:
45		{
	45		({
46	46	"OF": "Frequency",	"OF": "Frequency",
47	47	"OT": "Time",	"OT": "Time",
48	48	"OTF": "Time and Frequency",	"OTF": "Time and Frequency",
49	49	"N": "No"	"N": "No"
50		} as $overlap_type \| capture("(?<o>O[TF]{1,2}\|N)") \| $overlap_type[.o]?;
	50		} as $overlap_type \| capture("(?<o>O[TF]{1,2}\|N)") \| $overlap_type[.o]?)//null ;
51	51
52	52	def as_species_code:	def as_species_code:
53		capture("(?<code>[A-C][A-Z]\\d+[A-Z])") \| .code;
	53		(capture("(?<code>[A-C][A-Z]\\d+[A-Z])") \| .code)//null;
54	54
55	55	def as_species_common_name:	def as_species_common_name:
56	56	as_species_code \| $species_common_names[0][.?];	as_species_code \| $species_common_names[0][.?];

...	...	def as_species_common_name:
58	58	def as_species_sci_name:	def as_species_sci_name:
59	59	as_species_code \| $species_sci_names[0][.?];	as_species_code \| $species_sci_names[0][.?];
60	60
	61		def as_animal_interaction:
	62		# interaction between animals
	63		# always a pair, and multiple sets of pairs are possible
	64		# [[{}, {}]] or [[{} {}], [{} {}], ...]
	65		#
	66		# Example source data:
	67		# FCFB147 FCFB145
	68		# FCFB147 FCFB145 \| FFFB147 FFFB149 \| FFFB145 FFFB149 \| FCFB153 FCFB150
	69		# FCFB153 FCFB150
	70		# FCFB5 FCFB55
	71		# FCFB73 FCFB34
	72		capture("(?<type>[FMC]{2})(?<animal_id>FB\\d+)"; "g")//null;
	73
	74		def as_animal_profile:
	75		# age, sex and id, a animal profile
	76		# ignoring species code
	77		#
	78		# Example source data:
	79		# F03FB55 F1986FB55
	80		# F26FB5 F1963FB5
	81		# F??FB145 F????FB145
	82		# F??FB147 F????FB147
	83		# F??FB153 F????FB153
	84		# F??FB73 F????FB73
	85		# F??FB73 F????FB7370
	86		# M05FB150 M1984FB150
	87		# M17Keiko M1975Keiko BE7A
	88		# M17Keiko M1975Keiko BE7A
	89		# M??FB34 M????FB34
	90		# M??FB73 M????FB73
	91		capture("^(?<sex>[FM])" +
	92		"(?<age>[\\?\\d]{2})" +
	93		"(?<animal_id>(FB\\d+\|\\w+))" +
	94		"\\s+" +
	95		"[FM](?<birth_year>[\\d\\?]{4})")//null \|
	96		{"F": "Female", "M": "Male"} as $sex \|
	97		{
	98		sex: (if (.sex != null) then $sex[.sex] else null end),
	99		age: (try (.age \| tonumber) catch null),
	100		animal_id: .animal_id,
	101		birth_year: (try (.birth_year \| tonumber) catch null)
	102		};
	103
	104		def as_animal_behavior:
	105		# Behavior of the recorded animal with species code
	106		# species code not always present, use input as fallback
	107		#
	108		# Example source data:
	109		# Approaching ship BA2A
	110		# BA2A A few larger whales seen mixed with others
	111		# BE7A
	112		# Bow riding BD17A
	113		# Courtship CB1A
	114		# Dive BA2A
	115		# Feeding AA3A
	116
	117		# find the species code position and use the text before as behavior
	118		# match() returns "empty" which we can not test with if
	119		. as $b \| match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false \|
	120		if . then
	121		{
	122		type_of: ($b[0:.offset] \| gsub("^\\s+\|\\s+$";"") \| if (.\|length) > 0 then . else null end),
	123		species_code: .string \| gsub("^\\s+\|\\s+$";"")
	124		}
	125		# fallback without species code
	126		else
	127		{ type_of: $b \| gsub("^\\s+\|\\s+$";"") }
	128		end;
	129
	130		def as_signal_quality:
	131		(capture("(?<q>\\d+)") \| .q \| tonumber)//null;
	132
	133		def as_signal_class:
	134		# class name lookup table
	135		({
	136		"S": "Signature",
	137		"M": "Mimic",
	138		"V": "Variant",
	139		"D": "Deletion",
	140		"U": "Uncharacteristic",
	141		"C": "Calf"
	142		} as $class_names \| capture("(?<c>[SMVDUC]{1})") \| $class_names[.c]?)//null;
	143
	144		def as_sound_channel:
	145		# numbers of channels
	146		# input data mostly follows the documentation:
	147		# 11A
	148		# 11B
	149		# 41D
	150		# 21L
	151		# regex will match only those
	152		# not clear what other input values mean exactly
	153		# 211
	154		(capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") \|
	155		{
	156		recorded: .r \| tonumber,
	157		multiplexed: .m \| tonumber,
	158		side: .s
	159		})//null;
	160
61	161	# root	# root
62	162	{	{
63	163	# record number is unique, can be used as _id	# record number is unique, can be used as _id

...	...	def as_species_sci_name:
131	231	),	),
132	232	# any digit in the signal class indicates quality	# any digit in the signal class indicates quality
133	233	# it's only been used 123 times	# it's only been used 123 times
134		# note enclosing [] instead of (), otherwise capture() will remove
135		# non-matching items
136	234	_source_sc: .SC,	_source_sc: .SC,
137		quality: [ .SC \| capture("(?<q>\\d+)") \| .q \| tonumber ] \| .[0],
138		class: [
139		# class name lookup table
140		{
141		"S": "Signature",
142		"M": "Mimic",
143		"V": "Variant",
144		"D": "Deletion",
145		"U": "Uncharacteristic",
146		"C": "Calf"
147		} as $class_names \|
148		[ .SC \| capture("(?<c>[SMVDUC]{1})") ] \| $class_names[.[0].c]?
149		] \| .[0],
	235		quality: .SC \| as_signal_quality,
	236		class: .SC \| as_signal_class,
150	237	overlap: .SC \| as_signal_overlap,	overlap: .SC \| as_signal_overlap,
151	238	# other general sound producing sources listed in genus field	# other general sound producing sources listed in genus field
152	239	source: ( .GS \| split("\|") \|	source: ( .GS \| split("\|") \|

...	...	def as_species_sci_name:
178	265	null	null
179	266	end	end
180	267	),	),
181		# numbers of channels
182		# input data mostly follows the documentation:
183		# 11A
184		# 11B
185		# 41D
186		# 21L
187		# regex will match only those
188		# not clear what other input values mean exactly
189		# 211
190	268	channel: [	channel: [
191		{_source_nc: .NC},
192		(
193		.NC \|
194		capture("^(?<r>\\d)(?<m>\\d)(?<s>[A-L]$)") \|
195		{
196		recorded: .r \| tonumber,
197		multiplexed: .m \| tonumber,
198		side: .s
199		}
200		)
	269		{"_source_nc": .NC },
	270		(.NC \| as_sound_channel)
201	271	] \| add	] \| add
202	272	},	},
203	273	animal: {	animal: {

...	...	def as_species_sci_name:
242	312	end	end
243	313	),	),
244	314	# age, sex and id, a animal profile	# age, sex and id, a animal profile
245		# source data, ignoring species code
246		# F03FB55 F1986FB55
247		# F26FB5 F1963FB5
248		# F??FB145 F????FB145
249		# F??FB147 F????FB147
250		# F??FB153 F????FB153
251		# F??FB73 F????FB73
252		# F??FB73 F????FB7370
253		# M05FB150 M1984FB150
254		# M17Keiko M1975Keiko BE7A
255		# M17Keiko M1975Keiko BE7A
256		# M??FB34 M????FB34
257		# M??FB73 M????FB73
258		profile: [.AG \|
259		capture("^(?<sex>[FM])" +
260		"(?<age>[\\?\\d]{2})" +
261		"(?<animal_id>(FB\\d+\|\\w+))" +
262		"\\s+" +
263		"[FM](?<birth_year>[\\d\\?]{4})")
264		] \| .[0] \|
265		(
266		{"F": "Female", "M": "Male"} as $sex \|
267		{
268		sex: (if (.sex != null) then $sex[.sex] else null end),
269		age: (try (.age \| tonumber) catch null),
270		animal_id: .animal_id,
271		birth_year: (try (.birth_year \| tonumber) catch null)
272		}
273		),
	315		profile: .AG \| as_animal_profile,
274	316	# interaction between animals	# interaction between animals
275		# always a pair, and multiple sets of pairs are possible
276		# [[{}, {}]] or [[{} {}], [{} {}], ...]
277		# source data
278		# FCFB147 FCFB145
279		# FCFB147 FCFB145 \| FFFB147 FFFB149 \| FFFB145 FFFB149 \| FCFB153 FCFB150
280		# FCFB153 FCFB150
281		# FCFB5 FCFB55
282		# FCFB73 FCFB34
283		interaction: ( .IA \| split("\|") \|
284		map([capture("(?<type>[FMC]{2})(?<animal_id>FB\\d+)"; "g")])
285		),
	317		interaction: .IA \| split("\|") \| map([as_animal_interaction]),
286	318	# behavior type and species code	# behavior type and species code
287	319	# species code not always present, use input as fallback	# species code not always present, use input as fallback
288		behavior: ( .BH \| split("\|") \|
289		# match() returns "empty" which we can not test with if
290		map(. as $b \| match("[A-C][A-Z]\\d+[A-Z]([\\s\\.]+)?$"; "m")//false \|
291		if . then
	320		behavior: .BH \| split("\|") \| map(as_animal_behavior),
	321		# Genus name and species code
	322		genus: ( .GS \| split("\|") \|
	323		map(. as $s \| match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") \|
292	324	{	{
293		type_of: ($b[0:.offset] \| gsub("^\\s+\|\\s+$";"") \| if (.\|length) > 0 then . else null end),
	325		name: $s[0:.offset] \| gsub("^\\s+\|\\s+$";""),
294	326	species_code: .string \| gsub("^\\s+\|\\s+$";"")	species_code: .string \| gsub("^\\s+\|\\s+$";"")
295		}
296		# fallback without species code
297		else
298		{ type_of: $b \| gsub("^\\s+\|\\s+$";"") }
299		end
300		)
301		),
302		# Genus
303		species: .GS \| split("\|") \|
304		map(. as $s \|
305		{
306		_as_noted: $s \| gsub("^\\s+\|\\s+$";""),
307		species_code: $s \| as_species_code,
308		scientific_name: $s \| as_species_sci_name,
309		common_name: $s \| as_species_common_name,
310	327	})	})
	328		)
	329		# Species
	330		#species: .GS \| split("\|") \|
	331		# map(. as $s \|
	332		# {
	333		# _as_noted: $s \| gsub("^\\s+\|\\s+$";""),
	334		# species_code: $s \| as_species_code,
	335		# scientific_name: $s \| as_species_sci_name,
	336		# common_name: $s \| as_species_common_name,
	337		# })
311	338	}	}
312	339	}	}

Commit 32badc3512dd9094d51ba2cc2ef8112eba2698bf - WIP convert filters to functions
Author: dleucas
Author date (UTC): 2021-12-16 18:33
Committer name: dleucas
Committer date (UTC): 2021-12-16 18:33
Parent(s): e755dc7f4fe2d7c9b97826a0f3f2cf5385e90ef9
Signing key:
Tree: 48b4fd03c27e16ff57d5ab82c8e15f4616703415

File	Lines added	Lines deleted
transform.jq	33	24

File transform.jq changed (mode: 100755) (index 0016ceb..16bc0b2)
4	4	# Source data combines multiple values into one field, so split that up	# Source data combines multiple values into one field, so split that up
5	5	# also use native data types if possible.	# also use native data types if possible.
6	6
	7		import "./data/species.sci.names" as $species_sci_names;
	8		import "./data/species.common.names" as $species_common_names;
	9
7	10	# Convert Degree.Minute coordinates into decimal notation	# Convert Degree.Minute coordinates into decimal notation
8	11	def as_coord:	def as_coord:
9	12	# Example W073 or W70, degree only, negate	# Example W073 or W70, degree only, negate

...	...	def as_coord:
35	38	null	null
36	39	end;	end;
37	40
	41		def as_date:
	42		capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \| strptime("%d-%B-%Y") \| todateiso8601;
	43
	44		def as_signal_overlap:
	45		{
	46		"OF": "Frequency",
	47		"OT": "Time",
	48		"OTF": "Time and Frequency",
	49		"N": "No"
	50		} as $overlap_type \| capture("(?<o>O[TF]{1,2}\|N)") \| $overlap_type[.o]?;
	51
	52		def as_species_code:
	53		capture("(?<code>[A-C][A-Z]\\d+[A-Z])") \| .code;
	54
	55		def as_species_common_name:
	56		as_species_code \| $species_common_names[0][.?];
	57
	58		def as_species_sci_name:
	59		as_species_code \| $species_sci_names[0][.?];
	60
38	61	# root	# root
39	62	{	{
40	63	# record number is unique, can be used as _id	# record number is unique, can be used as _id
41	64	record_number: .RN,	record_number: .RN,
42	65	note: .NT,	note: .NT,
43	66	# a lot of noise in the original field, only parsing date	# a lot of noise in the original field, only parsing date
44		observation_date: [
45		.OD \| capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \|
46		strptime("%d-%B-%Y") \| todateiso8601
47		] \| .[0],
48		last_modified_date: [
49		.DA \| capture("^(?<date>\\d{1,2}-\\w{3}-\\d{4})") \| .date \|
50		strptime("%d-%B-%Y") \| todateiso8601
51		] \| .[0],
	67		observation_date: .OD \| as_date,
	68		last_modified_date: .DA \| as_date,
52	69	location: {	location: {
53	70	name: .GB \| split("\|") \| map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?\|(X$)"; ""; "gm")),	name: .GB \| split("\|") \| map(gsub("(\\s+)?[A-D][A-Z]\\d+[A-Z](\\s+)?\|(X$)"; ""; "gm")),
54	71	coordinates: .GC \| split("\|")	coordinates: .GC \| split("\|")

...	...	def as_coord:
130	147	} as $class_names \|	} as $class_names \|
131	148	[ .SC \| capture("(?<c>[SMVDUC]{1})") ] \| $class_names[.[0].c]?	[ .SC \| capture("(?<c>[SMVDUC]{1})") ] \| $class_names[.[0].c]?
132	149	] \| .[0],	] \| .[0],
133		overlap: [
134		# overlap lookup table
135		{
136		"OF": "Frequency",
137		"OT": "Time",
138		"OTF": "Time and Frequency",
139		"N": "No"
140		} as $overlap_type \|
141		[ .SC \| capture("(?<o>O[TF]{1,2}\|N)") ] \| $overlap_type[.[0].o]?
142		] \| .[0],
	150		overlap: .SC \| as_signal_overlap,
143	151	# other general sound producing sources listed in genus field	# other general sound producing sources listed in genus field
144	152	source: ( .GS \| split("\|") \|	source: ( .GS \| split("\|") \|
145	153	map(. as $s \| match("\\s+[E-Z]{1}(\\s+)?$"; "m") \|	map(. as $s \| match("\\s+[E-Z]{1}(\\s+)?$"; "m") \|

...	...	def as_coord:
291	299	end	end
292	300	)	)
293	301	),	),
294		# Genus name and species code
295		genus: ( .GS \| split("\|") \|
296		map(. as $s \| match("[A-C][A-Z]\\d+[A-Z](\\s+)?$"; "m") \|
	302		# Genus
	303		species: .GS \| split("\|") \|
	304		map(. as $s \|
297	305	{	{
298		name: $s[0:.offset] \| gsub("^\\s+\|\\s+$";""),
299		species_code: .string \| gsub("^\\s+\|\\s+$";"")
	306		_as_noted: $s \| gsub("^\\s+\|\\s+$";""),
	307		species_code: $s \| as_species_code,
	308		scientific_name: $s \| as_species_sci_name,
	309		common_name: $s \| as_species_common_name,
300	310	})	})
301		),
302	311	}	}
303	312	}	}

Commit e755dc7f4fe2d7c9b97826a0f3f2cf5385e90ef9 - convert html only once. extract species names as json. formating and lint.
Author: dleucas
Author date (UTC): 2021-12-16 13:35
Committer name: dleucas
Committer date (UTC): 2021-12-16 13:35
Parent(s): 572dbf1eaffe17c43a4a01dc9675737628c5a234
Signing key:
Tree: d38fbed1887eb4311ba66514c5cdd48665677183

File	Lines added	Lines deleted
transform.sh	29	8

File transform.sh changed (mode: 100755) (index 64f7fc5..1ea333d)
1	1	#!/bin/bash	#!/bin/bash
2		set -e # abort on any errors
	2		set -eo pipefail
	3		# set -x
	4
	5		test -e "$(command -v xidel)" \|\| (
	6		echo "ERR: Need xidel from https://www.videlibri.de/xidel.html"
	7		exit 1
	8		)
	9		test -e "$(command -v jq)" \|\| (
	10		echo "ERR: Need jq from https://stedolan.github.io/jq/"
	11		exit 1
	12		)
	13
	14		# Mapping of species id to common and scientific name
	15
	16		tail -n+56 data/species.map \| jq -cR 'split("\t") as $row \| {($row[0]): ($row[1])}' \| jq -cs add >data/species.sci.names.json
	17		head -n 55 data/species.map \| jq -cR 'split("\t") as $row \| {($row[0]): ($row[1])}' \| jq -cs add >data/species.common.names.json
3	18
4	19	# Transform HTML metadata from source site into JSON	# Transform HTML metadata from source site into JSON
5	20
6		# for xpath
	21		# for xpath
7	22	XIDEL='xidel -s --input-format=html --output-format=json-wrapped'	XIDEL='xidel -s --input-format=html --output-format=json-wrapped'
8	23
9	24	# select all rows from the 2nd table element	# select all rows from the 2nd table element

...	...	XPATH_ENTRY='/html/body/table[2]/tbody/tr/td'
20	35	# "SR:": "3400",	# "SR:": "3400",
21	36	# "CS:": "3.388",	# "CS:": "3.388",
22	37	# ...	# ...
23		#}
	38		#}
24	39	# The jq filter explained	# The jq filter explained
25	40	# 1. assign the whole array to $row	# 1. assign the whole array to $row
26	41	# 2. create a range with a step of 2 over the lenght of the array, 0,2,4,...	# 2. create a range with a step of 2 over the lenght of the array, 0,2,4,...
27	42	# 3. create a object and use the range as index for the $row elements	# 3. create a object and use the range as index for the $row elements
28	43	# 3.5 remove right most colon from key	# 3.5 remove right most colon from key
29	44	# 4. combine the list of objects into a single object with "add"	# 4. combine the list of objects into a single object with "add"
	45
	46		# shellcheck disable=SC2016
30	47	JQ_ARR2OBJ='[ .[] as $row \| range(0; $row\|length; 2) \| {( $row[.] \| rtrimstr(":")): ($row[.+1]) } ] \| add'	JQ_ARR2OBJ='[ .[] as $row \| range(0; $row\|length; 2) \| {( $row[.] \| rtrimstr(":")): ($row[.+1]) } ] \| add'
31	48
	49		test -d data/rn \|\| mkdir -p data/rn
32	50
33		while read RN
34		do
35		$XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" \| jq "$JQ_ARR2OBJ" > "data/rn/$RN.json"
36		done < data/retrieval.numbers
	51		while read -r RN; do
	52		# input should exist
	53		test -f "raw/rn/metaData.cfm?RN=$RN" \|\| continue
	54		# output should not exist
	55		test -f "data/rn/$RN.json" && continue
	56		$XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" \| jq -c "$JQ_ARR2OBJ" >"data/rn/$RN.json"
	57		done <data/retrieval.numbers
37	58
38	59	# transform all records with jq, this is where the magic happens	# transform all records with jq, this is where the magic happens
39		./transform.jq data/rn/*json > data/transformed.json
	60		./transform.jq data/rn/*json >data/transformed.json

Commit 572dbf1eaffe17c43a4a01dc9675737628c5a234 - download once. use wget only. get species names. test for commands. formating
Author: dleucas
Author date (UTC): 2021-12-16 12:14
Committer name: dleucas
Committer date (UTC): 2021-12-16 12:14
Parent(s): c3f9f9f9d9501e714117af7fff573e7f3fa4052b
Signing key:
Tree: 7e60b812e2b8cd029693b78a7fb14fad3add2e04

File	Lines added	Lines deleted
download.sh	77	31

File download.sh changed (mode: 100755) (index d154c61..321faae)
1		#!/bin/bash
	1		#!/bin/bash
	2		#
	3		# Scrape metadata from Woods Hole Oceanographic Institution
	4		#
	5		# Download of all audio files is left as exercise to the reader
	6		#
	7		# - Grab species index from "All Cuts" page
	8		# - Grab all audio cuts for every species for all listed years
	9		# - Grap metadata pop-up for every audio cut
	10		#
	11		# https://whoicf2.whoi.edu/science/B/whalesounds/metaData.cfm?RN=91008005
	12		#
	13		# Using xmllint XPath 1.0 for parsing, because it continues on broken HTML.
	14		#
	15		#
	16		# SPDX-License-Identifier: GPL-3.0-or-later
	17		#
	18		# Copyright (C) 2021 leuc
	19		#
	20		# This program is free software: you can redistribute it and/or modify it under the
	21		# terms of the GNU Affero General Public License as published by the Free Software
	22		# Foundation, either version 3 of the License, or (at your option) any later
	23		# version.
	24		#
	25		# This program is distributed in the hope that it will be useful, but WITHOUT ANY
	26		# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
	27		# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
	28		#
	29		# You should have received a copy of the GNU Affero General Public License along
	30		# with this program. If not, see <https://www.gnu.org/licenses/>.
2	31
3	32	set -e	set -e
4		# set -x
5	33
6		URL='http://cis.whoi.edu/science/B/whalesounds'
7		CURL='curl -s'
	34		test -e "$(command -v wget)" \|\| (
	35		echo "ERR: Please install wget"
	36		exit 1
	37		)
	38		test -e "$(command -v xmllint)" \|\| (
	39		echo "ERR: Please install xmllint from libxml2-utils"
	40		exit 1
	41		)
	42
	43		URL='https://cis.whoi.edu/science/B/whalesounds'
	44		WGET='wget -nv -nc'
8	45	XMLLINT='xmllint --recover --html'	XMLLINT='xmllint --recover --html'
9	46
	47		# Ensure target dir
	48		test -d raw \|\| mkdir raw
	49		test -d data \|\| mkdir data
	50
10	51	# Download Index page listing all search options	# Download Index page listing all search options
11		# $CURL "$URL/fullCuts.cfm" > raw/fullCuts.cfm
	52		$WGET "$URL/fullCuts.cfm" -O raw/fullCuts.cfm \|\| true
	53
	54		# Extract catalog IDs for each mammal from HTML drop down
	55		XPATH_SP='//select[@id="getSpecies"]/option[not(contains(text(),"Select"))]/@value'
	56		$XMLLINT -xpath "${XPATH_SP}" raw/fullCuts.cfm 2>/dev/null \| grep -oP 'SP=\K(\w+)' >data/species.list
12	57
13		# Extract catalog IDs for each mammal
14		XPATH_SP='//select[@id="getSpecies"]/option/@value'
15		$XMLLINT -xpath $XPATH_SP raw/fullCuts.cfm 2>/dev/null \| grep -oP 'SP=\K(\w+)' \| sort -u > data/species.ids
	58		# Extract mammal names from HTML drop down
	59		XPATH_NAME='//select[@id="getSpecies"]/option[not(contains(text(),"Select"))]/text()'
	60		$XMLLINT --xpath "${XPATH_NAME}" raw/fullCuts.cfm 2>/dev/null \| sed '/^\s$/d' \| sed 's/^\s//g' \| sed 's/\s*$//g' >data/species.names
	61
	62		# Make unique list of IDs
	63		sort -u data/species.list >data/species.ids
	64
	65		# Merge ids and names for later mapping in transform.sh
	66		paste data/species.list data/species.names >data/species.map
16	67
17	68	# create a list of pages to download	# create a list of pages to download
18		while read SP
19		do
20		echo "$URL/fullCuts.cfm?SP=$SP&YR=-1"
21		done < data/species.ids > data/species.urls
	69		while read -r SP; do
	70		echo "$URL/fullCuts.cfm?SP=$SP&YR=-1"
	71		done <data/species.ids >data/species.urls
22	72
23	73	# Download all pages	# Download all pages
24		# wget -P raw/sp/ -i data/species.urls
	74		$WGET -P raw/sp/ -i data/species.urls
25	75
26	76	# create a list of pages for each year and species	# create a list of pages for each year and species
27	77	XPATH_YR='//select[@id="pickYear"]/option/@value'	XPATH_YR='//select[@id="pickYear"]/option/@value'
28		while read SP
29		do
30		YEARS=$($XMLLINT -xpath "$XPATH_YR" "raw/sp/fullCuts.cfm?SP=$SP&YR=-1" 2>/dev/null \| grep -oP 'YR=\K(\d+)' \| sort -u)
31		for YEAR in $YEARS; do
32		echo "$URL/fullCuts.cfm?SP=$SP&YR=$YEAR"
33		done
34		done < data/species.ids > data/species.year.urls
	78		while read -r SP; do
	79		YEARS=$($XMLLINT -xpath "$XPATH_YR" "raw/sp/fullCuts.cfm?SP=$SP&YR=-1" 2>/dev/null \| grep -oP 'YR=\K(\d+)' \| sort -u)
	80		for YEAR in $YEARS; do
	81		echo "$URL/fullCuts.cfm?SP=$SP&YR=$YEAR"
	82		done
	83		done <data/species.ids >data/species.year.urls
35	84
36		# wget -P raw/spyr/ -i data/species.year.urls
	85		$WGET -P raw/spyr/ -i data/species.year.urls
37	86
38	87	# Extract retrieval number from all sp/year pages	# Extract retrieval number from all sp/year pages
39	88	XPATH_RN='//table//tr//td[5]/a/@href'	XPATH_RN='//table//tr//td[5]/a/@href'
40		for F in raw/spyr/fullCuts.cfm*
41		do
42		$XMLLINT -xpath "$XPATH_RN" "$F" 2>/dev/null \| grep -oP 'WhaleSounds/\K([\da-zA-Z]+)'
43		done \| sort -u > data/retrieval.numbers
	89		for F in raw/spyr/fullCuts.cfm*; do
	90		$XMLLINT -xpath "$XPATH_RN" "$F" 2>/dev/null \| grep -oP 'WhaleSounds/\K([\da-zA-Z]+)'
	91		done \| sort -u >data/retrieval.numbers
44	92
45	93	# Create list of URLs to download	# Create list of URLs to download
46		while read RN
47		do
48		echo "$URL/metaData.cfm?RN=$RN"
49		done < data/retrieval.numbers > data/retrieval.urls
50
51		# wget -P raw/rn/ -i data/retrieval.urls
	94		while read -r RN; do
	95		echo "$URL/metaData.cfm?RN=$RN"
	96		done <data/retrieval.numbers >data/retrieval.urls
52	97
	98		$WGET -P raw/rn/ -i data/retrieval.urls