RocketGit

dleucas / wmmsdb (public) (License: GPLv3) (since 2018-07-08) (hash sha1)

A collection of scripts to download, transform and normalize the Watkins Marine Mammal Sound Database.

Credit:

“Watkins Marine Mammal Sound Database, Woods Hole Oceanographic Institution.”

http://cis.whoi.edu/science/B/whalesounds/index.cfm

Clone URLs: https://rocketgit.com/user/dleucas/wmmsdb ssh://rocketgit@ssh.rocketgit.com/user/dleucas/wmmsdb git://git.rocketgit.com/user/dleucas/wmmsdb

master species_names

/transform.sh (1ea333df2ffb1ce5f19b88a78db75302de126eb9) (2055 bytes) (mode 100755) (type blob)

#!/bin/bash
set -eo pipefail
# set -x

test -e "$(command -v xidel)" || (
  echo "ERR: Need xidel from https://www.videlibri.de/xidel.html"
  exit 1
)
test -e "$(command -v jq)" || (
  echo "ERR: Need jq from https://stedolan.github.io/jq/"
  exit 1
)

# Mapping of species id to common and scientific name

tail -n+56 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.sci.names.json
head -n 55 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.common.names.json

# Transform HTML metadata from source site into JSON

# for xpath
XIDEL='xidel -s --input-format=html --output-format=json-wrapped'

# select all rows from the 2nd table element
XPATH_ENTRY='/html/body/table[2]/tbody/tr/td'

# with the xpath xidel returns a single JSON array with all table keys and values
# [["RN:","99002005","CU:","0  B2:30  1:55.149","NC:","11A","SR:","3400",... ]

# chunk the array into pairs of two and combine into a JSON object with key: value
#{
#  "RN:": "99002005",
#  "CU:": "0  B2:30  1:55.149",
#  "NC:": "11A",
#  "SR:": "3400",
#  "CS:": "3.388",
# ...
#}
# The jq filter explained
# 1. assign the whole array to $row
# 2. create a range with a step of 2 over the lenght of the array, 0,2,4,...
# 3. create a object and use the range as index for the $row elements
# 3.5 remove right most colon from key
# 4. combine the list of objects into a single object with "add"

# shellcheck disable=SC2016
JQ_ARR2OBJ='[ .[] as $row | range(0; $row|length; 2) | {( $row[.] | rtrimstr(":")): ($row[.+1]) } ] | add'

test -d data/rn || mkdir -p data/rn

while read -r RN; do
  # input should exist
  test -f "raw/rn/metaData.cfm?RN=$RN" || continue
  # output should not exist
  test -f "data/rn/$RN.json" && continue
  $XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" | jq -c "$JQ_ARR2OBJ" >"data/rn/$RN.json"
done <data/retrieval.numbers

# transform all records with jq, this is where the magic happens
./transform.jq data/rn/*json >data/transformed.json

Mode	Type	Size	Ref	File
100644	blob	71	0fe742dd5c99d4e4052b819da0946861ca4aa57f	.gitignore
100644	blob	2018	4b92f9633ec36c392eb6fa701c5fc18cdc1513e2	DATA.md
100755	blob	1600	76074bacbc6e3e7fa33edfb6fe8d79d9396e062f	GeoJSON.jq
100644	blob	1182	e510d9c801ec6a6f98efaf9a5a053b3d94747458	README.md
100644	blob	865	6ac29799fea3cd2dd8c0e8116a12e6da93809572	TODO.md
100755	blob	3398	321faaee956e307405652c0ba6a88045e61c1a69	download.sh
100644	blob	218	9100d4eb109a354733264a3b989d0de699db3c9c	index.jq
100644	blob	7797	5fe23ec00ab7d3de76f49d60dad3e06dabb614cd	index.mapping.json
100755	blob	482	7abb8bcf9a49d7c849ab50538e47c9033a921f28	index.sh
040000	tree	-	60e19bf3f6e7f2fba709ae362e565cf2df36ac29	snd
040000	tree	-	4f397e3702b225c714af48ecf5f1591d8458c248	srv
100755	blob	21951	e3f07b4ac26e762d889379ee3bc435fef0b2a338	transform.jq
100755	blob	2055	1ea333df2ffb1ce5f19b88a78db75302de126eb9	transform.sh
040000	tree	-	c86df441474ace769ecfa63de94d3adc1d75995a	webroot

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/dleucas/wmmsdb

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/dleucas/wmmsdb

Clone this repository using git:

git clone git://git.rocketgit.com/user/dleucas/wmmsdb

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main