/transform.sh (1ea333df2ffb1ce5f19b88a78db75302de126eb9) (2055 bytes) (mode 100755) (type blob)
#!/bin/bash
set -eo pipefail
# set -x
test -e "$(command -v xidel)" || (
echo "ERR: Need xidel from https://www.videlibri.de/xidel.html"
exit 1
)
test -e "$(command -v jq)" || (
echo "ERR: Need jq from https://stedolan.github.io/jq/"
exit 1
)
# Mapping of species id to common and scientific name
tail -n+56 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.sci.names.json
head -n 55 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.common.names.json
# Transform HTML metadata from source site into JSON
# for xpath
XIDEL='xidel -s --input-format=html --output-format=json-wrapped'
# select all rows from the 2nd table element
XPATH_ENTRY='/html/body/table[2]/tbody/tr/td'
# with the xpath xidel returns a single JSON array with all table keys and values
# [["RN:","99002005","CU:","0 B2:30 1:55.149","NC:","11A","SR:","3400",... ]
# chunk the array into pairs of two and combine into a JSON object with key: value
#{
# "RN:": "99002005",
# "CU:": "0 B2:30 1:55.149",
# "NC:": "11A",
# "SR:": "3400",
# "CS:": "3.388",
# ...
#}
# The jq filter explained
# 1. assign the whole array to $row
# 2. create a range with a step of 2 over the lenght of the array, 0,2,4,...
# 3. create a object and use the range as index for the $row elements
# 3.5 remove right most colon from key
# 4. combine the list of objects into a single object with "add"
# shellcheck disable=SC2016
JQ_ARR2OBJ='[ .[] as $row | range(0; $row|length; 2) | {( $row[.] | rtrimstr(":")): ($row[.+1]) } ] | add'
test -d data/rn || mkdir -p data/rn
while read -r RN; do
# input should exist
test -f "raw/rn/metaData.cfm?RN=$RN" || continue
# output should not exist
test -f "data/rn/$RN.json" && continue
$XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" | jq -c "$JQ_ARR2OBJ" >"data/rn/$RN.json"
done <data/retrieval.numbers
# transform all records with jq, this is where the magic happens
./transform.jq data/rn/*json >data/transformed.json
Mode |
Type |
Size |
Ref |
File |
100644 |
blob |
71 |
0fe742dd5c99d4e4052b819da0946861ca4aa57f |
.gitignore |
100644 |
blob |
2018 |
4b92f9633ec36c392eb6fa701c5fc18cdc1513e2 |
DATA.md |
100755 |
blob |
1600 |
76074bacbc6e3e7fa33edfb6fe8d79d9396e062f |
GeoJSON.jq |
100644 |
blob |
1182 |
e510d9c801ec6a6f98efaf9a5a053b3d94747458 |
README.md |
100644 |
blob |
865 |
6ac29799fea3cd2dd8c0e8116a12e6da93809572 |
TODO.md |
100755 |
blob |
3398 |
321faaee956e307405652c0ba6a88045e61c1a69 |
download.sh |
100644 |
blob |
218 |
9100d4eb109a354733264a3b989d0de699db3c9c |
index.jq |
100644 |
blob |
7797 |
5fe23ec00ab7d3de76f49d60dad3e06dabb614cd |
index.mapping.json |
100755 |
blob |
482 |
7abb8bcf9a49d7c849ab50538e47c9033a921f28 |
index.sh |
040000 |
tree |
- |
60e19bf3f6e7f2fba709ae362e565cf2df36ac29 |
snd |
040000 |
tree |
- |
4f397e3702b225c714af48ecf5f1591d8458c248 |
srv |
100755 |
blob |
21951 |
e3f07b4ac26e762d889379ee3bc435fef0b2a338 |
transform.jq |
100755 |
blob |
2055 |
1ea333df2ffb1ce5f19b88a78db75302de126eb9 |
transform.sh |
040000 |
tree |
- |
c86df441474ace769ecfa63de94d3adc1d75995a |
webroot |
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"
Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/dleucas/wmmsdb
Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/dleucas/wmmsdb
Clone this repository using git:
git clone git://git.rocketgit.com/user/dleucas/wmmsdb
You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a
merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main