File transform.sh changed (mode: 100755) (index 64f7fc5..1ea333d) |
1 |
1 |
#!/bin/bash |
#!/bin/bash |
2 |
|
set -e # abort on any errors |
|
|
2 |
|
set -eo pipefail |
|
3 |
|
# set -x |
|
4 |
|
|
|
5 |
|
test -e "$(command -v xidel)" || ( |
|
6 |
|
echo "ERR: Need xidel from https://www.videlibri.de/xidel.html" |
|
7 |
|
exit 1 |
|
8 |
|
) |
|
9 |
|
test -e "$(command -v jq)" || ( |
|
10 |
|
echo "ERR: Need jq from https://stedolan.github.io/jq/" |
|
11 |
|
exit 1 |
|
12 |
|
) |
|
13 |
|
|
|
14 |
|
# Mapping of species id to common and scientific name |
|
15 |
|
|
|
16 |
|
tail -n+56 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.sci.names.json |
|
17 |
|
head -n 55 data/species.map | jq -cR 'split("\t") as $row | {($row[0]): ($row[1])}' | jq -cs add >data/species.common.names.json |
3 |
18 |
|
|
4 |
19 |
# Transform HTML metadata from source site into JSON |
# Transform HTML metadata from source site into JSON |
5 |
20 |
|
|
6 |
|
# for xpath |
|
|
21 |
|
# for xpath |
7 |
22 |
XIDEL='xidel -s --input-format=html --output-format=json-wrapped' |
XIDEL='xidel -s --input-format=html --output-format=json-wrapped' |
8 |
23 |
|
|
9 |
24 |
# select all rows from the 2nd table element |
# select all rows from the 2nd table element |
|
... |
... |
XPATH_ENTRY='/html/body/table[2]/tbody/tr/td' |
20 |
35 |
# "SR:": "3400", |
# "SR:": "3400", |
21 |
36 |
# "CS:": "3.388", |
# "CS:": "3.388", |
22 |
37 |
# ... |
# ... |
23 |
|
#} |
|
|
38 |
|
#} |
24 |
39 |
# The jq filter explained |
# The jq filter explained |
25 |
40 |
# 1. assign the whole array to $row |
# 1. assign the whole array to $row |
26 |
41 |
# 2. create a range with a step of 2 over the lenght of the array, 0,2,4,... |
# 2. create a range with a step of 2 over the lenght of the array, 0,2,4,... |
27 |
42 |
# 3. create a object and use the range as index for the $row elements |
# 3. create a object and use the range as index for the $row elements |
28 |
43 |
# 3.5 remove right most colon from key |
# 3.5 remove right most colon from key |
29 |
44 |
# 4. combine the list of objects into a single object with "add" |
# 4. combine the list of objects into a single object with "add" |
|
45 |
|
|
|
46 |
|
# shellcheck disable=SC2016 |
30 |
47 |
JQ_ARR2OBJ='[ .[] as $row | range(0; $row|length; 2) | {( $row[.] | rtrimstr(":")): ($row[.+1]) } ] | add' |
JQ_ARR2OBJ='[ .[] as $row | range(0; $row|length; 2) | {( $row[.] | rtrimstr(":")): ($row[.+1]) } ] | add' |
31 |
48 |
|
|
|
49 |
|
test -d data/rn || mkdir -p data/rn |
32 |
50 |
|
|
33 |
|
while read RN |
|
34 |
|
do |
|
35 |
|
$XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" | jq "$JQ_ARR2OBJ" > "data/rn/$RN.json" |
|
36 |
|
done < data/retrieval.numbers |
|
|
51 |
|
while read -r RN; do |
|
52 |
|
# input should exist |
|
53 |
|
test -f "raw/rn/metaData.cfm?RN=$RN" || continue |
|
54 |
|
# output should not exist |
|
55 |
|
test -f "data/rn/$RN.json" && continue |
|
56 |
|
$XIDEL --xpath "$XPATH_ENTRY" "raw/rn/metaData.cfm?RN=$RN" | jq -c "$JQ_ARR2OBJ" >"data/rn/$RN.json" |
|
57 |
|
done <data/retrieval.numbers |
37 |
58 |
|
|
38 |
59 |
# transform all records with jq, this is where the magic happens |
# transform all records with jq, this is where the magic happens |
39 |
|
./transform.jq data/rn/*json > data/transformed.json |
|
|
60 |
|
./transform.jq data/rn/*json >data/transformed.json |