-
Notifications
You must be signed in to change notification settings - Fork 0
/
process-data-dictionaries.sh
executable file
·74 lines (50 loc) · 3.47 KB
/
process-data-dictionaries.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
if [ $# -eq 0 ]
then
echo "Two arguments must be supplied: (1) The input directory containing data dictionaries, (2) The output directory."
exit 1
fi
echo "Processing data dictionaries in $1 and sending output to $2"
curated=$1
outdir=$2
processingdir=$outdir/processing
splitdictionaries=$outdir/splitdictionaries
latestversions=$outdir/latestversions
withsources=$outdir/withsources
withdigests=$outdir/withdigests
mkdir $outdir
dd make-csv --in $curated --out $outdir/0-extracted
dd strip --in $outdir/0-extracted --out $outdir/1-stripped
dd collapse --in $outdir/1-stripped --out $outdir/2-collapsed --key-field-name question_name --key-field-regex "^([^\-]+)\-\d+" --key-field-regex-group 1
dd fill-down --in $outdir/2-collapsed --out $outdir/3-filled --field-names survey_name,survey_version,"Section Header"
dd replace-field-names --in $outdir/3-filled --out $outdir/4-mapped --mapping-file field-name-synonyms.csv
dd collapse --in $outdir/4-mapped --out $outdir/5-collapsed --key-field-name Id,Label --key-field-regex "(.*?)(_+\d+$)",".*" --key-field-regex-group 1,0 --delimeter " $ " --separator ","
dd drop-tier-1-fields --in $outdir/5-collapsed --out $outdir/6-filtered --id-field Id
dd transform --in $outdir/6-filtered --out $outdir/7-normalized --lowercase --collapse-white-space --field-names required,Id,Datatype,Units
dd split --in $outdir/7-normalized --out $outdir/8-split --field-names survey_name
dd retain-max-rows --in $outdir/8-split --out $outdir/9-deduped --field-names survey_version
dd filter --in $outdir/9-deduped --out $outdir/10-non-blank-variable-names --field-value-filter Id=.+
dd append-source --in $outdir/10-non-blank-variable-names --out $outdir/11-with-sources
dd merge --in $outdir/11-with-sources --out $outdir/12-merged/merged/merged.csv --distinct
st join --csv-file $outdir/12-merged/merged/merged.csv --field-id "source_directory" --out $outdir/12-merged/merged/merged.csv
dd drop-duplicates --in $outdir/12-merged --out $outdir/13-deduped --fields Id,Label,Program
dd append-global-code-book --in $outdir/13-deduped --out $outdir/14-with-gcb
dd append-digest --in $outdir/14-with-gcb --out $outdir/15-with-digests --field-names Id,source_file,source_directory
# dd retain-fields --in $latestversions --out $outdir/variable_names --field-names variable_name,label
# dd distinct --in $outidr/variable_names --out $outidr/variable_names
# dd append-source --in $outdir/variable_names --out $outdir/variable_names
# dd merge --in $outdir/variable_names --out $outdir/variable_names-summary.csv --sorted
#
#
# dd retain-fields --in $withsources --out $outdir/datatypes --field-names datatype,source_file,source_directory
# dd distinct --in $outdir/datatypes --out $outdir/datatypes
# dd merge --in $outdir/datatypes --out $outdir/datatypes-summary.csv --distinct --sorted
#
# dd retain-fields --in $latestversions --out $outdir/units --field-names units,source_file,source_directory
# dd distinct --in $outdir/units --out $outdir/units
# dd merge --in $outdir/units --out $outdir/units-summary.csv --distinct --sorted
#
# dd retain-fields --in $latestversions --out $outdir/labels --field-names label,source_file,source_directory
# dd merge --in $outdir/labels --out $outdir/labels-summary.csv --distinct --sorted
#
# dd retain-fields --in $latestversions --out $outdir/value-constraints --field-names value_constraints,source_directory
# dd merge --in $outdir/value-constraints --out $outdir/value-constraints-summary.csv --distinct --sorted