From 358d76090a9517581a52d138802c35677b54917a Mon Sep 17 00:00:00 2001 From: Andrew Owens Date: Thu, 21 Nov 2019 16:57:50 -0500 Subject: [PATCH 1/4] Add proper tests, change xlsx name --- .travis.yml | 9 +++++++++ src/allfiles_hash.json | 4 ++-- src/init.jl | 11 +++++++++-- src/use_codebook.jl | 2 +- test/Project.toml | 6 ++++++ test/runtests.jl | 39 +++++++++++++++++++++++++++++---------- 6 files changed, 56 insertions(+), 15 deletions(-) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8da0b92 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,9 @@ +language: julia +os: + - linux + - windows +julia: + - 1.0 + - 1.2 + - 1.3 +sudo: false diff --git a/src/allfiles_hash.json b/src/allfiles_hash.json index da76dd5..2491f97 100644 --- a/src/allfiles_hash.json +++ b/src/allfiles_hash.json @@ -1,5 +1,6 @@ { "J265684_codebook.xml": "6d9a40ef8c61aa359a31aa4d04746b31511893a05e81337aa7e7a365c8452f0a", + "psid.xlsx": "9f277e239d8483a3c852b7214bee6ac7d54cb101b27b4f95f75058e186b435d2", "fam1968.zip": "38292539d020824be3ca3908c04093edf8d3ccef8dc44cb81147a2315e94cd00", "fam1969.zip": "7942f14d8c0c3f42c05efa3fd6b90b011721e280f198fe6b78a417bb28dc9335", "fam1970.zip": "07bf03d9aa9ca9258aff3546abf60e66447c02031819d5a552f4769ddb8bb90f", @@ -40,6 +41,5 @@ "fam2013er.zip": "43a527b834dc31b753881d3ab03fe1a4c4f1dde7eb5aa2d77a7c3bb79095d15c", "fam2015er.zip": "726236d3f9d25e804d2605eff6a6a11f322999530d27a2528d0d01cf31af6066", "fam2017er.zip": "5ade1a3f42ed84c892fe8ff16365b85b0dc84ac66f4d454e291af12008e9b35d", - "ind2017er.zip": "7ea5837017603841afeb0d4d0365745d1a592e5c7a77021d4e1d617b7aed486c", - "psid_crossyear.xlsx": "9f277e239d8483a3c852b7214bee6ac7d54cb101b27b4f95f75058e186b435d2" + "ind2017er.zip": "7ea5837017603841afeb0d4d0365745d1a592e5c7a77021d4e1d617b7aed486c" } diff --git a/src/init.jl b/src/init.jl index 030c180..f0ca72d 100644 --- a/src/init.jl +++ b/src/init.jl @@ -1,10 +1,17 @@ function checkhash(filename) filename |> read |> sha256 |> bytes2hex end -function verifyfiles(allfilesjson) +function verifyfiles(allfilesjson; skip = false) allfiles_dict = JSON3.read(read(allfilesjson, String), SortedDict{String, String}) for (f, v) in allfiles_dict - isfile(f) || error("$f not found and is required.") + if !isfile(f) + if !skip + error("$f not found and is required.") + else + @warn "$f not found, skipping" + continue + end + end fh = checkhash(f) if fh == v println("Found file $f, hash OK") diff --git a/src/use_codebook.jl b/src/use_codebook.jl index 46d94ef..e59ce04 100644 --- a/src/use_codebook.jl +++ b/src/use_codebook.jl @@ -100,7 +100,7 @@ function process_input(inputjson) j2 = jsontable(read("output/codebook.json", String)); d2 = DataFrame(j2); d2.codedict = [Dict(string(x) => y for (x, y) in dt) for dt in d2.codedict] - df = DataFrame(XLSX.readtable("psid_crossyear.xlsx", "MATRIX")...) + df = DataFrame(XLSX.readtable("psid.xlsx", "MATRIX")...) df = mapcols(x -> [xx for xx in x], df) ## Need a map from VAR to the right row df_vars = df[!, r"^Y.+"] diff --git a/test/Project.toml b/test/Project.toml index 0c36332..84e57a9 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,2 +1,8 @@ [deps] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" + +[compat] +DataDeps = "0.7.0" +JSON3 = "= 0.1.12" diff --git a/test/runtests.jl b/test/runtests.jl index b62f4a0..0531acc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,14 +1,33 @@ using Test -using PSID +using PSID, DataDeps, JSON3 + + @show pwd() -#= x = dirname(pathof(PSID)) fx = "$x/allfiles_hash.json" -@show isfile(fx) -PSID.verifyfiles(fx) -PSID.process_codebook() -PSID.process_input("user_input.json") -famdatas, inddata = PSID.unzip_data() -PSID.construct_alldata(famdatas, inddata) -=# -makePSID("user_input.json") +skipdata = try + PSID.verifyfiles(fx, skip = skip) + println("Found all files, running full tests") + false +catch + println("Did not find data files, running partial tests") + true +end + +if skipdata + Base.download("https://raw.githubusercontent.com/aaowens/PSID.jl/master/examples/user_input.json", "user_input.json") + Base.download("https://drive.google.com/uc?authuser=0&id=1nz1UaVGcj0ur2Bp3ev7a8agJbj0A5JTF&export=download", "J265684_codebook.zip") + run(DataDeps.unpack_cmd("J265684_codebook.zip", "$(pwd())", ".zip", "")) + Base.download("https://psidonline.isr.umich.edu/help/xyr/psid.xlsx", "psid.xlsx") + userinput_json = "user_input.json" + isfile(userinput_json) || error("$userinput_json not found in current directory") + isdir("output") || mkdir("output") + isdir("datafiles") || mkdir("datafiles") + PSID.process_codebook() + PSID.process_input("user_input.json") + JSON3.read(read("output/user_output.json", String), Vector{PSID.VarInfo5}) + #famdatas, inddata = PSID.unzip_data() + #PSID.construct_alldata(famdatas, inddata) +else + makePSID("user_input.json") +end From 67a0ef13712d19a6abb63d53d85fcd430edd6bc6 Mon Sep 17 00:00:00 2001 From: Andrew Owens Date: Thu, 21 Nov 2019 16:58:34 -0500 Subject: [PATCH 2/4] Increment major version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d515435..3b6a8ec 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "PSID" uuid = "92fd0282-be9c-47fb-a489-f0d0a91db595" -version = "0.1.3" +version = "0.2.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" From 2230105b7612a1a02b6b48d670fbe9ec4333414c Mon Sep 17 00:00:00 2001 From: Andrew Owens Date: Thu, 21 Nov 2019 19:41:12 -0500 Subject: [PATCH 3/4] Drop 1.0 support --- .travis.yml | 1 - Project.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8da0b92..2eafd82 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ os: - linux - windows julia: - - 1.0 - 1.2 - 1.3 sudo: false diff --git a/Project.toml b/Project.toml index 3b6a8ec..de38eb1 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,7 @@ SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0" [compat] -julia = "1" +julia = "1.1" AbstractTrees = "0.2.1" CSV = "0.5.16" DataDeps = "0.7.0" From 10e3ad36b1a429ce316f4e7539003ae796a29a7e Mon Sep 17 00:00:00 2001 From: Andrew Owens Date: Thu, 21 Nov 2019 19:54:00 -0500 Subject: [PATCH 4/4] Add Travis button --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e23ec21..6a708a0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # PSID.jl +[![Build Status](https://travis-ci.com/aaowens/PSID.jl.svg?branch=master)](https://travis-ci.com/aaowens/PSID.jl) + The Panel Study of Income Dynamics (PSID) is a longitudinal public dataset which has been following a collection of families and their descendants since 1968. It provides a breadth of information about labor supply and life-cycle dynamics. More information is available at https://psidonline.isr.umich.edu/. This package produces a labeled panel of individuals with a consistent individual ID across time. You provide a JSON file describing the variables you want. An example input file can be found at [examples/user_input.json.](https://github.com/aaowens/PSID.jl/blob/master/examples/user_input.json). Currently only variables in the family files can be added, but in the future it should be possible to support variables in the individual files or the supplements. @@ -37,9 +39,9 @@ The file passed to `makePSID` describes the variables you want. }, ``` There are three fields, `name_user`, `varID`, and `unit`. `name_user` is a name chosen by you. `varID` is one of the codes assigned by the PSID to this variable. These can be looked up in the PSID [cross-year index](https://simba.isr.umich.edu/VS/i.aspx). For example, hours above can be found in the crosswalk at ` Family Public Data Index 01>WORK 02>Hours and Weeks 03>annual in prior year 04>head 05>total:`. Clicking on the variable info will show the the list of years and associated IDs when that variable is available. Choose any of the IDs for `varID`, it does not matter. `PSID.jl` will look up all available years for that variable in the crosswalk. You must also indicate the unit, which can be `head`, `spouse`, or `family`. This makes sure the variable is assigned to the correct individual. - -# Features + +# Features This package provides the following features: 1. Automatically labels missing values by searching the value labels from the codebook for strings like "NA", "Inap.", or "Missing".