Skip to content

Commit

Permalink
Merge pull request #10 from pat-alt/ci-for-formatter
Browse files Browse the repository at this point in the history
formatter
  • Loading branch information
pat-alt authored Jan 19, 2024
2 parents 16ab09a + 6aeb2ff commit 9ea8230
Show file tree
Hide file tree
Showing 24 changed files with 292 additions and 160 deletions.
7 changes: 7 additions & 0 deletions .JuliaFormatter.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
using Pkg # Load package manager
Pkg.add("JuliaFormatter") # Install JuliaFormatter

using JuliaFormatter # Load JuliaFormatter
format("."; verbose = true) # Format all files

Pkg.rm("JuliaFormatter") # Remove JuliaFormatter
34 changes: 34 additions & 0 deletions .github/workflows/FormatCheck.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Format Check

on:
push:
branches:
- 'main'
- 'release-'
tags: '*'
pull_request:

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: julia-actions/setup-julia@latest
with:
version: 1
- uses: actions/checkout@v1
- name: Install JuliaFormatter
run: |
using Pkg
Pkg.add("JuliaFormatter")
shell: julia --color=yes {0}
- name: Format code
run: |
using JuliaFormatter
format("."; verbose=true)
shell: julia --color=yes {0}
- name: Suggest formatting changes
uses: reviewdog/action-suggester@v1
if: github.event_name == 'pull_request'
with:
tool_name: JuliaFormatter
fail_on_error: true
6 changes: 3 additions & 3 deletions dev/src/full_data/full_data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ include("pre_process_speech.jl")
include("pre_process_pc.jl")

# Merge:
df = vcat(df_mm, df_speech, df_pc, cols=:union)
df = vcat(df_mm, df_speech, df_pc, cols = :union)
df.date = Dates.Date.(df.date, "yyyymmdd")

# Transform label column
Expand All @@ -30,9 +30,9 @@ ALL_EVENT_TIMES = unique(select(df, :date))
include("market_data.jl")

# Merge:
df = innerjoin(df, df_combined, on=:date) |>
df =
innerjoin(df, df_combined, on = :date) |>
x -> sort!(x, [:sentence_id, :doc_id, :date, :event_type])

# Save:
CSV.write("$clean_dir/all_data.csv", df)

38 changes: 22 additions & 16 deletions dev/src/full_data/market_data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,40 @@ clean_market_data_dir = joinpath(clean_dir, "market_data")
isdir(clean_market_data_dir) || mkdir(clean_market_data_dir)

# CPI data:
df_cpi = CSV.read("$market_data_dir/CPIAUCSL.csv", DataFrame) |>
x -> rename!(x, :DATE => :date, :CPIAUCSL => :value)
df_cpi =
CSV.read("$market_data_dir/CPIAUCSL.csv", DataFrame) |>
x -> rename!(x, :DATE => :date, :CPIAUCSL => :value)
df_cpi.indicator .= "CPI"
# CPI data is monthly, so we need to fill in the missing dates:
df_cpi = sort(outerjoin(ALL_EVENT_TIMES, df_cpi, on=:date), :date)
transform!(df_cpi, names(df_cpi) .=> Impute.locf, renamecols=false)
df_cpi = sort(outerjoin(ALL_EVENT_TIMES, df_cpi, on = :date), :date)
transform!(df_cpi, names(df_cpi) .=> Impute.locf, renamecols = false)
CSV.write("$clean_market_data_dir/cpi.csv", df_cpi)

# PPI data:
df_ppi = CSV.read("$market_data_dir/PPIACO.csv", DataFrame) |>
df_ppi =
CSV.read("$market_data_dir/PPIACO.csv", DataFrame) |>
x -> rename!(x, :DATE => :date, :PPIACO => :value)
df_ppi.indicator .= "PPI"
# PPI data is monthly, so we need to fill in the missing dates:
df_ppi = sort(outerjoin(ALL_EVENT_TIMES, df_ppi, on=:date), :date)
transform!(df_ppi, names(df_ppi) .=> Impute.locf, renamecols=false)
df_ppi = sort(outerjoin(ALL_EVENT_TIMES, df_ppi, on = :date), :date)
transform!(df_ppi, names(df_ppi) .=> Impute.locf, renamecols = false)
CSV.write("$clean_market_data_dir/ppi.csv", df_ppi)

# Treasury yield data:
df_ust = CSV.read("$market_data_dir/daily-treasury-rates.csv", DataFrame) |>
x -> rename!(x, :Date => :date) |>
x -> transform(x, :date => ByRow(x -> replace(x, "/" => "-")) => :date) |>
x -> transform(x, :date => ByRow(x -> Date(x, "mm-dd-yyyy")) => :date) |>
x -> stack(x, Not(:date), variable_name=:maturity, value_name=:value)
df_ust =
CSV.read("$market_data_dir/daily-treasury-rates.csv", DataFrame) |>
x ->
rename!(x, :Date => :date) |>
x ->
transform(x, :date => ByRow(x -> replace(x, "/" => "-")) => :date) |>
x ->
transform(x, :date => ByRow(x -> Date(x, "mm-dd-yyyy")) => :date) |>
x -> stack(x, Not(:date), variable_name = :maturity, value_name = :value)
df_ust.indicator .= "UST"
# UST data is daily, but may not be available on weekends, so we need to fill in the missing dates:
df_ust = sort(outerjoin(ALL_EVENT_TIMES, df_ust, on=:date), :date)
transform!(df_ust, names(df_ust) .=> Impute.locf, renamecols=false)
df_ust = sort(outerjoin(ALL_EVENT_TIMES, df_ust, on = :date), :date)
transform!(df_ust, names(df_ust) .=> Impute.locf, renamecols = false)
CSV.write("$clean_market_data_dir/ust.csv", df_ust)

df_combined = vcat(df_cpi, df_ppi, df_ust, cols=:union)
CSV.write("$clean_market_data_dir/combined.csv", df_combined)
df_combined = vcat(df_cpi, df_ppi, df_ust, cols = :union)
CSV.write("$clean_market_data_dir/combined.csv", df_combined)
17 changes: 10 additions & 7 deletions dev/src/full_data/pre_process_mm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,28 @@ using XLSX
# Metadata (master file):
xf = XLSX.readxlsx(joinpath(raw_data_dir, "master_files/master_mm_final.xlsx"))
m = xf[1][:]
df = DataFrame(m[2:end, :], m[1, :])
df = DataFrame(m[2:end, :], m[1, :])

# Get the date from the URL:
df[!,"YYYYMMDD"] .= extract_digits.(df.Url)
df[!, "YYYYMMDD"] .= extract_digits.(df.Url)

# Get labelled data:
labeled_dir = joinpath(raw_data_dir, "filtered_data/meeting_minutes_labeled/")
df_labeled = []
for x in readdir(labeled_dir)
_df = CSV.read(joinpath(labeled_dir, x), DataFrame, drop=[1])
_df = CSV.read(joinpath(labeled_dir, x), DataFrame, drop = [1])
_df[!, "YYYYMMDD"] .= extract_digits(x)
push!(df_labeled, _df)
end
df_labeled = vcat(df_labeled...)

# Merge:
df_mm = innerjoin(df, df_labeled, on=:YYYYMMDD) |>
x -> transform!(groupby(x, :Url), groupindices => :doc_id) |>
x -> select(x, [:doc_id, :YYYYMMDD, :EventType, :label, :sentence, :score]) |>
x -> rename!(x, :YYYYMMDD => :date, :EventType => :event_type)
df_mm =
innerjoin(df, df_labeled, on = :YYYYMMDD) |>
x ->
transform!(groupby(x, :Url), groupindices => :doc_id) |>
x ->
select(x, [:doc_id, :YYYYMMDD, :EventType, :label, :sentence, :score]) |>
x -> rename!(x, :YYYYMMDD => :date, :EventType => :event_type)

df_mm.event_type .= "meeting minutes"
15 changes: 9 additions & 6 deletions dev/src/full_data/pre_process_pc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@ df[!, "YYYYMMDD"] .= extract_digits.(df.Url)
labeled_dir = joinpath(raw_data_dir, "filtered_data/press_conference_labeled/")
df_labeled = []
for x in readdir(labeled_dir)
_df = CSV.read(joinpath(labeled_dir, x), DataFrame, drop=[1])
_df = CSV.read(joinpath(labeled_dir, x), DataFrame, drop = [1])
_df[!, "YYYYMMDD"] .= extract_digits(x)
push!(df_labeled, _df)
end
df_labeled = vcat(df_labeled...)

# Merge:
df_pc = innerjoin(df, df_labeled, on=:YYYYMMDD) |>
x -> transform!(groupby(x, :Url), groupindices => :doc_id) |>
x -> select(x, [:doc_id, :YYYYMMDD, :EventType, :label, :sentence, :score]) |>
x -> rename!(x, :YYYYMMDD => :date, :EventType => :event_type)
df_pc =
innerjoin(df, df_labeled, on = :YYYYMMDD) |>
x ->
transform!(groupby(x, :Url), groupindices => :doc_id) |>
x ->
select(x, [:doc_id, :YYYYMMDD, :EventType, :label, :sentence, :score]) |>
x -> rename!(x, :YYYYMMDD => :date, :EventType => :event_type)

df_pc.event_type .= "press conference"
df_pc.event_type .= "press conference"
37 changes: 27 additions & 10 deletions dev/src/full_data/pre_process_speech.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,49 @@ using XLSX
df = CSV.read(joinpath(raw_data_dir, "master_files/master_speech_final.csv"), DataFrame)

# Get the date from the URL:
df = df[length.(extract_digits.(df.LocalPath)) .>= 8,:]
df = df[length.(extract_digits.(df.LocalPath)).>=8, :]
df[!, "YYYYMMDD"] .= [extract_digits(x)[1:8] for x in df.LocalPath]
transform!(df, :LocalPath => ByRow(x -> length(extract_digits(x)) == 8 ? "1" : extract_digits(x)[9]) => :date_suffix)
transform!(
df,
:LocalPath =>
ByRow(x -> length(extract_digits(x)) == 8 ? "1" : extract_digits(x)[9]) =>
:date_suffix,
)
df[!, "EventType"] .= "speech"

# Get labelled data:
labeled_dir = joinpath(raw_data_dir, "filtered_data/speech_labeled/")
df_labeled = []
for x in readdir(labeled_dir)
_df = CSV.read(joinpath(labeled_dir, x), DataFrame, drop=[1])
_df = CSV.read(joinpath(labeled_dir, x), DataFrame, drop = [1])
_df[!, "YYYYMMDD"] .= extract_digits(x)[1:8]
_df[!, "date_suffix"] .= length(extract_digits(x)) == 8 ? "1" : extract_digits(x)[9]
push!(df_labeled, _df)
end
df_labeled = vcat(df_labeled...)

# Subset to 'select' speeches:
selected_path = readdir("dev/data/raw/fomc-hawkish-dovish-main/data/raw_data/speech/html/select/")
selected_path =
readdir("dev/data/raw/fomc-hawkish-dovish-main/data/raw_data/speech/html/select/")
df = df[[any([contains(x, y) for y in selected_path]) for x in df.LocalPath], :]
df = unique(df, :LocalPath)

# Merge:
df_speech = innerjoin(df, df_labeled, on=[:YYYYMMDD, :date_suffix]) |>
x -> transform!(groupby(x, :LocalPath), groupindices => :doc_id) |>
x -> select(x, [:doc_id, :YYYYMMDD, :EventType, :label, :sentence, :score, :Speaker]) |>
x -> rename!(x, :YYYYMMDD => :date, :EventType => :event_type, :Speaker => :speaker) |>
unique
df_speech =
innerjoin(df, df_labeled, on = [:YYYYMMDD, :date_suffix]) |>
x ->
transform!(groupby(x, :LocalPath), groupindices => :doc_id) |>
x ->
select(
x,
[:doc_id, :YYYYMMDD, :EventType, :label, :sentence, :score, :Speaker],
) |>
x ->
rename!(
x,
:YYYYMMDD => :date,
:EventType => :event_type,
:Speaker => :speaker,
) |> unique

df_speech.event_type .= "speech"
df_speech.event_type .= "speech"
15 changes: 10 additions & 5 deletions dev/src/get_data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@ raw_data_dir = joinpath(raw_data_root, "fomc-hawkish-dovish-main/data")

if !isdir(raw_data_dir) || OVERWRITE
if isdir(raw_data_root)
rm(raw_data_root; recursive=true)
rm(raw_data_root; recursive = true)
end
tgz = Downloads.download("https://github.com/gtfintechlab/fomc-hawkish-dovish/archive/refs/heads/main.tar.gz")
tgz = Downloads.download(
"https://github.com/gtfintechlab/fomc-hawkish-dovish/archive/refs/heads/main.tar.gz",
)
open(GzipDecompressorStream, tgz) do io
Tar.extract(
x -> contains(x.path, "fomc-hawkish-dovish-main/data") || contains(x.path, "training_data"),
io, "dev/data/raw"
x ->
contains(x.path, "fomc-hawkish-dovish-main/data") ||
contains(x.path, "training_data"),
io,
"dev/data/raw",
)
end
end
end
2 changes: 1 addition & 1 deletion dev/src/model_activations/env.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Pkg
Pkg.resolve()
Pkg.instantiate()
Pkg.precompile()
Pkg.precompile()
36 changes: 22 additions & 14 deletions dev/src/model_activations/merge_outputs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,39 @@ out_dir = "dev/data/activations/intermediate"
merge_dir = "dev/data/activations/merged"
ispath(merge_dir) || mkpath(merge_dir)
n_layers = 24
layer_dirs = [joinpath(merge_dir, "layer_$i") for i in 1:n_layers]
layer_dirs = [joinpath(merge_dir, "layer_$i") for i = 1:n_layers]
ispath.(layer_dirs) .|| mkpath.(layer_dirs)
writers = [open(Arrow.Writer, joinpath(i, "activations.arrow")) for i in layer_dirs]
files_sortperm = sortperm(parse.(Int, filter.(isdigit, readdir(out_dir)[contains.(readdir(out_dir), ".csv")])))
sorted_dir_files = readdir(out_dir)[contains.(readdir(out_dir), ".csv")][files_sortperm] |> unique
files_sortperm = sortperm(
parse.(Int, filter.(isdigit, readdir(out_dir)[contains.(readdir(out_dir), ".csv")])),
)
sorted_dir_files =
readdir(out_dir)[contains.(readdir(out_dir), ".csv")][files_sortperm] |> unique
for x in sorted_dir_files
df = CSV.read(joinpath(out_dir, x), DataFrame) |>
df =
CSV.read(joinpath(out_dir, x), DataFrame) |>
x -> sort!(x, [:sentence_id, :layer, :activation_id])
for layer in 1:n_layers
for layer = 1:n_layers
writer = writers[layer]
df_layer = df[df.layer .== layer, :]
df_layer = df[df.layer.==layer, :]
Arrow.write(writer, df_layer)
end
end

# Upload:
for layer in 1:n_layers
for layer = 1:n_layers
layer_dir = layer_dirs[layer]
artifact_id = artifact_from_directory(layer_dir)
release = upload_to_release(artifact_id; tag="activations_$(today())", name="activations_layer_$layer.tar.gz")
release = upload_to_release(
artifact_id;
tag = "activations_$(today())",
name = "activations_layer_$layer.tar.gz",
)
add_artifact!(
"Artifacts.toml",
"activations_layer_$layer",
release;
force=true,
lazy=true
"Artifacts.toml",
"activations_layer_$layer",
release;
force = true,
lazy = true,
)
end
end
21 changes: 10 additions & 11 deletions dev/src/model_activations/model_outputs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ using TrillionDollarWords
out_dir = "dev/data/activations/intermediate"
last_saved = 0
if ispath(out_dir)
sentence_ids = readdir(out_dir) .|>
x -> split(replace(x, ".csv" => ""), ":")[2] .|>
x -> parse(Int, x)
sentence_ids =
readdir(out_dir) .|>
x -> split(replace(x, ".csv" => ""), ":")[2] .|> x -> parse(Int, x)
if length(sentence_ids) > 0
last_saved = maximum(sentence_ids)
end
else
mkpath(out_dir)
mkpath(out_dir)
end
@info "Last saved: $last_saved."

Expand All @@ -26,19 +26,18 @@ Transformers.enable_gpu()

# Load model and data:
@info "Loading model and data..."
mod = load_model(; load_head=false, output_hidden_states=true)
mod = load_model(; load_head = false, output_hidden_states = true)
df = load_all_sentences()

# Compute activations:
@info "Computing activations..."
bs = 50
n = size(df, 1)
@info "Time limit: $(ENV["JOB_TIME"])".
timer = Timer(60 * 10)
@info "Time limit: $(ENV["JOB_TIME"])".timer = Timer(60 * 10)
for (i, batch) in enumerate(partition(1:n, bs))
@info "Processing batch $i/$(ceil(Int, n/bs))..."
!all(batch .<= last_saved) || continue
queries = df[batch,:]
queries = df[batch, :]
emb = @time layerwise_activations(mod, queries)
CSV.write(joinpath(out_dir, "activations_$batch.csv"), emb)
yield()
Expand All @@ -56,6 +55,6 @@ df_activations = []
for x in readdir(out_dir)[contains.(readdir(out_dir), ".csv")]
push!(df_activations, CSV.read(joinpath(out_dir, x), DataFrame))
end
df_activations = vcat(df_activations...) |>
x -> sort!(x, [:sentence_id, :layer, :activation_id])
CSV.write("$merge_dir/activations.csv", df_activations)
df_activations =
vcat(df_activations...) |> x -> sort!(x, [:sentence_id, :layer, :activation_id])
CSV.write("$merge_dir/activations.csv", df_activations)
Loading

0 comments on commit 9ea8230

Please sign in to comment.