Skip to content

Commit

Permalink
Add custom functions to create variables Compustat
Browse files Browse the repository at this point in the history
  • Loading branch information
milonemario committed Sep 1, 2023
1 parent e156682 commit 2409eff
Showing 1 changed file with 119 additions and 85 deletions.
204 changes: 119 additions & 85 deletions src/modules/wrds/compustat/Compustat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,60 @@ using YAML

using ..Pilates: WRDS

struct Field
fields::Vector # Fields from the raw data
fn::Function # Function to transform the data
name::Symbol # Final name of field
end

Field(F::Field) = F
Field(field::Union{Symbol, Field}, fn, name) = Field([field], fn, name)
Field(s::Symbol) = Field(s, data -> data[!, s], s)
Field(p::Pair{Symbol, Symbol}) = Field(first(p), data -> data[!, first(p)], last(p))
Field(p::Pair{Field, Symbol}) = Field(first(p), data -> data[!, first(p).name], last(p))



Field(p::Pair{<:Any, <:Function}) = Field(first(p), last(p), nameof(last(p)))
Field(pf::Pair{<:Any, <:Pair{<:Function, Symbol}}) = Field(first(pf), first(last(pf)), last(last(pf)))

function raw_fields(field::Field)
fields_raw = Symbol[]
for f in field.fields
if typeof(f) == Symbol
push!(fields_raw, f)
else
push!(fields_raw, raw_fields(f)...)
end
end
unique(fields_raw)
end

function get_fields(wrdsuser::WRDS.WrdsUser, fields::Vector{Pair{Symbol, Symbol}};
frequency="Annual", lag=0)
raw_fields(fields::Vector{Field}) = unique(reduce(vcat, raw_fields.(fields)))

function table_index(wrdsuser::WRDS.WrdsUser, frequency::String)
if frequency == "Annual"
table = WRDS.WrdsTable(wrdsuser, "compustat", "funda")
elseif frequency == "Quarterly"
table = WRDS.WrdsTable(wrdsuser, "compustat", "fundq")
else
error("Frequency $frequency not supported. Should be 'Annual' or 'Quarterly'.")
end
table.index
end

function compute_field(data::DataFrame, field::Field, index::Vector{Symbol})
# Get all raw fields
df = data[!, [index..., raw_fields(field)...]]
# Compute any non-raw fields
for F in field.fields[typeof.(field.fields) .!= Symbol]
df[!, F.name] = compute_field(df, F, index)
end
# Compute the field
field.fn(df)
end

function get_raw_fields(wrdsuser::WRDS.WrdsUser, fields::Vector{Symbol}; frequency="Annual")
# Get the original fields
tables_yml = YAML.load_file("$(@__DIR__)/files.yaml")
tables_names = [t for t in keys(tables_yml) if t ["funda", "fundq"]]
Expand All @@ -27,7 +77,7 @@ function get_fields(wrdsuser::WRDS.WrdsUser, fields::Vector{Pair{Symbol, Symbol}
fields_found = Symbol[]
fields_for_table = Dict{String, Vector{Symbol}}()
for t in tables_all
fields_t = [f for f in t.fields if f in first.(fields) && f fields_found]
fields_t = [f for f in t.fields if f in (fields) && f fields_found]
append!(fields_found, fields_t)
if length(fields_t) > 0
push!(tables, t)
Expand All @@ -47,113 +97,97 @@ function get_fields(wrdsuser::WRDS.WrdsUser, fields::Vector{Pair{Symbol, Symbol}
data = leftjoin(data, df, on=fields_on)
end
end
data
end

# Get the lags if requested
function get_fields(wrdsuser::WRDS.WrdsUser, fields::Vector{Field}; frequency="Annual", lag=0)
# Get all raw fields
fields_raw = raw_fields(fields)
if lag != 0
if frequency == "Annual"
table = WRDS.WrdsTable(wrdsuser, "compustat", "funda")
dlag = Year(lag)
add_fields!(data, wrdsuser, [:fyear])
dropmissing!(data, :fyear)
data._date_ = Date.(data.fyear)
select!(data, Not(:fyear))
:fyear fields_raw ? push!(fields_raw, :fyear) : nothing
elseif frequency == "Quarterly"
error("Get lagged field for quarterly Compustat data yet to be implemented.")
# table = WRDS.WrdsTable(wrdsuser, "compustat", "fundq")
# dlag = Quarter(lag)
end
key = [:gvkey, table.format_index...]
# If duplicate filings for a given lagby date, keep the one with the latest datadate
dfg = groupby(data, [key..., :_date_])
transform!(dfg, :datadate => maximum => :datadate_latest)
filter!([:datadate, :datadate_latest] => (x, y) -> x .== y, data)
select!(data, Not(:datadate_latest))
# Check that there are no duplicates
if length(findall(nonunique(data[!, [key..., :_date_]]))) > 0
error("Index for table $(table.table), schema $(table.schema) and vendor $(table.vendor) is non-unique.")
end
# Create lagged fields
data._date_lag_ = data._date_ .+ dlag
names_lag = [first(f) => Symbol("__$(String(first(f)))__") for f in fields]
data_lag = rename(data[!, [key..., first.(fields)..., :_date_lag_]], names_lag)
leftjoin!(data, data_lag, on=[key..., :_date_ => :_date_lag_] )
select!(data, Not([:_date_, :_date_lag_, first.(fields)...]))
rename!(data, [last(f) => first(f) for f in names_lag])
end

# Rename variables
rename!(data, fields)
data
end
data = get_raw_fields(wrdsuser, fields_raw; frequency=frequency)

function get_fields(wrdsuser::WRDS.WrdsUser, fields::Vector{Symbol}; kwargs...)
fields = [f => f for f in fields]
get_fields(wrdsuser, fields; kwargs...)
end
# Transform and rename
index = table_index(frequency)
for field in fields
# Create the dataframe required by the
data[!, field.name] .= compute_field(data, field, index)
end

function add_fields!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector{Pair{Symbol, Symbol}}; kwargs...)
# Check fields do not already exist
existing_fields = intersect(Symbol.(names(data)), last.(fields))
if length(existing_fields) > 0
error("Fields $existing_fields already exist in the data.")
fields_names = [f.name for f in fields]

# Get the lags if requested
if lag != 0
lag!(data, wrdsuser, fields_names; frequency=frequency, lag=lag)
end
# Add the fields to the data
df = get_fields(wrdsuser, fields; kwargs...)
fields_on = Symbol.([f for f names(data) if f names(df)])
leftjoin!(data, df, on=fields_on)

# Only keep the transformed fields
select!(data, [index..., fields_names...])
data
end

function add_fields!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector{Symbol}; kwargs...)
fields = [f => f for f in fields]
add_fields!(data, wrdsuser, fields; kwargs...)
function get_fields(wrdsuser::WRDS.WrdsUser, fields::Vector; kwargs...)
fields = Field.(fields)
get_fields(wrdsuser, fields; kwargs...)
end

function add_field!(data::DataFrame, wrdsuser::WRDS.WrdsUser, field::Pair{<:Function, Symbol}; frequency="Annual")
function lag!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector{Symbol}; frequency="Annual", lag=0)
if frequency == "Annual"
table = WRDS.WrdsTable(wrdsuser, "compustat", "funda")
dlag = Year(lag)
if "fyear" names(data)
error("Field 'fyear' is required to compute lagged variables.")
end
df = dropmissing(data[!, [table.index..., :fyear, fields...]], :fyear)
df._date_ = Date.(df.fyear)
select!(df, Not(:fyear))
elseif frequency == "Quarterly"
table = WRDS.WrdsTable(wrdsuser, "compustat", "fundq")
error("Get lagged field for quarterly Compustat data yet to be implemented.")
# table = WRDS.WrdsTable(wrdsuser, "compustat", "fundq")
# dlag = Quarter(lag)
end
fn = first(field)
df = fn()
if String(nameof(fn)) names(df)
error("The function $fn should return a dataframe with a column $(nameof(fn)) corresponding to the new variable.")
end
select!(df, [table.index..., nameof(fn)] )
rename!(df, nameof(fn) => last.(field))
fields_on = Symbol.([f for f names(data) if f names(df)])
leftjoin!(data, df, on=fields_on)
end

function add_field!(data::DataFrame, wrdsuser::WRDS.WrdsUser, field::Function; kwargs...)
field = field => nameof(field)
add_field!(data, wrdsuser, field; kwargs...)
end

function add_fields!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector{Pair{<:Function, Symbol}}; kwargs...)
for field in fields
add_field!(data, wrdsuser, field; kwargs...)
key = [:gvkey, table.format_index...]
# If duplicate filings for a given lagby date, keep the one with the latest datadate
dfg = groupby(df, [key..., :_date_])
transform!(dfg, :datadate => maximum => :datadate_latest)
filter!([:datadate, :datadate_latest] => (x, y) -> x .== y, df)
select!(df, Not(:datadate_latest))
# Check that there are no duplicates
if length(findall(nonunique(df[!, [key..., :_date_]]))) > 0
error("Index for table $(table.table), schema $(table.schema) and vendor $(table.vendor) is non-unique.")
end
# Create lagged fields
df._date_lag_ = df._date_ .+ dlag
names_lag = [f => Symbol("__$(String(f))__") for f in fields]
dfm = rename(df[!, [key..., fields..., :_date_lag_]], names_lag)
leftjoin!(df, dfm, on=[key..., :_date_ => :_date_lag_] )
select!(df, Not([:_date_, :_date_lag_, fields...]))
leftjoin!(data, df, on=table.index)
select!(data, Not([fields...]))
rename!(data, [last(f) => first(f) for f in names_lag])
end

function add_fields!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector{<:Function}; kwargs...)
for field in fields
add_field!(data, wrdsuser, field; kwargs...)
function add_fields!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector{Field}; kwargs...)
# Check fields do not already exist
existing_fields = intersect(Symbol.(names(data)), [f.name for f in fields])
if length(existing_fields) > 0
error("Fields $existing_fields already exist in the data.")
end
# Add the fields to the data
df = get_fields(wrdsuser, fields; kwargs...)
fields_on = Symbol.([f for f names(data) if f names(df)])
leftjoin!(data, df, on=fields_on)
end

function add_fields!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector{Any}; kwargs...)
fields_symbols = fields[typeof.(fields) .== Symbol]
fields_symbols_pairs = fields[typeof.(fields) .== Pair{Symbol, Symbol}]

fields_1 = [[f => f for f in fields_symbols]..., [first(f) => last(f) for f fields_symbols_pairs]...]
add_fields!(data, wrdsuser, fields_1; kwargs)

fields_others = fields[.~(typeof.(fields) .== Symbol) .&& .~(typeof.(fields) .== Pair{Symbol, Symbol})]
for field in fields_others
add_field!(data, wrdsuser, field; kwargs...)
end
function add_fields!(data::DataFrame, wrdsuser::WRDS.WrdsUser, fields::Vector; kwargs...)
fields = Field.(fields)
add_fields!(data, wrdsuser, fields; kwargs...)
end


end # module

0 comments on commit 2409eff

Please sign in to comment.