Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix missing level, adds some new #16

Merged
merged 1 commit into from
Aug 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TidierCats"
uuid = "79ddc9fe-4dbf-4a56-a832-df41fb326d23"
authors = ["Daniel Rizk"]
version = "0.1.1"
version = "0.1.2"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -10,10 +10,10 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
CategoricalArrays = "0.10"
CategoricalArrays = "0.10, 1.0"
DataFrames = "1.5"
Reexport = "0.2, 1"
julia = "1.6"
julia = "1.9"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
- `cat_collapse()`
- `cat_lump_min()`
- `cat_lump_prop()`
- `cat_recode()`
- `cat_other()`
- `cat_replace_missing()`
- `as_categorical()`

## Installation
Expand Down
3 changes: 3 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ In addition, this package includes:
- `cat_collapse()`
- `cat_lump_min()`
- `cat_lump_prop()`
- `cat_recode()`
- `cat_other()`
- `cat_replace_missing()`
- `as_categorical()`
152 changes: 147 additions & 5 deletions src/TidierCats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ using Reexport
@reexport using CategoricalArrays

export cat_rev, cat_relevel, cat_infreq, cat_lump, cat_reorder, cat_collapse, cat_lump_min, cat_lump_prop
export as_categorical, as_integer
export as_categorical, as_integer, cat_replace_missing, cat_other, cat_recode
include("catsdocstrings.jl")

"""
Expand All @@ -24,10 +24,58 @@ end
"""
$docstring_cat_relevel
"""
function cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String})
ordered_levels = [x for x in levels_order if x in levels(cat_array)]
append!(ordered_levels, [x for x in levels(cat_array) if x ∉ ordered_levels])
new_cat_array = CategoricalArray([String(v) for v in cat_array], ordered=true, levels=ordered_levels)
function cat_relevel(cat_array::CategoricalArray{Union{Missing, String}}, levels_order::Vector{Union{String, Missing}})
unwrapped_levels = unwrap.(levels(cat_array))
ordered_levels = [x for x in levels_order if !ismissing(x) && x in unwrapped_levels]
if any(ismissing, levels_order) && any(ismissing, unwrapped_levels)
push!(ordered_levels, missing)
end
append!(ordered_levels, [x for x in unwrapped_levels if !ismissing(x) && x ∉ ordered_levels])
levels!(cat_array, ordered_levels)
return cat_array
end

function cat_relevel(cat_array, levels_order::Vector{String}; after::Int = 0)
current_levels = levels(cat_array)

# Separate levels into those mentioned in levels_order and those not
mentioned_levels = [x for x in levels_order if x in current_levels]
unmentioned_levels = [x for x in current_levels if x ∉ mentioned_levels]

# Determine where to insert the mentioned levels
if after == 0
new_levels = vcat(mentioned_levels, unmentioned_levels)
elseif after > 0 && after <= length(current_levels)
before = current_levels[1:after]
after_levels = current_levels[(after+1):end]
new_levels = vcat(
[l for l in before if l ∉ mentioned_levels],
[l for l in after_levels if l ∉ mentioned_levels],
mentioned_levels
)
# Move mentioned levels to the correct position
mentioned_set = Set(mentioned_levels)
insert_pos = after + 1
for (i, level) in enumerate(new_levels)
if i > after && level ∉ mentioned_set
insert_pos = i
break
end
end
new_levels = vcat(
new_levels[1:(insert_pos-1)],
mentioned_levels,
new_levels[insert_pos:end]
)
new_levels = unique(new_levels) # Remove any duplicates
else
error("'after' must be between 0 and the number of levels")
end

# Create a new CategoricalArray with the updated level order
new_cat_array = copy(cat_array)
levels!(new_cat_array, new_levels)

return new_cat_array
end

Expand Down Expand Up @@ -188,4 +236,98 @@ function as_integer(cat_array::CategoricalArray)
return CategoricalArrays.levelcode.(cat_array)
end

"""
$docstring_cat_replace_missing
"""
function cat_replace_missing(cat_array::CategoricalArray{Union{Missing, String}}, txt::String)
replace(cat_array, missing => txt)
end

"""
$docstring_cat_other
"""
function cat_other(f::Union{CategoricalArray, AbstractVector};
keep::Union{Nothing, Vector{String}} = nothing,
drop::Union{Nothing, Vector{String}} = nothing,
other_level::String = "Other")

if !isnothing(keep) && !isnothing(drop)
error("Only one of 'keep' or 'drop' should be specified, not both.")
end

if isnothing(keep) && isnothing(drop)
error("Either 'keep' or 'drop' must be specified.")
end

# Convert to CategoricalArray if it's not already
if !(f isa CategoricalArray)
f = categorical(f)
end

current_levels = levels(f)

if !isnothing(keep)
levels_to_change = setdiff(current_levels, keep)
else # drop is specified
levels_to_change = intersect(current_levels, drop)
end

# Create a new CategoricalArray
new_f = copy(f)

# Replace levels
for level in levels_to_change
new_f[new_f .== level] .= other_level
end

# Ensure 'other_level' is at the end of levels
new_levels = union(setdiff(current_levels, levels_to_change), [other_level])
levels!(new_f, new_levels)

return new_f
end


"""
$docstring_cat_recode
"""
function cat_recode(f::Union{CategoricalArray, AbstractVector}; kwargs...)
# Convert to CategoricalArray if it's not already
if !(f isa CategoricalArray)
f = categorical(f)
end

# Create a new CategoricalArray
new_f = copy(f)

# Iterate over the keyword arguments
for (new_level, old_levels) in kwargs
old_levels_str = [String(level) for level in old_levels] # Convert to string if needed

if new_level === nothing
# Remove the old levels by setting them to missing
for old_level in old_levels_str
new_f[new_f .== old_level] .= missing
end
else
new_level_str = String(new_level) # Convert new level to string
# Recode the old levels to the new level
for old_level in old_levels_str
if old_level in levels(new_f)
new_f[new_f .== old_level] .= new_level_str
else
@warn "Unknown level in input factor: $old_level"
end
end
end
end

# Clean up the levels (remove missing levels)
levels!(new_f, unique(skipmissing(new_f)))

return new_f
end



end
113 changes: 102 additions & 11 deletions src/catsdocstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ julia> cat_rev(cat_array)

const docstring_cat_relevel =
"""
cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String})
cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String}, after::Int=0)

Reorders the levels in a categorical array according to the provided order.

# Arguments
`cat_array`: Input categorical array.
`levels_order`: Vector of levels in the desired order.

`after`: Position after which to insert the new levels. Default is ignored
# Returns
Categorical array with levels reordered according to levels_order.

Expand All @@ -59,14 +59,16 @@ julia> cat_array = CategoricalArray(["A", "B", "C", "A", "B", "B"], ordered=true
"B"
"B"

julia> cat_relevel(cat_array, ["B", "A", "C"])
6-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"A"
"B"
"C"
"A"
"B"
"B"
julia> println(levels(cat_relevel(cat_array, ["B", "A", "C"])))
["B", "A", "C"]

julia> println(levels(cat_relevel(cat_array, ["A"], after=1)))
["B", "A", "C"]

julia> cat_array = CategoricalArray(["A", "B", "C", "A", "B", missing], ordered=true);

julia> println(levels(cat_relevel(cat_array, ["C", "A", "B", missing]), skipmissing=false))
Union{Missing, String}["C", "A", "B", missing]
```
"""

Expand Down Expand Up @@ -316,4 +318,93 @@ julia> cat_lump_prop(cat_array, 0.3)
const docstring_as_integer =
"""
Converts a CategoricalValue or CategoricalArray to an integer or vector of integers.
"""
"""
const docstring_cat_replace_missing =
"""
cat_replace_missing(cat_array::CategoricalArray, missing_level::String="missing")

Lumps infrequent levels in a categorical array into an 'other' level based on proportion threshold.

# Arguments
- `cat_array`: Categorical array to lump
- `prop`: Proportion threshold. Levels with proportions below this will be lumped.
- `other_level`: The level name to lump infrequent levels into. Default is "Other".

# Returns
Categorical array with levels lumped based on proportion.

# Examples

```jldoctest
julia> cat_array = CategoricalArray(["a", "b", missing, "a", missing, "c"]);

julia > print(cat_missing_to_lvl(cat_array))
6-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
missing
"a"
missing
"c"

julia> print(cat_missing_to_lvl(cat_array, "unknown"))
6-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
"unknown"
"a"
"unknown"
"c"
```
"""

const docstring_cat_recode =
"""
cat_recode(cat_array::Union{CategoricalArray, AbstractVector}; kwargs...)

Recodes the levels in a categorical array based on a provided mapping.

# Arguments
- `cat_array`: Categorical array to recode
- `kwargs`: A dictionary with the original levels as keys and the new levels as values. Levels not in the keys will be kept the same.

# Returns
Categorical array with the levels recoded.

# Examples

```jldoctest
julia> x = CategoricalArray(["apple", "tomato", "banana", "dear"]);

julia> println(levels(cat_recode(x, fruit = ["apple", "banana"], nothing = ["tomato"])))
["fruit", "nothing", "dear"]
```
"""

const docstring_cat_other =
"""
cat_other(cat_array::CategoricalArray, other_level::String="Other")

Replaces all levels in a categorical array with the 'other' level.

# Arguments
- `cat_array`: Categorical array to replace levels
- `other_level`: The level name to replace all levels with. Default is "Other".

# Returns
Categorical array with all levels replaced by the 'other' level.

# Examples

```jldoctest
julia> cat_array = CategoricalArray(["A", "B", "C", "D", "E"]);

julia> cat_other(cat_array, drop = ["A", "B"])
5-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"Other"
"Other"
"C"
"D"
"E"
```
"""
Loading