diff --git a/docs/src/man/multiplealleles.md b/docs/src/man/multiplealleles.md index 6bf9319c..be380a30 100644 --- a/docs/src/man/multiplealleles.md +++ b/docs/src/man/multiplealleles.md @@ -52,7 +52,7 @@ should be used: df_ind = writeTableCF(countquartetsintrees(genetrees)...) # no mapping here: so quartet CFs across individuals CSV.write("tableCF_individuals.csv", df) # to save to a file df_sp = mapAllelesCFtable(mappingFile, "tableCF_individuals.csv"); -d_sp = readTableCF!(df_sp); +d_sp = readTableCF!(df_sp, mergerows=true); ``` where the mapping file can be a text (or `csv`) file with two columns named `allele` and `species`, mapping each allele name to a species name. diff --git a/src/multipleAlleles.jl b/src/multipleAlleles.jl index cf141f3b..c82ce7d7 100644 --- a/src/multipleAlleles.jl +++ b/src/multipleAlleles.jl @@ -33,8 +33,8 @@ in the example below, this file is best read later with the option ```julia mapAllelesCFtable("allele-species-map.csv", "allele-quartet-CF.csv"; filename = "quartetCF_speciesNames.csv") -df_sp = DataFrame(CSV.File("quartetCF_speciesNames.csv"); copycols=false); # DataFrame object -dataCF_specieslevel = readTableCF!(df_sp); # DataCF object +df_sp = CSV.read("quartetCF_speciesNames.csv", DataFrame); # DataFrame object +dataCF_specieslevel = readTableCF!(df_sp, mergerows=true); # DataCF object ``` """ function mapAllelesCFtable(alleleDF::AbstractString, cfDF::AbstractString; @@ -85,87 +85,40 @@ end # inside readTableCF! # by deleting rows that are not informative like sp1 sp1 sp1 sp2 # keepOne=true: we only keep one allele per species -function cleanAlleleDF!(newdf::DataFrame, cols::Vector{Int};keepOne=false::Bool) - withngenes = (length(cols)==8) +function cleanAlleleDF!(newdf::DataFrame, cols::Vector{<:Integer}; keepOne=false::Bool) delrows = Int[] # indices of rows to delete - repSpecies = String[] + repSpecies = Set{String}() if(isa(newdf[1,cols[1]],Integer)) #taxon names as integers: we need this to be able to add __2 - newdf[!,cols[1]] = map(string, newdf[!,cols[1]]) - newdf[!,cols[2]] = map(string, newdf[!,cols[2]]) - newdf[!,cols[3]] = map(string, newdf[!,cols[3]]) - newdf[!,cols[4]] = map(string, newdf[!,cols[4]]) + for j in 1:4 + newdf[!,cols[j]] .= map(string, newdf[!,cols[j]]) + end end row = Vector{String}(undef, 4) - for i in 1:size(newdf,1) #check all rows - @debug "row number: $i" - # fixit: check for no missing value, or error below + for i in 1:nrow(newdf) map!(j -> newdf[i,cols[j]], row, 1:4) - @debug "row $(row)" uniq = unique(row) - @debug "unique $(uniq)" - keep = false # default: used if 1 unique name, or 2 in some cases if(length(uniq) == 4) - keep = true - else - if(!keepOne) - if(length(uniq) == 3) #sp1 sp1 sp2 sp3 + continue + end + # by now, at least 1 species is repeated + if !keepOne # then we may choose to keep this row + # 3 options: sp1 sp1 sp2 sp3; or sp1 sp1 sp2 sp2 (keep) + # or sp1 sp1 sp1 sp2; or sp1 sp1 sp1 sp1 (do not keep) + keep = false + for u in uniq + ind = row .== u # indices of taxon names matching u + if sum(ind) == 2 keep = true - for u in uniq - @debug "u $(u), typeof $(typeof(u))" - ind = row .== u #taxon names matching u - @debug "taxon names matching u $(ind)" - if(sum(ind) == 2) - push!(repSpecies,string(u)) - found = false - for k in 1:4 - if(ind[k]) - if(found) - @debug "found the second one in k $(k), will change newdf[i,cols[k]] $(newdf[i,cols[k]]), typeof $(typeof(newdf[i,cols[k]]))" - newdf[i,cols[k]] = string(u, repeatAlleleSuffix) - break - else - found = true - end - end - end - break - end - end - elseif(length(uniq) == 2) - # keep was initialized to false - for u in uniq - @debug "length uniq is 2, u $(u)" - ind = row .== u - if(sum(ind) == 1 || sum(ind) == 3) - @debug "ind $(ind) is 1 or 3, should not keep" - break - elseif(sum(ind) == 2) - @debug "ind $(ind) is 2, should keep" - keep = true - found = false - push!(repSpecies,string(u)) - for k in 1:4 - if(ind[k]) - if(found) - newdf[i,cols[k]] = string(u, repeatAlleleSuffix) - break - else - found = true - end - end - end - end - end + push!(repSpecies, string(u)) + # change the second instance of a repeated taxon name with suffix + k = findlast(ind) + newdf[i,cols[k]] = string(u, repeatAlleleSuffix) end - @debug "after if, keep is $(keep)" end end keep || push!(delrows, i) - @debug "" keep end - @debug "" delrows - @debug "" repSpecies nrows = size(newdf,1) nkeep = nrows - length(delrows) if nkeep < nrows @@ -175,8 +128,7 @@ function cleanAlleleDF!(newdf::DataFrame, cols::Vector{Int};keepOne=false::Bool) nkeep > 0 || @warn "All 4-taxon subsets are uninformative, so the dataframe will be left empty" deleteat!(newdf, delrows) # deleteat! requires DataFrames 1.3 end - # @show size(newdf) - return unique(repSpecies) + return collect(repSpecies) end diff --git a/src/readData.jl b/src/readData.jl index b55224e9..652e3500 100644 --- a/src/readData.jl +++ b/src/readData.jl @@ -88,17 +88,17 @@ Optional arguments: The last version modifies the input data frame, if species are represented by multiple alleles for instance (see [`readTableCF!`](@ref)(data frame, columns)). """ -function readTableCF(file::AbstractString; delim=','::Char, summaryfile=""::AbstractString) +function readTableCF(file::AbstractString; delim=','::Char, summaryfile=""::AbstractString, kwargs...) df = DataFrame(CSV.File(file, delim=delim); copycols=false) - readTableCF!(df, summaryfile=summaryfile) + readTableCF!(df; summaryfile=summaryfile, kwargs...) end -function readTableCF(df0::DataFrames.DataFrame; summaryfile=""::AbstractString) +function readTableCF(df0::DataFrames.DataFrame; summaryfile=""::AbstractString, kwargs...) df = deepcopy(df0) - readTableCF!(df, summaryfile=summaryfile) + readTableCF!(df; summaryfile=summaryfile, kwargs...) end -function readTableCF!(df::DataFrames.DataFrame; summaryfile=""::AbstractString) +function readTableCF!(df::DataFrames.DataFrame; summaryfile=""::AbstractString, kwargs...) @debug "assume the numbers for the taxon read from the observed CF table match the numbers given to the taxon when creating the object network" alternativecolnames = [ # obsCF12 is as exported by fittedQuartetCF() [:CF12_34, Symbol("CF12.34"), :obsCF12], @@ -124,7 +124,7 @@ function readTableCF!(df::DataFrames.DataFrame; summaryfile=""::AbstractString) columns = [[1,2,3,4]; obsCFcol] if withngenes push!(columns, ngenecol) end - d = readTableCF!(df, columns) + d = readTableCF!(df, columns; kwargs...) if withngenes # && d.numTrees == -1 m1 = minimum([q.ngenes for q in d.quartet]) @@ -141,13 +141,14 @@ end # see docstring below, for readTableCF! # takes in df and 7 or 8 column numbers (4 labels + 3 CFs + ngenes possibly) -function readTableCF!(df::DataFrames.DataFrame, co::Vector{Int}) +function readTableCF!(df::DataFrames.DataFrame, co::Vector{Int}; mergerows=false) withngenes = (length(co)==8) # true if column :ngenes exists, false ow repSpecies = cleanAlleleDF!(df,co) # removes uninformative rows from df (not df0) # fixit: cleanAlleleDF! is time consuming but many times not needed # add option to skip it, if the user knows that each tip appears once only? - if !isempty(repSpecies) + if mergerows || !isempty(repSpecies) df = mergeRows(df,co) # warning: this 'df' is *not* changed externally + co = collect(eachindex(co)) # 1:7 or 1:8 end # we cannot move to mapAllelesCFtable because we need repSpecies in here quartets = Quartet[] for i in 1:size(df,1) diff --git a/test/test_multipleAlleles.jl b/test/test_multipleAlleles.jl index 92c334fa..8d11f5ca 100644 --- a/test/test_multipleAlleles.jl +++ b/test/test_multipleAlleles.jl @@ -74,6 +74,18 @@ sorttaxa!(dat) end # of testset: sorttaxa! @testset "snaq on multiple alleles" begin + +df = DataFrame(t1=["6","7"], t2=["7","6"], t3=["4","4"], t4=["8","8"], + a=[true,true], # to test recognition of columns + CF12_34=[0.25, 0.15], ngenes=[10,20], + CF13_24=[0.3,0.55], b=[false,false], CF14_23=[0.45,0.3]) +@test length(readTableCF(df).quartet) == 2 +d = readTableCF(df, mergerows=true) +@test isempty(d.repSpecies) +@test length(d.quartet) == 1 +@test d.quartet[1].obsCF ≈ [0.3, 0.5, 0.2] +@test d.quartet[1].ngenes ≈ 15 + df=DataFrame(t1=["6","6","10","6","6","7","7","7","7","7", "3", "7", "7"], # rows 11 & 13 (last & third to last): non-informative t2=["7","7","7","10","7","7","7","7","7","7", "7", "7", "7"], t3=["4","10","4","4","4","8","8","8","10","10","7", "6", "7"],