Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

22.7 fb kinship comparison #503

Open
wants to merge 1 commit into
base: release22.7-SNAPSHOT
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 187 additions & 24 deletions ehr/resources/pipeline/kinship/populateKinship.r
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
##
# Copyright (c) 2013-2019 LabKey Corporation
# Copyright (c) 2012-2017 LabKey Corporation
#
# Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
##
Expand All @@ -15,7 +15,11 @@ library(methods);
library(kinship2);
library(getopt);
library(Matrix);
library(dplyr);
library(dataCompareR);
library(Rlabkey);
library(lme4);
library(parallel);


spec <- matrix(c(
#'containerPath', '-c', 1, "character",
Expand All @@ -36,38 +40,197 @@ allPed$Species <- as.character(allPed$Species)
allPed$Species[is.na(allPed$Species)] <- c('Unknown')
allPed$Species <- as.factor(allPed$Species)

##### commented out after checking data
# duplicatedId=allPed$Id[duplicated(allPed$Id)]
# allPed[allPed$Id%in%duplicatedId,]# Check for duplication: passed
# Id.Dam=unique(allPed$Dam)
# allPed[(allPed$Id%in%Id.Dam)&(allPed$Gender!=2),]#Check for Dam that is not female
# Id.Sire=unique(allPed$Sire)
# allPed[(allPed$Id%in%Id.Sire)&(allPed$Gender!=1),]#Check for Sire that is not male:1 not passed

# this function adds missing parents to the pedigree
# it is similar to add.Inds from kinship; however, we retain gender
addMissing <- function(ped, species){
if(ncol(ped)<4)stop("pedigree should have at least 4 columns")
head <- names(ped)

nsires <- match(ped[,3],ped[,1])# [Quoc] change ped,2 to ped,3
nsires <- as.character(unique(ped[is.na(nsires),3]))
nsires <- nsires[!is.na(nsires)]
if(length(nsires)){
ped <- rbind(ped, data.frame(Id=nsires, Dam=rep(NA, length(nsires)), Sire=rep(NA, length(nsires)), Gender=rep(1, length(nsires)), Species=rep(species, length(nsires))));
}

ndams <- match(ped[,2],ped[,1])# [Quoc] change ped,3 to ped,2
ndams <- as.character(unique(ped[is.na(ndams),2]))
ndams <- ndams[!is.na(ndams)];

if(length(ndams)){
ped <- rbind(ped,data.frame(Id=ndams, Dam=rep(NA, length(ndams)), Sire=rep(NA, length(ndams)), Gender=rep(2, length(ndams)), Species=rep(species, length(ndams))));
}

names(ped) <- head
return(ped)
}

print(paste0('Getting previousKinship'))
library(Rlabkey);
labkey.url.base = "http://localhost:8080/labkey/"
labkey.url.path = "/WNPRC/EHR"

#labkey.setCurlOptions(ssl.verifypeer=FALSE, ssl.verifyhost=TRUE)
#previousRecords=NULL
previousKinship <- function(bool){
print('fetching previousKinship using quarterOfCores')
previousRecords <- labkey.selectRows (
baseUrl=labkey.url.base,
#to run directly in R, uncomment this line. otherwise providing a containerPath is not necessary
folderPath=labkey.url.path,
schemaName="ehr",
queryName="kinship",
colSelect=c('Id', 'Id2','coefficient'),
showHidden = TRUE,
colNameOpt = 'fieldname', #rname
#showHidden = FALSE
)
}
numCores <- detectCores()
quarterOfCores <- numCores/2
print(quarterOfCores)
#mclapply(TRUE,previousKinship, mc.cores = quarterOfCores)

# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately
# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half
newRecords=NULL
for (species in unique(allPed$Species)){


calculationKinship <- function(species){
newRecords=NULL
filename=NULL
previousRecords=NULL
compareResults=NULL
summaryResults=NULL
mismatchResults=NULL
#columnKeys=NULL


print(paste0('processing species: ', species))
allRecordsForSpecies <- allPed[allPed$Species == species,]
recordsForSpecies <- allPed[allPed$Species == species,]
print(paste0('total subjects: ', nrow(recordsForSpecies)))

recordsForSpecies <- addMissing(recordsForSpecies, species)
gc()
print(paste0('total subjects after adding missing: ', nrow(recordsForSpecies)))

fami=makefamid(id=recordsForSpecies$Id,father.id=recordsForSpecies$Sire,mother.id=recordsForSpecies$Dam)
famid=unique(fami)
famid=famid[famid!=0]
gc()

print(paste0('total families: ', length(famid)))
for (fam.no in famid){
familytemp=recordsForSpecies[fami==fam.no,]
# print(paste0('family size: ',species, ' ' , nrow(familytemp)))

temp.kin=kinship(familytemp$Id,familytemp$Sire,familytemp$Dam)
rownames(temp.kin) <- colnames(temp.kin) #add rownames to make matrix symmetric, which is required downstream
sparse.kin=as(temp.kin,"dsTMatrix") #change kinship matrix to symmetric triplet sparse matrix

# convert to a sparse matrix usng S4 object from Matrix library
temp.tri=data.frame(Id=colnames(temp.kin)[sparse.kin@i+1],Id2=colnames(temp.kin)[sparse.kin@j+1],coefficient=sparse.kin@x,stringsAsFactors=FALSE)
newRecords=rbind(newRecords,temp.tri)
}

filename <- paste0("kinship_",species,".txt");

firstRun = TRUE
print(paste("value of file.exists",file.exists(filename), filename))


if (file.exists(filename)){
print(paste0("opening previouskinship for ", species))
previousRecords <- read.table(filename, header = TRUE, sep="\t", colClasses = c("character","character", "numeric"))
#colnames(previousRecords)<c('Id', 'Id2', 'coefficient')
print(paste("removing duplicates",species,"previous", nrow(previousRecords), "newRecords",nrow(newRecords)))
#trimpreviousRecords <- subset(previousRecords,previousRecords$Id==previousRecords$Id2)
#removing coefficients for animals with same IDs
trimpreviousRecords <-previousRecords[(previousRecords$Id!=previousRecords$Id2),]
trimnewRecords <- newRecords[(newRecords$Id!=newRecords$Id2),]
#trimnewRecords <- subset(newRecords,newRecords$Id==newRecords$Id2)
#print number of records after trim
print(paste("after removal",species,"previous", nrow(trimpreviousRecords), "newRecords",nrow(trimnewRecords)))

compareResults <- rCompare(trimpreviousRecords,trimnewRecords,keys = c("Id","Id2"),roundDigits = 7,mismatches = NA,trimChars = TRUE);

summaryResults <- summary(compareResults)
print(summaryResults)
print(summaryResults$nrowSomeUnequal)

print(paste0('Generate mismatches for: ',species))
mismatchResults <- generateMismatchData(compareResults,trimpreviousRecords,trimnewRecords)
#print(mismatchResults)

print(paste0("nrowInAOnly ", summaryResults$nrowInAOnly, "nrowSomeUnequal ", summaryResults$nrowSomeUnequal))
oldFile <- paste0("previous_",filename)
file.rename(filename,oldFile)
firstRun = FALSE

}
print(paste("number of new records", is.null(newRecords)))

if ((firstRun || is.null(mismatchResults)) && !is.null(newRecords)){
print("Output table:")
print(str(newRecords))
print(filename)
write.table(newRecords, file = filename, append = FALSE,row.names=F,quote=F,sep="\t")
}
else if (!is.null(mismatchResults) ){
#&& (summaryResults$nrowInAOnly>0 && summaryResults$nrowSomeUnequal>0)
print("print comparison")
detailMismatches <- colsWithUnequalValues(COEFFICIENT,mismatchResults)
#print(summaryResults$colsWithUnequalValues)
mismatchFile <- paste0("mismatchesKinship",species,".txt")
saveReport(compareResults,reportName = mismatchFile, HTMLReport = FALSE)
write.table(detailMismatches, file = 'mismatches', append = FALSE,row.names=F,quote=F,sep="\t")
#write.table(mismatchResults, file = mismatchFile, append = FALSE, row.name=F,quote=F,sep="\t")
}


# Add missing parents for accurate kinship calculations
fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender))
}
system.time({
#lapply(unique(allPed$Species), calculationKinship)
mclapply(unique(allPed$Species), calculationKinship, mc.cores = quarterOfCores)
})

# Kinship is expecting records to be sorted IAW it's own pedigree function
recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0))

temp.kin=kinship(recordsForSpecies)

# Add rownames to make matrix symmetric, which is required downstream
rownames(temp.kin) <- colnames(temp.kin)

# Convert kinship matrix to a triplet list of two ids and their coefficient
summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix")))
idList <- rownames(temp.kin)
temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
#summary(previousRecords)
#See size of previousKinship to determine to add initial ran.

#columnKeys <- c('Id','Id2')
#compareResults <- rCompare(previousRecords,newRecords,keys = columnKeys,roundDigits = NA,mismatches = NA,trimChars = FALSE);
#summaryResults <- summary(compareResults)
#print(summaryResults)
#print(summaryResults('nrowSomeUnequal'))
print(paste0('nrowCommon'))
#print(summaryResults$nrowCommon)
#print(summaryResults$nrowInAOnly)
#print(summaryResults$nrowInBOnly)
#print(summaryResults$datasetSummary)
#print(paste0('Number of rows with same results: ',nrowSomeUnequal(summaryResults)))
#print(paste0('Generate mismatches'))
#mismatchResults <- generateMismatchData(compareResults,previousKinship,newRecords)
#print(mismatchResults)

# Now filter out parents added for kinship calculation
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE))
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE))

newRecords=rbind(newRecords,temp.tri)
print(paste0('total subjects: ', nrow(allRecordsForSpecies)))
}

# write TSV to disk
print("Output table:");
print(str(newRecords));
write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t");
#print(str(newRecords));
#print('value of mismatches ',is.null(mismatchResults))
#if (is.null(mismatchResults)){
#write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t")
#}
#if(!is.null(mismatchResults)){
#write.table('null', file = "kinship.txt", append = FALSE, row.name=F,quote=F,sep="\t")
#}