Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify clinical data binning related SQL #10823

Merged
merged 2 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.cbioportal.model.CopyNumberCountByGene;
import org.cbioportal.model.GenomicDataCount;
import org.cbioportal.model.Sample;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;

import java.util.List;
Expand All @@ -32,6 +33,8 @@ public interface StudyViewRepository {

List<ClinicalAttribute> getClinicalAttributes();

Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap();

List<CaseListDataCount> getCaseListDataCounts(StudyViewFilter studyViewFilter);

Map<String, AlterationCountByGene> getTotalProfiledCounts(StudyViewFilter studyViewFilter, String alterationType);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,6 @@ List<CopyNumberCountByGene> getCnaGenes(StudyViewFilter studyViewFilter, Categor
List<AlterationCountByGene> getStructuralVariantGenes(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, AlterationFilterHelper alterationFilterHelper);

List<ClinicalDataCount> getPatientClinicalDataCounts(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, List<String> attributeIds, List<String> filteredAttributeValues);

List<ClinicalDataCount> getSampleClinicalDataCounts(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, List<String> attributeIds, List<String> filteredAttributeValues );

List<ClinicalDataCount> getClinicalDataCounts(StudyViewFilter studyViewFilter, CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter,
boolean applyPatientIdFilters, List<String> attributeIds, List<String> filteredAttributeValues);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.cbioportal.persistence.enums.ClinicalAttributeDataSource;
import org.cbioportal.persistence.helper.AlterationFilterHelper;
import org.cbioportal.web.parameter.CategorizedClinicalDataCountFilter;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
Expand Down Expand Up @@ -83,6 +84,25 @@ public List<ClinicalAttribute> getClinicalAttributes() {
return mapper.getClinicalAttributes();
}

@Override
public Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap() {
if (clinicalAttributesMap.isEmpty()) {
buildClinicalAttributeNameMap();
}

Map<String, ClinicalDataType> attributeDatatypeMap = new HashMap<>();

clinicalAttributesMap
.get(ClinicalAttributeDataSource.SAMPLE)
.forEach(attribute -> attributeDatatypeMap.put(attribute.getAttrId(), ClinicalDataType.SAMPLE));

clinicalAttributesMap
.get(ClinicalAttributeDataSource.PATIENT)
.forEach(attribute -> attributeDatatypeMap.put(attribute.getAttrId(), ClinicalDataType.PATIENT));

return attributeDatatypeMap;
}

@Override
public List<CaseListDataCount> getCaseListDataCounts(StudyViewFilter studyViewFilter) {
CategorizedClinicalDataCountFilter categorizedClinicalDataCountFilter = extractClinicalDataCountFilters(studyViewFilter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import org.cbioportal.model.GenomicDataCount;
import org.cbioportal.model.CopyNumberCountByGene;
import org.cbioportal.model.Sample;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;

import java.util.List;
import java.util.Map;

public interface StudyViewColumnarService {

Expand All @@ -19,6 +21,8 @@ public interface StudyViewColumnarService {
List<CopyNumberCountByGene> getCnaGenes(StudyViewFilter interceptedStudyViewFilter);
List<AlterationCountByGene> getStructuralVariantGenes(StudyViewFilter studyViewFilter);

Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap();

List<ClinicalDataCountItem> getClinicalDataCounts(StudyViewFilter studyViewFilter, List<String> filteredAttributes);

List<CaseListDataCount> getCaseListDataCounts(StudyViewFilter studyViewFilter);
Expand All @@ -28,6 +32,5 @@ public interface StudyViewColumnarService {
List<ClinicalData> getSampleClinicalData(StudyViewFilter studyViewFilter, List<String> attributeIds);

List<GenomicDataCount> getGenomicDataCounts(StudyViewFilter studyViewFilter);



}
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
import org.cbioportal.persistence.StudyViewRepository;
import org.cbioportal.service.AlterationCountService;
import org.cbioportal.service.StudyViewColumnarService;
import org.cbioportal.web.parameter.ClinicalDataType;
import org.cbioportal.web.parameter.StudyViewFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.cache.annotation.Cacheable;
import org.springframework.stereotype.Service;

import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

@Service
Expand Down Expand Up @@ -58,6 +60,11 @@ public List<AlterationCountByGene> getStructuralVariantGenes(StudyViewFilter stu
return alterationCountService.getStructuralVariantGenes(studyViewFilter);
}

@Override
public Map<String, ClinicalDataType> getClinicalAttributeDatatypeMap() {
return studyViewRepository.getClinicalAttributeDatatypeMap();
}

@Override
public List<ClinicalDataCountItem> getClinicalDataCounts(StudyViewFilter studyViewFilter, List<String> filteredAttributes) {
return studyViewRepository.getClinicalDataCounts(studyViewFilter, filteredAttributes)
Expand Down Expand Up @@ -85,6 +92,4 @@ public List<ClinicalData> getPatientClinicalData(StudyViewFilter studyViewFilter
public List<ClinicalData> getSampleClinicalData(StudyViewFilter studyViewFilter, List<String> attributeIds) {
return studyViewRepository.getSampleClinicalData(studyViewFilter, attributeIds);
}


}
71 changes: 25 additions & 46 deletions src/main/java/org/cbioportal/web/columnar/ClinicalDataBinner.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

@Component
public class ClinicalDataBinner {
Expand All @@ -27,6 +26,15 @@ public ClinicalDataBinner(
this.dataBinner = dataBinner;
}

private List<ClinicalData> convertCountsToData(List<ClinicalDataCount> clinicalDataCounts)
{
return clinicalDataCounts
.stream()
.map(NewClinicalDataBinUtil::generateClinicalDataFromClinicalDataCount)
.flatMap(Collection::stream)
.toList();
}

haynescd marked this conversation as resolved.
Show resolved Hide resolved
@Cacheable(cacheResolver = "generalRepositoryCacheResolver", condition = "@cacheEnabledConfig.getEnabled()")
public List<ClinicalDataBin> fetchClinicalDataBinCounts(
DataBinMethod dataBinMethod,
Expand All @@ -43,53 +51,30 @@ public List<ClinicalDataBin> fetchClinicalDataBinCounts(
List<String> attributeIds = attributes.stream().map(ClinicalDataBinFilter::getAttributeId).collect(Collectors.toList());

// a new StudyView filter to partially filter by study and sample ids only
StudyViewFilter partialFilter = new StudyViewFilter();
partialFilter.setStudyIds(studyViewFilter.getStudyIds());
partialFilter.setSampleIdentifiers(studyViewFilter.getSampleIdentifiers());

// filter only by study id and sample identifiers, ignore rest
// we need this additional partial filter because we always need to know the bins generated for the initial state
// which allows us to keep the number of bins and bin ranges consistent even if there are additional data filters.
// we only want to update the counts for each bin, we don't want to regenerate the bins for the filtered data.
// NOTE: partial filter is only needed when dataBinMethod == DataBinMethod.STATIC but that's always the case
// for the frontend implementation. we can't really use dataBinMethod == DataBinMethod.DYNAMIC because of the
// complication it brings to the frontend visualization and filtering
List<Sample> unfilteredSamples = studyViewColumnarService.getFilteredSamples(partialFilter);
List<Sample> filteredSamples = studyViewColumnarService.getFilteredSamples(studyViewFilter);

// TODO make sure unique sample and patient keys don't need to be distinct
List<String> unfilteredUniqueSampleKeys = unfilteredSamples.stream().map(Sample::getUniqueSampleKey).collect(Collectors.toList());
List<String> filteredUniqueSampleKeys = filteredSamples.stream().map(Sample::getUniqueSampleKey).collect(Collectors.toList());
List<String> unfilteredUniquePatientKeys = unfilteredSamples.stream().map(Sample::getUniquePatientKey).collect(Collectors.toList());
List<String> filteredUniquePatientKeys = filteredSamples.stream().map(Sample::getUniquePatientKey).collect(Collectors.toList());
StudyViewFilter partialFilter = new StudyViewFilter();
partialFilter.setStudyIds(studyViewFilter.getStudyIds());
partialFilter.setSampleIdentifiers(studyViewFilter.getSampleIdentifiers());

// TODO make sure we don't need a distinction between sample vs patient attribute ids here
// ideally we shouldn't because we have patient clinical data separated from sample clinical data in clickhouse

// we need the clinical data for the partial filter in order to generate the bins for initial state
// we use the filtered data to calculate the counts for each bin, we do not regenerate bins for the filtered data
List<ClinicalData> unfilteredClinicalDataForSamples = studyViewColumnarService.getSampleClinicalData(partialFilter, attributeIds);
List<ClinicalData> filteredClinicalDataForSamples = studyViewColumnarService.getSampleClinicalData(studyViewFilter, attributeIds);
List<ClinicalData> unfilteredClinicalDataForPatients = studyViewColumnarService.getPatientClinicalData(partialFilter, attributeIds);
List<ClinicalData> filteredClinicalDataForPatients = studyViewColumnarService.getPatientClinicalData(studyViewFilter, attributeIds);
// we use the filtered data to calculate the counts for each bin, we do not regenerate bins for the filtered data
List<ClinicalDataCountItem> unfilteredClinicalDataCounts = studyViewColumnarService.getClinicalDataCounts(partialFilter, attributeIds);
List<ClinicalDataCountItem> filteredClinicalDataCounts = studyViewColumnarService.getClinicalDataCounts(studyViewFilter, attributeIds);

Map<String, ClinicalDataType> attributeDatatypeMap = NewClinicalDataBinUtil.toAttributeDatatypeMap(
unfilteredClinicalDataForSamples.stream().map(ClinicalData::getAttrId).collect(Collectors.toList()),
unfilteredClinicalDataForPatients.stream().map(ClinicalData::getAttrId).collect(Collectors.toList()),
Collections.emptyList() // TODO ignoring conflictingPatientAttributeIds for now
// TODO ignoring conflictingPatientAttributeIds for now
alisman marked this conversation as resolved.
Show resolved Hide resolved
List<ClinicalData> unfilteredClinicalData = convertCountsToData(
unfilteredClinicalDataCounts.stream().flatMap(c -> c.getCounts().stream()).toList()
);
List<ClinicalData> filteredClinicalData = convertCountsToData(
filteredClinicalDataCounts.stream().flatMap(c -> c.getCounts().stream()).toList()
);

List<Binnable> unfilteredClinicalData = Stream.of(
unfilteredClinicalDataForSamples,
unfilteredClinicalDataForPatients
// unfilteredClinicalDataForConflictingPatientAttributes /// TODO ignoring conflictingPatientAttributeIds for now
).flatMap(Collection::stream).collect(Collectors.toList());

List<Binnable> filteredClinicalData = Stream.of(
filteredClinicalDataForSamples,
filteredClinicalDataForPatients
// filteredClinicalDataForConflictingPatientAttributes // TODO ignoring conflictingPatientAttributeIds for now
).flatMap(Collection::stream).collect(Collectors.toList());
Map<String, ClinicalDataType> attributeDatatypeMap = studyViewColumnarService.getClinicalAttributeDatatypeMap();

Map<String, List<Binnable>> unfilteredClinicalDataByAttributeId =
unfilteredClinicalData.stream().collect(Collectors.groupingBy(Binnable::getAttrId));
Expand All @@ -100,17 +85,13 @@ public List<ClinicalDataBin> fetchClinicalDataBinCounts(
List<ClinicalDataBin> clinicalDataBins = Collections.emptyList();

if (dataBinMethod == DataBinMethod.STATIC) {
if (!unfilteredSamples.isEmpty() && !unfilteredClinicalData.isEmpty()) {
if (!unfilteredClinicalData.isEmpty()) {
clinicalDataBins = NewClinicalDataBinUtil.calculateStaticDataBins(
dataBinner,
attributes,
attributeDatatypeMap,
unfilteredClinicalDataByAttributeId,
filteredClinicalDataByAttributeId,
unfilteredUniqueSampleKeys,
unfilteredUniquePatientKeys,
filteredUniqueSampleKeys,
filteredUniquePatientKeys
filteredClinicalDataByAttributeId
);
}
}
Expand All @@ -123,9 +104,7 @@ public List<ClinicalDataBin> fetchClinicalDataBinCounts(
dataBinner,
attributes,
attributeDatatypeMap,
filteredClinicalDataByAttributeId,
filteredUniqueSampleKeys,
filteredUniquePatientKeys
filteredClinicalDataByAttributeId
);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package org.cbioportal.web.columnar.util;

import org.cbioportal.model.Binnable;
import org.cbioportal.model.ClinicalData;
import org.cbioportal.model.ClinicalDataBin;
import org.cbioportal.model.ClinicalDataCount;
import org.cbioportal.model.DataBin;
import org.cbioportal.web.parameter.ClinicalDataBinCountFilter;
import org.cbioportal.web.parameter.ClinicalDataBinFilter;
Expand All @@ -15,7 +17,6 @@
import java.util.Map;

import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;

public class NewClinicalDataBinUtil {
public static StudyViewFilter removeSelfFromFilter(ClinicalDataBinCountFilter dataBinCountFilter) {
Expand Down Expand Up @@ -70,33 +71,21 @@ public static List<ClinicalDataBin> calculateStaticDataBins(
List<ClinicalDataBinFilter> attributes,
Map<String, ClinicalDataType> attributeDatatypeMap,
Map<String, List<Binnable>> unfilteredClinicalDataByAttributeId,
Map<String, List<Binnable>> filteredClinicalDataByAttributeId,
List<String> unfilteredUniqueSampleKeys,
List<String> unfilteredUniquePatientKeys,
List<String> filteredUniqueSampleKeys,
List<String> filteredUniquePatientKeys
Map<String, List<Binnable>> filteredClinicalDataByAttributeId
) {
List<ClinicalDataBin> clinicalDataBins = new ArrayList<>();

for (ClinicalDataBinFilter attribute : attributes) {
if (attributeDatatypeMap.containsKey(attribute.getAttributeId())) {
ClinicalDataType clinicalDataType = attributeDatatypeMap.get(attribute.getAttributeId());
List<String> filteredIds = clinicalDataType == ClinicalDataType.PATIENT ? filteredUniquePatientKeys
: filteredUniqueSampleKeys;
List<String> unfilteredIds = clinicalDataType == ClinicalDataType.PATIENT
? unfilteredUniquePatientKeys
: unfilteredUniqueSampleKeys;

List<ClinicalDataBin> dataBins = dataBinner
.calculateClinicalDataBins(attribute, clinicalDataType,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(),
emptyList()),
unfilteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(),
emptyList()),
filteredIds, unfilteredIds)
.calculateClinicalDataBins(
attribute,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(), emptyList()),
unfilteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(), emptyList())
)
.stream()
.map(dataBin -> dataBinToClinicalDataBin(attribute, dataBin))
.collect(toList());
.toList();

clinicalDataBins.addAll(dataBins);
}
Expand All @@ -109,33 +98,54 @@ public static List<ClinicalDataBin> calculateDynamicDataBins(
DataBinner dataBinner,
List<ClinicalDataBinFilter> attributes,
Map<String, ClinicalDataType> attributeDatatypeMap,
Map<String, List<Binnable>> filteredClinicalDataByAttributeId,
List<String> filteredUniqueSampleKeys,
List<String> filteredUniquePatientKeys
Map<String, List<Binnable>> filteredClinicalDataByAttributeId
) {
List<ClinicalDataBin> clinicalDataBins = new ArrayList<>();

for (ClinicalDataBinFilter attribute : attributes) {

// if there is clinical data for requested attribute
if (attributeDatatypeMap.containsKey(attribute.getAttributeId())) {
ClinicalDataType clinicalDataType = attributeDatatypeMap.get(attribute.getAttributeId());
List<String> filteredIds = clinicalDataType == ClinicalDataType.PATIENT
? filteredUniquePatientKeys
: filteredUniqueSampleKeys;

List<ClinicalDataBin> dataBins = dataBinner
.calculateDataBins(attribute, clinicalDataType,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(),
emptyList()),
filteredIds)
.calculateDataBins(
attribute,
filteredClinicalDataByAttributeId.getOrDefault(attribute.getAttributeId(), emptyList())
)
.stream()
.map(dataBin -> dataBinToClinicalDataBin(attribute, dataBin))
.collect(toList());
.toList();
clinicalDataBins.addAll(dataBins);
}
}

return clinicalDataBins;
}

/**
alisman marked this conversation as resolved.
Show resolved Hide resolved
* Generate a list of ClinicalData from the given data count instance.
* Size of the generated list is equal to 'dataCount.count',
* and each ClinicalData in the list contains the same value 'dataCount.value'
*
* This method improves the performance of the data binning because it allows us to fetch only
* the clinical data counts data which is a lot more compact and faster to generated than the actual clinical data.
* We only need the attribute id and the value of the clinical data to generate data bins.
* Constructing the clinical data in memory by using clinical data counts significantly improves the performance,
* and it also allows us to use the exact same SQL used by the clinical data counts endpoint.
*
* @param dataCount ClinicalDataCount instance containing the count and the value
* @return a list of ClinicalData with size 'dataCount.count' and value 'dataCount.value'
*/
public static List<ClinicalData> generateClinicalDataFromClinicalDataCount(ClinicalDataCount dataCount)
{
List<ClinicalData> data = new ArrayList<>(dataCount.getCount());

for (int i=0; i < dataCount.getCount(); i++) {
ClinicalData d = new ClinicalData();
d.setAttrId(dataCount.getAttributeId());
d.setAttrValue(dataCount.getValue());
data.add(d);
}

return data;
}
}
Loading
Loading