Skip to content

Commit

Permalink
[GLUTEN-7243][VL] Fix Q97 cross-task spilling hangs (apache#7244)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhztheplayer authored Sep 14, 2024
1 parent a7c9567 commit 3d21141
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 8 deletions.
1 change: 0 additions & 1 deletion .github/workflows/velox_backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,6 @@ jobs:
-d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0
- name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory, IO threads off
continue-on-error: true # OOM
timeout-minutes: 15 # https://github.com/apache/incubator-gluten/issues/7243
run: |
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
Expand Down
19 changes: 13 additions & 6 deletions cpp/velox/compute/WholeStageResultIterator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -231,20 +231,27 @@ int64_t WholeStageResultIterator::spillFixedSize(int64_t size) {
std::string poolName{pool->root()->name() + "/" + pool->name()};
std::string logPrefix{"Spill[" + poolName + "]: "};
int64_t shrunken = memoryManager_->shrink(size);
// todo return the actual spilled size?
if (spillStrategy_ == "auto") {
if (task_->numThreads() != 0) {
// Task should have zero running threads, otherwise there's
// possibility that this spill call hangs. See https://github.com/apache/incubator-gluten/issues/7243.
// As of now, non-zero running threads usually happens when:
// 1. Task A spills task B;
// 2. Task A trys to grow buffers created by task B, during which spill is requested on task A again;
VLOG(2) << logPrefix << "Spill is requested on a task " << task_->taskId()
<< " that has non-zero running threads, which is not currently supported. Skipping.";
return shrunken;
}
int64_t remaining = size - shrunken;
LOG(INFO) << logPrefix << "Trying to request spilling for " << remaining << " bytes...";
LOG(INFO) << logPrefix << "Trying to request spill for " << remaining << " bytes...";
auto* mm = memoryManager_->getMemoryManager();
uint64_t spilledOut = mm->arbitrator()->shrinkCapacity(remaining); // this conducts spilling
uint64_t spilledOut = mm->arbitrator()->shrinkCapacity(remaining); // this conducts spill
LOG(INFO) << logPrefix << "Successfully spilled out " << spilledOut << " bytes.";
uint64_t total = shrunken + spilledOut;
VLOG(2) << logPrefix << "Successfully reclaimed total " << total << " bytes.";
return total;
} else {
LOG(WARNING) << "Spill-to-disk was disabled since " << kSpillStrategy << " was not configured.";
}

LOG(WARNING) << "Spill-to-disk was disabled since " << kSpillStrategy << " was not configured.";
VLOG(2) << logPrefix << "Successfully reclaimed total " << shrunken << " bytes.";
return shrunken;
}
Expand Down
2 changes: 1 addition & 1 deletion ep/build-velox/src/get_velox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
set -exu

VELOX_REPO=https://github.com/oap-project/velox.git
VELOX_BRANCH=2024_09_14
VELOX_BRANCH=2024_09_14-2
VELOX_HOME=""

OS=`uname -s`
Expand Down

0 comments on commit 3d21141

Please sign in to comment.