Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Add cluster idle HTTP api #53850

Merged
merged 6 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/alter/AlterHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.apache.logging.log4j.Logger;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -204,4 +205,14 @@ public void replayAlterJobV2(AlterJobV2 alterJob) {
existingJob.replay(alterJob);
}
}

public Map<Long, Long> getRunningAlterJobCount() {
Map<Long, Long> result = new HashMap<>();
for (AlterJobV2 alterJobV2 : alterJobsV2.values()) {
if (!alterJobV2.isDone()) {
result.compute(alterJobV2.getWarehouseId(), (key, value) -> value == null ? 1L : value + 1);
}
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -624,4 +624,11 @@ public void load(SRMetaBlockReader reader) throws IOException, SRMetaBlockExcept
}
});
}

public Map<Long, Long> getRunningAlterJobCount() {
Map<Long, Long> mv = materializedViewHandler.getRunningAlterJobCount();
Map<Long, Long> sc = schemaChangeHandler.getRunningAlterJobCount();
sc.forEach((key, value) -> mv.merge(key, value, Long::sum));
return mv;
}
}
24 changes: 21 additions & 3 deletions fe/fe-core/src/main/java/com/starrocks/alter/AlterJobV2.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import com.starrocks.server.GlobalStateMgr;
import com.starrocks.server.WarehouseManager;
import com.starrocks.sql.ast.UserIdentity;
import com.starrocks.warehouse.WarehouseIdleChecker;
import io.opentelemetry.api.trace.Span;
import org.apache.hadoop.util.Lists;
import org.apache.logging.log4j.LogManager;
Expand Down Expand Up @@ -206,6 +207,10 @@ public void createConnectContextIfNeeded() {
}
}

public long getWarehouseId() {
return warehouseId;
}

/**
* The keyword 'synchronized' only protects 2 methods:
* run() and cancel()
Expand All @@ -218,7 +223,7 @@ public void createConnectContextIfNeeded() {
*/
public synchronized void run() {
if (isTimeout()) {
cancelImpl("Timeout");
cancelHook(cancelImpl("Timeout"));
return;
}

Expand All @@ -240,6 +245,7 @@ public synchronized void run() {
break;
case FINISHED_REWRITING:
runFinishedRewritingJob();
finishHook();
break;
default:
break;
Expand All @@ -249,13 +255,15 @@ public synchronized void run() {
} // else: handle the new state
}
} catch (AlterCancelException e) {
cancelImpl(e.getMessage());
cancelHook(cancelImpl(e.getMessage()));
}
}

public boolean cancel(String errMsg) {
synchronized (this) {
return cancelImpl(errMsg);
boolean cancelled = cancelImpl(errMsg);
cancelHook(cancelled);
return cancelled;
}
}

Expand Down Expand Up @@ -319,6 +327,16 @@ protected boolean checkTableStable(Database db) throws AlterCancelException {

public abstract void replay(AlterJobV2 replayedJob);

public void finishHook() {
WarehouseIdleChecker.updateJobLastFinishTime(warehouseId);
}

public void cancelHook(boolean cancelled) {
if (cancelled) {
WarehouseIdleChecker.updateJobLastFinishTime(warehouseId);
}
}

public static AlterJobV2 read(DataInput in) throws IOException {
String json = Text.readString(in);
return GsonUtils.GSON.fromJson(json, AlterJobV2.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,9 @@ public final boolean cancel(String errMsg) {
createReplicaLatch.countDownToZero(new Status(TStatusCode.OK, ""));
}
synchronized (this) {
return cancelImpl(errMsg);
boolean cancelled = cancelImpl(errMsg);
cancelHook(cancelled);
return cancelled;
}
} finally {
isCancelling.set(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ protected void executeSql(String sql) throws Exception {
if (parsedStmt instanceof InsertStmt) {
((InsertStmt) parsedStmt).setIsVersionOverwrite(true);
}
StmtExecutor executor = new StmtExecutor(context, parsedStmt);
StmtExecutor executor = StmtExecutor.newInternalExecutor(context, parsedStmt);

// set default session variables for stats context
SessionVariable sessionVariable = context.getSessionVariable();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,9 @@ public final boolean cancel(String errMsg) {
createReplicaLatch.countDownToZero(new Status(TStatusCode.OK, ""));
}
synchronized (this) {
return cancelImpl(errMsg);
boolean cancelled = cancelImpl(errMsg);
cancelHook(cancelled);
return cancelled;
}
} finally {
isCancelling.set(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,9 @@ public final boolean cancel(String errMsg) {
createReplicaLatch.countDownToZero(new Status(TStatusCode.OK, ""));
}
synchronized (this) {
return cancelImpl(errMsg);
boolean cancelled = cancelImpl(errMsg);
cancelHook(cancelled);
return cancelled;
}
} finally {
isCancelling.set(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import com.starrocks.persist.metablock.SRMetaBlockReader;
import com.starrocks.persist.metablock.SRMetaBlockWriter;
import com.starrocks.server.GlobalStateMgr;
import com.starrocks.server.WarehouseManager;
import com.starrocks.sql.ast.AbstractBackupStmt;
import com.starrocks.sql.ast.BackupStmt;
import com.starrocks.sql.ast.BackupStmt.BackupType;
Expand All @@ -98,6 +99,7 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -929,4 +931,11 @@ public List<Pair<List<Object>, Long>> getSamples() {
List<Object> jobSamples = new ArrayList<>(dbIdToBackupOrRestoreJob.values());
return Lists.newArrayList(Pair.create(jobSamples, (long) dbIdToBackupOrRestoreJob.size()));
}

public Map<Long, Long> getRunningBackupRestoreCount() {
long count = dbIdToBackupOrRestoreJob.values().stream().filter(job -> !job.isDone()).count();
Map<Long, Long> result = new HashMap<>();
result.put(WarehouseManager.DEFAULT_WAREHOUSE_ID, count);
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import com.starrocks.fs.HdfsUtil;
import com.starrocks.metric.MetricRepo;
import com.starrocks.server.GlobalStateMgr;
import com.starrocks.server.WarehouseManager;
import com.starrocks.sql.analyzer.SemanticException;
import com.starrocks.task.AgentBatchTask;
import com.starrocks.task.AgentTask;
Expand All @@ -84,6 +85,7 @@
import com.starrocks.thrift.THdfsProperties;
import com.starrocks.thrift.TStatusCode;
import com.starrocks.thrift.TTaskType;
import com.starrocks.warehouse.WarehouseIdleChecker;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

Expand Down Expand Up @@ -799,6 +801,7 @@ private void uploadMetaAndJobInfoFile() {
LOG.info("job is finished. {}", this);

MetricRepo.COUNTER_UNFINISHED_BACKUP_JOB.increase(-1L);
WarehouseIdleChecker.updateJobLastFinishTime(WarehouseManager.DEFAULT_WAREHOUSE_ID);
}

private boolean uploadFile(String localFilePath, String remoteFilePath) {
Expand Down Expand Up @@ -885,6 +888,7 @@ private void cancelInternal() {

// log
globalStateMgr.getEditLog().logBackupJob(this);
WarehouseIdleChecker.updateJobLastFinishTime(WarehouseManager.DEFAULT_WAREHOUSE_ID);
LOG.info("finished to cancel backup job. current state: {}. {}", curState.name(), this);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
import com.starrocks.metric.MetricRepo;
import com.starrocks.persist.ColocatePersistInfo;
import com.starrocks.server.GlobalStateMgr;
import com.starrocks.server.WarehouseManager;
import com.starrocks.sql.analyzer.SemanticException;
import com.starrocks.task.AgentBatchTask;
import com.starrocks.task.AgentTask;
Expand All @@ -110,6 +111,7 @@
import com.starrocks.thrift.TStorageMedium;
import com.starrocks.thrift.TTabletSchema;
import com.starrocks.thrift.TTaskType;
import com.starrocks.warehouse.WarehouseIdleChecker;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

Expand Down Expand Up @@ -1545,6 +1547,7 @@ protected void waitingAllTabletsCommitted() {
status = st;
}
MetricRepo.COUNTER_UNFINISHED_RESTORE_JOB.increase(-1L);
WarehouseIdleChecker.updateJobLastFinishTime(WarehouseManager.DEFAULT_WAREHOUSE_ID);
return;
}
LOG.info("waiting {} tablets to commit. {}", unfinishedSignatureToId.size(), this);
Expand Down Expand Up @@ -1865,6 +1868,7 @@ public void cancelInternal(boolean isReplay) {
return;
}

WarehouseIdleChecker.updateJobLastFinishTime(WarehouseManager.DEFAULT_WAREHOUSE_ID);
LOG.info("finished to cancel restore job. is replay: {}. {}", isReplay, this);
}

Expand Down
3 changes: 3 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/common/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -2755,6 +2755,9 @@ public class Config extends ConfigBase {
@ConfField(mutable = true)
public static int lake_warehouse_max_compute_replica = 3;

@ConfField(mutable = true, comment = "time interval to check whether warehouse is idle")
public static long warehouse_idle_check_interval_seconds = 60;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider set this to 0 in open source version to disable the check by default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a new config warehouse_idle_check_enable, because the warehouse_idle_check_interval_seconds is also used to check the last job finish time.


// e.g. "tableId1;tableId2"
@ConfField(mutable = true)
public static String lake_compaction_disable_tables = "";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public void asyncExecuteSQL(MetadataCollectJob job) {
return;
}

this.executor = new StmtExecutor(context, parsedStmt);
this.executor = StmtExecutor.newInternalExecutor(context, parsedStmt);
context.setExecutor(executor);
context.setQueryId(UUIDUtil.genUUID());
context.getSessionVariable().setEnableMaterializedViewRewrite(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public static DataCacheSelectMetrics cacheSelect(DataCacheSelectStatement statem
connectContext.setSessionVariable(tmpSessionVariable);

InsertStmt insertStmt = statement.getInsertStmt();
StmtExecutor stmtExecutor = new StmtExecutor(connectContext, insertStmt);
StmtExecutor stmtExecutor = StmtExecutor.newInternalExecutor(connectContext, insertStmt);
// Register new StmtExecutor into current ConnectContext's StmtExecutor, so we can handle ctrl+c command
// If DataCacheSelect is forward to leader, connectContext's Executor is null
if (connectContext.getExecutor() != null) {
Expand Down
2 changes: 2 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/http/HttpServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
import com.starrocks.http.rest.GetSmallFileAction;
import com.starrocks.http.rest.GetStreamLoadState;
import com.starrocks.http.rest.HealthAction;
import com.starrocks.http.rest.IdleAction;
import com.starrocks.http.rest.LoadAction;
import com.starrocks.http.rest.MetaReplayerCheckAction;
import com.starrocks.http.rest.MetricsAction;
Expand Down Expand Up @@ -197,6 +198,7 @@ private void registerActions() throws IllegalArgException {
ShowDataAction.registerAction(controller);
QueryDumpAction.registerAction(controller);
SyncCloudTableMetaAction.registerAction(controller);
IdleAction.registerAction(controller);
// for stop FE
StopFeAction.registerAction(controller);
ExecuteSqlAction.registerAction(controller);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.starrocks.http.rest;

import com.starrocks.http.ActionController;
import com.starrocks.http.BaseRequest;
import com.starrocks.http.BaseResponse;
import com.starrocks.http.IllegalArgException;
import com.starrocks.persist.gson.GsonUtils;
import com.starrocks.server.GlobalStateMgr;
import com.starrocks.warehouse.IdleStatus;
import io.netty.handler.codec.http.HttpMethod;

/**
* API to check whether the cluster is idle
* {
* "isClusterIdle": true,
* "clusterIdleTime": 1734113878006,
* "warehouses": [
* {
* "id": 0,
* "name": "default_warehouse",
* "isIdle": true,
* "idleTime": 1734113878006
* }
* ]
* }
*/
public class IdleAction extends RestBaseAction {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please indicate what the Idle content is. The name of this class cannot express the meaning of the content.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


public IdleAction(ActionController controller) {
super(controller);
}

public static void registerAction(ActionController controller) throws IllegalArgException {
controller.registerHandler(HttpMethod.GET, "/api/idle_status", new IdleAction(controller));
}

@Override
public void execute(BaseRequest request, BaseResponse response) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this api endpoint be protected by authentication?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not need, there is no secret info

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be a security concern but I would defer it to you guys decision.

IdleStatus idleStatus = GlobalStateMgr.getCurrentState().getWarehouseIdleChecker().getIdleStatus();
String content = GsonUtils.GSON.toJson(idleStatus);
response.getContent().append(content);
sendResult(request, response);
}
}
kevincai marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
import com.starrocks.transaction.TransactionState;
import com.starrocks.transaction.TransactionState.TxnCoordinator;
import com.starrocks.transaction.TransactionState.TxnSourceType;
import com.starrocks.warehouse.WarehouseIdleChecker;
import org.apache.hadoop.util.ThreadUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand Down Expand Up @@ -350,6 +351,7 @@ public void afterAborted(TransactionState txnState, boolean txnOperated, String
.add("state", state)
.add("error_msg", "this task will be ignored when job is: " + state)
.build());
WarehouseIdleChecker.updateJobLastFinishTime(warehouseId, System.currentTimeMillis());
return;
}
boolean shouldRetry = retryTime > 0 && txnStatusChangeReason.contains("timeout")
Expand Down Expand Up @@ -377,6 +379,12 @@ public void afterAborted(TransactionState txnState, boolean txnOperated, String
}
}

@Override
public void afterVisible(TransactionState txnState, boolean txnOperated) {
super.afterVisible(txnState, txnOperated);
WarehouseIdleChecker.updateJobLastFinishTime(warehouseId);
}

/**
* This method is used to replay the cancelled state of load job
*
Expand Down
16 changes: 16 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/load/loadv2/LoadMgr.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
import java.util.Collection;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -826,4 +827,19 @@ public List<Pair<List<Object>, Long>> getSamples() {
.collect(Collectors.toList());
return Lists.newArrayList(Pair.create(samples, (long) idToLoadJob.size()));
}

public Map<Long, Long> getRunningLoadCount() {
Map<Long, Long> result = new HashMap<>();
readLock();
try {
for (LoadJob loadJob : idToLoadJob.values()) {
if (!loadJob.isFinal() && loadJob.getJobType() != EtlJobType.INSERT) {
result.compute(loadJob.getCurrentWarehouseId(), (key, value) -> value == null ? 1L : value + 1);
}
}
} finally {
readUnlock();
}
return result;
}
}
Loading
Loading