From 46ce106531755b860a99e332ddefa69e23d18601 Mon Sep 17 00:00:00 2001 From: 6clc Date: Tue, 2 Jan 2024 09:47:10 +0800 Subject: [PATCH 1/8] runtime(cinn): infer output tensor's shape --- paddle/cinn/backends/codegen_cuda_host.cc | 10 ++++ paddle/cinn/backends/codegen_cuda_host.h | 2 +- paddle/cinn/backends/codegen_cuda_util.h | 24 +++++++++- paddle/cinn/backends/llvm/codegen_llvm.cc | 3 +- paddle/cinn/common/type.h | 12 +++++ paddle/cinn/hlir/framework/op_lowering.h | 11 +++-- paddle/cinn/hlir/framework/op_lowering_impl.h | 11 +++-- .../hlir/framework/op_lowering_impl_base.h | 3 +- .../hlir/framework/pir/compilation_task.cc | 20 ++++++-- .../hlir/framework/pir/compilation_task.h | 5 +- .../hlir/framework/pir/op_lowering_impl.cc | 46 +++++++++++++++---- .../hlir/framework/pir/op_lowering_impl.h | 11 +++-- paddle/cinn/hlir/framework/pir/utils.h | 1 + paddle/cinn/ir/ir.h | 1 + paddle/cinn/ir/lowered_func.cc | 10 ++++ paddle/cinn/ir/module.cc | 4 ++ paddle/cinn/ir/module.h | 1 + paddle/cinn/ir/utils/ir_copy.cc | 6 ++- paddle/cinn/runtime/cinn_runtime.cc | 3 ++ paddle/cinn/runtime/cinn_runtime.h | 1 + paddle/cinn/runtime/cuda/cuda_intrinsics.cc | 9 ++++ paddle/cinn/runtime/cuda/cuda_util.cc | 3 ++ paddle/cinn/runtime/cuda/cuda_util.h | 1 + paddle/cinn/runtime/intrinsic.h | 2 + 24 files changed, 166 insertions(+), 34 deletions(-) diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc index b23028355a06ae..b440b424a9c2d9 100644 --- a/paddle/cinn/backends/codegen_cuda_host.cc +++ b/paddle/cinn/backends/codegen_cuda_host.cc @@ -198,6 +198,11 @@ llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) { [](auto& arg) { return std::addressof(arg); }); // @} + // Set local scope table + CHECK_EQ(ll_function_args.size(), func->args.size()); + for (int i = 0; i < ll_function_args.size(); i++) { + SetVar(func->args[i].name(), ll_function_args[i]); + } llvm::BasicBlock* entry = llvm::BasicBlock::Create( /*Context=*/b_->getContext(), /*Name=*/"entry", @@ -205,6 +210,11 @@ llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) { /*InsertBefore=*/nullptr); b_->SetInsertPoint(entry); CodeGenLLVM::Visit(&func->body); + + // Reset local scope table + for (const ir::Argument& func_arg : func->args) { + symbol_table_->Erase(func_arg.name()); + } RetVoid(); return f_; diff --git a/paddle/cinn/backends/codegen_cuda_host.h b/paddle/cinn/backends/codegen_cuda_host.h index aafaeebc248eb0..3a3453f80522b3 100644 --- a/paddle/cinn/backends/codegen_cuda_host.h +++ b/paddle/cinn/backends/codegen_cuda_host.h @@ -53,7 +53,7 @@ class CodeGenCUDA_Host : public CodeGenLLVM { } else if (op->name == runtime::intrinsic::call_cuda_kernel) { return LowerCUDAKernelCall(op); } else { - CINN_NOT_IMPLEMENTED; + return CodeGenLLVM::Visit(op); } } diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h index 5a7f1f5882bf9b..52296bd2a8807b 100644 --- a/paddle/cinn/backends/codegen_cuda_util.h +++ b/paddle/cinn/backends/codegen_cuda_util.h @@ -31,6 +31,7 @@ namespace backends { #define KERNEL_ARGS "kernel_args" #define KERNEL_ARGS_NUM "kernel_args_num" #define KERNEL_STREAM "kernel_stream" +#define TENSOR_SHAPE_ARGS "tensor_shape_args" /** * Split a CINN Module into two separate modules, one cantains the host @@ -150,7 +151,8 @@ struct CollectBucketStrategyHostFunctionVisitor : CollectHostFunctionVisitor(module_name), kernel_args_(KERNEL_ARGS, type_of()), kernel_args_num_(KERNEL_ARGS_NUM, type_of()), - kernel_stream_(KERNEL_STREAM, type_of()) {} + kernel_stream_(KERNEL_STREAM, type_of()), + tensor_shape_args_(TENSOR_SHAPE_ARGS, type_of()) {} std::tuple operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); @@ -181,6 +183,25 @@ struct CollectBucketStrategyHostFunctionVisitor {}); host_module_builder.AddFunctionWithoutOptim( host_func.as_lowered_func_ref()); + + // Parse LoweredFunc to infer output tensor's shape + std::vector infer_shape_func_body_stmts(arg_defs_); + infer_shape_func_body_stmts.insert( + infer_shape_func_body_stmts.end(), + op->infer_shape_func.as_lowered_func()->body); + + std::vector infer_shape_arguments = { + ir::Argument(kernel_args_, ir::Argument::IO::kOutput), + ir::Argument(kernel_args_num_, ir::Argument::IO::kInput), + ir::Argument(tensor_shape_args_, ir::Argument::IO::kOutput)}; + + ir::Expr host_infer_shape_func = + ir::_LoweredFunc_::Make(op->infer_shape_func.as_lowered_func()->name, + infer_shape_arguments, + ir::Block::Make(infer_shape_func_body_stmts), + {}); + host_module_builder.AddFunctionWithoutOptim( + host_infer_shape_func.as_lowered_func_ref()); } void ProcessLoweredFunc(ir::Expr func, ir::Expr predicate); @@ -199,6 +220,7 @@ struct CollectBucketStrategyHostFunctionVisitor ir::Var kernel_args_; ir::Var kernel_args_num_; ir::Var kernel_stream_; + ir::Var tensor_shape_args_; }; } // namespace detail diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc index a79e67fd6c4839..e554eca8795a43 100644 --- a/paddle/cinn/backends/llvm/codegen_llvm.cc +++ b/paddle/cinn/backends/llvm/codegen_llvm.cc @@ -818,7 +818,8 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_Var_ *op) { // TODO(fc500110) hard coding if (LLVM_WillVarLowerAsPointer(op->name)) { result = value; - } else if (value->getType()->isPointerTy()) { + } else if (value->getType()->isPointerTy() && + !value->getType()->getPointerElementType()->isPointerTy()) { result = Load(value, op->name + "_load"); } else { result = value; diff --git a/paddle/cinn/common/type.h b/paddle/cinn/common/type.h index 9ce9402d84f8fe..b11a320bbd5a19 100644 --- a/paddle/cinn/common/type.h +++ b/paddle/cinn/common/type.h @@ -251,6 +251,18 @@ inline Type type_of() { return x; } template <> +inline Type type_of() { + Type x = Int(32); + x.set_cpp_handle(); + return x; +} +template <> +inline Type type_of() { + Type x = Int(32); + x.set_cpp_handle2(); + return x; +} +template <> inline Type type_of() { Type x = type_of(); x.set_cpp_handle(); diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h index 57a54310c77198..442cecc861e6fe 100644 --- a/paddle/cinn/hlir/framework/op_lowering.h +++ b/paddle/cinn/hlir/framework/op_lowering.h @@ -47,11 +47,12 @@ class OpLowerer { group, apply_op_schedule, apply_group_schedule, apply_pass); } - std::vector> BucketLower( - const T& group, - bool apply_op_schedule = false, - bool apply_group_schedule = true, - bool apply_pass = true) { + std::vector>> + BucketLower(const T& group, + bool apply_op_schedule = false, + bool apply_group_schedule = true, + bool apply_pass = true) { return impl_->BucketLower( group, apply_op_schedule, apply_group_schedule, apply_pass); } diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.h b/paddle/cinn/hlir/framework/op_lowering_impl.h index 038c6f1ec8bf33..8e2a7fe366519a 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl.h @@ -60,11 +60,12 @@ class OpLowererImpl : public OpLowererImplBase { bool apply_group_schedule = true, bool apply_pass = true); - std::vector> BucketLower( - const GroupPtr& group, - bool apply_op_schedule = false, - bool apply_group_schedule = true, - bool apply_pass = true) { + std::vector>> + BucketLower(const GroupPtr& group, + bool apply_op_schedule = false, + bool apply_group_schedule = true, + bool apply_pass = true) { CINN_NOT_IMPLEMENTED; } diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h index bab0a700891121..09a2373101163b 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h @@ -38,7 +38,8 @@ class OpLowererImplBase { bool apply_group_schedule = true, bool apply_pass = true) = 0; - virtual std::vector> + virtual std::vector>> BucketLower(const T& group, bool apply_op_schedule = false, bool apply_group_schedule = true, diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc index 01c940b228a3dc..27cfcd01cd7fcd 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.cc +++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc @@ -15,6 +15,7 @@ #pragma once #include "paddle/cinn/hlir/framework/pir/compilation_task.h" +#include "paddle/cinn/common/target.h" #include "paddle/cinn/hlir/framework/op_lowering.h" #include "paddle/cinn/ir/module.h" @@ -23,11 +24,15 @@ namespace hlir { namespace framework { void GroupCompilationContext::SetLoweredFuncs( - std::vector>&& funcs) { - for (std::pair& predicate2func : + std::vector>>&& + funcs) { + for (std::pair>& predicate2func : funcs) { predicates_.push_back(predicate2func.first); - lowered_funcs_.push_back(predicate2func.second); + lowered_funcs_.push_back(predicate2func.second.second); + infer_shape_lowered_funcs_.push_back(predicate2func.second.first); ++func_size_; } } @@ -67,12 +72,13 @@ void CompilationTask::CodegenAndJit() { ir::Module::Builder builder(cinn::common::UniqName("module"), context_->target_); CHECK_EQ(context_->predicates_.size(), context_->lowered_funcs_.size()); - for (const ir::Expr predicate : context_->predicates_) { + for (const ir::Expr& predicate : context_->predicates_) { builder.AddPredicate(predicate); } for (const ir::LoweredFunc& func : context_->lowered_funcs_) { builder.AddFunction(func); } + builder.AddInferShapeFunc(context_->infer_shape_lowered_funcs_[0]); ir::Module ir_module = builder.Build(); context_->backend_compiler_ = backends::Compiler::Create(context_->target_); @@ -90,6 +96,9 @@ std::unique_ptr CompilationTask::BuildInstruction() { VLOG(4) << "Lookup kernel name: " << fn_name; auto* fn_ptr = context_->backend_compiler_->Lookup(fn_name); CHECK(fn_ptr); + auto* infer_shape_fn_ptr = + context_->backend_compiler_->Lookup(fn_name + "_infer_shape" + fn_name); + CHECK(infer_shape_fn_ptr); instr->SetLoweredFunc(reinterpret_cast(fn_ptr), fn_name); instr->Finalize(); return instr; @@ -100,6 +109,9 @@ pir::CINNKernelInfo CompilationTask::BuildPirCINNKernelInfo() { VLOG(4) << "Lookup kernel name: " << fn_name; auto* fn_ptr = context_->backend_compiler_->Lookup(fn_name); CHECK(fn_ptr); + auto* infer_shape_fn_ptr = + context_->backend_compiler_->Lookup(fn_name + "_infer_shape"); + CHECK(infer_shape_fn_ptr); pir::CINNKernelInfo cinn_kernel_info; cinn_kernel_info.fn_ptr = fn_ptr; cinn_kernel_info.int_args_map = context_->group_->int_args_map; diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.h b/paddle/cinn/hlir/framework/pir/compilation_task.h index 5291cafe4a2f32..223c44845896e2 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.h +++ b/paddle/cinn/hlir/framework/pir/compilation_task.h @@ -32,7 +32,9 @@ class GroupCompilationContext { : target_(target), group_(group), scope_(scope) {} void SetLoweredFuncs( - std::vector>&& funcs); + std::vector>>&& + funcs); std::string PrintPredicate2Funcs() const; void* FuncPtr(); std::shared_ptr BackendCompiler(); @@ -47,6 +49,7 @@ class GroupCompilationContext { size_t func_size_ = 0; std::vector predicates_; std::vector lowered_funcs_; + std::vector infer_shape_lowered_funcs_; std::string host_func_name_; std::string host_code_; std::vector device_code_; diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 643e4ed294b4cd..8920c38cf3d0f7 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -18,6 +18,7 @@ #include "paddle/cinn/adt/map_expr_ctx.h" #include "paddle/cinn/ast_gen_ius/tensor_group.h" +#include "paddle/cinn/backends/codegen_cuda_util.h" #include "paddle/cinn/hlir/framework/compile_error.h" #include "paddle/cinn/hlir/framework/pir/op_lowering_util.h" #include "paddle/cinn/hlir/framework/pir/utils.h" @@ -99,16 +100,17 @@ std::vector OpLowererImpl::Lower(const GroupPtr& group, } } -std::vector> +std::vector>> OpLowererImpl::BucketLower(const GroupPtr& group, bool apply_op_schedule, bool apply_group_schedule, bool apply_pass) { // 1.Do compute, lower and schedule for each op. auto& ops = group->ops; - if (ops.size() == 1 && ops[0]->name() == "custom_call") { - return {{ir::Expr(1), LowerCustomCall(group)[0]}}; - } + // if (ops.size() == 1 && ops[0]->name() == "custom_call") { + // return {{ir::Expr(1), LowerCustomCall(group)[0]}}; + // } std::vector group_func_arg_tensors; std::unordered_map<::pir::Value, ir::Tensor> tensor_map; // for some op, it will output more tmp value and regard as @@ -150,7 +152,8 @@ OpLowererImpl::BucketLower(const GroupPtr& group, // 3.Do post-processing, // including preparing function args and temporary variables, // applying low-level optimization passes, etc. - std::vector> cond2funcs; + std::vector>> + cond2funcs; for (std::pair& cond2body : cond2func_bodies) { std::vector group_func_arg_tensors_copy = @@ -161,9 +164,7 @@ OpLowererImpl::BucketLower(const GroupPtr& group, apply_op_schedule, cond2body.second, &group_func_arg_tensors_copy); - for (ir::LoweredFunc& func : funcs) { - cond2funcs.emplace_back(cond2body.first, func); - } + cond2funcs.push_back({cond2body.first, {funcs[0], funcs[1]}}); } return cond2funcs; } @@ -467,10 +468,31 @@ std::vector OpLowererImpl::PostProcess( int num_tensor_args = static_cast(group_func_args.size()); int non_tensor_arg_idx = group_func_args.size(); std::unordered_set int_args_set; + std::vector ir_bodys; for (int tensor_arg_idx = 0; tensor_arg_idx < num_tensor_args; tensor_arg_idx++) { auto tensor_dim = (*group_func_arg_tensors)[tensor_arg_idx]->sym_shape; int tensor_dim_size = tensor_dim.size(); + auto tensor_shape = (*group_func_arg_tensors)[tensor_arg_idx]->shape; + + ir::Var tensor_shape_args(TENSOR_SHAPE_ARGS, type_of()); + + if (group_func_args[tensor_arg_idx].is_output()) { + for (int i = 0; i < tensor_shape.size(); i++) { + ir::Expr call_set_infer_shape_value = + ir::Call::Make(type_of(), + runtime::intrinsic::set_value, + {tensor_shape_args, + ir::Expr(tensor_arg_idx), + ir::Expr(i), + tensor_shape[i]}, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0); + ir_bodys.push_back(call_set_infer_shape_value); + } + } for (int tensor_arg_dim_idx = 0; tensor_arg_dim_idx < tensor_dim_size; tensor_arg_dim_idx++) { if (tensor_dim[tensor_arg_dim_idx]->IsDynamic()) { @@ -489,6 +511,11 @@ std::vector OpLowererImpl::PostProcess( } } } + ir::LoweredFunc infer_shape_func = + ir::_LoweredFunc_::Make(group->FuncName() + "_infer_shape", + group_func_args, + ir::Block::Make(ir_bodys), + {}); #ifdef CINN_WITH_CUDA optim::OptimizeExprGPU(&(func_body)); @@ -506,7 +533,8 @@ std::vector OpLowererImpl::PostProcess( } // 4.Apply low level pass func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref(); - return {func}; + + return {infer_shape_func, func}; } std::vector OpLowererImpl::LowerOps( diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h index aa29119281b51c..d6698ee37cd50f 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h @@ -70,11 +70,12 @@ class OpLowererImpl : public OpLowererImplBase { * @param apply_group_schedule Whether to schedule at group level. * @return The lowered funcs. */ - std::vector> BucketLower( - const GroupPtr& group, - bool apply_op_schedule = false, - bool apply_group_schedule = true, - bool apply_pass = true); + std::vector>> + BucketLower(const GroupPtr& group, + bool apply_op_schedule = false, + bool apply_group_schedule = true, + bool apply_pass = true); void InsertNameGeneToScope(std::shared_ptr scope); diff --git a/paddle/cinn/hlir/framework/pir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h index 4d97d48291903a..ce9fa8c1cb9f17 100644 --- a/paddle/cinn/hlir/framework/pir/utils.h +++ b/paddle/cinn/hlir/framework/pir/utils.h @@ -31,6 +31,7 @@ namespace pir { struct CINNKernelInfo { void* fn_ptr; + void* infer_shape_fn_ptr; struct ArgDimIdx { int arg_idx; diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h index 7859a7181c527b..3e9460e084a36f 100644 --- a/paddle/cinn/ir/ir.h +++ b/paddle/cinn/ir/ir.h @@ -1018,6 +1018,7 @@ struct _Module_ : public ExprNode<_Module_> { std::vector functions; std::vector submodules; std::vector predicates; + Expr infer_shape_func; static ir::Module Make(const std::string& name, Target target); diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc index 129fc5d6e32782..d252a5e44954f5 100644 --- a/paddle/cinn/ir/lowered_func.cc +++ b/paddle/cinn/ir/lowered_func.cc @@ -398,11 +398,21 @@ void _LoweredFunc_::PrepareArgumentExprs() { } else if (arg.type() == type_of()) { pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of()); + } else if (arg.type() == type_of()) { + pod_cast_expr = + ir::intrinsics::PodValueToX::Make(load_expr, type_of()); + } else if (arg.type() == type_of()) { + pod_cast_expr = + ir::intrinsics::PodValueToX::Make(load_expr, type_of()); + } else if (arg.type() == type_of()) { + pod_cast_expr = + ir::intrinsics::PodValueToX::Make(load_expr, type_of()); } else { LOG(ERROR) << "Not supported type [" << arg.type() << "]"; CINN_NOT_IMPLEMENTED } + VLOG(6) << "args " << i << "convert"; Expr let_expr = Let::Make(_arg, pod_cast_expr); CHECK(let_expr.type().valid()); argument_prepare_exprs.push_back(let_expr); diff --git a/paddle/cinn/ir/module.cc b/paddle/cinn/ir/module.cc index d54286d9fc2ec6..fc58e44956fe76 100644 --- a/paddle/cinn/ir/module.cc +++ b/paddle/cinn/ir/module.cc @@ -53,6 +53,10 @@ void Module::Builder::AddPredicate(ir::Expr predicate) { module_->predicates.push_back(predicate); } +void Module::Builder::AddInferShapeFunc(ir::Expr infer_shape_func) { + module_->infer_shape_func = infer_shape_func; +} + void Module::Builder::Clear() { module_->buffers.clear(); module_->functions.clear(); diff --git a/paddle/cinn/ir/module.h b/paddle/cinn/ir/module.h index fad8377e6b0158..9910caab42b503 100644 --- a/paddle/cinn/ir/module.h +++ b/paddle/cinn/ir/module.h @@ -45,6 +45,7 @@ class Module : public ir::IrNodeRef { void AddFunctionWithoutOptim(const ir::LoweredFunc& func); void AddBuffer(ir::Buffer buffer); void AddPredicate(ir::Expr predicate); + void AddInferShapeFunc(ir::Expr infer_shape_func); void Clear(); Target::Arch GetTargetArch(); diff --git a/paddle/cinn/ir/utils/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc index 08dc2bc1e628cd..b444be218c39a5 100644 --- a/paddle/cinn/ir/utils/ir_copy.cc +++ b/paddle/cinn/ir/utils/ir_copy.cc @@ -242,7 +242,7 @@ struct IRCopyVisitor : public ir::IRVisitorRequireReImpl { std::vector functions; std::vector submodules; std::vector predicates; - + Expr infer_shape_func; for (auto& expr : op->buffers) { buffers.push_back(Visit(&expr)); } @@ -258,12 +258,16 @@ struct IRCopyVisitor : public ir::IRVisitorRequireReImpl { for (auto& expr : op->predicates) { predicates.push_back(Visit(&expr)); } + if (op->infer_shape_func.defined()) { + infer_shape_func = Visit(&op->infer_shape_func); + } auto res = ir::_Module_::Make(op->name, op->target); res->buffers = buffers; res->functions = functions; res->submodules = submodules; res->predicates = predicates; + res->infer_shape_func = infer_shape_func; return Expr(res); } diff --git a/paddle/cinn/runtime/cinn_runtime.cc b/paddle/cinn/runtime/cinn_runtime.cc index b8bc96d508877b..c4c25e8f867868 100644 --- a/paddle/cinn/runtime/cinn_runtime.cc +++ b/paddle/cinn/runtime/cinn_runtime.cc @@ -375,6 +375,9 @@ uint8_t cinn_pod_value_to_uint8(cinn_pod_value_t* value) { return *value; } bool cinn_pod_value_to_bool(cinn_pod_value_t* value) { return *value; } void* cinn_pod_value_to_void_p(cinn_pod_value_t* value) { return *value; } +int32_t* cinn_pod_value_to_int32_p(cinn_pod_value_t* value) { + return reinterpret_cast(value->data_addr()); +} cinn_buffer_t* cinn_pod_value_to_buffer_p(cinn_pod_value_t* value) { return *value; } diff --git a/paddle/cinn/runtime/cinn_runtime.h b/paddle/cinn/runtime/cinn_runtime.h index 17b5a400fd122b..4a5ce5d18d179c 100644 --- a/paddle/cinn/runtime/cinn_runtime.h +++ b/paddle/cinn/runtime/cinn_runtime.h @@ -561,6 +561,7 @@ uint8_t cinn_pod_value_to_uint8(cinn_pod_value_t* value); bool cinn_pod_value_to_bool(cinn_pod_value_t* value); void* cinn_pod_value_to_void_p(cinn_pod_value_t* value); +int32_t* cinn_pod_value_to_int32_p(cinn_pod_value_t* value); cinn_buffer_t* cinn_pod_value_to_buffer_p(cinn_pod_value_t* value); // @} diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc index e090117a423e4b..68579bcb64477e 100644 --- a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc +++ b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc @@ -434,6 +434,15 @@ CINN_REGISTER_HELPER(cinn_cuda_host_api) { .AddInputType() // index .End(); + using cinn::runtime::cuda::set_value; + REGISTER_EXTERN_FUNC_HELPER(set_value, cinn::common::DefaultHostTarget()) + .SetRetType() + .AddInputType() + .AddInputType() + .AddInputType() + .AddInputType() + .End(); + using cinn::runtime::cuda::cinn_call_cuda_kernel; REGISTER_EXTERN_FUNC_HELPER(cinn_call_cuda_kernel, cinn::common::DefaultHostTarget()) diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc index 326e5a3aac561d..413ae703bdac1f 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.cc +++ b/paddle/cinn/runtime/cuda/cuda_util.cc @@ -2748,6 +2748,9 @@ void cinn_gpu_cudnn_pool2d(const std::vector &attrs, cudnnDestroyPoolingDescriptor(pooling_desc); } +void set_value(int32_t **v, int row, int col, int32_t value) { + v[row][col] = value; +} void cinn_gpu_cudnn_softmax(const std::vector &attrs, cinn_buffer_t *input, cinn_buffer_t *output, diff --git a/paddle/cinn/runtime/cuda/cuda_util.h b/paddle/cinn/runtime/cuda/cuda_util.h index 7ea9dbe00a2c5b..25244b05c92c0a 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.h +++ b/paddle/cinn/runtime/cuda/cuda_util.h @@ -96,6 +96,7 @@ void cinn_call_cuda_memcpy(void* v_args, void* stream = nullptr); int32_t cinn_get_value_in_cuda_kernel_args(void* v_args, int idx); +void set_value(int32_t** v, int row, int col, int32_t value); /** * Call a CUDA compiled kernel. diff --git a/paddle/cinn/runtime/intrinsic.h b/paddle/cinn/runtime/intrinsic.h index 6939a8ea1f457f..f3f6e2d5ae9a3b 100644 --- a/paddle/cinn/runtime/intrinsic.h +++ b/paddle/cinn/runtime/intrinsic.h @@ -107,6 +107,8 @@ static const char* call_cuda_kernel = "cinn_call_cuda_kernel"; static const char* get_value_in_cuda_kernel_args = "cinn_get_value_in_cuda_kernel_args"; +static const char* set_value = "set_value"; + static const char* pod_values_to_array_repr = "pod_values_to_array"; static const char* get_address_repr = "get_address"; From e773f14322d959152b19454141cb34699daca90a Mon Sep 17 00:00:00 2001 From: 6clc Date: Tue, 2 Jan 2024 13:54:44 +0800 Subject: [PATCH 2/8] runtime(cinn): infer output tensor's shape in jit instruction --- .../hlir/framework/pir/compilation_task.cc | 1 + .../hlir/framework/pir/op_lowering_impl.cc | 30 +++---- .../instruction/cinn_jit_instruction.cc | 80 ++++++++++++++++--- 3 files changed, 85 insertions(+), 26 deletions(-) diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc index 27cfcd01cd7fcd..1726cdd7aa717f 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.cc +++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc @@ -114,6 +114,7 @@ pir::CINNKernelInfo CompilationTask::BuildPirCINNKernelInfo() { CHECK(infer_shape_fn_ptr); pir::CINNKernelInfo cinn_kernel_info; cinn_kernel_info.fn_ptr = fn_ptr; + cinn_kernel_info.infer_shape_fn_ptr = infer_shape_fn_ptr; cinn_kernel_info.int_args_map = context_->group_->int_args_map; return cinn_kernel_info; } diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 8920c38cf3d0f7..6c55ee8f0c749b 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -477,22 +477,22 @@ std::vector OpLowererImpl::PostProcess( ir::Var tensor_shape_args(TENSOR_SHAPE_ARGS, type_of()); - if (group_func_args[tensor_arg_idx].is_output()) { - for (int i = 0; i < tensor_shape.size(); i++) { - ir::Expr call_set_infer_shape_value = - ir::Call::Make(type_of(), - runtime::intrinsic::set_value, - {tensor_shape_args, - ir::Expr(tensor_arg_idx), - ir::Expr(i), - tensor_shape[i]}, - {}, - ir::CallType::Extern, - ir::FunctionRef(), - 0); - ir_bodys.push_back(call_set_infer_shape_value); - } + // if (group_func_args[tensor_arg_idx].is_output()) { + for (int i = 0; i < tensor_shape.size(); i++) { + ir::Expr call_set_infer_shape_value = + ir::Call::Make(type_of(), + runtime::intrinsic::set_value, + {tensor_shape_args, + ir::Expr(tensor_arg_idx), + ir::Expr(i), + tensor_shape[i]}, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0); + ir_bodys.push_back(call_set_infer_shape_value); } + // } for (int tensor_arg_dim_idx = 0; tensor_arg_dim_idx < tensor_dim_size; tensor_arg_dim_idx++) { if (tensor_dim[tensor_arg_dim_idx]->IsDynamic()) { diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc index 9ff10d0ae7c91c..680e8c1fe0eade 100644 --- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc @@ -30,6 +30,7 @@ namespace paddle { namespace framework { typedef void (*lower_func_ptr_g)(void*, int32_t, void*); +typedef void (*infer_shape_func_ptr_g)(void*, int32_t, int32_t**); class CinnJitInstruction::FnPtrImpl { using CINNKernelInfo = cinn::hlir::framework::pir::CINNKernelInfo; @@ -55,12 +56,50 @@ class CinnJitInstruction::FnPtrImpl { kernel_args[int_arg_mp.second.arg_idx]->dims().at( int_arg_mp.second.dim_idx))); } + (infer_shape_func_ptr_g) cinn_kernel_info_.infer_shape_fn_ptr; // 3. Launch host kernel ((lower_func_ptr_g)cinn_kernel_info_.fn_ptr)( static_cast(func_args_.data()), func_args_.size(), stream); } + void InferShape(const std::vector& kernel_args) { + func_args_.clear(); + + // 1. Convert the phi::DenseTensor type to cinn_pod_value_t + for (size_t i = 0; i < kernel_args.size(); ++i) { + auto* buffer = new cinn_buffer_t(); + // buffer->memory = reinterpret_cast(kernel_args[i]->data()); + func_args_.emplace_back(buffer); + } + // 2. Convert arg's data about shape of Tensor to cinn_pod_value_t + for (const auto& int_arg_mp : cinn_kernel_info_.int_args_map) { + func_args_.emplace_back(kernel_args[int_arg_mp.second.arg_idx]->dims().at( + int_arg_mp.second.dim_idx)); + func_args_.emplace_back(static_cast( + kernel_args[int_arg_mp.second.arg_idx]->dims().at( + int_arg_mp.second.dim_idx))); + } + + int32_t** output_tensor_shape; + output_tensor_shape = reinterpret_cast( + malloc(kernel_args.size() * sizeof(int32_t**))); + for (int i = 0; i < kernel_args.size(); i++) { + // '4' is output tensor shape dim + output_tensor_shape[i] = reinterpret_cast( + malloc(kernel_args[i]->dims().size() * sizeof(int32_t**))); + } + ((infer_shape_func_ptr_g)cinn_kernel_info_.infer_shape_fn_ptr)( + static_cast(func_args_.data()), + func_args_.size(), + output_tensor_shape); + + for (int i = 0; i < kernel_args.size(); i++) { + DDim dim(output_tensor_shape[i], kernel_args[i]->dims().size()); + kernel_args[i]->Resize(dim); + } + } + private: CINNKernelInfo cinn_kernel_info_; @@ -103,14 +142,18 @@ CinnJitInstruction::CinnJitInstruction( ->GetMutable(); tensor_args_.push_back(tensor); - - if (!FLAGS_cinn_bucket_compile) { - auto alloc_tensor_type = - result.type().dyn_cast(); - tensor->set_type( - paddle::dialect::TransToPhiDataType(alloc_tensor_type.dtype())); - tensor->Resize(alloc_tensor_type.dims()); - } + // auto alloc_tensor_type = + // result.type().dyn_cast(); + + // VLOG(-1) << "xxx " << alloc_tensor_type.dims(); + + // if (!FLAGS_cinn_bucket_compile) { + auto alloc_tensor_type = + result.type().dyn_cast(); + tensor->set_type( + paddle::dialect::TransToPhiDataType(alloc_tensor_type.dtype())); + tensor->Resize(alloc_tensor_type.dims()); + // } } } @@ -120,16 +163,31 @@ void CinnJitInstruction::Run() { auto stream = gpu_ctx->stream(); + // for (size_t i = 0; i < tensor_args_.size(); ++i) { + // // TODO(6clc): template infer shape from tensor_args_[0]. + // // After supporting symbolic calculation, perfect the code to query shape + // // of output tensor + // VLOG(-1) << "xxx " << tensor_args_[i]->dims(); + // if (FLAGS_cinn_bucket_compile) { + // tensor_args_[i]->Resize(tensor_args_[0]->dims()); + // } + // gpu_ctx->Alloc(tensor_args_[i], tensor_args_[i]->dtype()); + // } + // 1. infer shape + + fn_ptr_impl_->InferShape(tensor_args_); for (size_t i = 0; i < tensor_args_.size(); ++i) { // TODO(6clc): template infer shape from tensor_args_[0]. // After supporting symbolic calculation, perfect the code to query shape // of output tensor - if (FLAGS_cinn_bucket_compile) { - tensor_args_[i]->Resize(tensor_args_[0]->dims()); - } + // VLOG(-1) << "xxx " << tensor_args_[i]->dims(); + // if (FLAGS_cinn_bucket_compile) { + // tensor_args_[i]->Resize(tensor_args_[0]->dims()); + // } gpu_ctx->Alloc(tensor_args_[i], tensor_args_[i]->dtype()); } + // 2. exexute kernel fn_ptr_impl_->Run(tensor_args_, static_cast(stream)); #else VLOG(phi::FATAL) << "Not Supported: cinn jit instruction currently does not " From aced24424694701a482d78acdea6a5d53b53add4 Mon Sep 17 00:00:00 2001 From: 6clc Date: Tue, 2 Jan 2024 17:32:00 +0800 Subject: [PATCH 3/8] runtime(cinn): update struct name --- paddle/cinn/hlir/framework/op_lowering.h | 4 +- paddle/cinn/hlir/framework/op_lowering_impl.h | 11 +-- .../hlir/framework/op_lowering_impl_base.h | 7 +- .../hlir/framework/pir/compilation_task.cc | 13 ++- .../hlir/framework/pir/compilation_task.h | 3 +- .../hlir/framework/pir/op_lowering_impl.cc | 91 ++++++++++++++----- .../hlir/framework/pir/op_lowering_impl.h | 12 ++- 7 files changed, 95 insertions(+), 46 deletions(-) diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h index 442cecc861e6fe..d4b4a78e9cd3fa 100644 --- a/paddle/cinn/hlir/framework/op_lowering.h +++ b/paddle/cinn/hlir/framework/op_lowering.h @@ -47,8 +47,8 @@ class OpLowerer { group, apply_op_schedule, apply_group_schedule, apply_pass); } - std::vector>> + std::vector< + std::pair> BucketLower(const T& group, bool apply_op_schedule = false, bool apply_group_schedule = true, diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.h b/paddle/cinn/hlir/framework/op_lowering_impl.h index 8e2a7fe366519a..d48cbbeb7e9b4a 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl.h @@ -60,12 +60,11 @@ class OpLowererImpl : public OpLowererImplBase { bool apply_group_schedule = true, bool apply_pass = true); - std::vector>> - BucketLower(const GroupPtr& group, - bool apply_op_schedule = false, - bool apply_group_schedule = true, - bool apply_pass = true) { + std::vector> BucketLower( + const GroupPtr& group, + bool apply_op_schedule = false, + bool apply_group_schedule = true, + bool apply_pass = true) { CINN_NOT_IMPLEMENTED; } diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h index 09a2373101163b..5fdf45894f756f 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h @@ -30,6 +30,10 @@ namespace framework { template class OpLowererImplBase { public: + struct WrapLoweredFunc { + ir::LoweredFunc infer_shape_func; + ir::LoweredFunc kernel_func; + }; OpLowererImplBase() = default; ~OpLowererImplBase() = default; @@ -38,8 +42,7 @@ class OpLowererImplBase { bool apply_group_schedule = true, bool apply_pass = true) = 0; - virtual std::vector>> + virtual std::vector> BucketLower(const T& group, bool apply_op_schedule = false, bool apply_group_schedule = true, diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc index 1726cdd7aa717f..c6d3412102c302 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.cc +++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc @@ -25,14 +25,13 @@ namespace framework { void GroupCompilationContext::SetLoweredFuncs( std::vector>>&& - funcs) { - for (std::pair>& predicate2func : - funcs) { + pir::OpLowererImpl::WrapLoweredFunc>>&& funcs) { + for (std::pair& + predicate2func : funcs) { predicates_.push_back(predicate2func.first); - lowered_funcs_.push_back(predicate2func.second.second); - infer_shape_lowered_funcs_.push_back(predicate2func.second.first); + lowered_funcs_.push_back(predicate2func.second.kernel_func); + infer_shape_lowered_funcs_.push_back( + predicate2func.second.infer_shape_func); ++func_size_; } } diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.h b/paddle/cinn/hlir/framework/pir/compilation_task.h index 223c44845896e2..9e96c64694527e 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.h +++ b/paddle/cinn/hlir/framework/pir/compilation_task.h @@ -33,8 +33,7 @@ class GroupCompilationContext { void SetLoweredFuncs( std::vector>>&& - funcs); + pir::OpLowererImpl::WrapLoweredFunc>>&& funcs); std::string PrintPredicate2Funcs() const; void* FuncPtr(); std::shared_ptr BackendCompiler(); diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 6c55ee8f0c749b..26691257e0a737 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -100,8 +100,7 @@ std::vector OpLowererImpl::Lower(const GroupPtr& group, } } -std::vector>> +std::vector> OpLowererImpl::BucketLower(const GroupPtr& group, bool apply_op_schedule, bool apply_group_schedule, @@ -152,19 +151,22 @@ OpLowererImpl::BucketLower(const GroupPtr& group, // 3.Do post-processing, // including preparing function args and temporary variables, // applying low-level optimization passes, etc. - std::vector>> - cond2funcs; + std::vector> cond2funcs; for (std::pair& cond2body : cond2func_bodies) { std::vector group_func_arg_tensors_copy = group_func_arg_tensors; + std::vector group_func_args; std::vector funcs = PostProcess(group, tensor_map, apply_op_schedule, cond2body.second, - &group_func_arg_tensors_copy); - cond2funcs.push_back({cond2body.first, {funcs[0], funcs[1]}}); + &group_func_arg_tensors_copy, + &group_func_args); + ir::LoweredFunc infer_shape_func = GenerateInferShapeFunc( + group, group_func_arg_tensors_copy, group_func_args); + cond2funcs.push_back({cond2body.first, {infer_shape_func, funcs[0]}}); } return cond2funcs; } @@ -293,11 +295,13 @@ std::vector OpLowererImpl::LowerMapExpr( // 3.Do post-processing, // including preparing function args and temporary variables, // applying low-level optimization passes, etc. + std::vector group_func_args; return PostProcess(group, *tensor_map, apply_op_schedule, ir_sch.GetModule().GetExprs()[0], - group_func_arg_tensors); + group_func_arg_tensors, + &group_func_args); } std::vector OpLowererImpl::LowerGroup( @@ -346,11 +350,13 @@ std::vector OpLowererImpl::LowerGroup( // 3.Do post-processing, // including preparing function args and temporary variables, // applying low-level optimization passes, etc. + std::vector group_func_args; return PostProcess(group, tensor_map, do_op_schedule, ir_sch.GetModule().GetExprs().at(0), - &group_func_arg_tensors); + &group_func_arg_tensors, + &group_func_args); } std::vector OpLowererImpl::LowerCustomCall( @@ -404,16 +410,20 @@ std::vector OpLowererImpl::PostProcess( const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map, bool done_op_schedule, ir::Expr func_body, - std::vector* group_func_arg_tensors) { + std::vector* group_func_arg_tensors, + std::vector* group_func_args) { // 1.Prepare function args group->input_names.clear(); - std::vector group_func_args; + if (group_func_args == nullptr) { + std::vector group_func_args; + } std::unordered_set arg_name_set; for (auto& arg_tensor : *group_func_arg_tensors) { // input data name. group->input_names.push_back(arg_tensor->name); // input args - group_func_args.emplace_back(arg_tensor->buffer, ir::Argument::IO::kInput); + (*group_func_args) + .emplace_back(arg_tensor->buffer, ir::Argument::IO::kInput); arg_name_set.insert(arg_tensor->buffer->name); } @@ -435,14 +445,15 @@ std::vector OpLowererImpl::PostProcess( group_func_arg_tensors->push_back(tensor); // output args group->output_names.push_back(tensor->name); - group_func_args.emplace_back(tensor->buffer, ir::Argument::IO::kOutput); + (*group_func_args) + .emplace_back(tensor->buffer, ir::Argument::IO::kOutput); arg_name_set.insert(tensor->buffer->name); } } if (!done_op_schedule) { std::unordered_set args_set; - for (auto arg : group_func_args) { + for (auto arg : (*group_func_args)) { args_set.insert(arg.name()); } for (auto& op : group->ops) { @@ -458,15 +469,16 @@ std::vector OpLowererImpl::PostProcess( group->output_values.push_back(opresult); group_func_arg_tensors->push_back(tensor); group->output_names.push_back(tensor->name); - group_func_args.emplace_back(tensor->buffer, ir::Argument::IO::kOutput); + group_func_args->emplace_back(tensor->buffer, + ir::Argument::IO::kOutput); } } } std::map mps; // update args for dynamic dim - int num_tensor_args = static_cast(group_func_args.size()); - int non_tensor_arg_idx = group_func_args.size(); + int num_tensor_args = static_cast(group_func_args->size()); + int non_tensor_arg_idx = group_func_args->size(); std::unordered_set int_args_set; std::vector ir_bodys; for (int tensor_arg_idx = 0; tensor_arg_idx < num_tensor_args; @@ -502,7 +514,7 @@ std::vector OpLowererImpl::PostProcess( continue; } int_args_set.insert(symbol_name); - group_func_args.emplace_back( + group_func_args->emplace_back( ir::_Var_::Make(symbol_name, cinn::common::Int(32))); group->int_args_map[non_tensor_arg_idx++] = {tensor_arg_idx, tensor_arg_dim_idx}; @@ -511,11 +523,6 @@ std::vector OpLowererImpl::PostProcess( } } } - ir::LoweredFunc infer_shape_func = - ir::_LoweredFunc_::Make(group->FuncName() + "_infer_shape", - group_func_args, - ir::Block::Make(ir_bodys), - {}); #ifdef CINN_WITH_CUDA optim::OptimizeExprGPU(&(func_body)); @@ -527,14 +534,14 @@ std::vector OpLowererImpl::PostProcess( lang::GetTempBuffers(*group_func_arg_tensors, stages, func_body); // 3.Building LoweredFunc auto func = ir::_LoweredFunc_::Make( - group->FuncName(), group_func_args, func_body, temp_buffers); + group->FuncName(), *group_func_args, func_body, temp_buffers); if (!done_op_schedule) { func->PrepareBufferCastExprs(); } // 4.Apply low level pass func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref(); - return {infer_shape_func, func}; + return {func}; } std::vector OpLowererImpl::LowerOps( @@ -1051,6 +1058,42 @@ bool OpLowererImpl::IsInTensorMap( return false; } +ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc( + const GroupPtr& group, + const std::vector group_func_arg_tensors, + const std::vector group_func_args) { + // CHECK_EQ(group_func_arg_tensors.size(), group_func_args.size()); + std::vector ir_bodys; + for (int tensor_arg_idx = 0; tensor_arg_idx < group_func_arg_tensors.size(); + tensor_arg_idx++) { + auto tensor_dim = group_func_arg_tensors[tensor_arg_idx]->sym_shape; + int tensor_dim_size = tensor_dim.size(); + auto tensor_shape = group_func_arg_tensors[tensor_arg_idx]->shape; + + ir::Var tensor_shape_args(TENSOR_SHAPE_ARGS, type_of()); + for (int i = 0; i < tensor_shape.size(); i++) { + ir::Expr call_set_infer_shape_value = + ir::Call::Make(type_of(), + runtime::intrinsic::set_value, + {tensor_shape_args, + ir::Expr(tensor_arg_idx), + ir::Expr(i), + tensor_shape[i]}, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0); + ir_bodys.push_back(call_set_infer_shape_value); + } + } + ir::LoweredFunc infer_shape_func = + ir::_LoweredFunc_::Make(group->FuncName() + "_infer_shape", + group_func_args, + ir::Block::Make(ir_bodys), + {}); + return infer_shape_func; +} + } // namespace pir } // namespace framework } // namespace hlir diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h index d6698ee37cd50f..8c45e3c39e3274 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h @@ -70,8 +70,7 @@ class OpLowererImpl : public OpLowererImplBase { * @param apply_group_schedule Whether to schedule at group level. * @return The lowered funcs. */ - std::vector>> + std::vector> BucketLower(const GroupPtr& group, bool apply_op_schedule = false, bool apply_group_schedule = true, @@ -111,6 +110,7 @@ class OpLowererImpl : public OpLowererImplBase { * applied. * @param func_body The scheduled func body of group. * @param group_func_arg_tensors Tensors used as the group function arguments. + * @param group_func_args Arguments used as the group function arguments. * @return The lowered funcs after the post processing. */ std::vector PostProcess( @@ -118,7 +118,8 @@ class OpLowererImpl : public OpLowererImplBase { const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map, bool done_op_schedule, ir::Expr func_body, - std::vector* group_func_arg_tensors); + std::vector* group_func_arg_tensors, + std::vector* group_func_args); /** * @brief Lower an Op set to CINN IR. @@ -215,6 +216,11 @@ class OpLowererImpl : public OpLowererImplBase { const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map, const std::unordered_map& tmp_tensor_info); + ir::LoweredFunc GenerateInferShapeFunc( + const GroupPtr& group, + const std::vector group_func_arg_tensors, + const std::vector group_func_args); + // Functions used to determine which Ops to schedule at op level, define a // policy for each type of group. inline bool ReduceScheduleDetermineFunction(::pir::Operation* op); From a4fafba0c91d8713e22bf81d213f9c1be16479e2 Mon Sep 17 00:00:00 2001 From: 6clc Date: Wed, 3 Jan 2024 08:28:27 +0800 Subject: [PATCH 4/8] runtime(cinn): update output tensor name --- .../hlir/framework/pir/op_lowering_impl.cc | 7 +++- .../instruction/cinn_jit_instruction.cc | 39 ++++++++++++------- .../instruction/cinn_jit_instruction.h | 3 +- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 26691257e0a737..86d53776d06feb 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -1064,8 +1064,12 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc( const std::vector group_func_args) { // CHECK_EQ(group_func_arg_tensors.size(), group_func_args.size()); std::vector ir_bodys; + int output_tensor_idx = 0; for (int tensor_arg_idx = 0; tensor_arg_idx < group_func_arg_tensors.size(); tensor_arg_idx++) { + if (group_func_args[tensor_arg_idx].is_input()) { + continue; + } auto tensor_dim = group_func_arg_tensors[tensor_arg_idx]->sym_shape; int tensor_dim_size = tensor_dim.size(); auto tensor_shape = group_func_arg_tensors[tensor_arg_idx]->shape; @@ -1076,7 +1080,7 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc( ir::Call::Make(type_of(), runtime::intrinsic::set_value, {tensor_shape_args, - ir::Expr(tensor_arg_idx), + ir::Expr(output_tensor_idx), ir::Expr(i), tensor_shape[i]}, {}, @@ -1085,6 +1089,7 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc( 0); ir_bodys.push_back(call_set_infer_shape_value); } + ++output_tensor_idx; } ir::LoweredFunc infer_shape_func = ir::_LoweredFunc_::Make(group->FuncName() + "_infer_shape", diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc index 680e8c1fe0eade..7ffbb147636e9e 100644 --- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc @@ -63,13 +63,14 @@ class CinnJitInstruction::FnPtrImpl { static_cast(func_args_.data()), func_args_.size(), stream); } - void InferShape(const std::vector& kernel_args) { + void InferShape(const std::vector& kernel_args, + int32_t input_tensor_size, + int32_t output_tensor_size) { func_args_.clear(); // 1. Convert the phi::DenseTensor type to cinn_pod_value_t for (size_t i = 0; i < kernel_args.size(); ++i) { auto* buffer = new cinn_buffer_t(); - // buffer->memory = reinterpret_cast(kernel_args[i]->data()); func_args_.emplace_back(buffer); } // 2. Convert arg's data about shape of Tensor to cinn_pod_value_t @@ -81,23 +82,28 @@ class CinnJitInstruction::FnPtrImpl { int_arg_mp.second.dim_idx))); } - int32_t** output_tensor_shape; - output_tensor_shape = reinterpret_cast( - malloc(kernel_args.size() * sizeof(int32_t**))); - for (int i = 0; i < kernel_args.size(); i++) { + int32_t* output_tensor_shapes[output_tensor_size]; + for (int i = 0; i < output_tensor_size; i++) { // '4' is output tensor shape dim - output_tensor_shape[i] = reinterpret_cast( - malloc(kernel_args[i]->dims().size() * sizeof(int32_t**))); + output_tensor_shapes[i] = reinterpret_cast( + malloc(kernel_args[input_tensor_size + i]->dims().size() * + sizeof(int32_t*))); + VLOG(-1) << "dddd " << kernel_args[input_tensor_size + i]->dims(); } + ((infer_shape_func_ptr_g)cinn_kernel_info_.infer_shape_fn_ptr)( static_cast(func_args_.data()), func_args_.size(), - output_tensor_shape); - - for (int i = 0; i < kernel_args.size(); i++) { - DDim dim(output_tensor_shape[i], kernel_args[i]->dims().size()); - kernel_args[i]->Resize(dim); + output_tensor_shapes); + + for (int i = 0; i < output_tensor_size; i++) { + DDim dim(output_tensor_shapes[i], + kernel_args[input_tensor_size + i]->dims().size()); + kernel_args[input_tensor_size + i]->Resize(dim); + VLOG(-1) << "dddd " << kernel_args[input_tensor_size + i]->dims(); + free(output_tensor_shapes[i]); } + VLOG(-1) << "ddd "; } private: @@ -115,6 +121,8 @@ CinnJitInstruction::CinnJitInstruction( auto jit_kernel_op = op->dyn_cast(); fn_ptr_impl_ = std::make_shared(jit_kernel_op.cinn_kernel_info()); op_ = op; + input_tensor_size = op->num_operands(); + output_tensor_size = op->num_results(); place_ = place; @@ -175,12 +183,13 @@ void CinnJitInstruction::Run() { // } // 1. infer shape - fn_ptr_impl_->InferShape(tensor_args_); + fn_ptr_impl_->InferShape(tensor_args_, input_tensor_size, output_tensor_size); + VLOG(-1) << "yyyy "; for (size_t i = 0; i < tensor_args_.size(); ++i) { // TODO(6clc): template infer shape from tensor_args_[0]. // After supporting symbolic calculation, perfect the code to query shape // of output tensor - // VLOG(-1) << "xxx " << tensor_args_[i]->dims(); + VLOG(-1) << "xxx " << tensor_args_[i]->dims(); // if (FLAGS_cinn_bucket_compile) { // tensor_args_[i]->Resize(tensor_args_[0]->dims()); // } diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h index b15fae77bdbe77..5f744f4229d911 100644 --- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h @@ -49,7 +49,8 @@ class CinnJitInstruction : public InstructionBase { phi::DeviceContext* dev_ctx_; - phi::DenseTensor* out_tensor_; + int32_t input_tensor_size; + int32_t output_tensor_size; std::vector tensor_args_; From f32d663aa2c33388aeb2795923501535969cf04d Mon Sep 17 00:00:00 2001 From: 6clc Date: Wed, 3 Jan 2024 09:40:02 +0800 Subject: [PATCH 5/8] runtime(cinn): remove useless code --- .../hlir/framework/op_lowering_impl_base.h | 5 ++- .../hlir/framework/pir/op_lowering_impl.cc | 34 +++------------ .../hlir/framework/pir/op_lowering_impl.h | 7 ++++ .../instruction/cinn_jit_instruction.cc | 41 ++++--------------- 4 files changed, 24 insertions(+), 63 deletions(-) diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h index 5fdf45894f756f..32bda3ca50f675 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h @@ -31,8 +31,11 @@ template class OpLowererImplBase { public: struct WrapLoweredFunc { - ir::LoweredFunc infer_shape_func; ir::LoweredFunc kernel_func; + ir::LoweredFunc infer_shape_func; + WrapLoweredFunc(ir::LoweredFunc kernel_func, + ir::LoweredFunc infer_shape_func = ir::LoweredFunc()) + : infer_shape_func(infer_shape_func), kernel_func(kernel_func) {} }; OpLowererImplBase() = default; ~OpLowererImplBase() = default; diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 86d53776d06feb..1a5ec321dc869c 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -107,9 +107,10 @@ OpLowererImpl::BucketLower(const GroupPtr& group, bool apply_pass) { // 1.Do compute, lower and schedule for each op. auto& ops = group->ops; - // if (ops.size() == 1 && ops[0]->name() == "custom_call") { - // return {{ir::Expr(1), LowerCustomCall(group)[0]}}; - // } + if (ops.size() == 1 && ops[0]->name() == "custom_call") { + return {{ir::Expr(1), + pir::OpLowererImpl::WrapLoweredFunc(LowerCustomCall(group)[0])}}; + } std::vector group_func_arg_tensors; std::unordered_map<::pir::Value, ir::Tensor> tensor_map; // for some op, it will output more tmp value and regard as @@ -166,7 +167,7 @@ OpLowererImpl::BucketLower(const GroupPtr& group, &group_func_args); ir::LoweredFunc infer_shape_func = GenerateInferShapeFunc( group, group_func_arg_tensors_copy, group_func_args); - cond2funcs.push_back({cond2body.first, {infer_shape_func, funcs[0]}}); + cond2funcs.push_back({cond2body.first, {funcs[0], infer_shape_func}}); } return cond2funcs; } @@ -414,9 +415,6 @@ std::vector OpLowererImpl::PostProcess( std::vector* group_func_args) { // 1.Prepare function args group->input_names.clear(); - if (group_func_args == nullptr) { - std::vector group_func_args; - } std::unordered_set arg_name_set; for (auto& arg_tensor : *group_func_arg_tensors) { // input data name. @@ -480,31 +478,10 @@ std::vector OpLowererImpl::PostProcess( int num_tensor_args = static_cast(group_func_args->size()); int non_tensor_arg_idx = group_func_args->size(); std::unordered_set int_args_set; - std::vector ir_bodys; for (int tensor_arg_idx = 0; tensor_arg_idx < num_tensor_args; tensor_arg_idx++) { auto tensor_dim = (*group_func_arg_tensors)[tensor_arg_idx]->sym_shape; int tensor_dim_size = tensor_dim.size(); - auto tensor_shape = (*group_func_arg_tensors)[tensor_arg_idx]->shape; - - ir::Var tensor_shape_args(TENSOR_SHAPE_ARGS, type_of()); - - // if (group_func_args[tensor_arg_idx].is_output()) { - for (int i = 0; i < tensor_shape.size(); i++) { - ir::Expr call_set_infer_shape_value = - ir::Call::Make(type_of(), - runtime::intrinsic::set_value, - {tensor_shape_args, - ir::Expr(tensor_arg_idx), - ir::Expr(i), - tensor_shape[i]}, - {}, - ir::CallType::Extern, - ir::FunctionRef(), - 0); - ir_bodys.push_back(call_set_infer_shape_value); - } - // } for (int tensor_arg_dim_idx = 0; tensor_arg_dim_idx < tensor_dim_size; tensor_arg_dim_idx++) { if (tensor_dim[tensor_arg_dim_idx]->IsDynamic()) { @@ -540,7 +517,6 @@ std::vector OpLowererImpl::PostProcess( } // 4.Apply low level pass func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref(); - return {func}; } diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h index 8c45e3c39e3274..0a9f4d4b33820a 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h @@ -216,6 +216,13 @@ class OpLowererImpl : public OpLowererImplBase { const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map, const std::unordered_map& tmp_tensor_info); + /** + * @brief Generates the output tensor infer shape function. + * @param group The group to be lowered. + * @param group_func_arg_tensors Tensors used as the group function arguments. + * @param group_func_args Arguments used as the group function arguments. + * @return The lowered func to infer output tensor's shape. + */ ir::LoweredFunc GenerateInferShapeFunc( const GroupPtr& group, const std::vector group_func_arg_tensors, diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc index 7ffbb147636e9e..ba76c932b82757 100644 --- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc @@ -56,7 +56,6 @@ class CinnJitInstruction::FnPtrImpl { kernel_args[int_arg_mp.second.arg_idx]->dims().at( int_arg_mp.second.dim_idx))); } - (infer_shape_func_ptr_g) cinn_kernel_info_.infer_shape_fn_ptr; // 3. Launch host kernel ((lower_func_ptr_g)cinn_kernel_info_.fn_ptr)( @@ -73,6 +72,7 @@ class CinnJitInstruction::FnPtrImpl { auto* buffer = new cinn_buffer_t(); func_args_.emplace_back(buffer); } + // 2. Convert arg's data about shape of Tensor to cinn_pod_value_t for (const auto& int_arg_mp : cinn_kernel_info_.int_args_map) { func_args_.emplace_back(kernel_args[int_arg_mp.second.arg_idx]->dims().at( @@ -82,28 +82,27 @@ class CinnJitInstruction::FnPtrImpl { int_arg_mp.second.dim_idx))); } + // 3. Define an array of Pointers to hold the output tensor shape int32_t* output_tensor_shapes[output_tensor_size]; for (int i = 0; i < output_tensor_size; i++) { - // '4' is output tensor shape dim output_tensor_shapes[i] = reinterpret_cast( malloc(kernel_args[input_tensor_size + i]->dims().size() * sizeof(int32_t*))); - VLOG(-1) << "dddd " << kernel_args[input_tensor_size + i]->dims(); } + // 4. Launch infer_shape_fn_ptr to infer shape of output tensor ((infer_shape_func_ptr_g)cinn_kernel_info_.infer_shape_fn_ptr)( static_cast(func_args_.data()), func_args_.size(), output_tensor_shapes); + // 5. Resize shape of output tensor for (int i = 0; i < output_tensor_size; i++) { DDim dim(output_tensor_shapes[i], kernel_args[input_tensor_size + i]->dims().size()); kernel_args[input_tensor_size + i]->Resize(dim); - VLOG(-1) << "dddd " << kernel_args[input_tensor_size + i]->dims(); free(output_tensor_shapes[i]); } - VLOG(-1) << "ddd "; } private: @@ -150,18 +149,11 @@ CinnJitInstruction::CinnJitInstruction( ->GetMutable(); tensor_args_.push_back(tensor); - // auto alloc_tensor_type = - // result.type().dyn_cast(); - - // VLOG(-1) << "xxx " << alloc_tensor_type.dims(); - - // if (!FLAGS_cinn_bucket_compile) { auto alloc_tensor_type = result.type().dyn_cast(); tensor->set_type( paddle::dialect::TransToPhiDataType(alloc_tensor_type.dtype())); tensor->Resize(alloc_tensor_type.dims()); - // } } } @@ -171,28 +163,11 @@ void CinnJitInstruction::Run() { auto stream = gpu_ctx->stream(); - // for (size_t i = 0; i < tensor_args_.size(); ++i) { - // // TODO(6clc): template infer shape from tensor_args_[0]. - // // After supporting symbolic calculation, perfect the code to query shape - // // of output tensor - // VLOG(-1) << "xxx " << tensor_args_[i]->dims(); - // if (FLAGS_cinn_bucket_compile) { - // tensor_args_[i]->Resize(tensor_args_[0]->dims()); - // } - // gpu_ctx->Alloc(tensor_args_[i], tensor_args_[i]->dtype()); - // } - // 1. infer shape - - fn_ptr_impl_->InferShape(tensor_args_, input_tensor_size, output_tensor_size); - VLOG(-1) << "yyyy "; + if (FLAGS_cinn_bucket_compile) { + fn_ptr_impl_->InferShape( + tensor_args_, input_tensor_size, output_tensor_size); + } for (size_t i = 0; i < tensor_args_.size(); ++i) { - // TODO(6clc): template infer shape from tensor_args_[0]. - // After supporting symbolic calculation, perfect the code to query shape - // of output tensor - VLOG(-1) << "xxx " << tensor_args_[i]->dims(); - // if (FLAGS_cinn_bucket_compile) { - // tensor_args_[i]->Resize(tensor_args_[0]->dims()); - // } gpu_ctx->Alloc(tensor_args_[i], tensor_args_[i]->dtype()); } From 234fd2bd431e734330db385e78402f8d9bd15116 Mon Sep 17 00:00:00 2001 From: 6clc Date: Wed, 3 Jan 2024 09:40:02 +0800 Subject: [PATCH 6/8] runtime(cinn): remove useless code From d09b152d5c9f34f5d2071b61affd419d40784e52 Mon Sep 17 00:00:00 2001 From: 6clc Date: Wed, 3 Jan 2024 10:02:32 +0800 Subject: [PATCH 7/8] runtime(cinn): rename external call's name --- paddle/cinn/hlir/framework/pir/op_lowering_impl.cc | 8 ++++---- paddle/cinn/runtime/cuda/cuda_intrinsics.cc | 7 ++++--- paddle/cinn/runtime/cuda/cuda_util.cc | 2 +- paddle/cinn/runtime/cuda/cuda_util.h | 2 +- paddle/cinn/runtime/intrinsic.h | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 1a5ec321dc869c..73e9d763d2e022 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -1054,11 +1054,11 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc( for (int i = 0; i < tensor_shape.size(); i++) { ir::Expr call_set_infer_shape_value = ir::Call::Make(type_of(), - runtime::intrinsic::set_value, - {tensor_shape_args, - ir::Expr(output_tensor_idx), + runtime::intrinsic::infer_shape_set_value, + {ir::Expr(output_tensor_idx), ir::Expr(i), - tensor_shape[i]}, + tensor_shape[i], + tensor_shape_args}, {}, ir::CallType::Extern, ir::FunctionRef(), diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc index 68579bcb64477e..c4f335603963be 100644 --- a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc +++ b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc @@ -434,13 +434,14 @@ CINN_REGISTER_HELPER(cinn_cuda_host_api) { .AddInputType() // index .End(); - using cinn::runtime::cuda::set_value; - REGISTER_EXTERN_FUNC_HELPER(set_value, cinn::common::DefaultHostTarget()) + using cinn::runtime::cuda::infer_shape_set_value; + REGISTER_EXTERN_FUNC_HELPER(infer_shape_set_value, + cinn::common::DefaultHostTarget()) .SetRetType() - .AddInputType() .AddInputType() .AddInputType() .AddInputType() + .AddInputType() .End(); using cinn::runtime::cuda::cinn_call_cuda_kernel; diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc index 413ae703bdac1f..98ba1c52d7edc3 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.cc +++ b/paddle/cinn/runtime/cuda/cuda_util.cc @@ -2748,7 +2748,7 @@ void cinn_gpu_cudnn_pool2d(const std::vector &attrs, cudnnDestroyPoolingDescriptor(pooling_desc); } -void set_value(int32_t **v, int row, int col, int32_t value) { +void infer_shape_set_value(int row, int col, int32_t value, int32_t **v) { v[row][col] = value; } void cinn_gpu_cudnn_softmax(const std::vector &attrs, diff --git a/paddle/cinn/runtime/cuda/cuda_util.h b/paddle/cinn/runtime/cuda/cuda_util.h index 25244b05c92c0a..c7d9220e00688f 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.h +++ b/paddle/cinn/runtime/cuda/cuda_util.h @@ -96,7 +96,7 @@ void cinn_call_cuda_memcpy(void* v_args, void* stream = nullptr); int32_t cinn_get_value_in_cuda_kernel_args(void* v_args, int idx); -void set_value(int32_t** v, int row, int col, int32_t value); +void infer_shape_set_value(int row, int col, int32_t value, int32_t** v); /** * Call a CUDA compiled kernel. diff --git a/paddle/cinn/runtime/intrinsic.h b/paddle/cinn/runtime/intrinsic.h index f3f6e2d5ae9a3b..c2db240de2d12f 100644 --- a/paddle/cinn/runtime/intrinsic.h +++ b/paddle/cinn/runtime/intrinsic.h @@ -107,7 +107,7 @@ static const char* call_cuda_kernel = "cinn_call_cuda_kernel"; static const char* get_value_in_cuda_kernel_args = "cinn_get_value_in_cuda_kernel_args"; -static const char* set_value = "set_value"; +static const char* infer_shape_set_value = "infer_shape_set_value"; static const char* pod_values_to_array_repr = "pod_values_to_array"; From 3ecb41d65a7f2a5950413005495128ce12fb7e59 Mon Sep 17 00:00:00 2001 From: 6clc Date: Wed, 3 Jan 2024 15:38:48 +0800 Subject: [PATCH 8/8] runtime(cinn): refine code --- paddle/cinn/backends/codegen_cuda_host.cc | 2 +- paddle/cinn/hlir/framework/pir/op_lowering_impl.cc | 2 +- .../new_executor/instruction/cinn_jit_instruction.cc | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc index b440b424a9c2d9..11e986bb9ace1b 100644 --- a/paddle/cinn/backends/codegen_cuda_host.cc +++ b/paddle/cinn/backends/codegen_cuda_host.cc @@ -200,7 +200,7 @@ llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) { // Set local scope table CHECK_EQ(ll_function_args.size(), func->args.size()); - for (int i = 0; i < ll_function_args.size(); i++) { + for (int i = 0; i < ll_function_args.size(); ++i) { SetVar(func->args[i].name(), ll_function_args[i]); } llvm::BasicBlock* entry = llvm::BasicBlock::Create( diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 73e9d763d2e022..062e5db1cc1f8c 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -1042,7 +1042,7 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc( std::vector ir_bodys; int output_tensor_idx = 0; for (int tensor_arg_idx = 0; tensor_arg_idx < group_func_arg_tensors.size(); - tensor_arg_idx++) { + ++tensor_arg_idx) { if (group_func_args[tensor_arg_idx].is_input()) { continue; } diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc index ba76c932b82757..180eb4f478fa6b 100644 --- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc @@ -84,7 +84,7 @@ class CinnJitInstruction::FnPtrImpl { // 3. Define an array of Pointers to hold the output tensor shape int32_t* output_tensor_shapes[output_tensor_size]; - for (int i = 0; i < output_tensor_size; i++) { + for (int i = 0; i < output_tensor_size; ++i) { output_tensor_shapes[i] = reinterpret_cast( malloc(kernel_args[input_tensor_size + i]->dims().size() * sizeof(int32_t*))); @@ -97,7 +97,7 @@ class CinnJitInstruction::FnPtrImpl { output_tensor_shapes); // 5. Resize shape of output tensor - for (int i = 0; i < output_tensor_size; i++) { + for (int i = 0; i < output_tensor_size; ++i) { DDim dim(output_tensor_shapes[i], kernel_args[input_tensor_size + i]->dims().size()); kernel_args[input_tensor_size + i]->Resize(dim);