From 31f267bd5f134a4320cd57f7e1d651447043fa1e Mon Sep 17 00:00:00 2001 From: Peter Goodman Date: Tue, 22 Sep 2020 23:22:56 -0400 Subject: [PATCH] Cmake refactor (#693) * Update to latest remill APIs and way of using CMake. * Minor fixes * Bug fixes for AArch64 * Tweaks * Adds some nifty functionality useful for debugging some aarch64 bugs * Fixes a subtle bug * Bug fixes * Make the test suite use explicit args * Bug fix in xrefs * Disable tests for now --- .gitignore | 7 + CMakeLists.txt | 64 ++------ Dockerfile | 22 +-- README.md | 2 +- cmake/modules/FindProtobuf.cmake | 173 ++++++++++++++++++++++ cmake/settings.cmake | 86 +---------- mcsema/BC/Function.cpp | 25 ++-- mcsema/BC/Instruction.cpp | 49 ++---- mcsema/BC/Instruction.h | 5 +- mcsema/BC/Lift.cpp | 2 + mcsema/BC/Lift.h | 2 + mcsema/BC/Optimize.cpp | 2 +- mcsema/BC/Segment.cpp | 119 ++++++++++++++- mcsema/BC/Segment.h | 1 + mcsema/BC/Util.cpp | 8 +- mcsema/CFG/CFG.cpp | 9 +- mcsema/OS/Linux/X86/CMakeLists.txt | 8 +- mcsema/OS/Linux/generate_abi_wrapper.py | 4 +- tests/test_suite_generator/CMakeLists.txt | 14 +- tests/test_suite_generator/src/start.py | 2 + tests/var_recovery/recover_and_test.sh | 2 + tools/mcsema_disass/__main__.py | 9 +- tools/mcsema_disass/ida7/arm_util.py | 15 +- tools/mcsema_disass/ida7/disass.py | 3 + tools/mcsema_disass/ida7/get_cfg.py | 19 ++- tools/mcsema_disass/ida7/refs.py | 29 ++-- tools/setup.py | 4 +- tools/setup_launcher.sh | 1 + 28 files changed, 440 insertions(+), 246 deletions(-) create mode 100644 cmake/modules/FindProtobuf.cmake diff --git a/.gitignore b/.gitignore index a1ed911a7..ff48162a5 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ tools/regtrace/obj-intel64/* third_party/* generated/* cxxcommon +cxx-common tools/build/* tools/dist/* @@ -58,6 +59,12 @@ tools/mcsema_disass/binja/CFG_pb2.py tests/linux/x86/*.elf tests/linux/amd64/*.elf +.DS_Store +.project +.cproject +.settings +.idea + #==============================================================================# # Directories to ignore (do not add trailing '/'s, they skip symlinks). #==============================================================================# diff --git a/CMakeLists.txt b/CMakeLists.txt index 74b5556f0..e539050c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,13 @@ endif() # libraries # +# remill +if("${PLATFORM_NAME}" STREQUAL "windows") + set(REMILL_FINDPACKAGE_HINTS HINTS "${CMAKE_INSTALL_PREFIX}/remill/lib") +endif() + +find_package(remill REQUIRED ${REMILL_FINDPACKAGE_HINTS}) + # protobuf find_package(Protobuf REQUIRED) list(APPEND PROJECT_LIBRARIES ${Protobuf_LIBRARIES}) @@ -69,10 +76,6 @@ add_custom_target(protobuf_python_module_ida DEPENDS ${PROJECT_PROTOBUFPYTHONMODULE} ) -add_custom_target(protobuf_python_module_binja - DEPENDS ${PROJECT_PROTOBUFPYTHONMODULE} -) - # disable -Werror on these file since they have been generated set_source_files_properties(${PROJECT_PROTOBUFSOURCEFILES} PROPERTIES COMPILE_FLAGS "-Wno-sign-conversion -Wno-shorten-64-to-32 -Wno-conversion" @@ -82,14 +85,6 @@ set_source_files_properties(${PROJECT_PROTOBUFHEADERFILES} PROPERTIES COMPILE_FLAGS "-Wno-sign-conversion -Wno-shorten-64-to-32 -Wno-conversion" ) -# llvm -find_package(LLVM REQUIRED CONFIG HINTS ${FINDPACKAGE_LLVM_HINTS}) - -string(REPLACE "." ";" LLVM_VERSION_LIST ${LLVM_PACKAGE_VERSION}) -list(GET LLVM_VERSION_LIST 0 LLVM_MAJOR_VERSION) -list(GET LLVM_VERSION_LIST 1 LLVM_MINOR_VERSION) -set(REMILL_LLVM_VERSION "${LLVM_MAJOR_VERSION}.${LLVM_MINOR_VERSION}") - # # target settings # @@ -132,8 +127,7 @@ list(APPEND PROJECT_INCLUDEDIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}) add_dependencies(${MCSEMA_LIFT} semantics - protobuf_python_module_ida - protobuf_python_module_binja) + protobuf_python_module_ida) # # libraries @@ -154,51 +148,11 @@ list(APPEND PROJECT_LIBRARIES remill) if(NOT TARGET anvill-${REMILL_LLVM_VERSION}) find_package(anvill REQUIRED) if(NOT anvill_FOUND) - message(FATAL_ERROR "McSema depends upon Anvill being cloned into Remill's tools directory") + message(FATAL_ERROR "McSema depends upon Anvill being installed") endif() endif() list(APPEND PROJECT_LIBRARIES anvill-${REMILL_LLVM_VERSION}) -if (LLVM_Z3_INSTALL_DIR) - set(need_z3 TRUE) -elseif(DEFINED ENV{TRAILOFBITS_LIBRARIES}) - set(LLVM_Z3_INSTALL_DIR "$ENV{TRAILOFBITS_LIBRARIES}/z3") - set(need_z3 TRUE) -else() - set(need_z3 FALSE) -endif() - -if(need_z3) - find_package(Z3 4.7.1) - if (NOT Z3_FOUND) - message(WARNING "Z3 >= 4.7.1 has not been found in LLVM_Z3_INSTALL_DIR: ${LLVM_Z3_INSTALL_DIR}.") - endif() -endif() - - -set(LLVM_LIBRARIES - LLVMCore LLVMSupport LLVMAnalysis LLVMipo LLVMIRReader - LLVMBitReader LLVMBitWriter LLVMTransformUtils LLVMScalarOpts - LLVMLTO -) - -list(APPEND PROJECT_LIBRARIES ${LLVM_LIBRARIES}) -list(APPEND PROJECT_DEFINITIONS ${LLVM_DEFINITIONS}) -list(APPEND PROJECT_INCLUDEDIRECTORIES ${LLVM_INCLUDE_DIRS}) - -# xed -find_package(XED REQUIRED) -list(APPEND PROJECT_LIBRARIES ${XED_LIBRARIES}) -list(APPEND PROJECT_INCLUDEDIRECTORIES ${XED_INCLUDE_DIRS}) - -# google log module -find_package(glog REQUIRED) -list(APPEND PROJECT_LIBRARIES glog::glog) - -# gflags -find_package(gflags REQUIRED) -list(APPEND PROJECT_LIBRARIES gflags) - # # target settings # diff --git a/Dockerfile b/Dockerfile index ea52baa4c..3f3dadca1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -65,17 +65,17 @@ RUN mkdir -p ./build && cd ./build && \ cmake -G Ninja -DCMAKE_PREFIX_PATH="/opt/trailofbits/remill;/opt/trailofbits/anvill" -DMCSEMA_DISABLED_ABI_LIBRARIES:STRING="" -DCMAKE_VERBOSE_MAKEFILE=True -DCMAKE_INSTALL_PREFIX=/opt/trailofbits/mcsema .. && \ cmake --build . --target install -WORKDIR tests/test_suite_generator -RUN mkdir -p build && \ - cd build && \ - cmake -DMCSEMALIFT_PATH=/opt/trailofbits/mcsema/bin \ - -DMCSEMA_PREBUILT_CFG_PATH="$(pwd)/../generated/prebuilt_cfg/" \ - -DMCSEMADISASS_PATH=/opt/trailofbits/mcsema/bin \ - .. && \ - cmake --build . --target install - -RUN cd test_suite && \ - PATH="/opt/trailofbits/mcsema/bin:${PATH}" python2.7 start.py +# WORKDIR tests/test_suite_generator +# RUN mkdir -p build && \ +# cd build && \ +# cmake -DMCSEMALIFT_PATH=/opt/trailofbits/mcsema/bin \ +# -DMCSEMA_PREBUILT_CFG_PATH="$(pwd)/../generated/prebuilt_cfg/" \ +# -DMCSEMADISASS_PATH=/opt/trailofbits/mcsema/bin \ +# .. && \ +# cmake --build . --target install +# +# RUN cd test_suite && \ +# PATH="/opt/trailofbits/mcsema/bin:${PATH}" python2.7 start.py FROM base as dist ARG LLVM_VERSION diff --git a/README.md b/README.md index 260596611..0cff09a54 100644 --- a/README.md +++ b/README.md @@ -319,7 +319,7 @@ Make sure to always execute the `vcvars64.bat` script from the "x64 Native Tools mkdir remill_build cd remill_build -cmake -G "Visual Studio 16 2019" -T llvm -A x64 -DCMAKE_BUILD_TYPE=Release -DLIBRARY_REPOSITORY_ROOT=C:\Projects\tob_libraries -DCMAKE_INSTALL_PREFIX=C:\ ..\remill +cmake -G "Visual Studio 16 2019" -T llvm -A x64 -DCMAKE_BUILD_TYPE=Release -DCXX_COMMON_REPOSITORY_ROOT=C:\Projects\tob_libraries -DCMAKE_INSTALL_PREFIX=C:\ ..\remill cmake --build . --config Release -- /maxcpucount:%NUMBER_OF_PROCESSORS% ``` diff --git a/cmake/modules/FindProtobuf.cmake b/cmake/modules/FindProtobuf.cmake new file mode 100644 index 000000000..21223fa53 --- /dev/null +++ b/cmake/modules/FindProtobuf.cmake @@ -0,0 +1,173 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +set(LIBRARY_ROOT "${CXX_COMMON_REPOSITORY_ROOT}/protobuf") + +set(Protobuf_FOUND TRUE) +set(Protobuf_INCLUDE_DIR "${LIBRARY_ROOT}/include") + +if (WIN32) + set(Protobuf_PROTOC_EXECUTABLE "${LIBRARY_ROOT}/bin/protoc.exe") + set(Protobuf_LIBRARIES ${LIBRARY_ROOT}/lib/protobuf.lib) + set(Protobuf_PROTOC_LIBRARIES ${LIBRARY_ROOT}/lib/protoc.lib) +else () + set(Protobuf_PROTOC_EXECUTABLE "${LIBRARY_ROOT}/bin/protoc") + set(Protobuf_LIBRARIES ${LIBRARY_ROOT}/lib/libprotobuf.a) + set(Protobuf_PROTOC_LIBRARIES ${LIBRARY_ROOT}/lib/libprotoc.a) +endif () + +mark_as_advanced(FORCE Protobuf_FOUND) +mark_as_advanced(FORCE Protobuf_INCLUDE_DIR) +mark_as_advanced(FORCE Protobuf_PROTOC_EXECUTABLE) +mark_as_advanced(FORCE Protobuf_LIBRARIES) +mark_as_advanced(FORCE Protobuf_PROTOC_LIBRARIES) + +# Backwards compatibility +# Define camel case versions of input variables +foreach(UPPER + PROTOBUF_SRC_ROOT_FOLDER + PROTOBUF_IMPORT_DIRS + PROTOBUF_DEBUG + PROTOBUF_LIBRARY + PROTOBUF_PROTOC_LIBRARY + PROTOBUF_INCLUDE_DIR + PROTOBUF_PROTOC_EXECUTABLE + PROTOBUF_LIBRARY_DEBUG + PROTOBUF_PROTOC_LIBRARY_DEBUG + PROTOBUF_LITE_LIBRARY + PROTOBUF_LITE_LIBRARY_DEBUG + ) + if (DEFINED ${UPPER}) + string(REPLACE "PROTOBUF_" "Protobuf_" Camel ${UPPER}) + if (NOT DEFINED ${Camel}) + set(${Camel} ${${UPPER}}) + endif() + endif() +endforeach() + +# By default have PROTOBUF_GENERATE_CPP macro pass -I to protoc +# for each directory where a proto file is referenced. +if(NOT DEFINED PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE) +endif() + +function(PROTOBUF_GENERATE_CPP SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + set(${HDRS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h" + COMMAND ${Protobuf_PROTOC_EXECUTABLE} + ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() + +function(PROTOBUF_GENERATE_PYTHON SRCS) + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) +endfunction() \ No newline at end of file diff --git a/cmake/settings.cmake b/cmake/settings.cmake index d8b679ac4..155114f57 100644 --- a/cmake/settings.cmake +++ b/cmake/settings.cmake @@ -1,6 +1,6 @@ # This is only executed once; use a macro (and not a function) so that # everything defined here does not end up in a separate namespace -macro(main) +macro(mcsema_settings_main) # default build type if(WIN32) set(CMAKE_BUILD_TYPE Release) @@ -24,96 +24,14 @@ macro(main) # generate a compile commands JSON file. set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - # - # cxx-common - # - - if(DEFINED ENV{TRAILOFBITS_LIBRARIES}) - set(LIBRARY_REPOSITORY_ROOT $ENV{TRAILOFBITS_LIBRARIES} - CACHE PATH "Location of cxx-common libraries." - ) - endif() - - if(DEFINED LIBRARY_REPOSITORY_ROOT) - set(TOB_CMAKE_INCLUDE "${LIBRARY_REPOSITORY_ROOT}/cmake_modules/repository.cmake") - if(NOT EXISTS "${TOB_CMAKE_INCLUDE}") - message(FATAL_ERROR "The library repository could not be found!") - endif() - - include("${TOB_CMAKE_INCLUDE}") - - else() - message(STATUS "Using system libraries") - endif() - # # compiler and linker flags # # Globally set the required C++ standard set(CMAKE_CXX_STANDARD 17) - set(CMAKE_CXX_EXTENSIONS OFF) - if(WIN32) - # warnings and compiler settings - set(GLOBAL_CXXFLAGS - /MD /nologo /W3 /EHsc /wd4141 /wd4146 /wd4180 /wd4244 - /wd4258 /wd4267 /wd4291 /wd4345 /wd4351 /wd4355 /wd4456 - /wd4457 /wd4458 /wd4459 /wd4503 /wd4624 /wd4722 /wd4800 - /wd4100 /wd4127 /wd4512 /wd4505 /wd4610 /wd4510 /wd4702 - /wd4245 /wd4706 /wd4310 /wd4701 /wd4703 /wd4389 /wd4611 - /wd4805 /wd4204 /wd4577 /wd4091 /wd4592 /wd4324 - ) - - set(GLOBAL_DEFINITIONS - _CRT_SECURE_NO_DEPRECATE - _CRT_SECURE_NO_WARNINGS - _CRT_NONSTDC_NO_DEPRECATE - _CRT_NONSTDC_NO_WARNINGS - _SCL_SECURE_NO_DEPRECATE - _SCL_SECURE_NO_WARNINGS - GOOGLE_PROTOBUF_NO_RTTI - ) - - else() - # warnings and compiler settings - set(GLOBAL_CXXFLAGS - -Wall -Wextra -Wno-unused-parameter -Wno-c++98-compat - -Wno-unreachable-code-return -Wno-nested-anon-types - -Wno-extended-offsetof - -Wno-variadic-macros -Wno-return-type-c-linkage - -Wno-c99-extensions -Wno-ignored-attributes -Wno-unused-local-typedef - -Wno-unknown-pragmas -Wno-unknown-warning-option -fPIC - -fno-omit-frame-pointer -fvisibility-inlines-hidden -fno-exceptions - -fno-asynchronous-unwind-tables - ) - - if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "AppleClang") - set(GLOBAL_CXXFLAGS - ${GLOBAL_CXXFLAGS} - -Wgnu-alignof-expression -Wno-gnu-anonymous-struct -Wno-gnu-designator - -Wno-gnu-zero-variadic-macro-arguments -Wno-gnu-statement-expression - ) - endif() - - # debug symbols - if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND GLOBAL_CXXFLAGS - -gdwarf-2 -g3 - ) - endif() - - # optimization flags and definitions - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND GLOBAL_CXXFLAGS -O0) - list(APPEND PROJECT_DEFINITIONS "DEBUG") - else() - list(APPEND GLOBAL_CXXFLAGS -O3) - list(APPEND PROJECT_DEFINITIONS "NDEBUG") - endif() - endif() - if(UNIX) if(APPLE) set(PLATFORM_NAME "macos") @@ -132,5 +50,5 @@ macro(main) endmacro() if(NOT DEFINED SETTINGS_CMAKE_) - main() + mcsema_settings_main() endif() diff --git a/mcsema/BC/Function.cpp b/mcsema/BC/Function.cpp index ac19a6962..f3881e457 100644 --- a/mcsema/BC/Function.cpp +++ b/mcsema/BC/Function.cpp @@ -106,12 +106,12 @@ namespace { static llvm::Value *LoadMemoryPointer(const TranslationContext &ctx, llvm::BasicBlock *block) { - return ctx.lifter->LoadRegValue(block, "MEMORY"); + return ctx.lifter->LoadRegValue(block, ctx.state_ptr, "MEMORY"); } static llvm::Value *LoadMemoryPointerRef(const TranslationContext &ctx, llvm::BasicBlock *block) { - return ctx.lifter->LoadRegAddress(block, "MEMORY"); + return ctx.lifter->LoadRegAddress(block, ctx.state_ptr, "MEMORY"); } static llvm::Value *LoadStatePointer(const TranslationContext &, @@ -121,22 +121,22 @@ static llvm::Value *LoadStatePointer(const TranslationContext &, static llvm::Value *LoadProgramCounter(const TranslationContext &ctx, llvm::BasicBlock *block) { - return ctx.lifter->LoadRegValue(block, "PC"); + return ctx.lifter->LoadRegValue(block, ctx.state_ptr, "PC"); } static llvm::Value *LoadProgramCounterRef(const TranslationContext &ctx, llvm::BasicBlock *block) { - return ctx.lifter->LoadRegAddress(block, "PC"); + return ctx.lifter->LoadRegAddress(block, ctx.state_ptr, "PC"); } static llvm::Value *LoadNextProgramCounter(const TranslationContext &ctx, llvm::BasicBlock *block) { - return ctx.lifter->LoadRegValue(block, "NEXT_PC"); + return ctx.lifter->LoadRegValue(block, ctx.state_ptr, "NEXT_PC"); } static llvm::Value *LoadNextProgramCounterRef(const TranslationContext &ctx, llvm::BasicBlock *block) { - return ctx.lifter->LoadRegAddress(block, "NEXT_PC"); + return ctx.lifter->LoadRegAddress(block, ctx.state_ptr, "NEXT_PC"); } // Get the register tracer. This is useful when debugging, where the runtime @@ -731,6 +731,7 @@ static void LiftIndirectJump(TranslationContext &ctx, llvm::BasicBlock *block, auto fallback = DevirtualizeIndirectFlow(ctx, exit_point); std::unordered_map block_map; + if (ctx.cfg_block) { for (auto target_ea : ctx.cfg_block->successor_eas) { block_map.emplace(target_ea, GetOrCreateBlock(ctx, target_ea, true)); @@ -1088,7 +1089,8 @@ void SaveAndRestoreFunctionPreservedRegs(TranslationContext &ctx, // However, if after optimization the two parameters don't match, then // we need to preserve the restore, and we'll replace all uses of // `%restore_val` with `%reg`. - const auto reg_ptr = ctx.lifter->LoadRegAddress(entry_block, reg_name); + const auto reg_ptr = ctx.lifter->LoadRegAddress( + entry_block, ctx.state_ptr, reg_name); const auto reg = ir.CreateLoad(reg_ptr); const auto reg_latest = restore_ir.CreateLoad(reg_ptr); llvm::Value *restorer_args[] = {reg, reg_latest}; @@ -1121,7 +1123,8 @@ static void LiftSavedRegs(TranslationContext &ctx, llvm::BasicBlock *block) { ctx.cfg_module->ForEachInstructionPreservedRegister( ctx.inst.pc, [=, &ir, &ctx](const std::string ®_name) { if (const auto reg = gArch->RegisterByName(reg_name); reg) { - const auto reg_ptr = ctx.lifter->LoadRegAddress(block, reg_name); + const auto reg_ptr = ctx.lifter->LoadRegAddress( + block, ctx.state_ptr, reg_name); const auto reg_val = ir.CreateLoad(reg_ptr); ctx.preserved_regs.emplace_back(reg_ptr, reg_val); } @@ -1171,7 +1174,8 @@ static void LiftKilledRegs(TranslationContext &ctx, llvm::BasicBlock *block) { llvm::IRBuilder<> ir(block); ctx.cfg_module->ForEachInstructionKilledRegister( ctx.inst.pc, [=, &ir, &ctx](const std::string ®_name) { - const auto reg_ptr = ctx.lifter->LoadRegAddress(block, reg_name); + const auto reg_ptr = ctx.lifter->LoadRegAddress( + block, ctx.state_ptr, reg_name); if (!reg_ptr) { return; } @@ -1462,6 +1466,7 @@ static llvm::Function *LiftFunction(const NativeModule *cfg_module, ctx.cfg_block = nullptr; ctx.cfg_inst = nullptr; ctx.lifted_func = lifted_func; + ctx.state_ptr = remill::NthArgument(lifted_func, remill::kStatePointerArgNum); std::unordered_set referenced_blocks; referenced_blocks.insert(cfg_func->ea); @@ -1626,7 +1631,7 @@ static void InlineCalls(llvm::Function &func) { // those references. void DeclareLiftedFunctions(const NativeModule *cfg_module) { - for (const auto [ea, cfg_func] : cfg_module->ea_to_func) { + for (auto [ea, cfg_func] : cfg_module->ea_to_func) { (void) ea; if (cfg_func->is_external) { diff --git a/mcsema/BC/Instruction.cpp b/mcsema/BC/Instruction.cpp index 412ad5164..51948627a 100644 --- a/mcsema/BC/Instruction.cpp +++ b/mcsema/BC/Instruction.cpp @@ -52,6 +52,7 @@ InstructionLifter::InstructionLifter(const remill::IntrinsicTable *intrinsics_, // Lift a single instruction into a basic block. remill::LiftStatus InstructionLifter::LiftIntoBlock(remill::Instruction &inst, llvm::BasicBlock *block_, + llvm::Value *state_ptr, bool is_delayed) { inst_ptr = &inst; @@ -71,8 +72,8 @@ remill::LiftStatus InstructionLifter::LiftIntoBlock(remill::Instruction &inst, disp_ref_used = false; imm_ref_used = false; - auto status = - this->remill::InstructionLifter::LiftIntoBlock(inst, block, is_delayed); + auto status = this->remill::InstructionLifter::LiftIntoBlock( + inst, block, state_ptr, is_delayed); // If we have semantics for the instruction, then make sure that we were // able to match cross-reference information to the instruction's operands. @@ -99,29 +100,6 @@ remill::LiftStatus InstructionLifter::LiftIntoBlock(remill::Instruction &inst, return status; } -//// Returns `true` if a given cross-reference is self-referential. That is, -//// we'll have something like `jmp cs:EnterCriticalSection`, which references -//// `EnterCriticalSection` in the `.idata` section of a PE file. But this -//// location is our only "place" for the external `EnterCriticalSection`, so -//// we point it back at itself. -//static bool IsSelfReferential(const NativeXref *cfg_xref) { -// if (!cfg_xref->target_segment) { -// return false; -// } -// -// auto it = cfg_xref->target_segment->entries.find(cfg_xref->target_ea); -// if (it == cfg_xref->target_segment->entries.end()) { -// return false; -// } -// -// const auto &entry = it->second; -// if (!entry.xref) { -// return false; -// } -// -// return entry.xref->target_ea == entry.ea; -//} - llvm::Value * InstructionLifter::GetAddress(const NativeInstructionXref *cfg_xref) { if (!cfg_xref) { @@ -173,17 +151,18 @@ llvm::Value *InstructionLifter::LiftImmediateOperand(remill::Instruction &inst, // Lift an indirect memory operand to a value. llvm::Value *InstructionLifter::LiftAddressOperand(remill::Instruction &inst, llvm::BasicBlock *block, + llvm::Value *state_ptr, llvm::Argument *arg, remill::Operand &op) { auto &mem = op.addr; - // A higher layer will resolve any code refs; this is a static address and - // we want to preserve it in the register state structure. - if (mem.IsControlFlowTarget()) { - return this->remill::InstructionLifter::LiftAddressOperand(inst, block, arg, - op); - } +// // A higher layer will resolve any code refs; this is a static address and +// // we want to preserve it in the register state structure. +// if (mem.IsControlFlowTarget()) { +// return this->remill::InstructionLifter::LiftAddressOperand( +// inst, block, state_ptr, arg, op); +// } if ((mem.base_reg.name.empty() && mem.index_reg.name.empty()) || (mem.base_reg.name == "PC" && mem.index_reg.name.empty())) { @@ -218,7 +197,7 @@ llvm::Value *InstructionLifter::LiftAddressOperand(remill::Instruction &inst, disp_ref_used = true; mem.displacement = 0; auto dynamic_addr = this->remill::InstructionLifter::LiftAddressOperand( - inst, block, arg, op); + inst, block, state_ptr, arg, op); llvm::IRBuilder<> ir(block); return ir.CreateAdd(dynamic_addr, disp_ref); @@ -232,14 +211,14 @@ llvm::Value *InstructionLifter::LiftAddressOperand(remill::Instruction &inst, mem_ref_used = true; mem.displacement = 0; auto dynamic_addr = this->remill::InstructionLifter::LiftAddressOperand( - inst, block, arg, op); + inst, block, state_ptr, arg, op); llvm::IRBuilder<> ir(block); return ir.CreateAdd(dynamic_addr, mem_ref); } } - return this->remill::InstructionLifter::LiftAddressOperand(inst, block, arg, - op); + return this->remill::InstructionLifter::LiftAddressOperand( + inst, block, state_ptr, arg, op); } } // namespace mcsema diff --git a/mcsema/BC/Instruction.h b/mcsema/BC/Instruction.h index a1e89d6ce..c6de88096 100644 --- a/mcsema/BC/Instruction.h +++ b/mcsema/BC/Instruction.h @@ -49,6 +49,7 @@ class InstructionLifter : public remill::InstructionLifter { // Lift a single instruction into a basic block. remill::LiftStatus LiftIntoBlock(remill::Instruction &inst, llvm::BasicBlock *block, + llvm::Value *state_ptr, bool is_delayed) override; protected: @@ -59,7 +60,9 @@ class InstructionLifter : public remill::InstructionLifter { // Lift an indirect memory operand to a value. llvm::Value *LiftAddressOperand(remill::Instruction &inst, - llvm::BasicBlock *block, llvm::Argument *arg, + llvm::BasicBlock *block, + llvm::Value *state_ptr, + llvm::Argument *arg, remill::Operand &mem) override; private: diff --git a/mcsema/BC/Lift.cpp b/mcsema/BC/Lift.cpp index 0e2495e97..66f7d835b 100644 --- a/mcsema/BC/Lift.cpp +++ b/mcsema/BC/Lift.cpp @@ -244,6 +244,8 @@ bool LiftCodeIntoModule(const NativeModule *cfg_module) { legacy::PropagateInstAnnotations(); } + MergeSegments(cfg_module); + return true; } diff --git a/mcsema/BC/Lift.h b/mcsema/BC/Lift.h index 48381f894..7526e3389 100644 --- a/mcsema/BC/Lift.h +++ b/mcsema/BC/Lift.h @@ -30,6 +30,7 @@ class InstructionLifter; namespace llvm { class AllocaInst; +class Argument; class BasicBlock; class Function; class Value; @@ -65,6 +66,7 @@ struct TranslationContext { llvm::BasicBlock *entry_block{nullptr}; llvm::Value *stack_ptr_var{nullptr}; llvm::Value *frame_ptr_var{nullptr}; + llvm::Argument *state_ptr{nullptr}; }; bool LiftCodeIntoModule(const NativeModule *cfg_module); diff --git a/mcsema/BC/Optimize.cpp b/mcsema/BC/Optimize.cpp index 7fa91e2c8..7bddb787e 100644 --- a/mcsema/BC/Optimize.cpp +++ b/mcsema/BC/Optimize.cpp @@ -69,7 +69,7 @@ DEFINE_bool(check_for_lowmem_xrefs, false, DEFINE_bool(volatile_memops, false, "Mark all lowered loads/stores as volatile"); -DEFINE_bool(local_state_pointer, true, +DEFINE_bool(local_state_pointer, false, "Use the state pointer passed by argument to all lifted functions." "Set local_state_pointer to false to disable it."); diff --git a/mcsema/BC/Segment.cpp b/mcsema/BC/Segment.cpp index f36f7756a..8501b178d 100644 --- a/mcsema/BC/Segment.cpp +++ b/mcsema/BC/Segment.cpp @@ -63,8 +63,14 @@ DEFINE_bool(force_embed_data_refs, false, "when using McSema-produced bitcode in KLEE, as it avoids doing " "lazy cross-reference initialization."); +DEFINE_bool(name_lifted_sections, false, + "Put lifted sections into sections in the target in a way that is " + "reflective of their original addresses."); + DECLARE_bool(disable_aliases); +DEFINE_bool(merge_segments, false, "Should all lifted segments be merged?"); + namespace mcsema { namespace { @@ -558,7 +564,7 @@ llvm::Constant *NativeSegment::Pointer(void) const { llvm::Type::getIntNTy(*gContext, static_cast(size * 8u)); break; - // An array of bytes + // An array of bytes default: { auto byte_type = llvm::Type::getInt8Ty(*gContext); var_type = llvm::ArrayType::get(byte_type, static_cast(size)); @@ -581,12 +587,19 @@ llvm::Constant *NativeSegment::Pointer(void) const { } if (ea) { - const auto alignment = 1u << __builtin_ctzl(ea - padding); + if (const auto alignment = 1u << __builtin_ctzl(ea - padding); alignment) { #if LLVM_VERSION_NUMBER >= LLVM_VERSION(10, 0) - lifted_var->setAlignment(llvm::MaybeAlign(alignment)); + lifted_var->setAlignment(llvm::MaybeAlign(alignment)); #else - lifted_var->setAlignment(alignment); + lifted_var->setAlignment(alignment); #endif + } + } + + if (!is_external && FLAGS_name_lifted_sections && !FLAGS_merge_segments) { + std::stringstream ss; + ss << ".section_" << std::hex << ea; + lifted_var->setSection(ss.str()); } return lifted_var; @@ -706,4 +719,102 @@ void CallInitFiniCode(const NativeModule *cfg_module) { } } +// Merge all segments into one contiguous mega segment. +void MergeSegments(const NativeModule *cfg_module) { + if (!FLAGS_merge_segments) { + return; + } + + using SegPair = std::pair; + std::vector segs; + + for (auto [ea, seg] : cfg_module->ea_to_seg) { + seg = seg->Get(); + if (auto var = llvm::dyn_cast(seg->Pointer()); + var && var->hasInitializer() && !seg->is_thread_local) { + segs.emplace_back(seg, var); + (void) ea; + } + } + + + std::sort(segs.begin(), segs.end(), [](SegPair a, SegPair b) { + return a.first->ea < b.first->ea; + }); + + if (segs.empty()) { + return; + } + + const auto &dl = gModule->getDataLayout(); + llvm::Type * const u8 = llvm::Type::getInt8Ty(*gContext); + llvm::Type * const u32 = llvm::Type::getInt32Ty(*gContext); + + auto start_ea = segs.front().first->ea & ~4095ull; + const auto min_ea = start_ea; + + std::vector new_types; + std::vector new_vals; + std::unordered_map indices; + + const NativeSegment *prev_cfg_seg = nullptr; + for (auto [cfg_seg, seg_var] : segs) { + if (cfg_seg == prev_cfg_seg) { + continue; + } + + LOG(INFO) + << "Merging segment " << cfg_seg->name << " at " << std::hex + << cfg_seg->ea << " of size " << std::dec << cfg_seg->size; + + const auto ea = cfg_seg->ea; + if (start_ea < ea) { + const auto pad_type = llvm::ArrayType::get(u8, ea - start_ea); + new_types.push_back(pad_type); + new_vals.push_back(llvm::ConstantAggregateZero::get(pad_type)); + start_ea = ea; + + } else if (start_ea > ea) { + LOG(FATAL) + << "Segment " << cfg_seg->name << " starting at " << std::hex << ea + << " overlaps with previous segment " << prev_cfg_seg->name + << " starting at " << prev_cfg_seg->ea << " and ending at " + << start_ea << std::dec; + } + + const auto init = seg_var->getInitializer(); + const auto type = init->getType(); + indices.emplace(seg_var, static_cast(new_types.size())); + new_vals.push_back(init); + new_types.push_back(type); + start_ea += dl.getTypeStoreSize(type); + prev_cfg_seg = cfg_seg; + } + + llvm::StructType * const new_type = llvm::StructType::get( + *gContext, new_types, true); + + llvm::Constant * const new_val = llvm::ConstantStruct::get(new_type, new_vals); + + auto new_var = new llvm::GlobalVariable( + *gModule, new_type, false, llvm::GlobalValue::InternalLinkage, new_val, + "__mcsema_all_segments"); + + if (FLAGS_name_lifted_sections) { + std::stringstream ss; + ss << ".section_" << std::hex << min_ea; + new_var->setSection(ss.str()); + } + + const auto const_zero = llvm::Constant::getNullValue(u32); + for (auto [var, index] : indices) { + auto const_index = llvm::ConstantInt::get(u32, index, false); + llvm::Constant *const_indices[] = {const_zero, const_index}; + const auto ptr = llvm::ConstantExpr::getInBoundsGetElementPtr( + new_type, new_var, const_indices); + var->replaceAllUsesWith(ptr); + var->eraseFromParent(); + } +} + } // namespace mcsema diff --git a/mcsema/BC/Segment.h b/mcsema/BC/Segment.h index abd899b24..9604fae3d 100644 --- a/mcsema/BC/Segment.h +++ b/mcsema/BC/Segment.h @@ -29,5 +29,6 @@ llvm::Function *GetOrCreateMcSemaInitializer(void); //void DeclareDataSegments(const NativeModule *cfg_module); void DefineDataSegments(const NativeModule *cfg_module); void CallInitFiniCode(const NativeModule *cfg_module); +void MergeSegments(const NativeModule *cfg_module); } // namespace mcsema diff --git a/mcsema/BC/Util.cpp b/mcsema/BC/Util.cpp index e2e9d950c..b7097aca1 100644 --- a/mcsema/BC/Util.cpp +++ b/mcsema/BC/Util.cpp @@ -88,12 +88,8 @@ llvm::Constant *LiftXrefInData(const NativeSegment *cfg_seg, uint64_t ea, auto [offset, type] = remill::BuildIndexes(dl, seg_type, 0, goal_offset, gep_index_list); - ptr = seg_var; - if (offset) { - (void) type; - ptr = llvm::ConstantExpr::getInBoundsGetElementPtr(seg_type, seg_var, - gep_index_list); - } + ptr = llvm::ConstantExpr::getInBoundsGetElementPtr(seg_type, seg_var, + gep_index_list); if (offset < goal_offset) { auto i8_type = llvm::Type::getInt8Ty(*gContext); diff --git a/mcsema/CFG/CFG.cpp b/mcsema/CFG/CFG.cpp index a7e5ac624..f73bffa83 100644 --- a/mcsema/CFG/CFG.cpp +++ b/mcsema/CFG/CFG.cpp @@ -47,6 +47,7 @@ #include "mcsema/CFG/CFG.h" DECLARE_bool(explicit_args); +DECLARE_bool(merge_segments); DEFINE_bool( disable_adjacent_segment_merging, false, @@ -1162,15 +1163,15 @@ NativeModule *ReadProtoBuf(const std::string &file_name, const auto inst_ea = static_cast(cfg_inst.ea()); + // Don't add it if we've already got it. + block->last_inst_ea = std::max(block->last_inst_ea, inst_ea); + // If there is no possibility of interesting metadata, then we don't // actually need a `NativeInstruction`. if (!cfg_inst.lp_ea() && !cfg_inst.xrefs_size()) { continue; } - // Don't add it if we've already got it. - block->last_inst_ea = std::max(block->last_inst_ea, inst_ea); - auto &inst = module->ea_to_inst[inst_ea]; if (!inst) { @@ -1386,7 +1387,7 @@ NativeModule *ReadProtoBuf(const std::string &file_name, } for (auto &seg : module->segments) { - if (!seg->is_external) { + if (!seg->is_external && !FLAGS_merge_segments) { seg->padding = seg->ea & 4095u; } diff --git a/mcsema/OS/Linux/X86/CMakeLists.txt b/mcsema/OS/Linux/X86/CMakeLists.txt index a590cb6a6..7d44c72fe 100644 --- a/mcsema/OS/Linux/X86/CMakeLists.txt +++ b/mcsema/OS/Linux/X86/CMakeLists.txt @@ -56,28 +56,28 @@ endfunction() add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.c ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.h DEPENDS ${source_file} - COMMAND env TRAILOFBITS_LIBRARIES=${LIBRARY_REPOSITORY_ROOT} python3 ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py + COMMAND env TRAILOFBITS_LIBRARIES=${CXX_COMMON_REPOSITORY_ROOT} python3 ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py --arch "amd64" --type "c" --input ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.pph --output ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.c ) #add_custom_command( # OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc_x86.c ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.h # DEPENDS ${source_file} -# COMMAND env TRAILOFBITS_LIBRARIES=${LIBRARY_REPOSITORY_ROOT} python3 ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py +# COMMAND env TRAILOFBITS_LIBRARIES=${CXX_COMMON_REPOSITORY_ROOT} python3 ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py # --arch "x86" --type "c" --input ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.pph --output ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc_x86.c #) #add_custom_command( # OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libcpp_amd64.c # DEPENDS ${source_file} -# COMMAND env TRAILOFBITS_LIBRARIES=${LIBRARY_REPOSITORY_ROOT} python ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py +# COMMAND env TRAILOFBITS_LIBRARIES=${CXX_COMMON_REPOSITORY_ROOT} python ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py # --arch "amd64" --type "cpp" --input ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.pph --output ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libcpp_amd64.c #) #add_custom_command( # OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libcpp_x86.c # DEPENDS ${source_file} -# COMMAND env TRAILOFBITS_LIBRARIES=${LIBRARY_REPOSITORY_ROOT} python ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py +# COMMAND env TRAILOFBITS_LIBRARIES=${CXX_COMMON_REPOSITORY_ROOT} python ${CMAKE_CURRENT_SOURCE_DIR}/../generate_abi_wrapper.py # --arch "x86" --type "cpp" --input ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libc.pph --output ${CMAKE_CURRENT_SOURCE_DIR}/ABI_libcpp_x86.c #) diff --git a/mcsema/OS/Linux/generate_abi_wrapper.py b/mcsema/OS/Linux/generate_abi_wrapper.py index 06e8af1e6..a89dfed44 100644 --- a/mcsema/OS/Linux/generate_abi_wrapper.py +++ b/mcsema/OS/Linux/generate_abi_wrapper.py @@ -186,9 +186,9 @@ def write_library_file(hfile, outfile): libc_type = 'c++' if ABI_LIBRARY_TYPE == "cpp" else 'c' pass - if libc_type is 'c': + if libc_type == 'c': write_cc_file(hfile, outfile) - elif libc_type is 'c++': + elif libc_type == 'c++': write_cxx_file(hfile, outfile) def write_header_file(file, headers): diff --git a/tests/test_suite_generator/CMakeLists.txt b/tests/test_suite_generator/CMakeLists.txt index a1a58e991..5f4630525 100644 --- a/tests/test_suite_generator/CMakeLists.txt +++ b/tests/test_suite_generator/CMakeLists.txt @@ -358,7 +358,7 @@ find_program(MCSEMALIFT_PATH "mcsema-lift${LLVM_VERSION_SUFFIX}") if ("${MCSEMALIFT_PATH}" STREQUAL "MCSEMALIFT_PATH-NOTFOUND") message(FATAL_ERROR "Failed to locate the mcsema-lift${LLVM_VERSION_SUFFIX} executable!") endif () -set(MCSEMALIFT_EXE "${MCSEMALIFT_PATH}/mcsema-lift${LLVM_VERSION_SUFFIX}") +set(MCSEMALIFT_EXE "${MCSEMALIFT_PATH}") find_program(IDAT64_PATH "idat64") if ("${IDAT64_PATH}" STREQUAL "IDAT64_PATH-NOTFOUND") @@ -372,11 +372,11 @@ if ("${IDAT64_PATH}" STREQUAL "IDAT64_PATH-NOTFOUND") endif() endif () -find_program(MCSEMADISASS_PATH "mcsema-disass") +find_program(MCSEMADISASS_PATH "mcsema-disass$") if ("${MCSEMADISASS_PATH}" STREQUAL "MCSEMADISASS_PATH-NOTFOUND") message(FATAL_ERROR "Failed to locate the mcsema-disass executable!") endif () -set(MCSEMADISASS_EXE "${MCSEMADISASS_PATH}/mcsema-disass") +set(MCSEMADISASS_EXE "${MCSEMADISASS_PATH}") message("Toolset") message(" > mcsema-lift: ${MCSEMALIFT_EXE}") @@ -391,8 +391,12 @@ message(" > mcsema-disass: ${MCSEMADISASS_EXE}") set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/test_suite") if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)|(x86_64)") - set(ARCHITECTURE_LIST x86 amd64) - + if(APPLE) + set(ARCHITECTURE_LIST amd64) + else() + set(ARCHITECTURE_LIST x86 amd64) + endif() + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "ARM") set(ARCHITECTURE_LIST aarch64) diff --git a/tests/test_suite_generator/src/start.py b/tests/test_suite_generator/src/start.py index cf7f21a42..20f901c59 100755 --- a/tests/test_suite_generator/src/start.py +++ b/tests/test_suite_generator/src/start.py @@ -377,6 +377,8 @@ def lift_test_cfg(test_directory, toolset, test): "--arch", test.architecture(), "--os", test.platform(), "--cfg", test.cfg_path(), "--output", output_file_path, + "--explicit_args", + "--local_state_pointer", "--libc_constructor", "init", "--libc_destructor", "fini", "--abi_libraries", abi_libs, diff --git a/tests/var_recovery/recover_and_test.sh b/tests/var_recovery/recover_and_test.sh index 9bff4233b..fccbfda4d 100755 --- a/tests/var_recovery/recover_and_test.sh +++ b/tests/var_recovery/recover_and_test.sh @@ -138,6 +138,8 @@ function lift_binary --os linux \ --cfg ${OUT_DIR}/${in_file}.cfg \ --output ${OUT_DIR}/${in_file}.bc \ + --explicit_args \ + --local_state_pointer \ --libc_constructor __libc_csu_init \ --libc_destructor __libc_csu_fini \ --abi-libraries=${ABI_DIR}/ABI_exceptions_amd64.bc \ diff --git a/tools/mcsema_disass/__main__.py b/tools/mcsema_disass/__main__.py index a3de7c3ab..7c3ed6e3c 100755 --- a/tools/mcsema_disass/__main__.py +++ b/tools/mcsema_disass/__main__.py @@ -102,6 +102,12 @@ def main(): help="The entrypoint where disassembly should begin", required=False) + arg_parser.add_argument( + '--rebase', + help="Amount by which to rebase a binary", + required=False, + default=0) + args, command_args = arg_parser.parse_known_args() if not os.path.isfile(args.binary): @@ -174,7 +180,8 @@ def main(): "--entrypoint", args.entrypoint, "--os", args.os, "--output", args.output, - "--binary", args.binary + "--binary", args.binary, + "--rebase", args.rebase ] subprocess.run(pass_args) else: diff --git a/tools/mcsema_disass/ida7/arm_util.py b/tools/mcsema_disass/ida7/arm_util.py index fa74a8b8d..21e69a118 100644 --- a/tools/mcsema_disass/ida7/arm_util.py +++ b/tools/mcsema_disass/ida7/arm_util.py @@ -76,8 +76,10 @@ def fixup_function_return_address(inst, next_ea): return next_ea +_BAD_ARM_REF_OFF = (idc.BADADDR, 0, 0) _INVALID_THUNK_ADDR = (False, idc.BADADDR) + def is_ELF_thunk_by_structure(ea): """Try to manually identify an ELF thunk by its structure.""" from util import * @@ -109,13 +111,18 @@ def is_ELF_thunk_by_structure(ea): _ARM_REF_CANDIDATES = set() def _get_arm_ref_candidate(mask, op_val, op_str, all_refs): + from util import * global _BAD_ARM_REF_OFF, _ARM_REF_CANDIDATES try: op_name = op_str.split("@")[0][1:] # `#asc_400E5C@PAGE` -> `asc_400E5C`. + op_name = op_name.split("#")[-1] + op_name = op_name.split("+")[0] + op_name = op_name.split("(")[-1] ref_ea = idc.get_name_ea_simple(op_name) - if (ref_ea & mask) == op_val: - return ref_ea, mask, 0 + + #if (ref_ea & mask) == op_val: + return ref_ea, mask, 0 except: pass @@ -149,10 +156,10 @@ def try_get_ref_addr(inst, op, op_val, all_refs, _NOT_A_REF): from util import * - if op.type not in (idc.o_imm, idc.o_displ): + #if op.type not in (idc.o_imm, idc.o_displ): # This is a reference type that the other ref tracking code # can handle, return defaults - return op_val, 0, 0 + # return op_val, 0, 0 op_str = idc.print_operand(inst.ea, op.n) diff --git a/tools/mcsema_disass/ida7/disass.py b/tools/mcsema_disass/ida7/disass.py index c8cc12baa..a3bf6e63b 100644 --- a/tools/mcsema_disass/ida7/disass.py +++ b/tools/mcsema_disass/ida7/disass.py @@ -55,6 +55,9 @@ def execute(args, command_args): script_cmd.append(args.arch) script_cmd.append("--os") script_cmd.append(args.os) + if args.rebase: + script_cmd.append("--rebase") + script_cmd.append(str(args.rebase)) if args.entrypoint is not None and len(args.entrypoint): script_cmd.append("--entrypoint") script_cmd.append(args.entrypoint) diff --git a/tools/mcsema_disass/ida7/get_cfg.py b/tools/mcsema_disass/ida7/get_cfg.py index fe1a8ecb4..e8e313c3f 100644 --- a/tools/mcsema_disass/ida7/get_cfg.py +++ b/tools/mcsema_disass/ida7/get_cfg.py @@ -102,7 +102,7 @@ # e.g. `@@QEAU_..`, `@@AEAV..`, though these are likely for name mangling. EXTERNAL_NAMES = ("@@GLIBC_", "@@GLIBCXX_", "@@CXXABI_", "@@GCC_") -_NOT_ELF_BEGIN_EAS = (0xffffffffL, 0xffffffffffffffffL) +_NOT_ELF_BEGIN_EAS = (0xffffffff, 0xffffffffffffffff) # Returns `True` if this is an ELF binary (as opposed to an ELF object file). def is_linked_ELF_program(): @@ -1716,6 +1716,13 @@ def recover_module(entrypoint, gvar_infile = None): default=None, required=False) + parser.add_argument( + "--rebase", + help="Amount by which to rebase a binary", + default=0, + type=int, + required=False) + args = parser.parse_args(args=idc.ARGV[1:]) if args.log_file != os.devnull: @@ -1765,11 +1772,19 @@ def recover_module(entrypoint, gvar_infile = None): # other sane defaults. idc.set_inf_attr(idc.INF_AF, 0xdfff) idc.set_inf_attr(idc.INF_AF2, 0xfffd) - + # Ensure that IDA is done processing DEBUG("Using Batch mode.") idaapi.auto_wait() + # Shift the program image in memory. + if args.rebase: + rebase_flags = idc.MSF_FIXONCE + if idc.MOVE_SEGM_OK != idc.rebase_program(args.rebase, rebase_flags): + DEBUG("ERROR: Failed to rebase program with delta {:08x}".format(args.rebase)) + + idaapi.auto_wait() + ANVILL_PROGRAM = anvill.get_program() DEBUG("Starting analysis") diff --git a/tools/mcsema_disass/ida7/refs.py b/tools/mcsema_disass/ida7/refs.py index 6b7528ab4..df5638724 100644 --- a/tools/mcsema_disass/ida7/refs.py +++ b/tools/mcsema_disass/ida7/refs.py @@ -175,7 +175,6 @@ def _nearest_head(ea, bounds): _HAS_NO_REFS = set() _NO_REFS = tuple() _ENABLE_CACHING = False -_BAD_ARM_REF_OFF = (idc.BADADDR, 0) _NOT_A_REF = set() # Remove a reference from `from_ea` to `to_ea`. @@ -274,7 +273,7 @@ def _get_ref_candidate(inst, op, all_refs, binary_is_pie): if is_invalid_ea(addr_val) \ or idc.get_segm_name(idc.get_segm_start(addr_val)) in ["LOAD"]: - + # The `addr_val` that we get might actually be a value that is relative to # a base address. For example, in IDA we might see: # @@ -349,7 +348,7 @@ def _get_ref_candidate(inst, op, all_refs, binary_is_pie): # WTF(pag): This silently kills IDA. # idc.add_dref(inst.ea, addr_val, idc.XREF_USER) - + return ref def memop_is_actually_displacement(inst): @@ -375,9 +374,11 @@ def enable_reference_caching(): _FIXUPS = [] +_IMM_AS_DISPLACEMENT_OPS = ("ADRP", "ADR", "SETHI") + # Get a list of references from an instruction. def get_instruction_references(arg, binary_is_pie=False): - global _ENABLE_CACHING, _NOT_A_REF, _FIXUPS + global _ENABLE_CACHING, _NOT_A_REF, _FIXUPS, _IMM_AS_DISPLACEMENT_OPS inst = arg if isinstance(arg, (int, long)): @@ -433,20 +434,20 @@ def get_instruction_references(arg, binary_is_pie=False): idaapi.del_cref(op_ea, op.value, False) continue - # If this is a PIE-mode, 64-bit binary, then most likely the immediate - # operand is not a data ref. - if seg_begin.use64() and binary_is_pie: - idaapi.del_dref(op_ea, op.value) - idaapi.del_cref(op_ea, op.value, False) - continue - # In the special case of "ADR" and "ADRP" instructions for aarch64 # IDA infers the absolute immediate value to assign as op_type, rather # than characterizing it as a displacement from PC - if idc.print_insn_mnem(inst.ea) in ["ADRP", "ADR"]: - ref.type = Reference.DISPLACEMENT + if idc.print_insn_mnem(inst.ea) in _IMM_AS_DISPLACEMENT_OPS: + ref.type = Reference.DISPLACEMENT else: - ref.type = Reference.IMMEDIATE + ref.type = Reference.IMMEDIATE + + # If this is a PIE-mode, 64-bit binary, then most likely the immediate + # operand is not a data ref. + if seg_begin.use64() and binary_is_pie: + idaapi.del_dref(op_ea, op.value) + idaapi.del_cref(op_ea, op.value, False) + continue ref.symbol = get_symbol_name(op_ea, ref.ea) diff --git a/tools/setup.py b/tools/setup.py index 8192522af..690d3b466 100755 --- a/tools/setup.py +++ b/tools/setup.py @@ -22,11 +22,11 @@ setup(name="mcsema-disass", description="Binary program disassembler for McSema.", - version="2.0", + version="3.0", url="https://github.com/lifting-bits/mcsema", author="Trail of Bits", author_email="mcsema@trailofbits.com", - license='Apache 2.0', + license='AGPLv3', packages=['mcsema_disass', 'mcsema_disass.ida7', 'mcsema_disass.defs'], install_requires=['protobuf==3.2.0', 'python-magic'], package_data={ diff --git a/tools/setup_launcher.sh b/tools/setup_launcher.sh index 48b311480..5345d9d49 100755 --- a/tools/setup_launcher.sh +++ b/tools/setup_launcher.sh @@ -74,6 +74,7 @@ main() { return 1 else printf " i Successfully installed\n" + rm "${temp_file}" fi return 0