diff --git a/crucible-jvm/crucible-jvm.cabal b/crucible-jvm/crucible-jvm.cabal index 933c8c6ec..450473c37 100644 --- a/crucible-jvm/crucible-jvm.cabal +++ b/crucible-jvm/crucible-jvm.cabal @@ -53,10 +53,14 @@ library base >= 4 && < 5, aig, array, + base16-bytestring >= 0.1 && < 1.1, bv-sized >= 1.0.0, + bytestring, containers, crucible, - directory, + cryptohash-sha1 >= 0.11 && < 0.12, + directory >= 1.2.5, + extra >= 1.6.4 && < 1.8, filepath, haskeline >= 0.7, jvm-parser >= 0.3, @@ -64,6 +68,7 @@ library mtl >= 2.1, parameterized-utils >= 1.0 && < 2.2, pretty >= 1.1, + process, split >= 0.2, text, transformers >= 0.3, @@ -85,6 +90,8 @@ library Lang.Crucible.JVM.ClassRefs Lang.Crucible.JVM.Overrides Lang.JVM.Codebase + Lang.JVM.JavaTools + Lang.JVM.ProcessUtils other-modules: ghc-options: -Wall -Wcompat -Werror=incomplete-patterns -Werror=missing-methods -Werror=overlapping-patterns diff --git a/crucible-jvm/src/Lang/JVM/Codebase.hs b/crucible-jvm/src/Lang/JVM/Codebase.hs index 5890f888a..823c41ac6 100644 --- a/crucible-jvm/src/Lang/JVM/Codebase.hs +++ b/crucible-jvm/src/Lang/JVM/Codebase.hs @@ -1,6 +1,6 @@ {- | Module : Lang.JVM.Codebase -Description : Codebase manipulation, copied from jvm-verifier package +Description : Java Codebase manipulation License : BSD3 Stability : stable Point-of-contact : jhendrix @@ -27,17 +27,119 @@ module Lang.JVM.Codebase ) where import Control.Monad +import Control.Monad.Trans.Maybe (MaybeT(..)) +import Crypto.Hash.SHA1 (hash) +import qualified Data.ByteString.Base16 as B16 +import qualified Data.ByteString.Char8 as B +import Data.Foldable (Foldable(..)) import qualified Data.Map as M import Data.IORef import Data.List (isPrefixOf) import Data.Maybe -import System.Directory (doesFileExist) +import System.Directory import System.FilePath (pathSeparator, (<.>), ()) +import Lang.JVM.JavaTools +import Lang.JVM.ProcessUtils + import Language.JVM.Common import Language.JVM.JarReader import Language.JVM.Parser +{- Note [Loading classes from JIMAGE files] + +The Java standard library is packaged differently depending on which version of +the JDK is being used: + +* JDK 8 and earlier ship the standard library in /lib/rt.jar, so + loading the standard library's .class files is simply a matter of parsing + rt.jar. The `jvm-parser` library is used to parse rt.jar like any other JAR + file. +* JDK 9 and later abandon rt.jar and instead distribute the standard library as + a JIMAGE file. (For more on the JIMAGE file format and how it compares to + JAR, see https://stackoverflow.com/a/64202720.) In particular, the standard + library is now located in a JIMAGE file at /lib/modules. + +We need access to the standard library's .class files in order to verify even +the simplest Java programs, so figuring out how to load classes from JIMAGE +files is a requirement for using JDK 9 or later with crucible-jvm. Unlike JAR, +however, the JIMAGE file format is internal to the JDK and subject to change in +the future. As a result, parsing JIMAGE files directly, like `jvm-parser` does +with JAR files, is likely not a good idea. + +If parsing JIMAGE files directly isn't a viable option, what can be done? As +far as I can tell, there are only two reasonable ways of extracting .class +files from JIMAGE files (outside of using Java itself): + +* Use libjimage, a C++ library bundled with the JDK, to load classes. This is + what the JDK itself uses internally, and as a result, it's reasonably space- + and time-efficient to load a single class from a JIMAGE file. Unfortunately, + this is a non-starter for crucible-jvm due to potential linking issues that + would arise when combining GHC with C++ code. + See https://github.com/GaloisInc/crucible/issues/621. + +* Use the jimage utility, shipped alongside java in /bin, to extract + the contents of JIMAGE files. The `jimage extract` command can either be used + to extract everything in a JIMAGE file all at once, or it can be used to + extract one class at a time, like in this example: + + jimage extract --include regex:.*/java/lang/Class.class --dir + +Since libjimage is an option, that only leaves jimage. Unfortunately, jimage is +not without its flaws: + +* jimage is /slow/. It takes about about 1.5 seconds for jimage to extract the + standard `modules` file on a reasonably performant machine. Having an extra + 1.5 seconds of lag every time that crucible-jvm is invoked doesn't sound + appealing. + +* What's worse, jimage is still unreasonably slow even if you only use it to + extract a single class from a JIMAGE file. It takes about 0.25 seconds to + extract java/lang/Class.class from the standard `modules`, which leads me to + suspect that jimage is doing a linear scan through every file in `modules`. + In any case, this is about 14× slower than loading a class file from a JAR, + and if crucible-jvm spends 0.25 seconds every time it loads any class from + `modules`, this would add up very quickly. + +What can we do about jimage's performance issues? One appealing alternative is +to use jlink, yet another utility that the JDK ships. If you give jlink a list +of Java modules, it will produce a minimal runtime environment that includes a +JIMAGE file that only contains the classes needed to support those modules, but +nothing else. In theory, this could cut down on the time it takes to extract +classes from a JIMAGE file, as the standard library's `modules` file is ~136 MB +when compressed (~493 MB when uncompressed), and most Java classes will only +use a small portion of that. + +Unfortunately, jlink is even slower than jimage. It takes jlink about 3 seconds +to create a runtime environment that only contains the `java.base` module, +which cancels out any time saved when extracting the JIMAGE file afterwards. + +Having exhausted all other possibilities, we have concluded that we simply +can't make JIMAGE extraction fast in the general case. We /can/, however, +amortize the cost of using JIMAGE files by caching the extracted contents. +This is exactly what crucible-jvm does. Any time that crucible-jvm tries to +load a class (e.g., java/lang/Class.class), it first checks to see if it is in +a known cache directory +(e.g., ~/.cache/crucible-jvm//java.base/java/lang/Class.class), +and if so, loads it like it would any other .class on the classpath. If not, +it will then consult the JAR path and the classpath. If it is not located by +that point, then crucible-jvm will attempt to extract it from the JIMAGE file, +caching the result for subsequent lookups. Importantly, we perform the JIMAGE +extraction step last, as it is far more time-consuming that trying to load +classes from the JAR path or classpath. + +There's still possibly some room for further optimization here. As mentioned +above, extracting one class at a time from a JIMAGE file takes about 0.25 +seconds, while extracting the entire JIMAGE file takes about 1.5 seconds. +Depending on how many classes a program uses, it may take less time overall +to just extract-and-cache everything from a JIMAGE file at once. The downside, +however, is that the cache would require much more space. A typical JDK +`modules` file takes up ~493 MB of classes when extracted, but most simple Java +programs will only use <1 MB of these classes. As a result, we opt for the +one-class-at-a-time approach, which seems better tuned for the sorts of +Java programs that crucible-jvm verifies in common cases. +-} + -- | Collection of classes loaded by JVM. data CodebaseState = CodebaseState { classMap :: M.Map ClassName Class @@ -46,10 +148,23 @@ data CodebaseState = CodebaseState { -- interfaces to list of classes that directly implement them. } +-- | Contains the path to the @jimage@ tool, the standard @modules@ JIMAGE +-- file, and the path where the extracted contents of @modules@ are cached. +-- +-- This is only used when crucible-jvm uses JDK 9 or later. +-- See Note [Loading classes from JIMAGE files]. +data JimagePaths = JimagePaths + { jimagePath :: FilePath + , standardModulesFilePath :: FilePath + , standardModulesCachePath :: FilePath + } + data Codebase = Codebase { jarReader :: JarReader -- ^ Maps class names to lazily loaded classes in JARs , classPaths :: [FilePath] + , jimagePaths :: Maybe JimagePaths + -- ^ The path to the @jimage@ tool and the paths it works on. , stateRef :: IORef CodebaseState } @@ -58,8 +173,8 @@ instance Show Codebase where -- | Loads Java classes directly beneath given path. Also loads jar indices for -- lazy class loading. -loadCodebase :: [FilePath] -> [FilePath] -> IO Codebase -loadCodebase jarFiles classPaths = do +loadCodebase :: [FilePath] -> [FilePath] -> [FilePath] -> IO Codebase +loadCodebase jarFiles classPaths javaBinDirs = do -- REVISIT: Currently, classes found in the classpath shadow those in the -- jars. Pretty sure the -classpath argument to the vanilla jvm allows -- mixture of jars and directories, and resolves names in the order in which @@ -87,9 +202,16 @@ loadCodebase jarFiles classPaths = do -- lookup in many maps -- one for each classpath component -- we can -- merge the maps as in the current 'JarReader' type, but I doubt -- this would ever matter, performance wise. - jars <- newJarReader jarFiles - let cb = CodebaseState M.empty M.empty - Codebase jars classPaths <$> newIORef cb + jars <- newJarReader jarFiles + mbJimagePaths <- findJimagePaths javaBinDirs + let cb = CodebaseState { classMap = M.empty, subclassMap = M.empty } + cbRef <- newIORef cb + pure $ Codebase + { jarReader = jars + , classPaths = classPaths + , jimagePaths = mbJimagePaths + , stateRef = cbRef + } -- | Register a class with the given codebase addClass :: Class -> CodebaseState -> CodebaseState @@ -107,32 +229,54 @@ addClass cl (CodebaseState cMap scMap) = -- | Returns class with given name in codebase or returns nothing if no class with -- that name can be found. tryLookupClass :: Codebase -> ClassName -> IO (Maybe Class) -tryLookupClass (Codebase{jarReader, classPaths, stateRef}) clNm = do +tryLookupClass (Codebase{jarReader, classPaths, jimagePaths, stateRef}) clNm = do cb <- readIORef stateRef case M.lookup clNm (classMap cb) of Just cl -> return (Just cl) Nothing -> do -- Here we bias our search to JARs before classpath directories, -- as mentioned above in 'loadCodebase'. - let mcls = [loadClassFromJar clNm jarReader] ++ - map (loadClassFromDir clNm) classPaths + let mcls = [ loadClassFromStandardModulesCache clNm jimagePaths + , loadClassFromJar clNm jarReader ] ++ + map (loadClassFromDir clNm) classPaths ++ + [ loadClassFromStandardModulesFile clNm jimagePaths ] mcl <- foldl1 firstSuccess mcls case mcl of Just cl -> do writeIORef stateRef $! addClass cl cb return $ Just cl Nothing -> return Nothing - where - -- | Combine two @IO (Maybe a)@ computations lazily, choosing the - -- first to succeed (i.e. return 'Just'). - firstSuccess :: IO (Maybe a) -> IO (Maybe a) -> IO (Maybe a) - -- This seems like it would be a common pattern, although I can't - -- find it in the standard libraries. - firstSuccess ima1 ima2 = do - ma1 <- ima1 - case ma1 of - Nothing -> ima2 - Just _ -> return ma1 + +-- Search for a .class file in the cache of files extracted from the `modules` +-- JIMAGE file. If it is not found, this does /not/ extract it from a JIMAGE +-- file—see loadClassFromStandardModulesFile. +loadClassFromStandardModulesCache :: ClassName -> Maybe JimagePaths -> IO (Maybe Class) +loadClassFromStandardModulesCache clNm mbJimagePaths = + case mbJimagePaths of + Nothing -> pure Nothing + Just JimagePaths{standardModulesCachePath} -> + loadClassFromParentDir clNm standardModulesCachePath + +-- Attempt to extract a .class file from the `modules` JIMAGE file, cache it +-- for subsequent lookups, and load the resulting .class. +loadClassFromStandardModulesFile :: ClassName -> Maybe JimagePaths -> IO (Maybe Class) +loadClassFromStandardModulesFile clNm mbJimagePaths = + case mbJimagePaths of + Nothing -> pure Nothing + Just JimagePaths{jimagePath, standardModulesFilePath, standardModulesCachePath} -> do + _ <- readProcessExitIfFailure jimagePath + [ "extract" + , "--include", "regex:.*/" ++ unClassName clNm ++ ".class" + , "--dir", standardModulesCachePath + , standardModulesFilePath + ] + -- After extracting a single class from a JIMAGE file, why do we then + -- have to search every directory under the cached path again? It's + -- because `jimage extract` doesn't tell you the path of the thing it + -- just extracted. Sigh. We can't reasonably infer what this path is + -- either, as it will start with a module name, which isn't something + -- that is contained in a ClassName. + loadClassFromParentDir clNm standardModulesCachePath -- | Attempt to load a class by searching under directory @dir@, which -- is assumed to be a classpath component. If class @C1. ... .Cn@ is @@ -156,6 +300,37 @@ loadClassFromDir clNm dir = do classNameToClassFilePath clNm = map (\c -> if c == '/' then pathSeparator else c) (unClassName clNm) <.> "class" +-- Given @parentDir@, search through its subdirectories to find a class, +-- loading it if it is found. +loadClassFromParentDir :: ClassName -> FilePath -> IO (Maybe Class) +loadClassFromParentDir clNm parentDir = do + childDirBaseNames <- listDirectory parentDir + let childDirs = map (parentDir ) childDirBaseNames + foldl' firstSuccess (pure Nothing) $ map (loadClassFromDir clNm) childDirs + +-- | Attempt to find an executable named @jimage@, either in the directories +-- supplied as arguments or on the @PATH@. If such an executable can be found, +-- then @Just@ a @JimagePaths@ is returned. Otherwise, @Nothing@ is returned. +-- +-- This is only used when crucible-jvm uses JDK 9 or later. +-- See Note [Loading classes from JIMAGE files]. +findJimagePaths :: [FilePath] -> IO (Maybe JimagePaths) +findJimagePaths javaBinDirs = runMaybeT $ do + javaPath <- MaybeT $ findJavaIn javaBinDirs + jimagePath <- MaybeT $ findJimageIn javaBinDirs + javaHome <- MaybeT $ findJavaProperty javaPath "java.home" + MaybeT $ do + let standardModulesPath = javaHome "lib" "modules" + xdgCacheDir <- getXdgDirectory XdgCache "crucible-jvm" + let standardModulesHash = B16.encode $ hash $ B.pack standardModulesPath + standardModulesCache = xdgCacheDir B.unpack standardModulesHash + createDirectoryIfMissing True standardModulesCache + pure $ Just $ JimagePaths + { jimagePath = jimagePath + , standardModulesFilePath = standardModulesPath + , standardModulesCachePath = standardModulesCache + } + -- | Returns class with given name in codebase or raises error if no class with -- that name can be found. -- @@ -308,3 +483,14 @@ starClosure fn a = a : concatMap (starClosure fn) (fn a) starClosureM :: Monad m => (a -> m [a]) -> a -> m [a] starClosureM fn a = return ((a:) . concat) `ap` (mapM (starClosureM fn) =<< fn a) + +-- | Combine two @IO (Maybe a)@ computations lazily, choosing the +-- first to succeed (i.e. return 'Just'). +firstSuccess :: IO (Maybe a) -> IO (Maybe a) -> IO (Maybe a) +-- This seems like it would be a common pattern, although I can't +-- find it in the standard libraries. +firstSuccess ima1 ima2 = do + ma1 <- ima1 + case ma1 of + Nothing -> ima2 + Just _ -> return ma1 diff --git a/crucible-jvm/src/Lang/JVM/JavaTools.hs b/crucible-jvm/src/Lang/JVM/JavaTools.hs new file mode 100644 index 000000000..56b51f842 --- /dev/null +++ b/crucible-jvm/src/Lang/JVM/JavaTools.hs @@ -0,0 +1,93 @@ +{- | +Module : Lang.JVM.JavaTools +Description : Functionality for finding a Java executable and its properties. +Copyright : (c) Galois, Inc 2021 +License : BSD3 +Maintainer : rscott@galois.com +Stability : provisional +-} + +module Lang.JVM.JavaTools + ( findJavaIn, findJimageIn, findJavaToolIn + , findJavaProperty + , findJavaMajorVersion + ) where + +import Data.List.Extra (dropPrefix, firstJust, stripInfix) +import Data.Maybe +import System.Directory + +import Lang.JVM.ProcessUtils + +-- | @'findJavaIn' searchPaths@ searches for an executable named @java@ among +-- the directories in @searchPaths@. If @searchPaths@ is empty, then the @PATH@ +-- is searched. +-- +-- If the search finds at least one executable, then @Just@ the first result is +-- returned. If no executables are found, then @Nothing@ is returned. +findJavaIn :: [FilePath] -> IO (Maybe FilePath) +findJavaIn = findJavaToolIn "java" + +-- | @'findJavaIn' searchPaths@ searches for an executable named @java@ among +-- the directories in @searchPaths@. If @searchPaths@ is empty, then the @PATH@ +-- is searched. +-- +-- If the search finds at least one executable, then @Just@ the first result is +-- returned. If no executables are found, then @Nothing@ is returned. +findJimageIn :: [FilePath] -> IO (Maybe FilePath) +findJimageIn = findJavaToolIn "jimage" + +-- | @'findJavaToolIn' toolName searchPaths@ searches for an executable named +-- @toolName@ among the directories in @searchPaths@. If @searchPaths@ is +-- empty, then the @PATH@ is searched. +-- +-- If the search finds at least one executable, then @Just@ the first result is +-- returned. If no executables are found, then @Nothing@ is returned. +findJavaToolIn :: FilePath -> [FilePath] -> IO (Maybe FilePath) +findJavaToolIn toolName searchPaths + | null searchPaths = findExecutable toolName + | otherwise = listToMaybe <$> findExecutablesInDirectories searchPaths toolName + +-- | @'findJavaProperty' javaPath prop@ invokes @javaPath@ to dump its system +-- properties (see ) +-- and attempts to find the value associated with the property named @prop@. +-- If such a value is found, then @Just@ that value is returned. +-- If no property named @prop@ exists, then @Nothing@ is returned. +-- +-- This will throw an exception if @javaPath@'s system properties cannot be +-- determined. +findJavaProperty :: FilePath -> String -> IO (Maybe String) +findJavaProperty javaPath propertyNeedle = do + (_stdOut, stdErr) <- readProcessExitIfFailure javaPath ["-XshowSettings:properties", "-version"] + let propertyHaystacks = lines stdErr + pure $ firstJust getProperty_maybe propertyHaystacks + where + -- Each Java system property, as reported by + -- @java -XshowSettings:properties@, adheres to this template: + -- + -- " = " + -- + -- Note the leading whitespace. As a result, stripInfix is used to detect + -- " = " in the middle of the string and obtain the after it. + getProperty_maybe :: String -> Maybe String + getProperty_maybe propertyHaystack = + snd <$> stripInfix (propertyNeedle ++ " = ") propertyHaystack + +-- | @'findJavaMajorVersion' javaPath@ will consult @javaPath@ to find the +-- major version number corresponding to that Java release. +findJavaMajorVersion :: FilePath -> IO Int +findJavaMajorVersion javaPath = do + mbVersionStr <- findJavaProperty javaPath "java.version" + case mbVersionStr of + Nothing -> fail $ "Could not detect the version of Java at " ++ javaPath + Just versionStr -> pure $ read $ takeMajorVersionStr $ dropOldJavaVersionPrefix versionStr + where + -- e.g., if the version number is "11.0.9.1", then just take the "11" part. + takeMajorVersionStr :: String -> String + takeMajorVersionStr = takeWhile (/= '.') + + -- Prior to Java 9, the leading version number was always 1. For example, + -- Java 8 would use 1.8.<...>. We only care about the 8 part, so drop the + -- leading 1. bit. + dropOldJavaVersionPrefix :: String -> String + dropOldJavaVersionPrefix = dropPrefix "1." diff --git a/crucible-jvm/src/Lang/JVM/ProcessUtils.hs b/crucible-jvm/src/Lang/JVM/ProcessUtils.hs new file mode 100644 index 000000000..1924bdc5f --- /dev/null +++ b/crucible-jvm/src/Lang/JVM/ProcessUtils.hs @@ -0,0 +1,30 @@ +{- | +Module : Lang.JVM.ProcessUtils +Description : Miscellaneous @process@-related utilities. +Copyright : (c) Galois, Inc 2021 +License : BSD3 +Maintainer : rscott@galois.com +Stability : provisional +-} + +module Lang.JVM.ProcessUtils (readProcessExitIfFailure) where + +import Control.Monad (when) +import System.Exit (ExitCode(..)) +import System.Process (readProcessWithExitCode) + +-- | Invokes @readProcessWithExitCode@ with no standard input, and returns the +-- resulting standard output and standard error. If the exit code of the +-- process is not 'ExitSuccess', throw an exception with some information that +-- may be helpful to debug what went wrong. +readProcessExitIfFailure :: FilePath -> [String] -> IO (String, String) +readProcessExitIfFailure cmd args = do + (ec, out, err) <- readProcessWithExitCode cmd args "" + when (ec /= ExitSuccess) $ + fail $ unlines [ cmd ++ " returned non-zero exit code: " ++ show ec + , "Standard output:" + , out + , "Standard error:" + , err + ] + pure (out, err) diff --git a/crucible-jvm/tests/Str.class b/crucible-jvm/tests/Str.class new file mode 100644 index 000000000..16c1d6a17 Binary files /dev/null and b/crucible-jvm/tests/Str.class differ diff --git a/crucible-jvm/tool/Main.hs b/crucible-jvm/tool/Main.hs index c58cb51b7..94b19c565 100644 --- a/crucible-jvm/tool/Main.hs +++ b/crucible-jvm/tool/Main.hs @@ -99,15 +99,18 @@ data JVMOptions = JVMOptions -- this must include rt.jar from the JDK -- (The JDK_JAR environment variable can also be used to -- to specify this JAR). + , javaBinDirs :: [FilePath] + -- ^ where to look to find the @java@ executable , mainMethod :: String } defaultOptions :: JVMOptions defaultOptions = JVMOptions - { classPath = ["."] - , jarList = [] - , mainMethod = "main" + { classPath = ["."] + , jarList = [] + , javaBinDirs = [] + , mainMethod = "main" } cruxJVMConfig :: Crux.Config JVMOptions @@ -129,6 +132,11 @@ cruxJVMConfig = Crux.Config $ Crux.ReqArg "TODO" $ \p opts -> Right $ opts { jarList = jarList opts ++ splitSearchPath p } + , Crux.Option ['b'] ["java-bin-dirs"] + "TODO" + $ Crux.ReqArg "TODO" + $ \p opts -> + Right $ opts { javaBinDirs = javaBinDirs opts ++ splitSearchPath p } , Crux.Option ['m'] ["method"] "Method to simulate" $ Crux.ReqArg "method name" @@ -144,7 +152,7 @@ simulateJVM copts opts = Crux.SimulatorCallback $ \sym _maybeOnline -> do [file] -> return file _ -> fail "crux-jvm requires a single file name as an argument" - cb <- JCB.loadCodebase (jarList opts) (classPath opts) + cb <- JCB.loadCodebase (jarList opts) (classPath opts) (javaBinDirs opts) let cname = takeBaseName file let mname = mainMethod opts