From 296f164df53980fcafa0771c9c58fde95a447c86 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 10 Oct 2017 09:24:19 -0400 Subject: [PATCH] Support analyzer with custom tokenizer, only ngram for now. Also, add a function for defining index settings when the index is created since the analysis setting cannot be updated dynamically without closing the index. --- src/Database/V5/Bloodhound/Client.hs | 24 ++++++- src/Database/V5/Bloodhound/Types.hs | 98 +++++++++++++++++++++++++++- tests/V5/tests.hs | 26 ++++++++ 3 files changed, 146 insertions(+), 2 deletions(-) diff --git a/src/Database/V5/Bloodhound/Client.hs b/src/Database/V5/Bloodhound/Client.hs index 8f4d92b9..ff1efe2a 100644 --- a/src/Database/V5/Bloodhound/Client.hs +++ b/src/Database/V5/Bloodhound/Client.hs @@ -25,6 +25,7 @@ module Database.V5.Bloodhound.Client withBH -- ** Indices , createIndex + , createIndexWith , deleteIndex , updateIndexSettings , getIndexSettings @@ -512,6 +513,28 @@ createIndex indexSettings (IndexName indexName) = where url = joinPath [indexName] body = Just $ encode indexSettings +-- | Create an index, providing it with any number of settings. This +-- is more expressive than 'createIndex' but makes is more verbose +-- for the common case of configuring only the shard count and +-- replica count. +createIndexWith :: MonadBH m + => [UpdatableIndexSetting] + -> Int -- ^ shard count + -> IndexName + -> m Reply +createIndexWith updates shards (IndexName indexName) = + bindM2 put url (return (Just body)) + where url = joinPath [indexName] + body = encode $ object + ["settings" .= deepMerge + ( HM.singleton "index.number_of_shards" (toJSON shards) : + [u | Object u <- toJSON <$> updates] + ) + ] + +oPath :: ToJSON a => NonEmpty Text -> a -> Value +oPath (k :| []) v = object [k .= v] +oPath (k:| (h:t)) v = object [k .= oPath (h :| t) v] -- | 'deleteIndex' will delete an index given a 'Server', and an 'IndexName'. -- @@ -545,7 +568,6 @@ getIndexSettings (IndexName indexName) = do parseEsResponse =<< get =<< url where url = joinPath [indexName, "_settings"] - -- | 'forceMergeIndex' -- -- The force merge API allows to force merging of one or more indices through diff --git a/src/Database/V5/Bloodhound/Types.hs b/src/Database/V5/Bloodhound/Types.hs index 8a1cc05a..54df26b9 100644 --- a/src/Database/V5/Bloodhound/Types.hs +++ b/src/Database/V5/Bloodhound/Types.hs @@ -202,6 +202,7 @@ module Database.V5.Bloodhound.Types , ZeroTermsQuery(..) , CutoffFrequency(..) , Analyzer(..) + , Tokenizer(..) , MaxExpansions(..) , Lenient(..) , MatchQueryType(..) @@ -376,6 +377,12 @@ module Database.V5.Bloodhound.Types , EsUsername(..) , EsPassword(..) + + , Analysis(..) + , AnalyzerDefinition(..) + , TokenizerDefinition(..) + , Ngram(..) + , TokenChar(..) ) where import Control.Applicative as A @@ -583,8 +590,89 @@ data UpdatableIndexSetting = NumberOfReplicas ReplicaCount | IndexCompoundOnFlush Bool | WarmerEnabled Bool | MappingTotalFieldsLimit Int + | AnalysisSetting Analysis + -- ^ Analysis is not a dynamic setting and can only be performed on a closed index. deriving (Eq, Show, Generic, Typeable) +data Analysis = Analysis + { analysisAnalyzer :: M.Map Text AnalyzerDefinition + , analysisTokenizer :: M.Map Text TokenizerDefinition + } deriving (Eq,Show,Generic,Typeable) + +instance ToJSON Analysis where + toJSON (Analysis analyzer tokenizer) = object + [ "analyzer" .= analyzer + , "tokenizer" .= tokenizer + ] + +instance FromJSON Analysis where + parseJSON = withObject "Analysis" $ \m -> Analysis + <$> m .: "analyzer" + <*> m .: "tokenizer" + +data AnalyzerDefinition = AnalyzerDefinition + { analyzerDefinitionTokenizer :: Maybe Tokenizer + } deriving (Eq,Show,Generic,Typeable) + +instance ToJSON AnalyzerDefinition where + toJSON (AnalyzerDefinition tokenizer) = object $ catMaybes + [ fmap ("tokenizer" .=) tokenizer + ] + +instance FromJSON AnalyzerDefinition where + parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition + <$> m .:? "tokenizer" + + +data TokenizerDefinition + = TokenizerDefinitionNgram Ngram + deriving (Eq,Show,Generic,Typeable) + +instance ToJSON TokenizerDefinition where + toJSON x = case x of + TokenizerDefinitionNgram (Ngram minGram maxGram tokenChars) -> object + [ "type" .= ("ngram" :: Text) + , "min_gram" .= minGram + , "max_gram" .= maxGram + , "token_chars" .= tokenChars + ] + +instance FromJSON TokenizerDefinition where + parseJSON = withObject "TokenizerDefinition" $ \m -> do + typ <- m .: "type" :: Parser Text + case typ of + "ngram" -> fmap TokenizerDefinitionNgram $ Ngram + <$> (fmap unStringlyTypedInt (m .: "min_gram")) + <*> (fmap unStringlyTypedInt (m .: "max_gram")) + <*> m .: "token_chars" + _ -> fail "invalid TokenizerDefinition" + +data Ngram = Ngram + { ngramMinGram :: Int + , ngramMaxGram :: Int + , ngramTokenChars :: [TokenChar] + } deriving (Eq,Show,Generic,Typeable) + +data TokenChar = TokenLetter | TokenDigit | TokenWhitespace | TokenPunctuation | TokenSymbol + deriving (Eq,Read,Show,Generic,Typeable) + +instance ToJSON TokenChar where + toJSON t = String $ case t of + TokenLetter -> "letter" + TokenDigit -> "digit" + TokenWhitespace -> "whitespace" + TokenPunctuation -> "punctuation" + TokenSymbol -> "symbol" + +instance FromJSON TokenChar where + parseJSON = withText "TokenChar" $ \t -> case t of + "letter" -> return TokenLetter + "digit" -> return TokenDigit + "whitespace" -> return TokenWhitespace + "punctuation" -> return TokenPunctuation + "symbol" -> return TokenSymbol + _ -> fail "invalid TokenChar" + data AllocationPolicy = AllocAll -- ^ Allows shard allocation for all shards. | AllocPrimaries @@ -1020,6 +1108,8 @@ newtype CutoffFrequency = CutoffFrequency Double deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable) newtype Analyzer = Analyzer Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable) +newtype Tokenizer = + Tokenizer Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable) newtype MaxExpansions = MaxExpansions Int deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable) @@ -2990,6 +3080,7 @@ instance ToJSON UpdatableIndexSetting where toJSON (BlocksWrite x) = oPath ("blocks" :| ["write"]) x toJSON (BlocksMetaData x) = oPath ("blocks" :| ["metadata"]) x toJSON (MappingTotalFieldsLimit x) = oPath ("index" :| ["mapping","total_fields","limit"]) x + toJSON (AnalysisSetting x) = oPath ("index" :| ["analysis"]) x instance FromJSON UpdatableIndexSetting where parseJSON = withObject "UpdatableIndexSetting" parse @@ -3022,6 +3113,7 @@ instance FromJSON UpdatableIndexSetting where <|> blocksWrite `taggedAt` ["blocks", "write"] <|> blocksMetaData `taggedAt` ["blocks", "metadata"] <|> mappingTotalFieldsLimit `taggedAt` ["index", "mapping", "total_fields", "limit"] + <|> analysisSetting `taggedAt` ["index", "analysis"] where taggedAt f ks = taggedAt' f (Object o) ks taggedAt' f v [] = f =<< (parseJSON v <|> (parseJSON (unStringlyTypeJSON v))) taggedAt' f v (k:ks) = withObject "Object" (\o -> do v' <- o .: k @@ -3055,6 +3147,7 @@ instance FromJSON UpdatableIndexSetting where blocksWrite = pure . BlocksWrite blocksMetaData = pure . BlocksMetaData mappingTotalFieldsLimit = pure . MappingTotalFieldsLimit + analysisSetting = pure . AnalysisSetting instance FromJSON IndexSettingsSummary where parseJSON = withObject "IndexSettingsSummary" parse @@ -4643,10 +4736,13 @@ instance FromJSON NodeDataPathStats where newtype StringlyTypedDouble = StringlyTypedDouble { unStringlyTypedDouble :: Double } - instance FromJSON StringlyTypedDouble where parseJSON = fmap StringlyTypedDouble . parseJSON . unStringlyTypeJSON +newtype StringlyTypedInt = StringlyTypedInt { unStringlyTypedInt :: Int } + +instance FromJSON StringlyTypedInt where + parseJSON = fmap StringlyTypedInt . parseJSON . unStringlyTypeJSON instance FromJSON NodeFSTotalStats where parseJSON = withObject "NodeFSTotalStats" parse diff --git a/tests/V5/tests.hs b/tests/V5/tests.hs index e68fbb4e..64e6e2e9 100644 --- a/tests/V5/tests.hs +++ b/tests/V5/tests.hs @@ -907,6 +907,12 @@ instance Arbitrary RegexpFlag where arbitrary = sopArbitrary; shrink = genericSh instance Arbitrary BoolMatch where arbitrary = sopArbitrary; shrink = genericShrink instance Arbitrary Term where arbitrary = sopArbitrary; shrink = genericShrink instance Arbitrary IndexSettings where arbitrary = sopArbitrary; shrink = genericShrink +instance Arbitrary TokenChar where arbitrary = sopArbitrary; shrink = genericShrink +instance Arbitrary Ngram where arbitrary = sopArbitrary; shrink = genericShrink +instance Arbitrary TokenizerDefinition where arbitrary = sopArbitrary; shrink = genericShrink +instance Arbitrary AnalyzerDefinition where arbitrary = sopArbitrary; shrink = genericShrink +instance Arbitrary Analysis where arbitrary = sopArbitrary; shrink = genericShrink +instance Arbitrary Tokenizer where arbitrary = sopArbitrary; shrink = genericShrink instance Arbitrary UpdatableIndexSetting where arbitrary = sopArbitrary; shrink = genericShrink instance Arbitrary Bytes where arbitrary = sopArbitrary; shrink = genericShrink instance Arbitrary AllocationPolicy where arbitrary = sopArbitrary; shrink = genericShrink @@ -1622,6 +1628,26 @@ main = hspec $ do (IndexSettings (ShardCount 1) (ReplicaCount 0)) (NE.toList updates)) + it "accepts customer analyzers" $ when' (atleast es50) $ withTestEnv $ do + _ <- deleteExampleIndex + let analysis = Analysis + (M.singleton "ex_analyzer" (AnalyzerDefinition (Just (Tokenizer "ex_tokenizer")))) + (M.singleton "ex_tokenizer" + ( TokenizerDefinitionNgram + ( Ngram 3 4 [TokenLetter,TokenDigit]) + ) + ) + updates = [AnalysisSetting analysis] + createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex + liftIO $ validateStatus createResp 200 + getResp <- getIndexSettings testIndex + liftIO $ + getResp `shouldBe` Right (IndexSettingsSummary + testIndex + (IndexSettings (ShardCount 1) (ReplicaCount 0)) + updates + ) + describe "Index Optimization" $ do it "returns a successful response upon completion" $ withTestEnv $ do _ <- createExampleIndex