diff --git a/posting/index.go b/posting/index.go index e957cb43b10..7950cc89641 100644 --- a/posting/index.go +++ b/posting/index.go @@ -65,12 +65,6 @@ func indexTokens(info *indexMutationInfo) ([]string, error) { var tokens []string for _, it := range info.tokenizers { - if it.Name() == "exact" && schemaType == types.StringID && len(sv.Value.(string)) > 100 { - // Exact index can only be applied for strings so we can safely try to convert Value to - // string. - glog.Infof("Long term for exact index on predicate: [%s]. "+ - "Consider switching to hash for better performance.\n", attr) - } toks, err := tok.BuildTokens(sv.Value, tok.GetLangTokenizer(it, lang)) if err != nil { return tokens, err diff --git a/posting/lists.go b/posting/lists.go index 7757520275b..efa13fde8a7 100644 --- a/posting/lists.go +++ b/posting/lists.go @@ -159,7 +159,7 @@ func Cleanup() { // to lru cache and returns it. // // plist := Get(key, group) -// ... // Use plist +// ... Use plist // TODO: This should take a node id and index. And just append all indices to a list. // When doing a commit, it should update all the sync index watermarks. // worker pkg would push the indices to the watermarks held by lists. diff --git a/systest/queries_test.go b/systest/queries_test.go index 05475749a9e..e3b6711a74a 100644 --- a/systest/queries_test.go +++ b/systest/queries_test.go @@ -46,6 +46,7 @@ func TestQuery(t *testing.T) { t.Run("schema predicate names", wrap(SchemaQueryTestPredicate1)) t.Run("schema specific predicate fields", wrap(SchemaQueryTestPredicate2)) t.Run("schema specific predicate field", wrap(SchemaQueryTestPredicate3)) + t.Run("hash index queries", wrap(QueryHashIndex)) t.Run("cleanup", wrap(SchemaQueryCleanup)) } @@ -318,3 +319,116 @@ func SchemaQueryTestHTTP(t *testing.T, c *dgo.Dgraph) { }` CompareJSON(t, js, string(m["data"])) } + +func QueryHashIndex(t *testing.T, c *dgo.Dgraph) { + ctx := context.Background() + + require.NoError(t, c.Alter(ctx, &api.Operation{ + Schema: ` + name: string @index(hash) @lang . + `, + })) + + txn := c.NewTxn() + _, err := txn.Mutate(ctx, &api.Mutation{ + SetNquads: []byte(` + _:p0 "" . + _:p1 "0" . + _:p2 "srfrog" . + _:p3 "Lorem ipsum" . + _:p4 "Lorem ipsum dolor sit amet" . + _:p5 "Lorem ipsum dolor sit amet, consectetur adipiscing elit" . + _:p6 "Lorem ipsum"@en . + _:p7 "Lorem ipsum dolor sit amet"@en . + _:p8 "Lorem ipsum dolor sit amet, consectetur adipiscing elit"@en . + _:p9 "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed varius tellus ut sem bibendum, eu tristique augue congue. Praesent eget odio tincidunt, pellentesque ante sit amet, tempus sem. Donec et tellus et diam facilisis egestas ut ac risus. Proin feugiat risus tristique erat condimentum placerat. Nulla eget ligula tempus, blandit leo vel, accumsan tortor. Phasellus et felis in diam ultricies porta nec in ipsum. Phasellus id leo sagittis, bibendum enim ut, pretium lectus. Quisque ac ex viverra, suscipit turpis sed, scelerisque metus. Sed non dui facilisis, viverra leo eget, vulputate erat. Etiam nec enim sed nisi imperdiet cursus. Suspendisse sed ligula non nisi pharetra varius." . + _:pa ""@fr . + `), + }) + require.NoError(t, err) + require.NoError(t, txn.Commit(ctx)) + + tests := []struct { + in, out string + }{ + { + in: `schema(pred: [name]) {}`, + out: ` + { + "schema": [ + { + "index": true, + "lang": true, + "predicate": "name", + "tokenizer": [ + "hash" + ], + "type": "string" + } + ] + }`, + }, + { + in: `{q(func:eq(name,"")){name}}`, + out: `{"q": [{"name":""}]}`, + }, + { + in: `{q(func:eq(name,"0")){name}}`, + out: `{"q": [{"name":"0"}]}`, + }, + { + in: `{q(func:eq(name,"srfrog")){name}}`, + out: `{"q": [{"name":"srfrog"}]}`, + }, + { + in: `{q(func:eq(name,"Lorem ipsum")){name}}`, + out: `{"q": [{"name":"Lorem ipsum"}]}`, + }, + { + in: `{q(func:eq(name,"Lorem ipsum dolor sit amet")){name}}`, + out: `{"q": [{"name":"Lorem ipsum dolor sit amet"}]}`, + }, + { + in: `{q(func:eq(name@en,"Lorem ipsum")){name@en}}`, + out: `{"q": [{"name@en":"Lorem ipsum"}]}`, + }, + { + in: `{q(func:eq(name@.,"Lorem ipsum dolor sit amet")){name@en}}`, + out: `{"q": [{"name@en":"Lorem ipsum dolor sit amet"}]}`, + }, + { + in: `{q(func:eq(name,["srfrog"])){name}}`, + out: `{"q": [{"name":"srfrog"}]}`, + }, + { + in: `{q(func:eq(name,["srfrog","srf","srfrogg","sr","s"])){name}}`, + out: `{"q": [{"name":"srfrog"}]}`, + }, + { + in: `{q(func:eq(name,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name}}`, + out: `{"q": [{"name":""},{"name":"Lorem ipsum"},{"name":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`, + }, + { + in: `{q(func:eq(name,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name}}`, + out: `{"q": [{"name":"Lorem ipsum"}]}`, + }, + { + in: `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name@en}}`, + out: `{"q": [{"name@en":"Lorem ipsum"},{"name@en":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`, + }, + { + in: `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name@en}}`, + out: `{"q": [{"name@en":"Lorem ipsum"}]}`, + }, + { + in: `{q(func:eq(name@.,"")){name@fr}}`, + out: `{"q": [{"name@fr":""}]}`, + }, + } + + for _, tc := range tests { + resp, err := c.NewTxn().Query(ctx, tc.in) + require.NoError(t, err) + CompareJSON(t, tc.out, string(resp.Json)) + } +} diff --git a/tok/tok.go b/tok/tok.go index d2e55e5cccc..d0c5b02c2d9 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -22,14 +22,36 @@ import ( "plugin" "time" - farm "github.com/dgryski/go-farm" "github.com/golang/glog" geom "github.com/twpayne/go-geom" + "golang.org/x/crypto/blake2b" "github.com/dgraph-io/dgraph/types" "github.com/dgraph-io/dgraph/x" ) +// Tokenizer identifiers are unique and can't be reused. +// The range 0x00 - 0x7f is system reserved. +// The range 0x80 - 0xff is for custom tokenizers. +// TODO: use these everywhere where we must ensure a system tokenizer. +const ( + IdentNone = 0x0 + IdentTerm = 0x1 + IdentExact = 0x2 + IdentYear = 0x4 + IdentMonth = 0x41 + IdentDay = 0x42 + IdentHour = 0x43 + IdentGeo = 0x5 + IdentInt = 0x6 + IdentFloat = 0x7 + IdentFullText = 0x8 + IdentBool = 0x9 + IdentTrigram = 0xA + IdentHash = 0xB + IdentCustom = 0x80 +) + // Tokenizer defines what a tokenizer must provide. type Tokenizer interface { @@ -103,7 +125,7 @@ func LoadCustomTokenizer(soFile string) { tokenizer := symb.(func() interface{})().(PluginTokenizer) id := tokenizer.Identifier() - x.AssertTruef(id >= 0x80, + x.AssertTruef(id >= IdentCustom, "custom tokenizer identifier byte must be >= 0x80, but was %#x", id) registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer}) } @@ -142,7 +164,7 @@ func (t GeoTokenizer) Type() string { return "geo" } func (t GeoTokenizer) Tokens(v interface{}) ([]string, error) { return types.IndexGeoTokens(v.(geom.T)) } -func (t GeoTokenizer) Identifier() byte { return 0x5 } +func (t GeoTokenizer) Identifier() byte { return IdentGeo } func (t GeoTokenizer) IsSortable() bool { return false } func (t GeoTokenizer) IsLossy() bool { return true } @@ -153,7 +175,7 @@ func (t IntTokenizer) Type() string { return "int" } func (t IntTokenizer) Tokens(v interface{}) ([]string, error) { return []string{encodeInt(v.(int64))}, nil } -func (t IntTokenizer) Identifier() byte { return 0x6 } +func (t IntTokenizer) Identifier() byte { return IdentInt } func (t IntTokenizer) IsSortable() bool { return true } func (t IntTokenizer) IsLossy() bool { return false } @@ -164,7 +186,7 @@ func (t FloatTokenizer) Type() string { return "float" } func (t FloatTokenizer) Tokens(v interface{}) ([]string, error) { return []string{encodeInt(int64(v.(float64)))}, nil } -func (t FloatTokenizer) Identifier() byte { return 0x7 } +func (t FloatTokenizer) Identifier() byte { return IdentFloat } func (t FloatTokenizer) IsSortable() bool { return true } func (t FloatTokenizer) IsLossy() bool { return true } @@ -178,7 +200,7 @@ func (t YearTokenizer) Tokens(v interface{}) ([]string, error) { binary.BigEndian.PutUint16(buf[0:2], uint16(tval.Year())) return []string{string(buf)}, nil } -func (t YearTokenizer) Identifier() byte { return 0x4 } +func (t YearTokenizer) Identifier() byte { return IdentYear } func (t YearTokenizer) IsSortable() bool { return true } func (t YearTokenizer) IsLossy() bool { return true } @@ -193,7 +215,7 @@ func (t MonthTokenizer) Tokens(v interface{}) ([]string, error) { binary.BigEndian.PutUint16(buf[2:4], uint16(tval.Month())) return []string{string(buf)}, nil } -func (t MonthTokenizer) Identifier() byte { return 0x41 } +func (t MonthTokenizer) Identifier() byte { return IdentMonth } func (t MonthTokenizer) IsSortable() bool { return true } func (t MonthTokenizer) IsLossy() bool { return true } @@ -209,7 +231,7 @@ func (t DayTokenizer) Tokens(v interface{}) ([]string, error) { binary.BigEndian.PutUint16(buf[4:6], uint16(tval.Day())) return []string{string(buf)}, nil } -func (t DayTokenizer) Identifier() byte { return 0x42 } +func (t DayTokenizer) Identifier() byte { return IdentDay } func (t DayTokenizer) IsSortable() bool { return true } func (t DayTokenizer) IsLossy() bool { return true } @@ -226,7 +248,7 @@ func (t HourTokenizer) Tokens(v interface{}) ([]string, error) { binary.BigEndian.PutUint16(buf[6:8], uint16(tval.Hour())) return []string{string(buf)}, nil } -func (t HourTokenizer) Identifier() byte { return 0x43 } +func (t HourTokenizer) Identifier() byte { return IdentHour } func (t HourTokenizer) IsSortable() bool { return true } func (t HourTokenizer) IsLossy() bool { return true } @@ -242,7 +264,7 @@ func (t TermTokenizer) Tokens(v interface{}) ([]string, error) { tokens := termAnalyzer.Analyze([]byte(str)) return uniqueTerms(tokens), nil } -func (t TermTokenizer) Identifier() byte { return 0x1 } +func (t TermTokenizer) Identifier() byte { return IdentTerm } func (t TermTokenizer) IsSortable() bool { return false } func (t TermTokenizer) IsLossy() bool { return true } @@ -256,7 +278,7 @@ func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) { } return nil, x.Errorf("Exact indices only supported for string types") } -func (t ExactTokenizer) Identifier() byte { return 0x2 } +func (t ExactTokenizer) Identifier() byte { return IdentExact } func (t ExactTokenizer) IsSortable() bool { return true } func (t ExactTokenizer) IsLossy() bool { return false } @@ -279,7 +301,7 @@ func (t FullTextTokenizer) Tokens(v interface{}) ([]string, error) { // finally, return the terms. return uniqueTerms(tokens), nil } -func (t FullTextTokenizer) Identifier() byte { return 0x8 } +func (t FullTextTokenizer) Identifier() byte { return IdentFullText } func (t FullTextTokenizer) IsSortable() bool { return false } func (t FullTextTokenizer) IsLossy() bool { return true } @@ -321,7 +343,7 @@ func (t BoolTokenizer) Tokens(v interface{}) ([]string, error) { } return []string{encodeInt(b)}, nil } -func (t BoolTokenizer) Identifier() byte { return 0x9 } +func (t BoolTokenizer) Identifier() byte { return IdentBool } func (t BoolTokenizer) IsSortable() bool { return false } func (t BoolTokenizer) IsLossy() bool { return false } @@ -345,7 +367,7 @@ func (t TrigramTokenizer) Tokens(v interface{}) ([]string, error) { } return nil, nil } -func (t TrigramTokenizer) Identifier() byte { return 0xA } +func (t TrigramTokenizer) Identifier() byte { return IdentTrigram } func (t TrigramTokenizer) IsSortable() bool { return false } func (t TrigramTokenizer) IsLossy() bool { return true } @@ -358,13 +380,22 @@ func (t HashTokenizer) Tokens(v interface{}) ([]string, error) { if !ok { return nil, x.Errorf("Hash tokenizer only supported for string types") } - var hash [8]byte - binary.BigEndian.PutUint64(hash[:], farm.Hash64([]byte(term))) + // Blake2 is a hash function equivalent of SHA series, but faster. SHA is the best hash function + // for doing checksum of content, because they have low collision ratios. See issue #2776. + hash := blake2b.Sum256([]byte(term)) + if len(hash) == 0 { + return nil, x.Errorf("Hash tokenizer failed to create hash") + } return []string{string(hash[:])}, nil } -func (t HashTokenizer) Identifier() byte { return 0xB } +func (t HashTokenizer) Identifier() byte { return IdentHash } func (t HashTokenizer) IsSortable() bool { return false } -func (t HashTokenizer) IsLossy() bool { return true } + +// We have switched HashTokenizer to be non-lossy. This allows us to avoid having to retrieve values +// for the returned results, and compare them against the value in the query, which is slow. There +// is very low probability of collisions with a 256-bit hash. We use that fact to speed up equality +// query operations using the hash index. +func (t HashTokenizer) IsLossy() bool { return false } // PluginTokenizer is implemented by external plugins loaded dynamically via // *.so files. It follows the implementation semantics of the Tokenizer diff --git a/worker/task.go b/worker/task.go index 6380df25ada..a587b188c86 100644 --- a/worker/task.go +++ b/worker/task.go @@ -962,12 +962,21 @@ func (qs *queryState) handleRegexFunction(ctx context.Context, arg funcArgs) err } func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) error { + span := otrace.FromContext(ctx) + stop := x.SpanTimer(span, "handleCompareFunction") + defer stop() + if span != nil { + span.Annotatef(nil, "Number of uids: %d. args.srcFn: %+v", arg.srcFn.n, arg.srcFn) + } + attr := arg.q.Attr + span.Annotatef(nil, "Attr: %s. Fname: %s", attr, arg.srcFn.fname) tokenizer, err := pickTokenizer(attr, arg.srcFn.fname) // We should already have checked this in getInequalityTokens. x.Check(err) // Only if the tokenizer that we used IsLossy, then we need to fetch // and compare the actual values. + span.Annotatef(nil, "Tokenizer: %s, Lossy: %t", tokenizer.Name(), tokenizer.IsLossy()) if tokenizer.IsLossy() { // Need to evaluate inequality for entries in the first bucket. typ, err := schema.State().TypeOf(attr) diff --git a/worker/tokens.go b/worker/tokens.go index d8c3be84c21..c787503384e 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -17,8 +17,6 @@ package worker import ( - "strings" - "github.com/dgraph-io/badger" "bytes" @@ -30,35 +28,32 @@ import ( ) func verifyStringIndex(attr string, funcType FuncType) (string, bool) { - var requiredTokenizer string - switch funcType { - case FullTextSearchFn: - requiredTokenizer = tok.FullTextTokenizer{}.Name() - default: - requiredTokenizer = tok.TermTokenizer{}.Name() + var requiredTokenizer tok.Tokenizer + if funcType == FullTextSearchFn { + requiredTokenizer = tok.FullTextTokenizer{} + } else { + requiredTokenizer = tok.TermTokenizer{} } if !schema.State().IsIndexed(attr) { - return requiredTokenizer, false + return requiredTokenizer.Name(), false } - tokenizers := schema.State().Tokenizer(attr) - for _, tokenizer := range tokenizers { - // check for prefix, in case of explicit usage of language specific full text tokenizer - if strings.HasPrefix(tokenizer.Name(), requiredTokenizer) { - return requiredTokenizer, true + id := requiredTokenizer.Identifier() + for _, t := range schema.State().Tokenizer(attr) { + if t.Identifier() == id { + return requiredTokenizer.Name(), true } } - - return requiredTokenizer, false + return requiredTokenizer.Name(), false } func verifyCustomIndex(attr string, tokenizerName string) bool { if !schema.State().IsIndexed(attr) { return false } - for _, tn := range schema.State().TokenizerNames(attr) { - if tn == tokenizerName { + for _, t := range schema.State().Tokenizer(attr) { + if t.Identifier() >= tok.IdentCustom && t.Name() == tokenizerName { return true } } @@ -71,12 +66,10 @@ func getStringTokens(funcArgs []string, lang string, funcType FuncType) ([]strin if lang == "." { lang = "en" } - switch funcType { - case FullTextSearchFn: + if funcType == FullTextSearchFn { return tok.GetFullTextTokens(funcArgs, lang) - default: - return tok.GetTermTokens(funcArgs) } + return tok.GetTermTokens(funcArgs) } func pickTokenizer(attr string, f string) (tok.Tokenizer, error) { @@ -86,32 +79,25 @@ func pickTokenizer(attr string, f string) (tok.Tokenizer, error) { } tokenizers := schema.State().Tokenizer(attr) - - var tokenizer tok.Tokenizer - for _, t := range tokenizers { - if !t.IsLossy() { - tokenizer = t - break - } - } - - // If function is eq and we found a tokenizer thats !Lossy(), lets return - // it to avoid the second lookup. - if f == "eq" && tokenizer != nil { - return tokenizer, nil - } - - // Lets try to find a sortable tokenizer. for _, t := range tokenizers { - if t.IsSortable() { - return t, nil + // If function is eq and we found a tokenizer thats !Lossy(), lets return it + switch f { + case "eq": + // For equality, find a non-lossy tokenizer. + if !t.IsLossy() { + return t, nil + } + default: + // rest of the cases: ge, gt, le, lt require a sortable tokenizer. + if t.IsSortable() { + return t, nil + } } } - // rest of the cases, ge, gt , le , lt require a sortable tokenizer. + // Should we return an error if we don't find a non-lossy tokenizer for eq function. if f != "eq" { - return nil, x.Errorf("Attribute:%s does not have proper index for comparison", - attr) + return nil, x.Errorf("Attribute:%s does not have proper index for comparison", attr) } // We didn't find a sortable or !isLossy() tokenizer, lets return the first one. @@ -139,7 +125,8 @@ func getInequalityTokens(readTs uint64, attr, f string, return nil, "", nil // Allow eq with term/fulltext tokenizers, even though they give multiple tokens. - case f == "eq" && (tokenizer.Name() == "term" || tokenizer.Name() == "fulltext"): + case f == "eq" && + (tokenizer.Identifier() == tok.IdentTerm || tokenizer.Identifier() == tok.IdentFullText): break case len(ineqTokens) > 1: @@ -167,34 +154,36 @@ func getInequalityTokens(readTs uint64, attr, f string, itr := txn.NewIterator(itOpt) defer itr.Close() - ineqTokenInBytes := []byte(ineqToken) //used for inequality comparison below + // used for inequality comparison below + ineqTokenInBytes := []byte(ineqToken) var out []string for itr.Seek(seekKey); itr.Valid(); itr.Next() { item := itr.Item() key := item.Key() k := x.Parse(key) - if k == nil { - continue - } + + switch { + case k == nil: + // if its lossy then we handle inequality comparison later - // on in handleCompareAttr - if tokenizer.IsLossy() { + // in handleCompareFunction + case tokenizer.IsLossy(): out = append(out, k.Term) - } else { - // for non Lossy lets compare for inequality (gt & lt) - // to see if key needs to be included - if f == "gt" { - if bytes.Compare([]byte(k.Term), ineqTokenInBytes) > 0 { - out = append(out, k.Term) - } - } else if f == "lt" { - if bytes.Compare([]byte(k.Term), ineqTokenInBytes) < 0 { - out = append(out, k.Term) - } - } else { //for le or ge or any other fn consider the key + + // for non Lossy lets compare for inequality (gt & lt) + // to see if key needs to be included + case f == "gt": + if bytes.Compare([]byte(k.Term), ineqTokenInBytes) > 0 { + out = append(out, k.Term) + } + case f == "lt": + if bytes.Compare([]byte(k.Term), ineqTokenInBytes) < 0 { out = append(out, k.Term) } + default: + // for le or ge or any other fn consider the key + out = append(out, k.Term) } } return out, ineqToken, nil