From c9049bf01d2cb06315e1b51088c746e23fd46778 Mon Sep 17 00:00:00 2001 From: jmd Date: Wed, 13 Sep 2023 15:01:19 +0200 Subject: [PATCH] Making an InQuery work. (Not in use yet) --- .../LuceneIndexContextTest.cs | 20 +- .../LuceneIndexContext.cs | 42 +- .../Class1.cs | 87 ++++ ...otJEM.Json.Index2.QueryParsers.Test.csproj | 25 + .../IndexQueryParserExtensions.cs | 34 ++ .../IndexSearcherExtensions.cs | 35 -- .../Query/Copy/BooleanQuery.cs | 483 ++++++++++++++++++ .../Query/Copy/BooleanScorer.cs | 300 +++++++++++ .../Query/Copy/BooleanScorer2.cs | 325 ++++++++++++ .../Query/Copy/ConjunctionScorer.cs | 163 ++++++ .../Query/Copy/DisjunctionScorer.cs | 167 ++++++ .../Query/Copy/DisjunctionSumScorer.cs | 90 ++++ .../Query/Copy/MinShouldMatchSumScorer.cs | 480 +++++++++++++++++ .../Query/Copy/ReqExclScorer.cs | 146 ++++++ .../Query/Copy/ReqOptSumScorer.cs | 116 +++++ .../Query/InQuery.cs | 242 +++------ .../SimplifiedLuceneQueryAstVisitor.cs | 40 +- ...FrameworkClassTest.cs => JsonIndexTest.cs} | 0 src/DotJEM.Json.Index2.sln | 6 + 19 files changed, 2576 insertions(+), 225 deletions(-) create mode 100644 src/DotJEM.Json.Index2.QueryParsers.Test/Class1.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers.Test/DotJEM.Json.Index2.QueryParsers.Test.csproj create mode 100644 src/DotJEM.Json.Index2.QueryParsers/IndexQueryParserExtensions.cs delete mode 100644 src/DotJEM.Json.Index2.QueryParsers/IndexSearcherExtensions.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanQuery.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer2.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ConjunctionScorer.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionScorer.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionSumScorer.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/MinShouldMatchSumScorer.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqExclScorer.cs create mode 100644 src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqOptSumScorer.cs rename src/DotJEM.Json.Index2.Test/{MyFrameworkClassTest.cs => JsonIndexTest.cs} (100%) diff --git a/src/DotJEM.Json.Index2.Contexts.Test/LuceneIndexContextTest.cs b/src/DotJEM.Json.Index2.Contexts.Test/LuceneIndexContextTest.cs index 1995492..70b7f34 100644 --- a/src/DotJEM.Json.Index2.Contexts.Test/LuceneIndexContextTest.cs +++ b/src/DotJEM.Json.Index2.Contexts.Test/LuceneIndexContextTest.cs @@ -17,20 +17,22 @@ public async Task SayHello_ReturnsHello() { IJsonIndexContextBuilder builder = new JsonIndexContextBuilder(); builder - .ByDefault(x => x.UsingMemmoryStorage().Build()); + .ByDefault(x => x + .UsingMemmoryStorage() + .WithAnalyzer(cfg => new StandardAnalyzer(cfg.Version)) + .WithFieldResolver(new FieldResolver("uuid", "type")) + .Build()); builder - .For("IndexName", b => b.UsingStorage(new RamJsonIndexStorage()).Build()); - - + .For("IndexName", x => x + .UsingMemmoryStorage() + .WithAnalyzer(cfg => new StandardAnalyzer(cfg.Version)) + .WithFieldResolver(new FieldResolver("uuid", "type")) + .Build()); IJsonIndexContext context = builder.Build(); context.Open("IndexName"); - IJsonIndex index = new JsonIndexBuilder("myIndex") - .UsingMemmoryStorage() - .WithAnalyzer(cfg => new StandardAnalyzer(cfg.Version)) - .WithFieldResolver(new FieldResolver("uuid", "type")) - .Build(); + IJsonIndex index = context.Open("IndexName"); IJsonIndexWriter writer = index.CreateWriter(); writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); diff --git a/src/DotJEM.Json.Index2.Contexts/LuceneIndexContext.cs b/src/DotJEM.Json.Index2.Contexts/LuceneIndexContext.cs index bc231e0..865db42 100644 --- a/src/DotJEM.Json.Index2.Contexts/LuceneIndexContext.cs +++ b/src/DotJEM.Json.Index2.Contexts/LuceneIndexContext.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Concurrent; +using System.Collections.Generic; using DotJEM.Json.Index2.Configuration; using DotJEM.Json.Index2.Contexts.Searching; using DotJEM.Json.Index2.Contexts.Storage; @@ -14,7 +15,7 @@ public interface IJsonIndexContext : IJsonIndexSearcherProvider public class JsonIndexContext : IJsonIndexContext { - private readonly ILuceneJsonIndexFactory factory; + private readonly IJsonIndexFactory factory; private readonly ConcurrentDictionary indices = new ConcurrentDictionary(); //public IServiceResolver Services { get; } @@ -24,7 +25,7 @@ public class JsonIndexContext : IJsonIndexContext //public JsonIndexContext(string path, IServiceCollection services = null) // : this(new LuceneIndexContextBuilder(path), services) { } - public JsonIndexContext(ILuceneJsonIndexFactory factory) + public JsonIndexContext(IJsonIndexFactory factory) { this.factory = factory ?? throw new ArgumentNullException(nameof(factory)); //this.Services = resolver ?? throw new ArgumentNullException(nameof(resolver)); @@ -40,11 +41,6 @@ public IJsonIndexSearcher CreateSearcher() return new LuceneJsonMultiIndexSearcher(indices.Values); } } - -public interface ILuceneJsonIndexFactory -{ - IJsonIndex Create(string name); -} public interface IJsonIndexContextBuilder { @@ -64,12 +60,42 @@ public IJsonIndexContextBuilder ByDefault(Func de public IJsonIndexContextBuilder For(string name, Func defaultConfig) { + if (name == null) throw new ArgumentNullException(nameof(name)); + if (defaultConfig == null) throw new ArgumentNullException(nameof(defaultConfig)); + if (name is "*" or "") throw new ArgumentException("Invalid name for an index.", nameof(name)); + configurators.AddOrUpdate(name, s => defaultConfig, (s, func) => defaultConfig); return this; } public IJsonIndexContext Build() { - return new JsonIndexContext(null); + return new JsonIndexContext(new JsonIndexFactory(new Dictionary>(configurators))); + } +} + +public interface IJsonIndexFactory +{ + IJsonIndex Create(string name); +} + +public class JsonIndexFactory : IJsonIndexFactory +{ + private readonly IReadOnlyDictionary> configurators; + + public JsonIndexFactory(IReadOnlyDictionary> configurators) + { + this.configurators = configurators; + } + + public IJsonIndex Create(string name) + { + if (configurators.TryGetValue(name, out Func func)) + return func(new JsonIndexBuilder(name)); + + if(configurators.TryGetValue("*", out func)) + return func(new JsonIndexBuilder(name)); + + throw new InvalidOperationException(""); } } \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers.Test/Class1.cs b/src/DotJEM.Json.Index2.QueryParsers.Test/Class1.cs new file mode 100644 index 0000000..315d306 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers.Test/Class1.cs @@ -0,0 +1,87 @@ +using DotJEM.Json.Index2.Documents.Fields; +using DotJEM.Json.Index2.IO; +using DotJEM.Json.Index2.QueryParsers.Query; +using DotJEM.Json.Index2.Results; +using DotJEM.Json.Index2.Searching; +using Lucene.Net.Analysis.Standard; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Newtonsoft.Json; +using Newtonsoft.Json.Linq; +using NUnit.Framework; + +namespace DotJEM.Json.Index2.QueryParsers.Test; + +public class JsonIndexTest +{ + [Test] + public async Task SayHello_ReturnsHello() + { + IJsonIndex index = new JsonIndexBuilder("myIndex") + .UsingMemmoryStorage() + .WithAnalyzer(cfg => new StandardAnalyzer(cfg.Version)) + .WithFieldResolver(new FieldResolver("uuid", "type")) + .UseSimplifiedLuceneQueryParser() + .Build(); + + IJsonIndexWriter writer = index.CreateWriter(); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + writer.Commit(); + + IJsonIndexSearcher? searcher = index.CreateSearcher(); + int count = searcher.Search("type:CAR").Count(); + //int count = searcher.Search(new MatchAllDocsQuery()).Count(); + Assert.AreEqual(5, count); + } + [Test] + public async Task SayHello_ReturnsHell2o() + { + IJsonIndex index = new JsonIndexBuilder("myIndex") + .UsingMemmoryStorage() + .WithAnalyzer(cfg => new StandardAnalyzer(cfg.Version)) + .WithFieldResolver(new FieldResolver("uuid", "type")) + .UseSimplifiedLuceneQueryParser() + .Build(); + + IJsonIndexWriter writer = index.CreateWriter(); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "AXE" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "AXE" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "AXE" })); + + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAR" })); + + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "FAT" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "FAT" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "FAT" })); + + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAT" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAT" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "CAT" })); + + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "HAT" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "HAT" })); + writer.Create(JObject.FromObject(new { uuid = Guid.NewGuid(), type = "HAT" })); + writer.Commit(); + + + IJsonIndexSearcher? searcher = index.CreateSearcher(); + //new TermQuery(new Term("type", "AXE")) + + //ISearch? search = searcher.Search(new InQuery("type", "car", "foo", "fat")); + ISearch? search = searcher.Search("type IN (car, foo, fat)"); + //int count = searcher.Search(new MatchAllDocsQuery()).Count(); + + foreach (SearchResult result in search.Take(100).Execute()) + { + Console.Write(result.Data.ToString(Formatting.None)); + } + + Assert.That(search.Count(), Is.EqualTo(6)); + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers.Test/DotJEM.Json.Index2.QueryParsers.Test.csproj b/src/DotJEM.Json.Index2.QueryParsers.Test/DotJEM.Json.Index2.QueryParsers.Test.csproj new file mode 100644 index 0000000..522770a --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers.Test/DotJEM.Json.Index2.QueryParsers.Test.csproj @@ -0,0 +1,25 @@ + + + + + + net6.0 + enable + enable + False + False + False + + + + + + + + + + + + + + diff --git a/src/DotJEM.Json.Index2.QueryParsers/IndexQueryParserExtensions.cs b/src/DotJEM.Json.Index2.QueryParsers/IndexQueryParserExtensions.cs new file mode 100644 index 0000000..1268f01 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/IndexQueryParserExtensions.cs @@ -0,0 +1,34 @@ +using System; +using DotJEM.Json.Index2.Configuration; +using DotJEM.Json.Index2.Documents.Info; +using DotJEM.Json.Index2.Results; +using DotJEM.Json.Index2.Searching; +using Lucene.Net.Analysis; + +namespace DotJEM.Json.Index2.QueryParsers; + +public static class IndexQueryParserExtensions +{ + public static IJsonIndexBuilder UseSimplifiedLuceneQueryParser(this IJsonIndexBuilder self) + => self.TryWithService(config=>new SimplifiedLuceneQueryParser(config.FieldInformationManager, config.Analyzer)); + + public static ISearch Search(this IJsonIndexSearcher self, string query) + { + ILuceneQueryParser parser = self.Index.ResolveParser(); + LuceneQueryInfo queryInfo = parser.Parse(query); + return self.Search(queryInfo.Query).OrderBy(queryInfo.Sort); + } + + public static ISearch Search(this IJsonIndex self, string query) + { + ILuceneQueryParser parser = self.ResolveParser(); + LuceneQueryInfo queryInfo = parser.Parse(query); + return self.CreateSearcher().Search(queryInfo.Query).OrderBy(queryInfo.Sort); + } + + private static ILuceneQueryParser ResolveParser(this IJsonIndex self) + { + return self.Configuration.Get() ?? throw new Exception("Query parser not configured."); + } + +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/IndexSearcherExtensions.cs b/src/DotJEM.Json.Index2.QueryParsers/IndexSearcherExtensions.cs deleted file mode 100644 index 7682211..0000000 --- a/src/DotJEM.Json.Index2.QueryParsers/IndexSearcherExtensions.cs +++ /dev/null @@ -1,35 +0,0 @@ -using System; -using DotJEM.Json.Index2.Configuration; -using DotJEM.Json.Index2.Documents.Info; -using DotJEM.Json.Index2.Results; -using DotJEM.Json.Index2.Searching; -using Lucene.Net.Analysis; - -namespace DotJEM.Json.Index2.QueryParsers -{ - public static class IndexSearcherExtensions - { - public static IJsonIndexBuilder UseSimplifiedLuceneQueryParser(this IJsonIndexBuilder self) - => self.TryWithService(config=>new SimplifiedLuceneQueryParser(config.FieldInformationManager, config.Analyzer)); - - public static ISearch Search(this IJsonIndexSearcher self, string query) - { - ILuceneQueryParser parser = self.Index.ResolveParser(); - LuceneQueryInfo queryInfo = parser.Parse(query); - return self.Search(queryInfo.Query).OrderBy(queryInfo.Sort); - } - - public static ISearch Search(this IJsonIndex self, string query) - { - ILuceneQueryParser parser = self.ResolveParser(); - LuceneQueryInfo queryInfo = parser.Parse(query); - return self.CreateSearcher().Search(queryInfo.Query).OrderBy(queryInfo.Sort); - } - - private static ILuceneQueryParser ResolveParser(this IJsonIndex self) - { - return self.Configuration.Get() ?? throw new Exception("Query parser not configured."); - } - - } -} diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanQuery.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanQuery.cs new file mode 100644 index 0000000..34afacb --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanQuery.cs @@ -0,0 +1,483 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Text; +using J2N; +using J2N.Collections.Generic.Extensions; +using Lucene.Net.Search; +using Lucene.Net.Util; +using JCG = J2N.Collections.Generic; +using LuceneQuery = Lucene.Net.Search.Query; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy; + +using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext; +using IBits = IBits; +using IndexReader = Lucene.Net.Index.IndexReader; +using Occur_e = Occur; +using Similarity = Lucene.Net.Search.Similarities.Similarity; +using Term = Lucene.Net.Index.Term; +using ToStringUtils = ToStringUtils; + +public sealed class CopyOfBooleanQuery : LuceneQuery +{ + private IList clauses = new List(); + private IList terms = new List(); + + public bool CoordDisabled { get; } + + public int MinimumNumberShouldMatch { get; } = 0; + + public CopyOfBooleanQuery(bool disableCoord = false) + { + this.CoordDisabled = disableCoord; + } + + //public void Add(LuceneQuery query, Occur occur) => Add(new BooleanClause(query, occur)); + //public void Add(BooleanClause clause) => clauses.Add(clause); + + public void AddTerm(Term term) => terms.Add(term); + + public sealed class CopyOfBooleanWeight : Weight + { + private readonly CopyOfBooleanQuery outerInstance; + private readonly Similarity similarity; + private readonly IList weights; + private readonly bool disableCoord; + + public int MaxCoord { get; } + + public override LuceneQuery Query => outerInstance; + + public CopyOfBooleanWeight(CopyOfBooleanQuery outerInstance, IndexSearcher searcher, bool disableCoord) + { + this.outerInstance = outerInstance; + similarity = searcher.Similarity; + this.disableCoord = disableCoord; + weights = new List(outerInstance.clauses.Count); + foreach (BooleanClause c in outerInstance.clauses) + { + Weight w = c.Query.CreateWeight(searcher); + weights.Add(w); + if (!c.IsProhibited) + { + MaxCoord++; + } + } + } + + public override float GetValueForNormalization() + { + float sum = 0.0f; + for (int i = 0; i < weights.Count; i++) + { + // call sumOfSquaredWeights for all clauses in case of side effects + float s = weights[i].GetValueForNormalization(); // sum sub weights + if (!outerInstance.clauses[i].IsProhibited) + { + // only add to sum for non-prohibited clauses + sum += s; + } + } + sum *= outerInstance.Boost * outerInstance.Boost; // boost each sub-weight + return sum; + } + + public float Coord(int overlap, int maxOverlap) + { + // LUCENE-4300: in most cases of maxOverlap=1, BQ rewrites itself away, + // so coord() is not applied. But when BQ cannot optimize itself away + // for a single clause (minNrShouldMatch, prohibited clauses, etc), its + // important not to apply coord(1,1) for consistency, it might not be 1.0F + return maxOverlap == 1 ? 1F : similarity.Coord(overlap, maxOverlap); + } + + public override void Normalize(float norm, float topLevelBoost) + { + topLevelBoost *= outerInstance.Boost; // incorporate boost + foreach (Weight w in weights) + { + // normalize all clauses, (even if prohibited in case of side affects) + w.Normalize(norm, topLevelBoost); + } + } + + public override Explanation Explain(AtomicReaderContext context, int doc) + { + int minShouldMatch = outerInstance.MinimumNumberShouldMatch; + ComplexExplanation sumExpl = new ComplexExplanation(); + sumExpl.Description = "sum of:"; + int coord = 0; + float sum = 0.0f; + bool fail = false; + int shouldMatchCount = 0; + + using (IEnumerator cIter = outerInstance.clauses.GetEnumerator()) + { + foreach (Weight w in weights) + { + cIter.MoveNext(); + BooleanClause c = cIter.Current; + if (w.GetScorer(context, context.AtomicReader.LiveDocs) is null) + { + if (c.IsRequired) + { + fail = true; + Explanation r = new Explanation(0.0f, "no match on required clause (" + c.Query.ToString() + ")"); + sumExpl.AddDetail(r); + } + continue; + } + Explanation e = w.Explain(context, doc); + if (e.IsMatch) + { + if (!c.IsProhibited) + { + sumExpl.AddDetail(e); + sum += e.Value; + coord++; + } + else + { + Explanation r = new Explanation(0.0f, "match on prohibited clause (" + c.Query.ToString() + ")"); + r.AddDetail(e); + sumExpl.AddDetail(r); + fail = true; + } + if (c.Occur == Occur_e.SHOULD) + { + shouldMatchCount++; + } + } + else if (c.IsRequired) + { + Explanation r = new Explanation(0.0f, "no match on required clause (" + c.Query.ToString() + ")"); + r.AddDetail(e); + sumExpl.AddDetail(r); + fail = true; + } + } + } + if (fail) + { + sumExpl.Match = false; + sumExpl.Value = 0.0f; + sumExpl.Description = "Failure to meet condition(s) of required/prohibited clause(s)"; + return sumExpl; + } + else if (shouldMatchCount < minShouldMatch) + { + sumExpl.Match = false; + sumExpl.Value = 0.0f; + sumExpl.Description = "Failure to match minimum number " + "of optional clauses: " + minShouldMatch; + return sumExpl; + } + + sumExpl.Match = 0 < coord ? true : false; + sumExpl.Value = sum; + + float coordFactor = disableCoord ? 1.0f : Coord(coord, MaxCoord); + if (coordFactor == 1.0f) + { + return sumExpl; // eliminate wrapper + } + else + { + ComplexExplanation result = new ComplexExplanation(sumExpl.IsMatch, sum * coordFactor, "product of:"); + result.AddDetail(sumExpl); + result.AddDetail(new Explanation(coordFactor, "coord(" + coord + "/" + MaxCoord + ")")); + return result; + } + } + + public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs) + { + if (scoreDocsInOrder || outerInstance.MinimumNumberShouldMatch > 1) + { + // TODO: (LUCENE-4872) in some cases BooleanScorer may be faster for minNrShouldMatch + // but the same is even true of pure conjunctions... + return base.GetBulkScorer(context, scoreDocsInOrder, acceptDocs); + } + + IList prohibited = new JCG.List(); + IList optional = new JCG.List(); + using (IEnumerator cIter = outerInstance.clauses.GetEnumerator()) + { + foreach (Weight w in weights) + { + cIter.MoveNext(); + BooleanClause c = cIter.Current; + BulkScorer subScorer = w.GetBulkScorer(context, false, acceptDocs); + if (subScorer is null) + { + if (c.IsRequired) + { + return null; + } + } + else if (c.IsRequired) + { + // TODO: there are some cases where BooleanScorer + // would handle conjunctions faster than + // BooleanScorer2... + return base.GetBulkScorer(context, scoreDocsInOrder, acceptDocs); + } + else if (c.IsProhibited) + { + prohibited.Add(subScorer); + } + else + { + optional.Add(subScorer); + } + } + } + + // Check if we can and should return a BooleanScorer + return new CopyOfBooleanScorer(this, disableCoord, outerInstance.MinimumNumberShouldMatch, optional, prohibited, MaxCoord); + } + + public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) + { + IList required = new JCG.List(); + IList prohibited = new JCG.List(); + IList optional = new JCG.List(); + IEnumerator cIter = outerInstance.clauses.GetEnumerator(); + foreach (Weight w in weights) + { + cIter.MoveNext(); + BooleanClause c = cIter.Current; + Scorer subScorer = w.GetScorer(context, acceptDocs); + if (subScorer is null) + { + if (c.IsRequired) + { + return null; + } + } + else if (c.IsRequired) + { + required.Add(subScorer); + } + else if (c.IsProhibited) + { + prohibited.Add(subScorer); + } + else + { + optional.Add(subScorer); + } + } + + // no required and optional clauses. + if (required.Count == 0 && optional.Count == 0) + return null; + + // either >1 req scorer, or there are 0 req scorers and at least 1 + // optional scorer. Therefore if there are not enough optional scorers + // no documents will be matched by the query + if (optional.Count < outerInstance.MinimumNumberShouldMatch) + return null; + + // simple conjunction + if (optional.Count == 0 && prohibited.Count == 0) + { + float coord = disableCoord + ? 1.0f + : Coord(required.Count, MaxCoord); + return new CopyOfConjunctionScorer(this, required.ToArray(), coord); + } + + // simple disjunction + if (required.Count == 0 && prohibited.Count == 0 && outerInstance.MinimumNumberShouldMatch <= 1 && optional.Count > 1) + { + var coord = new float[optional.Count + 1]; + for (int i = 0; i < coord.Length; i++) + { + coord[i] = disableCoord ? 1.0f : Coord(i, MaxCoord); + } + return new CopyOfDisjunctionSumScorer(this, optional.ToArray(), coord); + } + + // Return a BooleanScorer2 + return new CopyOfBooleanScorer2(this, disableCoord, outerInstance.MinimumNumberShouldMatch, required, prohibited, optional, MaxCoord); + } + + public override bool ScoresDocsOutOfOrder + { + get + { + if (outerInstance.MinimumNumberShouldMatch > 1) + { + // BS2 (in-order) will be used by scorer() + return false; + } + foreach (BooleanClause c in outerInstance.clauses) + { + if (c.IsRequired) + { + // BS2 (in-order) will be used by scorer() + return false; + } + } + + // scorer() will return an out-of-order scorer if requested. + return true; + } + } + } + + public override Weight CreateWeight(IndexSearcher searcher) + { + return new CopyOfBooleanWeight(this, searcher, CoordDisabled); + } + + public override LuceneQuery Rewrite(IndexReader reader) + { + if (MinimumNumberShouldMatch == 0 && clauses.Count == 1) // optimize 1-clause queries + { + BooleanClause c = clauses[0]; + if (!c.IsProhibited) // just return clause + { + Lucene.Net.Search.Query query = c.Query.Rewrite(reader); // rewrite first + + if (Boost != 1.0f) // incorporate boost + { + if (query == c.Query) // if rewrite was no-op + { + query = (Lucene.Net.Search.Query)query.Clone(); // then clone before boost + } + // Since the BooleanQuery only has 1 clause, the BooleanQuery will be + // written out. Therefore the rewritten Query's boost must incorporate both + // the clause's boost, and the boost of the BooleanQuery itself + query.Boost = Boost * query.Boost; + } + + return query; + } + } + + CopyOfBooleanQuery clone = null; // recursively rewrite + for (int i = 0; i < clauses.Count; i++) + { + BooleanClause c = clauses[i]; + LuceneQuery query = c.Query.Rewrite(reader); + if (query != c.Query) // clause rewrote: must clone + { + if (clone is null) + { + // The BooleanQuery clone is lazily initialized so only initialize + // it if a rewritten clause differs from the original clause (and hasn't been + // initialized already). If nothing differs, the clone isn't needlessly created + clone = (CopyOfBooleanQuery)Clone(); + } + clone.clauses[i] = new BooleanClause(query, c.Occur); + } + } + return clone ?? // some clauses rewrote + this; // no clauses rewrote + } + + public override void ExtractTerms(ISet terms) + { + foreach (BooleanClause clause in clauses) + { + if (clause.Occur != Occur_e.MUST_NOT) + { + clause.Query.ExtractTerms(terms); + } + } + } + + public override object Clone() + { + CopyOfBooleanQuery clone = (CopyOfBooleanQuery)base.Clone(); + clone.clauses = new JCG.List(clauses); + return clone; + } + + /// + /// Prints a user-readable version of this query. + public override string ToString(string field) + { + StringBuilder buffer = new StringBuilder(); + bool needParens = Boost != 1.0 || MinimumNumberShouldMatch > 0; + if (needParens) + { + buffer.Append('('); + } + + for (int i = 0; i < clauses.Count; i++) + { + BooleanClause c = clauses[i]; + if (c.IsProhibited) + { + buffer.Append('-'); + } + else if (c.IsRequired) + { + buffer.Append('+'); + } + + Lucene.Net.Search.Query subQuery = c.Query; + if (subQuery != null) + { + if (subQuery is BooleanQuery) // wrap sub-bools in parens + { + buffer.Append('('); + buffer.Append(subQuery.ToString(field)); + buffer.Append(')'); + } + else + { + buffer.Append(subQuery.ToString(field)); + } + } + else + { + buffer.Append("null"); + } + + if (i != clauses.Count - 1) + { + buffer.Append(' '); + } + } + + if (needParens) + { + buffer.Append(')'); + } + + if (MinimumNumberShouldMatch > 0) + { + buffer.Append('~'); + buffer.Append(MinimumNumberShouldMatch); + } + + if (Boost != 1.0f) + { + buffer.Append(ToStringUtils.Boost(Boost)); + } + + return buffer.ToString(); + } + + public override bool Equals(object o) + { + if (o is not CopyOfBooleanQuery other) + return false; + + // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled + return NumericUtils.SingleToSortableInt32(Boost) == NumericUtils.SingleToSortableInt32(other.Boost) + && clauses.Equals(other.clauses) + && MinimumNumberShouldMatch == other.MinimumNumberShouldMatch + && CoordDisabled == other.CoordDisabled; + } + + public override int GetHashCode() + { + return BitConversion.SingleToInt32Bits(Boost) ^ clauses.GetHashCode() + + MinimumNumberShouldMatch + (CoordDisabled ? 17 : 0); + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer.cs new file mode 100644 index 0000000..485fe9c --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer.cs @@ -0,0 +1,300 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Lucene.Net.Search; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy; + +using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext; +using BooleanWeight = BooleanQuery.BooleanWeight; + + +internal sealed class CopyOfBooleanScorer : BulkScorer +{ + private sealed class BooleanScorerCollector : ICollector + { + private readonly BucketTable bucketTable; // LUCENENET: marked readonly + private readonly int mask; // LUCENENET: marked readonly + private Scorer scorer; + + public BooleanScorerCollector(int mask, BucketTable bucketTable) + { + this.mask = mask; + this.bucketTable = bucketTable; + } + + public void Collect(int doc) + { + BucketTable table = bucketTable; + int i = doc & BucketTable.MASK; + Bucket bucket = table.buckets[i]; + + if (bucket.Doc != doc) // invalid bucket + { + bucket.Doc = doc; // set doc + bucket.Score = scorer.GetScore(); // initialize score + bucket.Bits = mask; // initialize mask + bucket.Coord = 1; // initialize coord + + bucket.Next = table.first; // push onto valid list + table.first = bucket; + } // valid bucket + else + { + bucket.Score += scorer.GetScore(); // increment score + bucket.Bits |= mask; // add bits in mask + bucket.Coord++; // increment coord + } + } + + public void SetNextReader(AtomicReaderContext context) + { + // not needed by this implementation + } + + public void SetScorer(Scorer scorer) + { + this.scorer = scorer; + } + + public bool AcceptsDocsOutOfOrder => true; + } + + internal sealed class Bucket + { + internal int Doc { get; set; } // tells if bucket is valid + internal double Score { get; set; } // incremental score + + // TODO: break out bool anyProhibited, int + // numRequiredMatched; then we can remove 32 limit on + // required clauses + internal int Bits { get; set; } // used for bool constraints + + internal int Coord { get; set; } // count of terms in score + internal Bucket Next { get; set; } // next valid bucket + + public Bucket() + { + // Initialize properties + Doc = -1; + } + } + + /// + /// A simple hash table of document scores within a range. + internal sealed class BucketTable + { + public const int SIZE = 1 << 11; + public const int MASK = SIZE - 1; + + internal readonly Bucket[] buckets = new Bucket[SIZE]; + internal Bucket first = null; // head of valid list + + public BucketTable() + { + // Pre-fill to save the lazy init when collecting + // each sub: + for (int idx = 0; idx < SIZE; idx++) + { + buckets[idx] = new Bucket(); + } + } + + public ICollector NewCollector(int mask) + { + return new BooleanScorerCollector(mask, this); + } + + public static int Count => SIZE; // LUCENENET NOTE: This was size() in Lucene. // LUCENENET: CA1822: Mark members as static + } + + internal sealed class SubScorer + { + public BulkScorer Scorer { get; set; } + + // TODO: re-enable this if BQ ever sends us required clauses + //public boolean required = false; + public bool Prohibited { get; set; } + + public ICollector Collector { get; set; } + public SubScorer Next { get; set; } + public bool More { get; set; } + + public SubScorer(BulkScorer scorer, bool required, bool prohibited, ICollector collector, SubScorer next) + { + if (required) + { + throw new ArgumentException("this scorer cannot handle required=true"); + } + Scorer = scorer; + More = true; + // TODO: re-enable this if BQ ever sends us required clauses + //this.required = required; + Prohibited = prohibited; + Collector = collector; + Next = next; + } + } + + private readonly SubScorer scorers = null; // LUCENENET: marked readonly + private readonly BucketTable bucketTable = new BucketTable(); // LUCENENET: marked readonly + private readonly float[] coordFactors; + + // TODO: re-enable this if BQ ever sends us required clauses + //private int requiredMask = 0; + private readonly int minNrShouldMatch; + + private int end; + private Bucket current; + + // Any time a prohibited clause matches we set bit 0: + private const int PROHIBITED_MASK = 1; + + //private readonly Weight weight; // LUCENENET: Never read + + internal CopyOfBooleanScorer(CopyOfBooleanQuery.CopyOfBooleanWeight weight, bool disableCoord, int minNrShouldMatch, IList optionalScorers, IList prohibitedScorers, int maxCoord) + { + this.minNrShouldMatch = minNrShouldMatch; + //this.weight = weight; // LUCENENET: Never read + + foreach (BulkScorer scorer in optionalScorers) + { + scorers = new SubScorer(scorer, false, false, bucketTable.NewCollector(0), scorers); + } + + foreach (BulkScorer scorer in prohibitedScorers) + { + scorers = new SubScorer(scorer, false, true, bucketTable.NewCollector(PROHIBITED_MASK), scorers); + } + + coordFactors = new float[optionalScorers.Count + 1]; + for (int i = 0; i < coordFactors.Length; i++) + { + coordFactors[i] = disableCoord ? 1.0f : weight.Coord(i, maxCoord); + } + } + + public override bool Score(ICollector collector, int max) + { + bool more; + Bucket tmp; + CopyOfFakeScorer fs = new CopyOfFakeScorer(); + + // The internal loop will set the score and doc before calling collect. + collector.SetScorer(fs); + do + { + bucketTable.first = null; + + while (current != null) // more queued + { + // check prohibited & required + if ((current.Bits & PROHIBITED_MASK) == 0) + { + // TODO: re-enable this if BQ ever sends us required + // clauses + //&& (current.bits & requiredMask) == requiredMask) { + // NOTE: Lucene always passes max = + // Integer.MAX_VALUE today, because we never embed + // a BooleanScorer inside another (even though + // that should work)... but in theory an outside + // app could pass a different max so we must check + // it: + if (current.Doc >= max) + { + tmp = current; + current = current.Next; + tmp.Next = bucketTable.first; + bucketTable.first = tmp; + continue; + } + + if (current.Coord >= minNrShouldMatch) + { + fs.score = (float)(current.Score * coordFactors[current.Coord]); + fs.doc = current.Doc; + fs.freq = current.Coord; + collector.Collect(current.Doc); + } + } + + current = current.Next; // pop the queue + } + + if (bucketTable.first != null) + { + current = bucketTable.first; + bucketTable.first = current.Next; + return true; + } + + // refill the queue + more = false; + end += BucketTable.SIZE; + for (SubScorer sub = scorers; sub != null; sub = sub.Next) + { + if (sub.More) + { + sub.More = sub.Scorer.Score(sub.Collector, end); + more |= sub.More; + } + } + current = bucketTable.first; + } while (current != null || more); + + return false; + } + + public override string ToString() + { + StringBuilder buffer = new StringBuilder(); + buffer.Append("boolean("); + for (SubScorer sub = scorers; sub != null; sub = sub.Next) + { + buffer.Append(sub.Scorer.ToString()); + buffer.Append(' '); + } + buffer.Append(')'); + return buffer.ToString(); + } +} + +internal sealed class CopyOfFakeScorer : Scorer +{ + internal float score; + internal int doc = -1; + internal int freq = 1; + + public CopyOfFakeScorer() + : base(null) + { + } + + public override int Advance(int target) + { + throw new InvalidOperationException("FakeScorer doesn't support advance(int)"); + } + + public override int DocID => doc; + + public override int Freq => freq; + + public override int NextDoc() + { + throw new InvalidOperationException("FakeScorer doesn't support nextDoc()"); + } + + public override float GetScore() + { + return score; + } + + public override long GetCost() + { + return 1; + } + + public override Weight Weight => throw new InvalidOperationException(); + + public override ICollection GetChildren() => throw new InvalidOperationException(); +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer2.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer2.cs new file mode 100644 index 0000000..c1b9524 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/BooleanScorer2.cs @@ -0,0 +1,325 @@ +using System; +using System.Collections.Generic; +using J2N.Collections.Generic.Extensions; +using Lucene.Net.Search; +using JCG = J2N.Collections.Generic; +using CopyOfBooleanWeight = DotJEM.Json.Index2.QueryParsers.Query.Copy.CopyOfBooleanQuery.CopyOfBooleanWeight; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy; + +internal class CopyOfBooleanScorer2 : Scorer +{ + private readonly IList requiredScorers; + private readonly IList optionalScorers; + private readonly IList prohibitedScorers; + + private class Coordinator + { + internal readonly float[] coordFactors; + + internal Coordinator(CopyOfBooleanScorer2 outerInstance, int maxCoord, bool disableCoord) + { + coordFactors = new float[outerInstance.optionalScorers.Count + outerInstance.requiredScorers.Count + 1]; + for (int i = 0; i < coordFactors.Length; i++) + { + coordFactors[i] = disableCoord ? 1.0f : ((CopyOfBooleanWeight)outerInstance.m_weight).Coord(i, maxCoord); + } + } + + internal int nrMatchers; // to be increased by score() of match counting scorers. + } + + private readonly Coordinator coordinator; + private readonly Scorer countingSumScorer; + private readonly int minNrShouldMatch; + private int doc = -1; + + public CopyOfBooleanScorer2(CopyOfBooleanWeight weight, bool disableCoord, int minNrShouldMatch, IList required, IList prohibited, IList optional, int maxCoord) + : base(weight) + { + if (minNrShouldMatch < 0) + { + throw new ArgumentOutOfRangeException(nameof(minNrShouldMatch), "Minimum number of optional scorers should not be negative"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) + } + this.minNrShouldMatch = minNrShouldMatch; + + optionalScorers = optional; + requiredScorers = required; + prohibitedScorers = prohibited; + coordinator = new Coordinator(this, maxCoord, disableCoord); + + countingSumScorer = MakeCountingSumScorer(/* disableCoord // LUCENENET: Not referenced */); + } + + /// + /// Count a scorer as a single match. + private class SingleMatchScorer : Scorer + { + private readonly CopyOfBooleanScorer2 outerInstance; + + internal Scorer scorer; + internal int lastScoredDoc = -1; + + // Save the score of lastScoredDoc, so that we don't compute it more than + // once in score(). + internal float lastDocScore = float.NaN; + + internal SingleMatchScorer(CopyOfBooleanScorer2 outerInstance, Scorer scorer) + : base(scorer.Weight) + { + this.outerInstance = outerInstance; + this.scorer = scorer; + } + + public override float GetScore() + { + int doc = DocID; + if (doc >= lastScoredDoc) + { + if (doc > lastScoredDoc) + { + lastDocScore = scorer.GetScore(); + lastScoredDoc = doc; + } + outerInstance.coordinator.nrMatchers++; + } + return lastDocScore; + } + + public override int Freq => 1; + + public override int DocID => scorer.DocID; + + public override int NextDoc() + { + return scorer.NextDoc(); + } + + public override int Advance(int target) + { + return scorer.Advance(target); + } + + public override long GetCost() + { + return scorer.GetCost(); + } + } + + private Scorer CountingDisjunctionSumScorer(IList scorers, int minNrShouldMatch) + { + // each scorer from the list counted as a single matcher + if (minNrShouldMatch > 1) + { + return new MinShouldMatchSumScorerAnonymousClass(this, m_weight, scorers, minNrShouldMatch); + } + else + { + // we pass null for coord[] since we coordinate ourselves and override score() + return new DisjunctionSumScorerAnonymousClass(this, m_weight, scorers.ToArray(), null); + } + } + + private sealed class MinShouldMatchSumScorerAnonymousClass : CopyOfMinShouldMatchSumScorer + { + private readonly CopyOfBooleanScorer2 outerInstance; + + public MinShouldMatchSumScorerAnonymousClass(CopyOfBooleanScorer2 outerInstance, Weight weight, IList scorers, int minNrShouldMatch) + : base(weight, scorers, minNrShouldMatch) + { + this.outerInstance = outerInstance; + } + + public override float GetScore() + { + outerInstance.coordinator.nrMatchers += base.m_nrMatchers; + return base.GetScore(); + } + } + + private sealed class DisjunctionSumScorerAnonymousClass : CopyOfDisjunctionSumScorer + { + private readonly CopyOfBooleanScorer2 outerInstance; + + public DisjunctionSumScorerAnonymousClass(CopyOfBooleanScorer2 outerInstance, Weight weight, Scorer[] subScorers, float[] coord) + : base(weight, subScorers, coord) + { + this.outerInstance = outerInstance; + } + + public override float GetScore() + { + outerInstance.coordinator.nrMatchers += base.m_nrMatchers; + return (float)base.m_score; + } + } + + private Scorer CountingConjunctionSumScorer(/* bool disableCoord, // LUCENENET: Not Referenced */ IList requiredScorers) + { + // each scorer from the list counted as a single matcher + int requiredNrMatchers = requiredScorers.Count; + return new ConjunctionScorerAnonymousClass(this, m_weight, requiredScorers.ToArray(), requiredNrMatchers); + } + + private sealed class ConjunctionScorerAnonymousClass : CopyOfConjunctionScorer + { + private readonly CopyOfBooleanScorer2 outerInstance; + + private readonly int requiredNrMatchers; + + public ConjunctionScorerAnonymousClass(CopyOfBooleanScorer2 outerInstance, Weight weight, Scorer[] scorers, int requiredNrMatchers) + : base(weight, scorers) + { + this.outerInstance = outerInstance; + this.requiredNrMatchers = requiredNrMatchers; + lastScoredDoc = -1; + lastDocScore = float.NaN; + } + + private int lastScoredDoc; + + // Save the score of lastScoredDoc, so that we don't compute it more than + // once in score(). + private float lastDocScore; + + public override float GetScore() + { + int doc = outerInstance.DocID; + if (doc >= lastScoredDoc) + { + if (doc > lastScoredDoc) + { + lastDocScore = base.GetScore(); + lastScoredDoc = doc; + } + outerInstance.coordinator.nrMatchers += requiredNrMatchers; + } + // All scorers match, so defaultSimilarity super.score() always has 1 as + // the coordination factor. + // Therefore the sum of the scores of the requiredScorers + // is used as score. + return lastDocScore; + } + } + + private Scorer DualConjunctionSumScorer(/* bool disableCoord, // LUCENENET: Not Referenced */ Scorer req1, Scorer req2) // non counting. + { + return new CopyOfConjunctionScorer(m_weight, new Scorer[] { req1, req2 }); + // All scorers match, so defaultSimilarity always has 1 as + // the coordination factor. + // Therefore the sum of the scores of two scorers + // is used as score. + } + + /// + /// Returns the scorer to be used for match counting and score summing. + /// Uses requiredScorers, optionalScorers and prohibitedScorers. + /// + private Scorer MakeCountingSumScorer(/* bool disableCoord // LUCENENET: Not Referenced */) // each scorer counted as a single matcher + { + return (requiredScorers.Count == 0) + ? MakeCountingSumScorerNoReq(/* disableCoord // LUCENENET: Not Referenced */) + : MakeCountingSumScorerSomeReq(/* disableCoord // LUCENENET: Not Referenced */); + } + + private Scorer MakeCountingSumScorerNoReq(/* bool disableCoord // LUCENENET: Not Referenced */) // No required scorers + { + // minNrShouldMatch optional scorers are required, but at least 1 + int nrOptRequired = (minNrShouldMatch < 1) ? 1 : minNrShouldMatch; + Scorer requiredCountingSumScorer; + if (optionalScorers.Count > nrOptRequired) + { + requiredCountingSumScorer = CountingDisjunctionSumScorer(optionalScorers, nrOptRequired); + } + else if (optionalScorers.Count == 1) + { + requiredCountingSumScorer = new SingleMatchScorer(this, optionalScorers[0]); + } + else + { + requiredCountingSumScorer = CountingConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ optionalScorers); + } + return AddProhibitedScorers(requiredCountingSumScorer); + } + + private Scorer MakeCountingSumScorerSomeReq(/* bool disableCoord // LUCENENET: Not Referenced */) // At least one required scorer. + { + if (optionalScorers.Count == minNrShouldMatch) // all optional scorers also required. + { + JCG.List allReq = new JCG.List(requiredScorers); + allReq.AddRange(optionalScorers); + return AddProhibitedScorers(CountingConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ allReq)); + } // optionalScorers.size() > minNrShouldMatch, and at least one required scorer + else + { + Scorer requiredCountingSumScorer = requiredScorers.Count == 1 ? new SingleMatchScorer(this, requiredScorers[0]) : CountingConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ requiredScorers); + if (minNrShouldMatch > 0) // use a required disjunction scorer over the optional scorers + { + return AddProhibitedScorers(DualConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ requiredCountingSumScorer, CountingDisjunctionSumScorer(optionalScorers, minNrShouldMatch))); // non counting + } // minNrShouldMatch == 0 + else + { + return new CopyOfReqOptSumScorer(AddProhibitedScorers(requiredCountingSumScorer), optionalScorers.Count == 1 ? new SingleMatchScorer(this, optionalScorers[0]) + // require 1 in combined, optional scorer. + : CountingDisjunctionSumScorer(optionalScorers, 1)); + } + } + } + + /// + /// Returns the scorer to be used for match counting and score summing. + /// Uses the given required scorer and the prohibitedScorers. + /// A required scorer already built. + private Scorer AddProhibitedScorers(Scorer requiredCountingSumScorer) + { + return (prohibitedScorers.Count == 0) + ? requiredCountingSumScorer + : new CopyOfReqExclScorer(requiredCountingSumScorer, ((prohibitedScorers.Count == 1) + ? prohibitedScorers[0] + : new CopyOfMinShouldMatchSumScorer(m_weight, prohibitedScorers))); // no prohibited + } + + public override int DocID => doc; + + public override int NextDoc() + { + return doc = countingSumScorer.NextDoc(); + } + + public override float GetScore() + { + coordinator.nrMatchers = 0; + float sum = countingSumScorer.GetScore(); + return sum * coordinator.coordFactors[coordinator.nrMatchers]; + } + + public override int Freq => countingSumScorer.Freq; + + public override int Advance(int target) + { + return doc = countingSumScorer.Advance(target); + } + + public override long GetCost() + { + return countingSumScorer.GetCost(); + } + + public override ICollection GetChildren() + { + IList children = new JCG.List(); + foreach (Scorer s in optionalScorers) + { + children.Add(new ChildScorer(s, "SHOULD")); + } + foreach (Scorer s in prohibitedScorers) + { + children.Add(new ChildScorer(s, "MUST_NOT")); + } + foreach (Scorer s in requiredScorers) + { + children.Add(new ChildScorer(s, "MUST")); + } + return children; + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ConjunctionScorer.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ConjunctionScorer.cs new file mode 100644 index 0000000..e1cd55a --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ConjunctionScorer.cs @@ -0,0 +1,163 @@ +using System.Collections.Generic; +using Lucene.Net.Search; +using JCG = J2N.Collections.Generic; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using ArrayUtil = Lucene.Net.Util.ArrayUtil; + + /// + /// Scorer for conjunctions, sets of queries, all of which are required. + internal class CopyOfConjunctionScorer : Scorer + { + protected int m_lastDoc = -1; + protected readonly DocsAndFreqs[] m_docsAndFreqs; + private readonly DocsAndFreqs lead; + private readonly float coord; + + internal CopyOfConjunctionScorer(Weight weight, Scorer[] scorers) + : this(weight, scorers, 1f) + { + } + + internal CopyOfConjunctionScorer(Weight weight, Scorer[] scorers, float coord) + : base(weight) + { + this.coord = coord; + this.m_docsAndFreqs = new DocsAndFreqs[scorers.Length]; + for (int i = 0; i < scorers.Length; i++) + { + m_docsAndFreqs[i] = new DocsAndFreqs(scorers[i]); + } + // Sort the array the first time to allow the least frequent DocsEnum to + // lead the matching. + ArrayUtil.TimSort(m_docsAndFreqs, System.Collections.Generic.Comparer.Create((o1, o2) => + { + if (o1.Cost < o2.Cost) + { + return -1; + } + else if (o1.Cost > o2.Cost) + { + return 1; + } + else + { + return 0; + } + })); + + lead = m_docsAndFreqs[0]; // least frequent DocsEnum leads the intersection + } + + private int DoNext(int doc) + { + for (; ; ) + { + // doc may already be NO_MORE_DOCS here, but we don't check explicitly + // since all scorers should advance to NO_MORE_DOCS, match, then + // return that value. + for (; ; ) + { + for (int i = 1; i < m_docsAndFreqs.Length; i++) + { + // invariant: docsAndFreqs[i].doc <= doc at this point. + + // docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead" + // on the previous iteration and the advance on the lead scorer exactly matched. + if (m_docsAndFreqs[i].Doc < doc) + { + m_docsAndFreqs[i].Doc = m_docsAndFreqs[i].Scorer.Advance(doc); + + if (m_docsAndFreqs[i].Doc > doc) + { + // DocsEnum beyond the current doc - break and advance lead to the new highest doc. + doc = m_docsAndFreqs[i].Doc; + goto advanceHeadBreak; + } + } + } + // success - all DocsEnums are on the same doc + return doc; + //advanceHeadContinue:; + } + advanceHeadBreak: + // advance head for next iteration + doc = lead.Doc = lead.Scorer.Advance(doc); + } + } + + public override int Advance(int target) + { + lead.Doc = lead.Scorer.Advance(target); + return m_lastDoc = DoNext(lead.Doc); + } + + public override int DocID => m_lastDoc; + + public override int NextDoc() + { + lead.Doc = lead.Scorer.NextDoc(); + return m_lastDoc = DoNext(lead.Doc); + } + + public override float GetScore() + { + // TODO: sum into a double and cast to float if we ever send required clauses to BS1 + float sum = 0.0f; + foreach (DocsAndFreqs docs in m_docsAndFreqs) + { + sum += docs.Scorer.GetScore(); + } + return sum * coord; + } + + public override int Freq => m_docsAndFreqs.Length; + + public override long GetCost() + { + return lead.Scorer.GetCost(); + } + + public override ICollection GetChildren() + { + IList children = new JCG.List(m_docsAndFreqs.Length); + foreach (DocsAndFreqs docs in m_docsAndFreqs) + { + children.Add(new ChildScorer(docs.Scorer, "MUST")); + } + return children; + } + + internal sealed class DocsAndFreqs + { + internal long Cost { get; private set; } + internal Scorer Scorer { get; private set; } + internal int Doc { get; set; } + + internal DocsAndFreqs(Scorer scorer) + { + this.Scorer = scorer; + this.Cost = scorer.GetCost(); + this.Doc = -1; + } + } + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionScorer.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionScorer.cs new file mode 100644 index 0000000..dea6b05 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionScorer.cs @@ -0,0 +1,167 @@ +using System.Collections.Generic; +using System.Diagnostics; +using Lucene.Net.Search; +using JCG = J2N.Collections.Generic; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy; + +internal abstract class CopyOfDisjunctionScorer : Scorer +{ + protected readonly Scorer[] m_subScorers; + + protected int m_doc = -1; + protected int m_numScorers; + + protected CopyOfDisjunctionScorer(Weight weight, Scorer[] subScorers) + : base(weight) + { + this.m_subScorers = subScorers; + this.m_numScorers = subScorers.Length; + Heapify(); + } + + /// + /// Organize subScorers into a min heap with scorers generating the earliest document on top. + /// + protected void Heapify() + { + for (int i = (m_numScorers >> 1) - 1; i >= 0; i--) + { + HeapAdjust(i); + } + } + + protected void HeapAdjust(int root) + { + Scorer scorer = m_subScorers[root]; + int doc = scorer.DocID; + int i = root; + while (i <= (m_numScorers >> 1) - 1) + { + int lchild = (i << 1) + 1; + Scorer lscorer = m_subScorers[lchild]; + int ldoc = lscorer.DocID; + int rdoc = int.MaxValue, rchild = (i << 1) + 2; + Scorer rscorer = null; + if (rchild < m_numScorers) + { + rscorer = m_subScorers[rchild]; + rdoc = rscorer.DocID; + } + if (ldoc < doc) + { + if (rdoc < ldoc) + { + m_subScorers[i] = rscorer; + m_subScorers[rchild] = scorer; + i = rchild; + } + else + { + m_subScorers[i] = lscorer; + m_subScorers[lchild] = scorer; + i = lchild; + } + } + else if (rdoc < doc) + { + m_subScorers[i] = rscorer; + m_subScorers[rchild] = scorer; + i = rchild; + } + else + { + return; + } + } + } + + protected void HeapRemoveRoot() + { + if (m_numScorers == 1) + { + m_subScorers[0] = null; + m_numScorers = 0; + } + else + { + m_subScorers[0] = m_subScorers[m_numScorers - 1]; + m_subScorers[m_numScorers - 1] = null; + --m_numScorers; + HeapAdjust(0); + } + } + + public sealed override ICollection GetChildren() + { + IList children = new JCG.List(m_numScorers); + for (int i = 0; i < m_numScorers; i++) + { + children.Add(new ChildScorer(m_subScorers[i], "SHOULD")); + } + return children; + } + + public override long GetCost() + { + long sum = 0; + for (int i = 0; i < m_numScorers; i++) + { + sum += m_subScorers[i].GetCost(); + } + return sum; + } + + public override int DocID => m_doc; + + public override int NextDoc() + { + Debug.Assert(m_doc != NO_MORE_DOCS); + while (true) + { + if (m_subScorers[0].NextDoc() != NO_MORE_DOCS) + { + HeapAdjust(0); + } + else + { + HeapRemoveRoot(); + if (m_numScorers == 0) + { + return m_doc = NO_MORE_DOCS; + } + } + if (m_subScorers[0].DocID != m_doc) + { + AfterNext(); + return m_doc; + } + } + } + + public override int Advance(int target) + { + while (true) + { + if (m_subScorers[0].Advance(target) != NO_MORE_DOCS) + { + HeapAdjust(0); + } + else + { + HeapRemoveRoot(); + if (m_numScorers == 0) + { + return m_doc = NO_MORE_DOCS; + } + } + if (m_subScorers[0].DocID >= target) + { + AfterNext(); + return m_doc; + } + } + } + + protected abstract void AfterNext(); +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionSumScorer.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionSumScorer.cs new file mode 100644 index 0000000..08b05c9 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/DisjunctionSumScorer.cs @@ -0,0 +1,90 @@ +using System; +using Lucene.Net.Search; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A for OR like queries, counterpart of . + /// This implements and uses Advance() on the given s. + /// + internal class CopyOfDisjunctionSumScorer : CopyOfDisjunctionScorer + { + /// + /// The number of subscorers that provide the current match. + protected internal int m_nrMatchers = -1; + + protected internal double m_score = float.NaN; + private readonly float[] coord; + + /// + /// Construct a . + /// The weight to be used. + /// Array of at least two subscorers. + /// Table of coordination factors + internal CopyOfDisjunctionSumScorer(Weight weight, Scorer[] subScorers, float[] coord) + : base(weight, subScorers) + { + if (m_numScorers <= 1) + { + throw new ArgumentException("There must be at least 2 subScorers"); + } + this.coord = coord; + } + + protected override void AfterNext() + { + Scorer sub = m_subScorers[0]; + m_doc = sub.DocID; + if (m_doc != NO_MORE_DOCS) + { + m_score = sub.GetScore(); + m_nrMatchers = 1; + CountMatches(1); + CountMatches(2); + } + } + + // TODO: this currently scores, but so did the previous impl + // TODO: remove recursion. + // TODO: if we separate scoring, out of here, + // then change freq() to just always compute it from scratch + private void CountMatches(int root) + { + if (root < m_numScorers && m_subScorers[root].DocID == m_doc) + { + m_nrMatchers++; + m_score += m_subScorers[root].GetScore(); + CountMatches((root << 1) + 1); + CountMatches((root << 1) + 2); + } + } + + /// + /// Returns the score of the current document matching the query. + /// Initially invalid, until is called the first time. + /// + public override float GetScore() + { + return (float)m_score * coord[m_nrMatchers]; + } + + public override int Freq => m_nrMatchers; + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/MinShouldMatchSumScorer.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/MinShouldMatchSumScorer.cs new file mode 100644 index 0000000..52b64e2 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/MinShouldMatchSumScorer.cs @@ -0,0 +1,480 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using J2N; +using J2N.Collections.Generic.Extensions; +using Lucene.Net.Search; +using JCG = J2N.Collections.Generic; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using ArrayUtil = Lucene.Net.Util.ArrayUtil; + + /// + /// A for OR like queries, counterpart of . + /// This implements and uses Advance() on the given s. + /// + /// This implementation uses the minimumMatch constraint actively to efficiently + /// prune the number of candidates, it is hence a mixture between a pure + /// and a . + /// + internal class CopyOfMinShouldMatchSumScorer : Scorer + { + /// + /// The overall number of non-finalized scorers + private int numScorers; + + /// + /// The minimum number of scorers that should match + private readonly int mm; + + /// + /// A static array of all subscorers sorted by decreasing cost + private readonly Scorer[] sortedSubScorers; + + /// + /// A monotonically increasing index into the array pointing to the next subscorer that is to be excluded + private int sortedSubScorersIdx = 0; + + private readonly Scorer[] subScorers; // the first numScorers-(mm-1) entries are valid + private int nrInHeap; // 0..(numScorers-(mm-1)-1) + + /// + /// mmStack is supposed to contain the most costly subScorers that still did + /// not run out of docs, sorted by increasing sparsity of docs returned by that subScorer. + /// For now, the cost of subscorers is assumed to be inversely correlated with sparsity. + /// + private readonly Scorer[] mmStack; // of size mm-1: 0..mm-2, always full + + /// + /// The document number of the current match. + private int doc = -1; + + /// + /// The number of subscorers that provide the current match. + protected int m_nrMatchers = -1; + + private double score = float.NaN; + + /// + /// Construct a . + /// + /// The weight to be used. + /// A collection of at least two subscorers. + /// The positive minimum number of subscorers that should + /// match to match this query. + /// When is bigger than + /// the number of , no matches will be produced. + /// When equals the number of , + /// it is more efficient to use . + public CopyOfMinShouldMatchSumScorer(Weight weight, IList subScorers, int minimumNrMatchers) + : base(weight) + { + this.nrInHeap = this.numScorers = subScorers.Count; + + if (minimumNrMatchers <= 0) + { + throw new ArgumentOutOfRangeException(nameof(minimumNrMatchers), "Minimum nr of matchers must be positive"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) + } + if (numScorers <= 1) + { + throw new ArgumentOutOfRangeException(nameof(numScorers), "There must be at least 2 subScorers"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) + } + + this.mm = minimumNrMatchers; + this.sortedSubScorers = subScorers.ToArray(); + // sorting by decreasing subscorer cost should be inversely correlated with + // next docid (assuming costs are due to generating many postings) + ArrayUtil.TimSort(sortedSubScorers, System.Collections.Generic.Comparer.Create((o1, o2) => (o2.GetCost() - o1.GetCost()).Signum())); + // take mm-1 most costly subscorers aside + this.mmStack = new Scorer[mm - 1]; + for (int i = 0; i < mm - 1; i++) + { + mmStack[i] = sortedSubScorers[i]; + } + nrInHeap -= mm - 1; + this.sortedSubScorersIdx = mm - 1; + // take remaining into heap, if any, and heapify + this.subScorers = new Scorer[nrInHeap]; + for (int i = 0; i < nrInHeap; i++) + { + this.subScorers[i] = this.sortedSubScorers[mm - 1 + i]; + } + MinheapHeapify(); + Debug.Assert(MinheapCheck()); + } + + /// + /// Construct a , using one as the minimum number + /// of matching . + /// + public CopyOfMinShouldMatchSumScorer(Weight weight, IList subScorers) + : this(weight, subScorers, 1) + { + } + + public override sealed ICollection GetChildren() + { + IList children = new JCG.List(numScorers); + for (int i = 0; i < numScorers; i++) + { + children.Add(new ChildScorer(subScorers[i], "SHOULD")); + } + return children; + } + + public override int NextDoc() + { + Debug.Assert(doc != NO_MORE_DOCS); + while (true) + { + // to remove current doc, call next() on all subScorers on current doc within heap + while (subScorers[0].DocID == doc) + { + if (subScorers[0].NextDoc() != NO_MORE_DOCS) + { + MinheapSiftDown(0); + } + else + { + MinheapRemoveRoot(); + numScorers--; + if (numScorers < mm) + { + return doc = NO_MORE_DOCS; + } + } + //assert minheapCheck(); + } + + EvaluateSmallestDocInHeap(); + + if (m_nrMatchers >= mm) // doc satisfies mm constraint + { + break; + } + } + return doc; + } + + private void EvaluateSmallestDocInHeap() + { + // within heap, subScorer[0] now contains the next candidate doc + doc = subScorers[0].DocID; + if (doc == NO_MORE_DOCS) + { + m_nrMatchers = int.MaxValue; // stop looping + return; + } + // 1. score and count number of matching subScorers within heap + score = subScorers[0].GetScore(); + m_nrMatchers = 1; + CountMatches(1); + CountMatches(2); + // 2. score and count number of matching subScorers within stack, + // short-circuit: stop when mm can't be reached for current doc, then perform on heap next() + // TODO instead advance() might be possible, but complicates things + for (int i = mm - 2; i >= 0; i--) // first advance sparsest subScorer + { + if (mmStack[i].DocID >= doc || mmStack[i].Advance(doc) != NO_MORE_DOCS) + { + if (mmStack[i].DocID == doc) // either it was already on doc, or got there via advance() + { + m_nrMatchers++; + score += mmStack[i].GetScore(); + } // scorer advanced to next after doc, check if enough scorers left for current doc + else + { + if (m_nrMatchers + i < mm) // too few subScorers left, abort advancing + { + return; // continue looping TODO consider advance() here + } + } + } // subScorer exhausted + else + { + numScorers--; + if (numScorers < mm) // too few subScorers left + { + doc = NO_MORE_DOCS; + m_nrMatchers = int.MaxValue; // stop looping + return; + } + if (mm - 2 - i > 0) + { + // shift RHS of array left + Array.Copy(mmStack, i + 1, mmStack, i, mm - 2 - i); + } + // find next most costly subScorer within heap TODO can this be done better? + while (!MinheapRemove(sortedSubScorers[sortedSubScorersIdx++])) + { + //assert minheapCheck(); + } + // add the subScorer removed from heap to stack + mmStack[mm - 2] = sortedSubScorers[sortedSubScorersIdx - 1]; + + if (m_nrMatchers + i < mm) // too few subScorers left, abort advancing + { + return; // continue looping TODO consider advance() here + } + } + } + } + + // TODO: this currently scores, but so did the previous impl + // TODO: remove recursion. + // TODO: consider separating scoring out of here, then modify this + // and afterNext() to terminate when nrMatchers == minimumNrMatchers + // then also change freq() to just always compute it from scratch + private void CountMatches(int root) + { + if (root < nrInHeap && subScorers[root].DocID == doc) + { + m_nrMatchers++; + score += subScorers[root].GetScore(); + CountMatches((root << 1) + 1); + CountMatches((root << 1) + 2); + } + } + + /// + /// Returns the score of the current document matching the query. Initially + /// invalid, until is called the first time. + /// + public override float GetScore() + { + return (float)score; + } + + public override int DocID => doc; + + public override int Freq => m_nrMatchers; + + /// + /// Advances to the first match beyond the current whose document number is + /// greater than or equal to a given target. + /// + /// The implementation uses the Advance() method on the subscorers. + /// + /// The target document number. + /// The document whose number is greater than or equal to the given + /// target, or -1 if none exist. + public override int Advance(int target) + { + if (numScorers < mm) + { + return doc = NO_MORE_DOCS; + } + // advance all Scorers in heap at smaller docs to at least target + while (subScorers[0].DocID < target) + { + if (subScorers[0].Advance(target) != NO_MORE_DOCS) + { + MinheapSiftDown(0); + } + else + { + MinheapRemoveRoot(); + numScorers--; + if (numScorers < mm) + { + return doc = NO_MORE_DOCS; + } + } + //assert minheapCheck(); + } + + EvaluateSmallestDocInHeap(); + + if (m_nrMatchers >= mm) + { + return doc; + } + else + { + return NextDoc(); + } + } + + public override long GetCost() + { + // cost for merging of lists analog to DisjunctionSumScorer + long costCandidateGeneration = 0; + for (int i = 0; i < nrInHeap; i++) + { + costCandidateGeneration += subScorers[i].GetCost(); + } + // TODO is cost for advance() different to cost for iteration + heap merge + // and how do they compare overall to pure disjunctions? + const float c1 = 1.0f, c2 = 1.0f; // maybe a constant, maybe a proportion between costCandidateGeneration and sum(subScorer_to_be_advanced.cost())? + return (long)(c1 * costCandidateGeneration + c2 * costCandidateGeneration * (mm - 1)); // advance() cost - heap-merge cost + } + + /// + /// Organize into a min heap with scorers generating the earliest document on top. + /// + protected void MinheapHeapify() + { + for (int i = (nrInHeap >> 1) - 1; i >= 0; i--) + { + MinheapSiftDown(i); + } + } + + /// + /// The subtree of at root is a min heap except possibly for its root element. + /// Bubble the root down as required to make the subtree a heap. + /// + protected void MinheapSiftDown(int root) + { + // TODO could this implementation also move rather than swapping neighbours? + Scorer scorer = subScorers[root]; + int doc = scorer.DocID; + int i = root; + while (i <= (nrInHeap >> 1) - 1) + { + int lchild = (i << 1) + 1; + Scorer lscorer = subScorers[lchild]; + int ldoc = lscorer.DocID; + int rdoc = int.MaxValue, rchild = (i << 1) + 2; + Scorer rscorer = null; + if (rchild < nrInHeap) + { + rscorer = subScorers[rchild]; + rdoc = rscorer.DocID; + } + if (ldoc < doc) + { + if (rdoc < ldoc) + { + subScorers[i] = rscorer; + subScorers[rchild] = scorer; + i = rchild; + } + else + { + subScorers[i] = lscorer; + subScorers[lchild] = scorer; + i = lchild; + } + } + else if (rdoc < doc) + { + subScorers[i] = rscorer; + subScorers[rchild] = scorer; + i = rchild; + } + else + { + return; + } + } + } + + protected void MinheapSiftUp(int i) + { + Scorer scorer = subScorers[i]; + int doc = scorer.DocID; + // find right place for scorer + while (i > 0) + { + int parent = (i - 1) >> 1; + Scorer pscorer = subScorers[parent]; + int pdoc = pscorer.DocID; + if (pdoc > doc) // move root down, make space + { + subScorers[i] = subScorers[parent]; + i = parent; + } // done, found right place + else + { + break; + } + } + subScorers[i] = scorer; + } + + /// + /// Remove the root from and re-establish it as a heap + /// + protected void MinheapRemoveRoot() + { + if (nrInHeap == 1) + { + //subScorers[0] = null; // not necessary + nrInHeap = 0; + } + else + { + nrInHeap--; + subScorers[0] = subScorers[nrInHeap]; + //subScorers[nrInHeap] = null; // not necessary + MinheapSiftDown(0); + } + } + + /// + /// Removes a given from the heap by placing end of heap at that + /// position and bubbling it either up or down + /// + protected bool MinheapRemove(Scorer scorer) + { + // find scorer: O(nrInHeap) + for (int i = 0; i < nrInHeap; i++) + { + if (subScorers[i] == scorer) // remove scorer + { + subScorers[i] = subScorers[--nrInHeap]; + //if (i != nrInHeap) subScorers[nrInHeap] = null; // not necessary + MinheapSiftUp(i); + MinheapSiftDown(i); + return true; + } + } + return false; // scorer already exhausted + } + + // LUCENENET specific - S1699 - marked non-virtual because calling virtual members + // from the constructor is not a safe operation in .NET + private bool MinheapCheck() + { + return MinheapCheck(0); + } + + private bool MinheapCheck(int root) + { + if (root >= nrInHeap) + { + return true; + } + int lchild = (root << 1) + 1; + int rchild = (root << 1) + 2; + if (lchild < nrInHeap && subScorers[root].DocID > subScorers[lchild].DocID) + { + return false; + } + if (rchild < nrInHeap && subScorers[root].DocID > subScorers[rchild].DocID) + { + return false; + } + return MinheapCheck(lchild) && MinheapCheck(rchild); + } + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqExclScorer.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqExclScorer.cs new file mode 100644 index 0000000..b6fe6a3 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqExclScorer.cs @@ -0,0 +1,146 @@ +using System.Collections.Generic; +using Lucene.Net.Search; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A for queries with a required subscorer + /// and an excluding (prohibited) sub . + /// + /// This implements , + /// and it uses the SkipTo() on the given scorers. + /// + internal class CopyOfReqExclScorer : Scorer + { + private Scorer reqScorer; + private DocIdSetIterator exclDisi; + private int doc = -1; + + /// + /// Construct a . + /// The scorer that must match, except where + /// Indicates exclusion. + public CopyOfReqExclScorer(Scorer reqScorer, DocIdSetIterator exclDisi) + : base(reqScorer.Weight) + { + this.reqScorer = reqScorer; + this.exclDisi = exclDisi; + } + + public override int NextDoc() + { + if (reqScorer is null) + { + return doc; + } + doc = reqScorer.NextDoc(); + if (doc == NO_MORE_DOCS) + { + reqScorer = null; // exhausted, nothing left + return doc; + } + if (exclDisi is null) + { + return doc; + } + return doc = ToNonExcluded(); + } + + /// + /// Advance to non excluded doc. + /// On entry: + /// + /// reqScorer != null, + /// exclScorer != null, + /// reqScorer was advanced once via Next() or SkipTo() + /// and reqScorer.Doc may still be excluded. + /// + /// Advances reqScorer a non excluded required doc, if any. + /// true if there is a non excluded required doc. + private int ToNonExcluded() + { + int exclDoc = exclDisi.DocID; + int reqDoc = reqScorer.DocID; // may be excluded + do + { + if (reqDoc < exclDoc) + { + return reqDoc; // reqScorer advanced to before exclScorer, ie. not excluded + } + else if (reqDoc > exclDoc) + { + exclDoc = exclDisi.Advance(reqDoc); + if (exclDoc == NO_MORE_DOCS) + { + exclDisi = null; // exhausted, no more exclusions + return reqDoc; + } + if (exclDoc > reqDoc) + { + return reqDoc; // not excluded + } + } + } while ((reqDoc = reqScorer.NextDoc()) != NO_MORE_DOCS); + reqScorer = null; // exhausted, nothing left + return NO_MORE_DOCS; + } + + public override int DocID => doc; + + /// + /// Returns the score of the current document matching the query. + /// Initially invalid, until is called the first time. + /// The score of the required scorer. + public override float GetScore() + { + return reqScorer.GetScore(); // reqScorer may be null when next() or skipTo() already return false + } + + public override int Freq => reqScorer.Freq; + + public override ICollection GetChildren() + { + return new[] { new ChildScorer(reqScorer, "FILTERED") }; + } + + public override int Advance(int target) + { + if (reqScorer is null) + { + return doc = NO_MORE_DOCS; + } + if (exclDisi is null) + { + return doc = reqScorer.Advance(target); + } + if (reqScorer.Advance(target) == NO_MORE_DOCS) + { + reqScorer = null; + return doc = NO_MORE_DOCS; + } + return doc = ToNonExcluded(); + } + + public override long GetCost() + { + return reqScorer.GetCost(); + } + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqOptSumScorer.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqOptSumScorer.cs new file mode 100644 index 0000000..aa92bb6 --- /dev/null +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/Copy/ReqOptSumScorer.cs @@ -0,0 +1,116 @@ +using System.Collections.Generic; +using System.Diagnostics; +using Lucene.Net.Search; +using JCG = J2N.Collections.Generic; + +namespace DotJEM.Json.Index2.QueryParsers.Query.Copy +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A for queries with a required part and an optional part. + /// Delays SkipTo() on the optional part until a GetScore() is needed. + /// + /// This implements . + /// + internal class CopyOfReqOptSumScorer : Scorer + { + /// + /// The scorers passed from the constructor. + /// These are set to null as soon as their Next() or SkipTo() returns false. + /// + private readonly Scorer reqScorer; // LUCENENET: marked readonly + + private Scorer optScorer; + + /// + /// Construct a . + /// The required scorer. This must match. + /// The optional scorer. This is used for scoring only. + public CopyOfReqOptSumScorer(Scorer reqScorer, Scorer optScorer) + : base(reqScorer.Weight) + { + Debug.Assert(reqScorer != null); + Debug.Assert(optScorer != null); + + this.reqScorer = reqScorer; + this.optScorer = optScorer; + } + + public override int NextDoc() + { + return reqScorer.NextDoc(); + } + + public override int Advance(int target) + { + return reqScorer.Advance(target); + } + + public override int DocID => reqScorer.DocID; + + /// + /// Returns the score of the current document matching the query. + /// Initially invalid, until is called the first time. + /// The score of the required scorer, eventually increased by the score + /// of the optional scorer when it also matches the current document. + public override float GetScore() + { + // TODO: sum into a double and cast to float if we ever send required clauses to BS1 + int curDoc = reqScorer.DocID; + float reqScore = reqScorer.GetScore(); + if (optScorer is null) + { + return reqScore; + } + + int optScorerDoc = optScorer.DocID; + if (optScorerDoc < curDoc && (optScorerDoc = optScorer.Advance(curDoc)) == NO_MORE_DOCS) + { + optScorer = null; + return reqScore; + } + + return optScorerDoc == curDoc ? reqScore + optScorer.GetScore() : reqScore; + } + + public override int Freq + { + get + { + // we might have deferred advance() + GetScore(); + return (optScorer != null && optScorer.DocID == reqScorer.DocID) ? 2 : 1; + } + } + + public override ICollection GetChildren() + { + return new JCG.List(2) + { + new ChildScorer(reqScorer, "MUST"), + new ChildScorer(optScorer, "SHOULD") + }; + } + + public override long GetCost() + { + return reqScorer.GetCost(); + } + } +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/Query/InQuery.cs b/src/DotJEM.Json.Index2.QueryParsers/Query/InQuery.cs index d5a11d3..947ca42 100644 --- a/src/DotJEM.Json.Index2.QueryParsers/Query/InQuery.cs +++ b/src/DotJEM.Json.Index2.QueryParsers/Query/InQuery.cs @@ -1,195 +1,101 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Text; using Lucene.Net.Index; using Lucene.Net.Search; -using Lucene.Net.Search.Similarities; using Lucene.Net.Util; -using LuceneQuery = Lucene.Net.Search.Query; -namespace DotJEM.Json.Index2.QueryParsers.Query -{ - public class InQuery : LuceneQuery - { - // Important resources: - // https://events.static.linuxfound.org/sites/events/files/slides/CustomLuceneQueries.pdf - // https://github.com/sing1ee/lucene-custom-query - // https://github.com/o19s/lucene-query-example/blob/master/src/main/java/com/o19s/BackwardsTermQuery.java - // https://opensourceconnections.com/blog/2014/01/20/build-your-own-custom-lucene-query-and-scorer/ - +namespace DotJEM.Json.Index2.QueryParsers.Query; - private IEnumerable terms; - - public InQuery(IEnumerable terms) - { - this.terms = terms; - } +public class InQuery: MultiTermQuery +{ + private readonly BytesRef[] values; - public override string ToString(string field) - { - throw new NotImplementedException(); - } + public InQuery(string field, params string[] values) + : base(field) + { + this.values = values + .OrderBy(v => v) + .Select(x => new BytesRef( x )).ToArray(); } - - - [Serializable] - public class CustomTermQuery : LuceneQuery + public override string ToString(string field) { - private readonly Term term; - private readonly TermContext perReaderTermState; - - public CustomTermQuery(Term t) + StringBuilder buffer = new StringBuilder(); + if (!Field.Equals(field, StringComparison.Ordinal)) { - term = t; - perReaderTermState = null; + buffer.Append(Field); + buffer.Append(':'); } - - - public virtual Term Term => term; - - public override Weight CreateWeight(IndexSearcher searcher) - { - IndexReaderContext topReaderContext = searcher.TopReaderContext; - TermContext termStates = perReaderTermState == null || perReaderTermState.TopReaderContext != topReaderContext - ? TermContext.Build(topReaderContext, term) - : perReaderTermState; - - return new CustomTermWeight(this, searcher, termStates); - } - - public override void ExtractTerms(ISet terms) - { - terms.Add(Term); - } - - public override string ToString(string field) - { - StringBuilder stringBuilder = new StringBuilder(); - if (!term.Field.Equals(field, StringComparison.Ordinal)) - { - stringBuilder.Append(term.Field); - stringBuilder.Append(":"); - } - stringBuilder.Append(term.Text()); - stringBuilder.Append(ToStringUtils.Boost(Boost)); - return stringBuilder.ToString(); - } - - public override bool Equals(object o) - { - if (!(o is CustomTermQuery)) - return false; - CustomTermQuery termQuery = (CustomTermQuery)o; - return Math.Abs(Boost - termQuery.Boost) < 0.00000001 && term.Equals(termQuery.term); - } - - public override int GetHashCode() - { - return Boost.GetHashCode() ^ term.GetHashCode(); - } - - internal sealed class CustomTermWeight : Weight + buffer.Append('('); + for (int i = 0; i < values.Length; i++) { - private readonly CustomTermQuery outerInstance; - internal readonly Similarity similarity; - internal readonly Similarity.SimWeight stats; - internal readonly TermContext termStates; - - public CustomTermWeight(CustomTermQuery outerInstance, IndexSearcher searcher, TermContext termStates) - { - this.outerInstance = outerInstance; - this.termStates = termStates; - similarity = searcher.Similarity; - stats = similarity - .ComputeWeight(outerInstance.Boost, searcher.CollectionStatistics(outerInstance.term.Field), searcher.TermStatistics(outerInstance.term, termStates)); - } - - public override string ToString() => $"weight({outerInstance})"; - - public override LuceneQuery Query => outerInstance; - - public override float GetValueForNormalization() => stats.GetValueForNormalization(); - - public override void Normalize(float queryNorm, float topLevelBoost) => stats.Normalize(queryNorm, topLevelBoost); - - public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) - { - TermsEnum termsEnum = GetTermsEnum(context); - return termsEnum == null ? null : new CustomTermScorer(this, termsEnum.Docs(acceptDocs, null), similarity.GetSimScorer(stats, context)); - } - - private TermsEnum GetTermsEnum(AtomicReaderContext context) - { - TermState state = termStates.Get(context.Ord); - if (state == null) - return null; - TermsEnum iterator = context.AtomicReader.GetTerms(outerInstance.term.Field).GetIterator(null); - iterator.SeekExact(outerInstance.term.Bytes, state); - return iterator; - } - - private bool TermNotInReader(AtomicReader reader, Term term) - { - return reader.DocFreq(term) == 0; - } - - public override Explanation Explain(AtomicReaderContext context, int doc) - { - Scorer scorer = GetScorer(context, context.AtomicReader.LiveDocs); - if (scorer == null || scorer.Advance(doc) != doc) - return new ComplexExplanation(false, 0.0f, "no matching term"); - float freq = scorer.Freq; - Similarity.SimScorer simScorer = similarity.GetSimScorer(stats, context); - ComplexExplanation complexExplanation = new ComplexExplanation(); - complexExplanation.Description = "weight(" + Query + " in " + doc + ") [" + similarity.GetType().Name + "], result of:"; - Explanation detail = simScorer.Explain(doc, new Explanation(freq, "termFreq=" + freq)); - complexExplanation.AddDetail(detail); - complexExplanation.Value = detail.Value; - complexExplanation.Match = true; - return complexExplanation; - } + BytesRef bytesRef = values[i]; + if(i != 0) buffer.Append(", "); + buffer.Append(bytesRef.Utf8ToString()); } + buffer.Append(')'); + buffer.Append(ToStringUtils.Boost(Boost)); + return buffer.ToString(); } - internal sealed class CustomTermScorer : Scorer + protected override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts) { - private readonly DocsEnum docsEnum; - private readonly Similarity.SimScorer docScorer; - - internal CustomTermScorer(Weight weight, DocsEnum td, Similarity.SimScorer docScorer) - : base(weight) - { - this.docScorer = docScorer; - docsEnum = td; - } - - public override int DocID => docsEnum.DocID; + TermsEnum termsEnum = terms.GetEnumerator(); + return new InTermsEnum(termsEnum, this.values); + } +} - public override int Freq => docsEnum.Freq; +public class InTermsEnum : TermsEnum +{ + private readonly TermsEnum inner; + private readonly BytesRef[] terms; + private int index = 0; + private BytesRef current; + + public override IComparer Comparer => inner.Comparer; + public override BytesRef Term => inner.Term; + public override long Ord => inner.Ord; + public override int DocFreq => inner.DocFreq; + public override long TotalTermFreq => inner.TotalTermFreq; + + public InTermsEnum(TermsEnum inner, BytesRef[] terms) + { + this.inner = inner; + this.terms = terms; + } - public override int NextDoc() - { - return docsEnum.NextDoc(); - } + public override BytesRef Next() + { + if (MoveNext()) + return current; + return null; + } - public override float GetScore() + public override bool MoveNext() + { + while (true) { - return docScorer.Score(docsEnum.DocID, docsEnum.Freq); - } + if (index >= terms.Length) + return false; - public override int Advance(int target) - { - return docsEnum.Advance(target); - } + BytesRef next = terms[index++]; + if (inner.SeekCeil(next) == SeekStatus.END) + return false; - public override long GetCost() - { - return docsEnum.GetCost(); - } + if(!next.BytesEquals(inner.Term)) + continue; - public override string ToString() - { - return "scorer(" + m_weight + ")"; + current = inner.Term; + return true; } } -} + + public override TermState GetTermState() => inner.GetTermState(); + public override SeekStatus SeekCeil(BytesRef text) => throw new NotSupportedException(); + public override void SeekExact(long ord) => throw new NotSupportedException(); + public override bool SeekExact(BytesRef text) => throw new NotSupportedException(); + public override void SeekExact(BytesRef term, TermState state) => throw new NotSupportedException(); + public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags) => inner.Docs(liveDocs, reuse, flags); + public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) => inner.DocsAndPositions(liveDocs, reuse, flags); +} \ No newline at end of file diff --git a/src/DotJEM.Json.Index2.QueryParsers/SimplifiedLuceneQueryAstVisitor.cs b/src/DotJEM.Json.Index2.QueryParsers/SimplifiedLuceneQueryAstVisitor.cs index 298afaa..2184be6 100644 --- a/src/DotJEM.Json.Index2.QueryParsers/SimplifiedLuceneQueryAstVisitor.cs +++ b/src/DotJEM.Json.Index2.QueryParsers/SimplifiedLuceneQueryAstVisitor.cs @@ -29,7 +29,7 @@ public ContentTypeContext(IEnumerable enumerable) public override string ToString() { - return $"ContentTypes(" +string.Join(";", contentTypes)+")"; + return $"ContentTypes(" + string.Join(";", contentTypes) + ")"; } } @@ -107,7 +107,6 @@ public override LuceneQueryInfo Visit(FieldQuery ast, ContentTypeContext context } } - switch (ast.Operator) { case FieldOperator.None: @@ -214,7 +213,7 @@ LuceneQuery CreateGreaterThanQuery(string field, Value val, bool inclusive) case StringValue stringValue: return TermRangeQuery.NewStringRange(field, stringValue.Value, null, inclusive, inclusive); - + } throw new ArgumentOutOfRangeException(); } @@ -240,12 +239,14 @@ LuceneQuery CreateSimpleQuery(string field, Value val) { case MatchAllValue _: return new WildcardQuery(new Term(field, "*")); + case NumberValue numberValue: return NumericRangeQuery.NewDoubleRange(field, numberValue.Value, numberValue.Value, true, true); + case IntegerValue integerValue: return NumericRangeQuery.NewInt64Range(field, integerValue.Value, integerValue.Value, true, true); - case PhraseValue phraseValue: + case PhraseValue phraseValue: TokenStream source = analyzer.GetTokenStream(field, new StringReader(phraseValue.Value)); source.Reset(); CachingTokenFilter buffer = new CachingTokenFilter(source); @@ -276,9 +277,21 @@ LuceneQuery CreateSimpleQuery(string field, Value val) return phrase; case WildcardValue wildcardValue: + //TODO: Wildcards should probably also be analyzed in some way. return new WildcardQuery(new Term(field, wildcardValue.Value)); + case StringValue stringValue: - return new CustomTermQuery(new Term(field, stringValue.Value)); + Term[] terms = ReadTerms(field, stringValue).ToArray(); + if(terms.Length == 1) + return new TermQuery(terms.First()); + + BooleanQuery q = new BooleanQuery(); + foreach (Term term in terms) + { + q.Add(new (new TermQuery(term), Occur.MUST)); + } + return q; + //TODO: Just use the standard term query, for not this is just testing. } throw new ArgumentOutOfRangeException(); } @@ -287,6 +300,23 @@ LuceneQuery CreateSimpleQuery(string field, Value val) return query; } + private IEnumerable ReadTerms(string field, StringValue value) + { + using TokenStream source = analyzer.GetTokenStream(field, new StringReader(value.Value)); + source.Reset(); + + using CachingTokenFilter buffer = new CachingTokenFilter(source); + buffer.Reset(); + + if (!buffer.TryGetAttribute(out ITermToBytesRefAttribute attribute)) + yield break; + while (buffer.IncrementToken()) + { + attribute.FillBytesRef(); + yield return new Term(field, BytesRef.DeepCopyOf(attribute.BytesRef)); + } + } + private LuceneQueryInfo VisitComposite(CompositeQuery ast, ContentTypeContext context, Occur occur) { BooleanQuery query = new BooleanQuery(); diff --git a/src/DotJEM.Json.Index2.Test/MyFrameworkClassTest.cs b/src/DotJEM.Json.Index2.Test/JsonIndexTest.cs similarity index 100% rename from src/DotJEM.Json.Index2.Test/MyFrameworkClassTest.cs rename to src/DotJEM.Json.Index2.Test/JsonIndexTest.cs diff --git a/src/DotJEM.Json.Index2.sln b/src/DotJEM.Json.Index2.sln index e01eaaa..4ff693d 100644 --- a/src/DotJEM.Json.Index2.sln +++ b/src/DotJEM.Json.Index2.sln @@ -25,6 +25,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DotJEM.Json.Index2.Snapshot EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotJEM.Json.Index2.Contexts.Test", "DotJEM.Json.Index2.Contexts.Test\DotJEM.Json.Index2.Contexts.Test.csproj", "{8F30DDCD-334C-4C47-AE47-9DCEB0CB4C73}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotJEM.Json.Index2.QueryParsers.Test", "DotJEM.Json.Index2.QueryParsers.Test\DotJEM.Json.Index2.QueryParsers.Test.csproj", "{10159BF9-8B8C-4BE1-8DB8-321839C6C267}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -55,6 +57,10 @@ Global {8F30DDCD-334C-4C47-AE47-9DCEB0CB4C73}.Debug|Any CPU.Build.0 = Debug|Any CPU {8F30DDCD-334C-4C47-AE47-9DCEB0CB4C73}.Release|Any CPU.ActiveCfg = Release|Any CPU {8F30DDCD-334C-4C47-AE47-9DCEB0CB4C73}.Release|Any CPU.Build.0 = Release|Any CPU + {10159BF9-8B8C-4BE1-8DB8-321839C6C267}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {10159BF9-8B8C-4BE1-8DB8-321839C6C267}.Debug|Any CPU.Build.0 = Debug|Any CPU + {10159BF9-8B8C-4BE1-8DB8-321839C6C267}.Release|Any CPU.ActiveCfg = Release|Any CPU + {10159BF9-8B8C-4BE1-8DB8-321839C6C267}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE