From 19ce5e91bc7decbccd981bc11626bde3a91b90fc Mon Sep 17 00:00:00 2001 From: Shad Storhaug Date: Mon, 20 Jul 2020 17:51:21 +0700 Subject: [PATCH] Lucene.Net.Tests.BenchmarkDotNet: Added benchmarks for IndexFiles and SearchFiles --- .../IndexFilesBenchmarks.cs | 231 ++++++++++++++++++ .../Lucene.Net.Tests.BenchmarkDotNet.csproj | 7 + .../SearchFilesBenchmarks.cs | 128 ++++++++++ .../Util/ContentGenerator.cs | 110 +++++++++ .../Util/PathUtil.cs | 64 +++++ 5 files changed, 540 insertions(+) create mode 100644 src/Lucene.Net.Tests.BenchmarkDotNet/IndexFilesBenchmarks.cs create mode 100644 src/Lucene.Net.Tests.BenchmarkDotNet/SearchFilesBenchmarks.cs create mode 100644 src/Lucene.Net.Tests.BenchmarkDotNet/Util/ContentGenerator.cs create mode 100644 src/Lucene.Net.Tests.BenchmarkDotNet/Util/PathUtil.cs diff --git a/src/Lucene.Net.Tests.BenchmarkDotNet/IndexFilesBenchmarks.cs b/src/Lucene.Net.Tests.BenchmarkDotNet/IndexFilesBenchmarks.cs new file mode 100644 index 0000000000..3a9119e4f6 --- /dev/null +++ b/src/Lucene.Net.Tests.BenchmarkDotNet/IndexFilesBenchmarks.cs @@ -0,0 +1,231 @@ +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Jobs; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Standard; +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Randomized.Generators; +using Lucene.Net.Store; +using Lucene.Net.Tests.BenchmarkDotNet.Util; +using Lucene.Net.Util; +using System; +using System.IO; +using System.Text; + +namespace Lucene.Net.Tests.BenchmarkDotNet +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [MemoryDiagnoser] + [Config(typeof(Config))] + public class IndexFilesBenchmarks + { + private class Config : ManualConfig + { + public Config() + { + var baseJob = Job.MediumRun; + + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00010").WithId("4.8.0-beta00010")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00009").WithId("4.8.0-beta00009")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00008").WithId("4.8.0-beta00008")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00007").WithId("4.8.0-beta00007")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00006").WithId("4.8.0-beta00006")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00005").WithId("4.8.0-beta00005")); + } + } + + private static DirectoryInfo sourceDirectory; + private static DirectoryInfo indexDirectory; + + [GlobalSetup] + public void GlobalSetUp() + { + sourceDirectory = PathUtil.CreateTempDir("sourceFiles"); + int seed = 2342; + ContentGenerator.GenerateFiles(new Random(seed), sourceDirectory.FullName, 250); + } + + [GlobalCleanup] + public void GlobalTearDown() + { + try + { + if (System.IO.Directory.Exists(sourceDirectory.FullName)) + System.IO.Directory.Delete(sourceDirectory.FullName, recursive: true); + } + catch { } + } + + [IterationSetup] + public void IterationSetUp() + { + indexDirectory = PathUtil.CreateTempDir("indexFiles"); + } + + [IterationCleanup] + public void IterationTearDown() + { + try + { + if (System.IO.Directory.Exists(indexDirectory.FullName)) + System.IO.Directory.Delete(indexDirectory.FullName, recursive: true); + } + catch { } + + } + + /// Index all text files under a directory. + [Benchmark] + public void IndexFiles() => IndexFiles(sourceDirectory, indexDirectory); + + /// Index all text files under a directory. + public static void IndexFiles(DirectoryInfo sourceDirectory, DirectoryInfo indexDirectory) + { + string indexPath = indexDirectory.FullName; + + bool create = true; + + Store.Directory dir = FSDirectory.Open(indexPath); + // :Post-Release-Update-Version.LUCENE_XY: + Analyzer analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48); + IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer); + + if (create) + { + // Create a new index in the directory, removing any + // previously indexed documents: + iwc.OpenMode = OpenMode.CREATE; + } + else + { + // Add new documents to an existing index: + iwc.OpenMode = OpenMode.CREATE_OR_APPEND; + } + + // Optional: for better indexing performance, if you + // are indexing many documents, increase the RAM + // buffer. + // + // iwc.RAMBufferSizeMB = 256.0; + + using (IndexWriter writer = new IndexWriter(dir, iwc)) + { + IndexDocs(writer, sourceDirectory); + + // NOTE: if you want to maximize search performance, + // you can optionally call forceMerge here. This can be + // a terribly costly operation, so generally it's only + // worth it when your index is relatively static (ie + // you're done adding documents to it): + // + // writer.ForceMerge(1); + } + } + + /// + /// Recurses over files and directories found under the + /// given directory and indexes each file. + /// + /// NOTE: This method indexes one document per input file. + /// This is slow. For good throughput, put multiple documents + /// into your input file(s). + /// + /// + /// to the index where the given + /// file/dir info will be stored + /// + /// + /// The directory to recurse into to find files to index. + /// + /// + /// If there is a low-level I/O error. + /// + internal static void IndexDocs(IndexWriter writer, DirectoryInfo directoryInfo) + { + foreach (var dirInfo in directoryInfo.GetDirectories()) + { + IndexDocs(writer, dirInfo); + } + foreach (var fileInfo in directoryInfo.GetFiles()) + { + IndexDocs(writer, fileInfo); + } + } + + /// + /// Indexes the given file using the given writer. + /// + /// + /// to the index where the given + /// file info will be stored. + /// + /// + /// The file to index. + /// + /// + /// If there is a low-level I/O error. + /// + internal static void IndexDocs(IndexWriter writer, FileInfo file) + { + using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read)) + { + // make a new, empty document + Document doc = new Document(); + + // Add the path of the file as a field named "path". Use a + // field that is indexed (i.e. searchable), but don't tokenize + // the field into separate words and don't index term frequency + // or positional information: + Field pathField = new StringField("path", file.FullName, Field.Store.YES); + doc.Add(pathField); + + // Add the last modified date of the file a field named "modified". + // Use a LongField that is indexed (i.e. efficiently filterable with + // NumericRangeFilter). This indexes to milli-second resolution, which + // is often too fine. You could instead create a number based on + // year/month/day/hour/minutes/seconds, down the resolution you require. + // For example the long value 2011021714 would mean + // February 17, 2011, 2-3 PM. + doc.Add(new Int64Field("modified", file.LastWriteTimeUtc.Ticks, Field.Store.NO)); + + // Add the contents of the file to a field named "contents". Specify a Reader, + // so that the text of the file is tokenized and indexed, but not stored. + // Note that FileReader expects the file to be in UTF-8 encoding. + // If that's not the case searching for special characters will fail. + doc.Add(new TextField("contents", new StreamReader(fs, Encoding.UTF8))); + + if (writer.Config.OpenMode == OpenMode.CREATE) + { + // New index, so we just add the document (no old document can be there): + //Console.WriteLine("adding " + file); + writer.AddDocument(doc); + } + else + { + // Existing index (an old copy of this document may have been indexed) so + // we use updateDocument instead to replace the old one matching the exact + // path, if present: + //Console.WriteLine("updating " + file); + writer.UpdateDocument(new Term("path", file.FullName), doc); + } + } + } + } +} diff --git a/src/Lucene.Net.Tests.BenchmarkDotNet/Lucene.Net.Tests.BenchmarkDotNet.csproj b/src/Lucene.Net.Tests.BenchmarkDotNet/Lucene.Net.Tests.BenchmarkDotNet.csproj index 5d4af5aea7..781c6613ec 100644 --- a/src/Lucene.Net.Tests.BenchmarkDotNet/Lucene.Net.Tests.BenchmarkDotNet.csproj +++ b/src/Lucene.Net.Tests.BenchmarkDotNet/Lucene.Net.Tests.BenchmarkDotNet.csproj @@ -12,6 +12,13 @@ + + + + + + + diff --git a/src/Lucene.Net.Tests.BenchmarkDotNet/SearchFilesBenchmarks.cs b/src/Lucene.Net.Tests.BenchmarkDotNet/SearchFilesBenchmarks.cs new file mode 100644 index 0000000000..3c2c45a338 --- /dev/null +++ b/src/Lucene.Net.Tests.BenchmarkDotNet/SearchFilesBenchmarks.cs @@ -0,0 +1,128 @@ +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Jobs; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Standard; +using Lucene.Net.Index; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Randomized.Generators; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Tests.BenchmarkDotNet.Util; +using Lucene.Net.Util; +using System; +using System.IO; + +namespace Lucene.Net.Tests.BenchmarkDotNet +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [MemoryDiagnoser] + [Config(typeof(Config))] + public class SearchFilesBenchmarks + { + private class Config : ManualConfig + { + public Config() + { + var baseJob = Job.MediumRun; + + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00010").WithNuGet("Lucene.Net.QueryParser", "4.8.0-beta00010").WithId("4.8.0-beta00010")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00009").WithNuGet("Lucene.Net.QueryParser", "4.8.0-beta00009").WithId("4.8.0-beta00009")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00008").WithNuGet("Lucene.Net.QueryParser", "4.8.0-beta00008").WithId("4.8.0-beta00008")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00007").WithNuGet("Lucene.Net.QueryParser", "4.8.0-beta00007").WithId("4.8.0-beta00007")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00006").WithNuGet("Lucene.Net.QueryParser", "4.8.0-beta00006").WithId("4.8.0-beta00006")); + AddJob(baseJob.WithNuGet("Lucene.Net.Analysis.Common", "4.8.0-beta00005").WithNuGet("Lucene.Net.QueryParser", "4.8.0-beta00005").WithId("4.8.0-beta00005")); + } + } + + private const string QueryString = "settings"; + private static DirectoryInfo indexDirectory; + + [GlobalSetup] + public void GlobalSetUp() + { + var sourceDirectory = PathUtil.CreateTempDir("sourceFiles"); + + // Generate content to index (including our string that we will search for) + int seed = 2342; + ContentGenerator.GenerateFiles(new Random(seed), sourceDirectory.FullName, 1000, QueryString); + + + // Index the content + indexDirectory = PathUtil.CreateTempDir("indexFiles"); + IndexFilesBenchmarks.IndexFiles(sourceDirectory, indexDirectory); + + // Cleanup our source files, they are no longer needed + try + { + if (System.IO.Directory.Exists(sourceDirectory.FullName)) + System.IO.Directory.Delete(sourceDirectory.FullName, recursive: true); + } + catch { } + } + + [GlobalCleanup] + public void GlobalTearDown() + { + try + { + if (System.IO.Directory.Exists(indexDirectory.FullName)) + System.IO.Directory.Delete(indexDirectory.FullName, recursive: true); + } + catch { } + } + + [Benchmark] + public void SearchFiles() + { + + string index = indexDirectory.FullName; + string field = "contents"; + //string queries = null; + int repeat = 1000; + //bool raw = false; + string queryString = QueryString; + //int hitsPerPage = 10; + + using (IndexReader reader = DirectoryReader.Open(FSDirectory.Open(index))) + { + IndexSearcher searcher = new IndexSearcher(reader); + // :Post-Release-Update-Version.LUCENE_XY: + Analyzer analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48); + + // :Post-Release-Update-Version.LUCENE_XY: + QueryParser parser = new QueryParser(LuceneVersion.LUCENE_48, field, analyzer); + + Query query = parser.Parse(queryString.Trim()); + //Console.WriteLine("Searching for: " + query.ToString(field)); + + // repeat & time as benchmark + { + //DateTime start = DateTime.UtcNow; + for (int i = 0; i < repeat; i++) + { + searcher.Search(query, null, 100); + } + //DateTime end = DateTime.UtcNow; + //Console.WriteLine("Time: " + (end - start).TotalMilliseconds + "ms"); + } + } // Disposes reader + } + } +} diff --git a/src/Lucene.Net.Tests.BenchmarkDotNet/Util/ContentGenerator.cs b/src/Lucene.Net.Tests.BenchmarkDotNet/Util/ContentGenerator.cs new file mode 100644 index 0000000000..15612e9229 --- /dev/null +++ b/src/Lucene.Net.Tests.BenchmarkDotNet/Util/ContentGenerator.cs @@ -0,0 +1,110 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Randomized.Generators +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public static class ContentGenerator + { + public static void GenerateFiles(Random random, string directory, int numberOfFiles, params string[] stringsToQuery) + { + var subdirectories = new HashSet(); + for (int i = 0; i < numberOfFiles; i++) + { + bool root = random.Next(1, 100) > 50; + + if (root) + { + GenerateFile(random, directory, stringsToQuery); + } + else + { + string subdirectory; + if (subdirectories.Count > 0 && random.Next(1, 100) > 30) + { + subdirectory = RandomPicks.RandomFrom(random, subdirectories); + } + else + { + subdirectory = RandomSimpleString(random, 5, 20); + subdirectories.Add(subdirectory); + } + GenerateFile(random, Path.Combine(directory, subdirectory), stringsToQuery); + } + } + } + + private static void GenerateFile(Random random, string directory, ICollection stringsToQuery) + { + if (!System.IO.Directory.Exists(directory)) + System.IO.Directory.CreateDirectory(directory); + + string fileName = RandomSimpleString(random, 5, 25) + ".txt"; + int paragraphs = random.Next(5, 25); + + using (var writer = new StreamWriter(Path.Combine(directory, fileName), append: false, encoding: Encoding.UTF8)) + { + for (int i = 0; i < paragraphs; i++) + { + WriteParagraph(random, writer, stringsToQuery); + } + } + } + + private static void WriteParagraph(Random random, TextWriter writer, ICollection stringsToQuery) + { + int words = random.Next(50, 100); + bool addStringsToQuery = stringsToQuery != null && stringsToQuery.Count > 0; + + for (int i = 0; i < words; i++) + { + if (addStringsToQuery && random.Next(1, 1500) == 668) + writer.Write(RandomPicks.RandomFrom(random, stringsToQuery)); + else + writer.Write(RandomSimpleString(random, 1, 8)); + + if (i + 1 < words) + writer.Write(" "); + } + writer.WriteLine("."); + writer.WriteLine(); + } + + /// + /// Returns a random string consisting only of lowercase characters 'a' through 'z'. + /// + public static string RandomSimpleString(Random r, int minLength, int maxLength) + { + int end = RandomInts.RandomInt32Between(r, minLength, maxLength); + if (end == 0) + { + // allow 0 length + return ""; + } + char[] buffer = new char[end]; + for (int i = 0; i < end; i++) + { + buffer[i] = (char)RandomInts.RandomInt32Between(r, 'a', 'z'); + } + return new string(buffer, 0, end); + } + } +} diff --git a/src/Lucene.Net.Tests.BenchmarkDotNet/Util/PathUtil.cs b/src/Lucene.Net.Tests.BenchmarkDotNet/Util/PathUtil.cs new file mode 100644 index 0000000000..aaf9c8633b --- /dev/null +++ b/src/Lucene.Net.Tests.BenchmarkDotNet/Util/PathUtil.cs @@ -0,0 +1,64 @@ +using System; +using System.IO; + +namespace Lucene.Net.Tests.BenchmarkDotNet.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public static class PathUtil + { + private const int TEMP_NAME_RETRY_THRESHOLD = 9999; + + public static DirectoryInfo CreateTempDir(string prefix) + { + //DirectoryInfo @base = BaseTempDirForTestClass(); + + int attempt = 0; + DirectoryInfo f; + bool iterate = true; + do + { + if (attempt++ >= TEMP_NAME_RETRY_THRESHOLD) + { + throw new Exception("Failed to get a temporary name too many times, check your temp directory and consider manually cleaning it: " + System.IO.Path.GetTempPath()); + } + // LUCENENET specific - need to use a random file name instead of a sequential one or two threads may attempt to do + // two operations on a file at the same time. + //f = new DirectoryInfo(Path.Combine(System.IO.Path.GetTempPath(), "LuceneTemp", prefix + "-" + attempt)); + f = new DirectoryInfo(Path.Combine(System.IO.Path.GetTempPath(), "LuceneTemp", prefix + "-" + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); + + try + { + if (!System.IO.Directory.Exists(f.FullName)) + { + f.Create(); + iterate = false; + } + } +#pragma warning disable 168 + catch (IOException exc) +#pragma warning restore 168 + { + iterate = true; + } + } while (iterate); + + return f; + } + } +}