Merge pull request apache#3 from apache/master

pull from upstream finally
desultir · Aug 1, 2016 · ce035ce · ce035ce
2 parents 4ed97f0 + d86c369
commit ce035ce
Show file tree

Hide file tree

Showing 752 changed files with 30,732 additions and 13,531 deletions.
diff --git a/build.xml b/build.xml
@@ -20,7 +20,7 @@
 <project name="lucene-solr" default="-projecthelp" basedir=".">
   <import file="lucene/common-build.xml"/>
 
-  <property name="jgit-version" value="4.2.0.201601211800-r"/>
+  <property name="jgit-version" value="4.4.1.201607150455-r"/>
 
   <property name="tests.heap-dump-dir" location="heapdumps"/>
 
@@ -151,6 +151,7 @@
         (~$/\$$Id\b/$) : 'svn keyword',
         (~$/\$$Header\b/$) : 'svn keyword',
         (~$/\$$Source\b/$) : 'svn keyword',
+        (~$/^\uFEFF/$) : 'UTF-8 byte order mark'
       ];
       
       def baseDir = properties['validate.baseDir'];
@@ -165,10 +166,17 @@
       }
       
       def javadocsPattern = ~$/(?sm)^\Q/**\E(.*?)\Q*/\E/$;
+      def commentPattern  = ~$/(?sm)^\Q/*\E(.*?)\Q*/\E/$;
       def lineSplitter = ~$/[\r\n]+/$;
       def licenseMatcher = Defaults.createDefaultMatcher();
       def validLoggerPattern = ~$/(?s)\b(private\s|static\s|final\s){3}+\s*Logger\s+\p{javaJavaIdentifierStart}+\s+=\s+\QLoggerFactory.getLogger(MethodHandles.lookup().lookupClass());\E/$;
+      def packagePattern = ~$/(?m)^\s*package\s+org\.apache.*;/$;
       
+      def isLicense = { matcher, ratDocument ->
+        licenseMatcher.reset();
+        return lineSplitter.split(matcher.group(1)).any{ licenseMatcher.match(ratDocument, it) };
+      }
+
       ant.fileScanner{
         fileset(dir: baseDir){
           extensions.each{
@@ -196,17 +204,32 @@
           }
         }
         def javadocsMatcher = javadocsPattern.matcher(text);
+        def ratDocument = new FileDocument(f);
         while (javadocsMatcher.find()) {
-          def ratDocument = new FileDocument(f);
-          licenseMatcher.reset();
-          if (lineSplitter.split(javadocsMatcher.group(1)).any{ licenseMatcher.match(ratDocument, it) }) {
+          if (isLicense(javadocsMatcher, ratDocument)) {
             reportViolation(f, String.format(Locale.ENGLISH, 'javadoc-style license header [%s]',
               ratDocument.getMetaData().value(MetaData.RAT_URL_LICENSE_FAMILY_NAME)));
           }
         }
-        if (f.toString().endsWith('.java') && text.contains('org.slf4j.LoggerFactory')) {
-          if (!validLoggerPattern.matcher(text).find()) {
-            reportViolation(f, 'invalid logging pattern [not private static final, uses static class name]');
+        if (f.toString().endsWith('.java')) {
+          if (text.contains('org.slf4j.LoggerFactory')) {
+            if (!validLoggerPattern.matcher(text).find()) {
+              reportViolation(f, 'invalid logging pattern [not private static final, uses static class name]');
+            }
+          }
+          def packageMatcher = packagePattern.matcher(text);
+          if (packageMatcher.find()) {
+            def packageStartPos = packageMatcher.start();
+            def commentMatcher = commentPattern.matcher(text);
+            while (commentMatcher.find()) {
+              if (isLicense(commentMatcher, ratDocument)) {
+                if (commentMatcher.start() < packageStartPos) {
+                  break; // This file is all good, so break loop: license header precedes package definition
+                } else {
+                  reportViolation(f, 'package declaration precedes license header');
+                }
+              }
+            }
           }
         }
       };
@@ -739,13 +762,15 @@ Test args: [${args}]</echo>
   <target name="jenkins-hourly">
     <antcall>
       <param name="is.jenkins.build" value="true"/>
+      <param name="tests.haltonfailure" value="false"/>
       <target name="-jenkins-base"/>
     </antcall>
   </target>
 
   <target name="jenkins-nightly">
     <antcall>
       <param name="is.jenkins.build" value="true"/>
+      <param name="tests.haltonfailure" value="false"/>
       <param name="tests.nightly" value="true"/>
       <target name="-jenkins-base"/>
     </antcall>

diff --git a/dev-tools/idea/.idea/copyright/profiles_settings.xml b/dev-tools/idea/.idea/copyright/profiles_settings.xml
diff --git a/dev-tools/maven/lucene/analysis/common/pom.xml.template b/dev-tools/maven/lucene/analysis/common/pom.xml.template
@@ -69,5 +69,18 @@
         </excludes>
       </testResource>
     </testResources>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
   </build>
 </project>
diff --git a/dev-tools/maven/solr/contrib/analysis-extras/pom.xml.template b/dev-tools/maven/solr/contrib/analysis-extras/pom.xml.template
@@ -42,6 +42,13 @@
     <url>${vc-browse-base-url};f=${module-directory}</url>
   </scm>
   <dependencies>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers-common</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <!-- lucene-test-framework dependency must be declared before lucene-core -->
       <!-- This dependency cannot be put into solr-parent, because local        -->

diff --git a/dev-tools/maven/solr/test-framework/pom.xml.template b/dev-tools/maven/solr/test-framework/pom.xml.template
@@ -58,10 +58,7 @@
     <testSourceDirectory>${module-path}/src/test</testSourceDirectory>
     <resources>
       <resource>
-        <directory>${module-path}</directory>
-        <excludes>
-          <exclude>**/*.java</exclude>
-        </excludes>
+        <directory>${module-path}/src/resources</directory>
       </resource>
     </resources>
     <plugins>

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -5,6 +5,19 @@ http://s.apache.org/luceneversions
 
 ======================= Lucene 7.0.0 =======================
 
+API Changes
+
+* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
+  Use setSplitOnWhitespace(true) to get the old behavior.  (Steve Rowe)
+
+* LUCENE-7369: Similarity.coord and BooleanQuery.disableCoord are removed.
+  (Adrien Grand)
+
+* LUCENE-7368: Removed query normalization. (Adrien Grand)
+
+* LUCENE-7355: AnalyzingQueryParser has been removed as its functionality has
+  been folded into the classic QueryParser. (Adrien Grand)
+
 Bug Fixes
 
 Improvements
@@ -15,10 +28,19 @@ Other
 
 * LUCENE-6968: LSH Filter (Tommaso Teofili, Andy Hind, Cao Manh Dat)
 
+* LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
+
 ======================= Lucene 6.2.0 =======================
 
+API Changes
+
+* ScoringWrapperSpans was removed since it had no purpose or effect as of Lucene 5.5.
+
 New Features
 
+* LUCENE-7381: Add point based DoubleRangeField and RangeFieldQuery for
+  indexing and querying on Ranges up to 4 dimensions (Nick Knize)
+
 * LUCENE-7302: IndexWriter methods that change the index now return a
   long "sequence number" indicating the effective equivalent
   single-threaded execution order (Mike McCandless)
@@ -31,10 +53,38 @@ New Features
   analyzer for the Ukrainian language (Andriy Rysin via Mike
   McCandless)
 
+* LUCENE-7373: Directory.renameFile, which did both renaming and fsync
+  of the directory metadata, has been deprecated; use the new separate
+  methods Directory.rename and Directory.syncMetaData instead (Robert Muir,
+  Uwe Schindler, Mike McCandless)
+
+* LUCENE-7355: Added Analyzer#normalize(), which only applies normalization to
+  an input string. (Adrien Grand)
+
+* LUCENE-7380: Add Polygon.fromGeoJSON for more easily creating
+  Polygon instances from a standard GeoJSON string (Robert Muir, Mike
+  McCandless)
+
+* SOLR-9279: Queries module: new ComparisonBoolFunction base class
+  (Doug Turnbull via David Smiley)
+
 Bug Fixes
 
 * LUCENE-6662: Fixed potential resource leaks. (Rishabh Patel via Adrien Grand)
 
+* LUCENE-7340: MemoryIndex.toString() could throw NPE; fixed. Renamed to toStringDebug().
+  (Daniel Collins, David Smiley)
+
+* LUCENE-7382: Fix bug introduced by LUCENE-7355 that used the
+  wrong default AttributeFactory for new Tokenizers.
+  (Terry Smith, Uwe Schindler)
+
+* LUCENE-7389: Fix FieldType.setDimensions(...) validation for the dimensionNumBytes
+  parameter. (Martijn van Groningen)
+
+* LUCENE-7391: Fix performance regression in MemoryIndex's fields() introduced
+  in Lucene 6. (Steve Mason via David Smiley)
+
 Improvements
 
 * LUCENE-7323: Compound file writing now verifies the incoming
@@ -66,12 +116,48 @@ Improvements
   and empty boolean queries now rewrite to MatchNoDocsQuery instead of
   vice/versa (Jim Ferenczi via Mike McCandless)
 
+* LUCENE-7359: Add equals() and hashCode() to Explanation (Alan Woodward)
+
+* LUCENE-7353: ScandinavianFoldingFilterFactory and
+  ScandinavianNormalizationFilterFactory now implement MultiTermAwareComponent.
+  (Adrien Grand)
+
+* LUCENE-2605: Add classic QueryParser option setSplitOnWhitespace() to
+  control whether to split on whitespace prior to text analysis.  Default
+  behavior remains unchanged: split-on-whitespace=true. (Steve Rowe)
+
+* LUCENE-7276: MatchNoDocsQuery now includes an optional reason for
+  why it was used (Jim Ferenczi via Mike McCandless)
+
+* LUCENE-7355: AnalyzingQueryParser now only applies the subset of the analysis
+  chain that is about normalization for range/fuzzy/wildcard queries.
+  (Adrien Grand)
+
+* LUCENE-7376: Add support for ToParentBlockJoinQuery to fast vector highlighter's
+  FieldQuery. (Martijn van Groningen)
+
+* LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley)
+
+* LUCENE-7393: Add ICUTokenizer option to parse Myanmar text as syllables instead of words,
+  because the ICU word-breaking algorithm has some issues. This allows for the previous 
+  tokenization used before Lucene 5. (AM, Robert Muir)
+
 Optimizations
 
 * LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)
 
 * LUCENE-7356: SearchGroup tweaks. (Christine Poerschke)
 
+* LUCENE-7351: Doc id compression for points. (Adrien Grand)
+
+* LUCENE-7371: Point values are now better compressed using run-length
+  encoding. (Adrien Grand)
+
+* LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.
+  (Adrien Grand)
+
+* LUCENE-7396: Faster flush of points. (Adrien Grand, Mike McCandless)
+
 Other
 
 * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien
@@ -81,7 +167,15 @@ Other
 
 * LUCENE-7346: Update forbiddenapis to version 2.2.
   (Uwe Schindler)
+
+* LUCENE-7360: Explanation.toHtml() is deprecated. (Alan Woodward)
 
+* LUCENE-7372: Factor out an org.apache.lucene.search.FilterWeight class.
+  (Christine Poerschke, Adrien Grand, David Smiley)
+
+* LUCENE-7384: Removed ScoringWrapperSpans. And tweaked SpanWeight.buildSimWeight() to
+  reuse the existing Similarity instead of creating a new one. (David Smiley)
+
 ======================= Lucene 6.1.0 =======================
 
 New Features

diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt
@@ -14,3 +14,36 @@ yielding better compression ratios. In case you would still like to compress on
 top of the codec, you can do it on the application side by using the utility
 classes from the java.util.zip package.
 
+## Explanation.toHtml() removed (LUCENE-7360)
+
+Clients wishing to render Explanations as HTML should implement their own
+utilities for this.
+
+## Similarity.coord and BooleanQuery.disableCoord removed (LUCENE-7369)
+
+Coordination factors were a workaround for the fact that the ClassicSimilarity
+does not have strong enough term frequency saturation. This causes disjunctions
+to get better scores on documents that have many occurrences of a few query
+terms than on documents that match most clauses, which is most of time
+undesirable. The new BM25Similarity does not suffer from this problem since it
+has better saturation for the contribution of the term frequency so the coord
+factors have been removed from scores. Things now work as if coords were always
+disabled when constructing boolean queries.
+
+## Weight.getValueForNormalization() and Weight.normalize() removed (LUCENE-7368)
+
+Query normalization's goal was to make scores comparable across queries, which
+was only implemented by the ClassicSimilarity. Since ClassicSimilarity is not
+the default similarity anymore, this functionality has been removed. Boosts are
+now propagated through Query#createWeight.
+
+## AnalyzingQueryParser removed (LUCENE-7355)
+
+The functionality of AnalyzingQueryParser has been folded into the classic
+QueryParser, which now passes terms through Analyzer#normalize when generating
+queries.
+
+## CommonQueryParserConfiguration.setLowerCaseExpandedTerms removed (LUCENE-7355)
+
+This option has been removed as expanded terms are now normalized through
+Analyzer#normalize.
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -143,5 +143,13 @@ protected TokenStreamComponents createComponents(String fieldName) {
     }
     return new TokenStreamComponents(source, new ArabicStemFilter(result));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new LowerCaseFilter(in);
+    result = new DecimalDigitFilter(result);
+    result = new ArabicNormalizationFilter(result);
+    return result;
+  }
 }
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
@@ -126,4 +126,11 @@ public TokenStreamComponents createComponents(String fieldName) {
     result = new BulgarianStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -127,5 +127,12 @@ protected TokenStreamComponents createComponents(String fieldName) {
       result = new SetKeywordMarkerFilter(result, excltable);
     return new TokenStreamComponents(source, new BrazilianStemFilter(result));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
@@ -130,4 +130,12 @@ protected TokenStreamComponents createComponents(String fieldName) {
     result = new SnowballFilter(result, new CatalanStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new ElisionFilter(result, DEFAULT_ARTICLES);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }
diff --git a/...e/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java b/...e/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
diff --git a/.../analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex b/.../analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.lucene.analysis.charfilter;
 
 import java.io.IOException;