Collations with nondeterministic comparison

This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <[email protected]> Reviewed-by: Peter Geoghegan <[email protected]> Discussion: https://www.postgresql.org/message-id/flat/[email protected]
dwsteele · Mar 22, 2019 · 5e1963f · 5e1963f
1 parent 2ab6d28
commit 5e1963f
Show file tree

Hide file tree

Showing 69 changed files with 2,087 additions and 239 deletions.
diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h
@@ -137,6 +137,7 @@ typedef struct BloomMetaPageData
 typedef struct BloomState
 {
 	FmgrInfo	hashFn[INDEX_MAX_KEYS];
+	Oid			collations[INDEX_MAX_KEYS];
 	BloomOptions opts;			/* copy of options on index's metapage */
 	int32		nColumns;
 

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
@@ -163,6 +163,7 @@ initBloomState(BloomState *state, Relation index)
 		fmgr_info_copy(&(state->hashFn[i]),
 					   index_getprocinfo(index, i + 1, BLOOM_HASH_PROC),
 					   CurrentMemoryContext);
+		state->collations[i] = index->rd_indcollation[i];
 	}
 
 	/* Initialize amcache if needed with options from metapage */
@@ -267,7 +268,7 @@ signValue(BloomState *state, BloomSignatureWord *sign, Datum value, int attno)
 	 * different columns will be mapped into different bits because of step
 	 * above
 	 */
-	hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value));
+	hashVal = DatumGetInt32(FunctionCall1Coll(&state->hashFn[attno], state->collations[attno], value));
 	mySrand(hashVal ^ myRand());
 
 	for (j = 0; j < state->opts.bitSize[attno]; j++)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
@@ -2077,6 +2077,13 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        default, <literal>c</literal> = libc, <literal>i</literal> = icu</entry>
      </row>
 
+     <row>
+      <entry><structfield>collisdeterministic</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>Is the collation deterministic?</entry>
+     </row>
+
      <row>
       <entry><structfield>collencoding</structfield></entry>
       <entry><type>int4</type></entry>

diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
@@ -847,11 +847,13 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
 
    <para>
     Note that while this system allows creating collations that <quote>ignore
-    case</quote> or <quote>ignore accents</quote> or similar (using
-    the <literal>ks</literal> key), PostgreSQL does not at the moment allow
-    such collations to act in a truly case- or accent-insensitive manner.  Any
-    strings that compare equal according to the collation but are not
-    byte-wise equal will be sorted according to their byte values.
+    case</quote> or <quote>ignore accents</quote> or similar (using the
+    <literal>ks</literal> key), in order for such collations to act in a
+    truly case- or accent-insensitive manner, they also need to be declared as not
+    <firstterm>deterministic</firstterm> in <command>CREATE COLLATION</command>;
+    see <xref linkend="collation-nondeterministic"/>.
+    Otherwise, any strings that compare equal according to the collation but
+    are not byte-wise equal will be sorted according to their byte values.
    </para>
 
    <note>
@@ -883,6 +885,55 @@ CREATE COLLATION french FROM "fr-x-icu";
    </para>
    </sect4>
    </sect3>
+
+   <sect3 id="collation-nondeterministic">
+    <title>Nondeterminstic Collations</title>
+
+    <para>
+     A collation is either <firstterm>deterministic</firstterm> or
+     <firstterm>nondeterministic</firstterm>.  A deterministic collation uses
+     deterministic comparisons, which means that it considers strings to be
+     equal only if they consist of the same byte sequence.  Nondeterministic
+     comparison may determine strings to be equal even if they consist of
+     different bytes.  Typical situations include case-insensitive comparison,
+     accent-insensitive comparison, as well as comparion of strings in
+     different Unicode normal forms.  It is up to the collation provider to
+     actually implement such insensitive comparisons; the deterministic flag
+     only determines whether ties are to be broken using bytewise comparison.
+     See also <ulink url="https://unicode.org/reports/tr10">Unicode Technical
+     Standard 10</ulink> for more information on the terminology.
+    </para>
+
+    <para>
+     To create a nondeterministic collation, specify the property
+     <literal>deterministic = false</literal> to <command>CREATE
+     COLLATION</command>, for example:
+<programlisting>
+CREATE COLLATION ndcoll (provider = icu, locale = 'und', deterministic = false);
+</programlisting>
+     This example would use the standard Unicode collation in a
+     nondeterministic way.  In particular, this would allow strings in
+     different normal forms to be compared correctly.  More interesting
+     examples make use of the ICU customization facilities explained above.
+     For example:
+<programlisting>
+CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
+CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-true', deterministic = false);
+</programlisting>
+    </para>
+
+    <para>
+     All standard and predefined collations are deterministic, all
+     user-defined collations are deterministic by default.  While
+     nondeterministic collations give a more <quote>correct</quote> behavior,
+     especially when considering the full power of Unicode and its many
+     special cases, they also have some drawbacks.  Foremost, their use leads
+     to a performance penalty.  Also, certain operations are not possible with
+     nondeterministic collations, such as pattern matching operations.
+     Therefore, they should be used only in cases where they are specifically
+     wanted.
+    </para>
+   </sect3>
   </sect2>
  </sect1>
 

diff --git a/doc/src/sgml/citext.sgml b/doc/src/sgml/citext.sgml
@@ -14,6 +14,16 @@
   exactly like <type>text</type>.
  </para>
 
+ <tip>
+  <para>
+   Consider using <firstterm>nondeterministic collations</firstterm> (see
+   <xref linkend="collation-nondeterministic"/>) instead of this module.  They
+   can be used for case-insensitive comparisons, accent-insensitive
+   comparisons, and other combinations, and they handle more Unicode special
+   cases correctly.
+  </para>
+ </tip>
+
  <sect2>
   <title>Rationale</title>
 
@@ -246,6 +256,17 @@ SELECT * FROM users WHERE nick = 'Larry';
       will be invoked instead.
     </para>
     </listitem>
+
+    <listitem>
+     <para>
+      The approach of lower-casing strings for comparison does not handle some
+      Unicode special cases correctly, for example when one upper-case letter
+      has two lower-case letter equivalents.  Unicode distinguishes between
+      <firstterm>case mapping</firstterm> and <firstterm>case
+      folding</firstterm> for this reason.  Use nondeterministic collations
+      instead of <type>citext</type> to handle that correctly.
+     </para>
+    </listitem>
    </itemizedlist>
  </sect2>
 

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
@@ -4065,6 +4065,12 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
     </para>
    </caution>
 
+   <para>
+    The pattern matching operators of all three kinds do not support
+    nondeterministic collations.  If required, apply a different collation to
+    the expression to work around this limitation.
+   </para>
+
   <sect2 id="functions-like">
    <title><function>LIKE</function></title>
 

diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml
@@ -23,6 +23,7 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> (
     [ LC_COLLATE = <replaceable>lc_collate</replaceable>, ]
     [ LC_CTYPE = <replaceable>lc_ctype</replaceable>, ]
     [ PROVIDER = <replaceable>provider</replaceable>, ]
+    [ DETERMINISTIC = <replaceable>boolean</replaceable>, ]
     [ VERSION = <replaceable>version</replaceable> ]
 )
 CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replaceable>existing_collation</replaceable>
@@ -124,6 +125,27 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
      </listitem>
     </varlistentry>
 
+    <varlistentry>
+     <term><literal>DETERMINISTIC</literal></term>
+
+     <listitem>
+      <para>
+       Specifies whether the collation should use deterministic comparisons.
+       The default is true.  A deterministic comparison considers strings that
+       are not byte-wise equal to be unequal even if they are considered
+       logically equal by the comparison.  PostgreSQL breaks ties using a
+       byte-wise comparison.  Comparison that is not deterministic can make the
+       collation be, say, case- or accent-insensitive.  For that, you need to
+       choose an appropriate <literal>LC_COLLATE</literal> setting
+       <emphasis>and</emphasis> set the collation to not deterministic here.
+      </para>
+
+      <para>
+       Nondeterministic collations are only supported with the ICU provider.
+      </para>
+     </listitem>
+    </varlistentry>
+
     <varlistentry>
      <term><replaceable>version</replaceable></term>
 

diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
@@ -27,8 +27,10 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "catalog/pg_collation.h"
 #include "utils/builtins.h"
 #include "utils/hashutils.h"
+#include "utils/pg_locale.h"
 
 /*
  * Datatype-specific hash functions.
@@ -243,15 +245,51 @@ Datum
 hashtext(PG_FUNCTION_ARGS)
 {
 	text	   *key = PG_GETARG_TEXT_PP(0);
+	Oid			collid = PG_GET_COLLATION();
+	pg_locale_t	mylocale = 0;
 	Datum		result;
 
-	/*
-	 * Note: this is currently identical in behavior to hashvarlena, but keep
-	 * it as a separate function in case we someday want to do something
-	 * different in non-C locales.  (See also hashbpchar, if so.)
-	 */
-	result = hash_any((unsigned char *) VARDATA_ANY(key),
-					  VARSIZE_ANY_EXHDR(key));
+	if (!collid)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for string hashing"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
+
+	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
+		mylocale = pg_newlocale_from_collation(collid);
+
+	if (!mylocale || mylocale->deterministic)
+	{
+		result = hash_any((unsigned char *) VARDATA_ANY(key),
+						  VARSIZE_ANY_EXHDR(key));
+	}
+	else
+	{
+#ifdef USE_ICU
+		if (mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		ulen = -1;
+			UChar	   *uchar = NULL;
+			Size		bsize;
+			uint8_t	   *buf;
+
+			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
+
+			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+									uchar, ulen, NULL, 0);
+			buf = palloc(bsize);
+			ucol_getSortKey(mylocale->info.icu.ucol,
+							uchar, ulen, buf, bsize);
+
+			result = hash_any(buf, bsize);
+
+			pfree(buf);
+		}
+		else
+#endif
+			/* shouldn't happen */
+			elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
+	}
 
 	/* Avoid leaking memory for toasted inputs */
 	PG_FREE_IF_COPY(key, 0);
@@ -263,12 +301,52 @@ Datum
 hashtextextended(PG_FUNCTION_ARGS)
 {
 	text	   *key = PG_GETARG_TEXT_PP(0);
+	Oid			collid = PG_GET_COLLATION();
+	pg_locale_t	mylocale = 0;
 	Datum		result;
 
-	/* Same approach as hashtext */
-	result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
-							   VARSIZE_ANY_EXHDR(key),
-							   PG_GETARG_INT64(1));
+	if (!collid)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for string hashing"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
+
+	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
+		mylocale = pg_newlocale_from_collation(collid);
+
+	if (!mylocale || mylocale->deterministic)
+	{
+		result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
+								   VARSIZE_ANY_EXHDR(key),
+								   PG_GETARG_INT64(1));
+	}
+	else
+	{
+#ifdef USE_ICU
+		if (mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		ulen = -1;
+			UChar	   *uchar = NULL;
+			Size		bsize;
+			uint8_t	   *buf;
+
+			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
+
+			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+									uchar, ulen, NULL, 0);
+			buf = palloc(bsize);
+			ucol_getSortKey(mylocale->info.icu.ucol,
+							uchar, ulen, buf, bsize);
+
+			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
+
+			pfree(buf);
+		}
+		else
+#endif
+			/* shouldn't happen */
+			elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
+	}
 
 	PG_FREE_IF_COPY(key, 0);
 

diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c
@@ -630,7 +630,8 @@ spg_text_leaf_consistent(PG_FUNCTION_ARGS)
 			 * query (prefix) string, so we don't need to check it again.
 			 */
 			res = (level >= queryLen) ||
-				DatumGetBool(DirectFunctionCall2(text_starts_with,
+				DatumGetBool(DirectFunctionCall2Coll(text_starts_with,
+													 PG_GET_COLLATION(),
 												 out->leafValue,
 												 PointerGetDatum(query)));
 

diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c
@@ -46,6 +46,7 @@ Oid
 CollationCreate(const char *collname, Oid collnamespace,
 				Oid collowner,
 				char collprovider,
+				bool collisdeterministic,
 				int32 collencoding,
 				const char *collcollate, const char *collctype,
 				const char *collversion,
@@ -160,6 +161,7 @@ CollationCreate(const char *collname, Oid collnamespace,
 	values[Anum_pg_collation_collnamespace - 1] = ObjectIdGetDatum(collnamespace);
 	values[Anum_pg_collation_collowner - 1] = ObjectIdGetDatum(collowner);
 	values[Anum_pg_collation_collprovider - 1] = CharGetDatum(collprovider);
+	values[Anum_pg_collation_collisdeterministic - 1] = BoolGetDatum(collisdeterministic);
 	values[Anum_pg_collation_collencoding - 1] = Int32GetDatum(collencoding);
 	namestrcpy(&name_collate, collcollate);
 	values[Anum_pg_collation_collcollate - 1] = NameGetDatum(&name_collate);