Skip to content

Commit

Permalink
Bug #25688673: REMOVE SPECIAL-CASING OF NON-STRNXFRM-BASED COLLATIONS
Browse files Browse the repository at this point in the history
Some character sets are designated as MY_CS_STRNXFRM, meaning that sorting
needs to go through my_strnxfrm() (implemented by the charset), and some are
not, meaning that a client can do the strnxfrm itself based on
cs->sort_order. However, most of the logic related to the latter has been
removed already (e.g. filesort always uses my_strnxfrm() since 2003), and now
it's mostly in the way. The three main uses left are:

 1. A microoptimization for constructing sort keys in filesort.
 2. A home-grown implementation of Boyer-Moore for accelerating certain
    LIKE patterns that should probably be handled through FTS.
 3. Some optimizations to MyISAM prefix keys.

Given that our default collation (utf8mb4_0900_ai_ci) now is a strnxfrm-based
collation, the benefits of keeping these around for a narrow range of
single-byte locales (like latin1_swedish_ci, cp850 and a bunch of more
obscure locales) seems dubious. We seemingly can't remove the flag entirely
due to #3 seemingly affecting the on-disk MyISAM structure, but we can remove
the code for #1 and #2.

Change-Id: If974e490d451b7278355e33ab1fca993f446b792
  • Loading branch information
Steinar H. Gunderson committed Mar 15, 2017
1 parent 586af41 commit d683d25
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 354 deletions.
7 changes: 6 additions & 1 deletion include/m_ctype.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,12 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
#define MY_CS_LOADED 8 /* sets that are currently loaded */
#define MY_CS_BINSORT 16 /* if binary sort order */
#define MY_CS_PRIMARY 32 /* if primary collation */
#define MY_CS_STRNXFRM 64 /* if strnxfrm is used for sort */
#define MY_CS_STRNXFRM 64 /*
if _not_ set, sort_order will
give same result as strnxfrm --
all new collations should have this
flag set, do not check it in new code
*/
#define MY_CS_UNICODE 128 /* is a charset is BMP Unicode */
#define MY_CS_READY 256 /* if a charset is initialized */
#define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/
Expand Down
2 changes: 1 addition & 1 deletion mysql-test/r/func_like.result
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ DROP TABLE t1, t2;
# Bug#20035071: Out of range error in subselect lead to assertion failed
CREATE TABLE t1(a INTEGER) engine=innodb;
SELECT 1 FROM t1 HAVING (SELECT 1 FROM t1) LIKE EXP(NOW());
ERROR 22003: DOUBLE value is out of range in 'exp(now())'
1
DROP TABLE t1;
#
# Bug #25140629: WRONG RESULT WHEN USING LIKE FUNCTION WITH UCA
Expand Down
4 changes: 2 additions & 2 deletions mysql-test/t/func_like.test
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ select * from t1 where a like "test%";
select * from t1 where a like "te_t";

#
# The following will test the Boyer-Moore code
# The following will test non-anchored matches
#
select * from t1 where a like "%a%";
select * from t1 where a like "%abcd%";
Expand Down Expand Up @@ -203,7 +203,7 @@ DROP TABLE t1, t2;
--echo # Bug#20035071: Out of range error in subselect lead to assertion failed

CREATE TABLE t1(a INTEGER) engine=innodb;
--error ER_DATA_OUT_OF_RANGE
# Error is OK but not mandatory.
SELECT 1 FROM t1 HAVING (SELECT 1 FROM t1) LIKE EXP(NOW());
DROP TABLE t1;

Expand Down
106 changes: 39 additions & 67 deletions sql/filesort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,6 @@ bool filesort(THD *thd, Filesort *filesort, bool sort_positions,
IO_CACHE chunk_file; // For saving Merge_chunk structs.
IO_CACHE *outfile; // Contains the final, sorted result.
Sort_param param;
bool multi_byte_charset;
Bounded_queue<uchar *, uchar *, Sort_param, Mem_compare_queue_key>
pq((Malloc_allocator<uchar*>
(key_memory_Filesort_info_record_pointers)));
Expand Down Expand Up @@ -414,8 +413,7 @@ bool filesort(THD *thd, Filesort *filesort, bool sort_positions,

param.init_for_filesort(filesort,
make_array(filesort->sortorder, s_length),
sortlength(thd, filesort->sortorder, s_length,
&multi_byte_charset),
sortlength(thd, filesort->sortorder, s_length),
table,
thd->variables.max_length_for_sort_data,
max_rows, sort_positions);
Expand All @@ -430,8 +428,7 @@ bool filesort(THD *thd, Filesort *filesort, bool sort_positions,
// If number of rows is not known, use as much of sort buffer as possible.
num_rows_estimate= table->file->estimate_rows_upper_bound();

if (multi_byte_charset &&
!(param.tmp_buffer= (char*)
if (!(param.tmp_buffer= (char*)
my_malloc(key_memory_Sort_param_tmp_buffer,
param.max_compare_length(), MYF(MY_WME))))
goto err;
Expand Down Expand Up @@ -1508,7 +1505,6 @@ uint Sort_param::make_sortkey(uchar *to, const uchar *ref_pos)
}

const CHARSET_INFO *cs=item->collation.collation;
char fill_char= ((cs->state & MY_CS_BINSORT) ? (char) 0 : ' ');

/* All item->str() to use some extra byte for end null.. */
String tmp((char*) to,sort_field->length+4,cs);
Expand Down Expand Up @@ -1539,46 +1535,29 @@ uint Sort_param::make_sortkey(uchar *to, const uchar *ref_pos)
break;
}
uint length= static_cast<uint>(res->length());
if (sort_field->need_strnxfrm)
const char *from= res->ptr();
if (pointer_cast<const uchar *>(from) == to)
{
char *from=(char*) res->ptr();
size_t tmp_length MY_ATTRIBUTE((unused));
if ((uchar*) from == to)
{
DBUG_ASSERT(sort_field->length >= length);
set_if_smaller(length,sort_field->length);
memcpy(tmp_buffer, from, length);
from= tmp_buffer;
}
tmp_length=
cs->coll->strnxfrm(cs, to, sort_field->length,
item->max_char_length(),
(uchar*) from, length,
MY_STRXFRM_PAD_TO_MAXLEN);
DBUG_ASSERT(tmp_length == sort_field->length);
DBUG_ASSERT(sort_field->length >= length);
set_if_smaller(length,sort_field->length);
memcpy(tmp_buffer, from, length);
from= tmp_buffer;
}
else
uint sort_field_length= sort_field->length;
if (sort_field->suffix_length)
{
size_t diff;
uint sort_field_length= sort_field->length -
sort_field->suffix_length;
if (sort_field_length < length)
{
diff= 0;
length= sort_field_length;
}
else
diff= sort_field_length - length;
if (sort_field->suffix_length)
{
/* Store length last in result_string */
store_length(to + sort_field_length, length,
sort_field->suffix_length);
}

my_strnxfrm(cs, to,length,(const uchar*)res->ptr(),length);
cs->cset->fill(cs, (char *)to+length,diff,fill_char);
/* Store length last in result_string */
sort_field_length-= sort_field->suffix_length;
store_length(to + sort_field_length, length, sort_field->suffix_length);
}

size_t tmp_length MY_ATTRIBUTE((unused));
tmp_length=
cs->coll->strnxfrm(cs, to, sort_field_length,
item->max_char_length(),
pointer_cast<const uchar*>(from), length,
MY_STRXFRM_PAD_TO_MAXLEN);
DBUG_ASSERT(tmp_length == sort_field_length);
break;
}
case INT_RESULT:
Expand Down Expand Up @@ -2394,32 +2373,25 @@ static uint suffix_length(ulong string_length)
@param thd Thread handler
@param sortorder Order of items to sort
@param s_length Number of items to sort
@param[out] multi_byte_charset Set to 1 if we are using multi-byte charset
(In which case we have to use strnxfrm())
@note
sortorder->length is updated for each sort item.
@n
sortorder->need_strnxfrm is set 1 if we have to use strnxfrm
@return
Total length of sort buffer in bytes
*/

uint
sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
bool *multi_byte_charset)
sortlength(THD *thd, st_sort_field *sortorder, uint s_length)
{
uint total_length= 0;
*multi_byte_charset= false;

// Heed the contract that strnxfrm() needs an even number of bytes.
const uint max_sort_length_even=
(thd->variables.max_sort_length + 1) & ~1;

for (; s_length-- ; sortorder++)
{
DBUG_ASSERT(!sortorder->need_strnxfrm);
DBUG_ASSERT(sortorder->suffix_length == 0);
if (sortorder->field)
{
Expand All @@ -2428,16 +2400,12 @@ sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
sortorder->length= field->sort_length();
sortorder->is_varlen= field->sort_key_is_varlen();

if (use_strnxfrm(cs))
{
// How many bytes do we need (including sort weights) for strnxfrm()?
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
sortorder->need_strnxfrm= true;
*multi_byte_charset= 1;
}
// How many bytes do we need (including sort weights) for strnxfrm()?
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);

/*
NOTE: The corresponding test below also has a check for
cs == &my_charset_bin to sort truncated blobs deterministically;
NO PAD collations to sort truncated blobs deterministically;
however, that part is dealt by in Field_blob/Field_varstring,
so we don't need it here.
*/
Expand Down Expand Up @@ -2468,16 +2436,20 @@ sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
const CHARSET_INFO *cs= item->collation.collation;
sortorder->length= item->max_length;
set_if_smaller(sortorder->length, max_sort_length_even);
if (use_strnxfrm(cs))
{
// How many bytes do we need (including sort weights) for strnxfrm()?
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
sortorder->need_strnxfrm= true;
*multi_byte_charset= 1;
}
else if (cs->pad_attribute == NO_PAD)

// How many bytes do we need (including sort weights) for strnxfrm()?
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);

if (cs->pad_attribute == NO_PAD)
{
/* Store length last to be able to sort blob/varbinary */
/*
Store length last, which makes it into a tie-breaker. This is
so that e.g. 'a' < 'a\0' for the binary collation, even though
the field is fixed-width and pads with '\0'. The utf8mb4_0900_*
collations technically don't need this, since they pad with 0
(which does not match any real weight), but we'd like not to
rely on such implementation details in filesort.
*/
sortorder->suffix_length= suffix_length(sortorder->length);
sortorder->length+= sortorder->suffix_length;
}
Expand Down
3 changes: 1 addition & 2 deletions sql/filesort.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ void filesort_free_buffers(TABLE *table, bool full);
void change_double_for_sort(double nr,uchar *to);

/// Declared here so we can unit test it.
uint sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
bool *multi_byte_charset);
uint sortlength(THD *thd, st_sort_field *sortorder, uint s_length);

#endif /* FILESORT_INCLUDED */
Loading

0 comments on commit d683d25

Please sign in to comment.