-
Notifications
You must be signed in to change notification settings - Fork 323
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix for Data Quality perf and issue other tweaks. (#11711)
- Fix bug where `DB_Table` data quality indicators broke deserialization in the table viz. - Memorization of the untrimmed data quality indicator and move to it being an operation and column function. - If more than 10,000 rows then use a sample for untrimmed. - ALIASes for blank functions. - Fix for Snowflake drill down. - Bug fix for Long and Double columns with Nothings at end.
- Loading branch information
1 parent
99a91a1
commit 85c8f76
Showing
11 changed files
with
207 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package org.enso.table.data.column.operation; | ||
|
||
import java.util.Random; | ||
import org.enso.base.Text_Utils; | ||
import org.enso.table.data.column.storage.ColumnStorage; | ||
import org.enso.table.data.column.storage.StringStorage; | ||
import org.enso.table.data.table.Column; | ||
import org.graalvm.polyglot.Context; | ||
|
||
public class CountUntrimmed { | ||
// Default seed for random number generation (no specific reason for this value, just stability on | ||
// result). | ||
private static final long RANDOM_SEED = 677280131; | ||
|
||
// Default sample size for counting untrimmed cells. | ||
public static final long DEFAULT_SAMPLE_SIZE = 10000; | ||
|
||
/** Counts the number of cells in the columns with leading or trailing whitespace. */ | ||
public static Long apply(Column column, long sampleSize) throws InterruptedException { | ||
ColumnStorage storage = column.getStorage(); | ||
return applyToStorage(storage, sampleSize); | ||
} | ||
|
||
/** Counts the number of cells in the given storage with leading or trailing whitespace. */ | ||
public static Long applyToStorage(ColumnStorage storage, long sampleSize) | ||
throws InterruptedException { | ||
return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) | ||
? stringStorage.cachedUntrimmedCount() | ||
: (Long) compute(storage, sampleSize, Context.getCurrent()); | ||
} | ||
|
||
/** Internal method performing the calculation on a storage. */ | ||
public static long compute(ColumnStorage storage, long sampleSize, Context context) { | ||
long size = storage.getSize(); | ||
|
||
long count = 0; | ||
if (sampleSize < size) { | ||
var rng = new Random(RANDOM_SEED); | ||
for (int i = 0; i < sampleSize; i++) { | ||
long idx = rng.nextInt(Math.toIntExact(size)); | ||
var val = storage.getItemAsObject(idx); | ||
if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) { | ||
count++; | ||
} | ||
|
||
if (context != null) { | ||
context.safepoint(); | ||
} | ||
} | ||
count = Math.min(size, (long) Math.ceil((double) count / sampleSize * size)); | ||
} else { | ||
for (long i = 0; i < storage.getSize(); i++) { | ||
var val = storage.getItemAsObject(i); | ||
if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) { | ||
count++; | ||
} | ||
|
||
if (context != null) { | ||
context.safepoint(); | ||
} | ||
} | ||
} | ||
|
||
return count; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.