Skip to content

Commit

Permalink
Part of #1867: String GroupBy optimizations are timing out nightly te…
Browse files Browse the repository at this point in the history
…sts (#1869)

This PR (part of #1867):
- Changes the medium str logic (`8 <= max_bytes < 16`) to not use the array return variation of `computeOnSegments`
- Updates `bytesToUintArr` to run in parallel

If the nightly still has issues, I think the next step is to remove the special logic for medium str until the next iteration

Co-authored-by: Pierce Hayes <[email protected]>
  • Loading branch information
stress-tess and Pierce Hayes authored Oct 31, 2022
1 parent 3a1bb3a commit fe3da03
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 60 deletions.
2 changes: 1 addition & 1 deletion benchmarks/small-str-groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import arkouda as ak

SIZES = {"small": 7, "medium": 14, "big": 28}
SIZES = {"small": 6, "medium": 12, "big": 24}


def time_ak_groupby(N_per_locale, trials, seed):
Expand Down
51 changes: 0 additions & 51 deletions src/SegmentedComputation.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ module SegmentedComputation {
StringIsUpper,
StringIsTitle,
StringBytesToUintArr,
StringBytesTo2UintArrs,
}

proc computeOnSegments(segments: [?D] int, values: [?vD] ?t, param function: SegFunction, type retType, const strArg: string = "") throws {
Expand Down Expand Up @@ -130,54 +129,4 @@ module SegmentedComputation {
}
return res;
}

proc twoReturnComputeOnSegments(segments: [?D] int, values: [?vD] ?t, param function: SegFunction, type retType) throws {
// I hate duplicating the code from computeOnSegments but I need to return 2 arrays
var res1: [D] retType;
var res2: [D] retType;
if D.size == 0 {
return (res1, res2);
}

const (startSegInds, numSegs, lengths) = computeSegmentOwnership(segments, vD);

// Start task parallelism
coforall loc in Locales {
on loc {
const myFirstSegIdx = startSegInds[loc.id];
const myNumSegs = max(0, numSegs[loc.id]);
const mySegInds = {myFirstSegIdx..#myNumSegs};
// Segment offsets whose bytes are owned by loc
// Lengths of segments whose bytes are owned by loc
var mySegs, myLens: [mySegInds] int;
forall i in mySegInds with (var agg = new SrcAggregator(int)) {
agg.copy(mySegs[i], segments[i]);
agg.copy(myLens[i], lengths[i]);
}
try {
// Apply function to bytes of each owned segment, aggregating return values to res1 and res2
forall (start, len, i) in zip(mySegs, myLens, mySegInds) with (var agg1 = newDstAggregator(retType),
var agg2 = newDstAggregator(retType)) {
select function {
when SegFunction.StringBytesTo2UintArrs {
var half = (len/2):int;
agg1.copy(res1[i], stringBytesToUintArr(values, start..#half));
agg2.copy(res2[i], stringBytesToUintArr(values, (start+half)..#half));
}
otherwise {
compilerError("Unrecognized segmented function");
}
}
}
} catch {
throw new owned ErrorWithContext("Error computing %s on string or segment".format(function:string),
getLineNumber(),
getRoutineName(),
getModuleName(),
"IllegalArgumentError");
}
}
}
return (res1, res2);
}
}
20 changes: 13 additions & 7 deletions src/SegmentedString.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ module SegmentedString {
return computeOnSegments(offsets.a, values.a, SegFunction.StringIsTitle, bool);
}

proc bytesToUintArr(max_bytes:int, st) throws {
proc bytesToUintArr(const max_bytes:int, st) throws {
// bytes contained in strings < 128 bits, so concatenating is better than the hash
if max_bytes < 8 {
// we only need one uint array
Expand All @@ -492,7 +492,16 @@ module SegmentedString {
}
else {
// we need two uint arrays
var (numeric1, numeric2) = twoReturnComputeOnSegments(offsets.a, values.a, SegFunction.StringBytesTo2UintArrs, uint);
ref off = offsets.a;
ref vals = values.a;
// should we do strings.getLengths()-1 to not account for null byte
const lens = getLengths();
var numeric1, numeric2: [offsets.aD] uint;
forall (o, l, n1, n2) in zip(off, lens, numeric1, numeric2) {
const half = (l/2):int;
n1 = stringBytesToUintArr(vals, o..#half);
n2 = stringBytesToUintArr(vals, (o+half)..#half);
}
const concat1Name = st.nextName();
const concat2Name = st.nextName();
st.addEntry(concat1Name, new shared SymEntry(numeric1));
Expand Down Expand Up @@ -1429,11 +1438,8 @@ module SegmentedString {
The SegFunction called by computeOnSegments for bytesToUintArr
*/
inline proc stringBytesToUintArr(values, rng) throws {
var concat: uint;
for v in values[rng] {
concat = concat<<8 + v:uint;
}
return concat;
var localSlice = new lowLevelLocalizingSlice(values, rng);
return | reduce [i in 0..#rng.size] (localSlice.ptr(i):uint)<<(8*(rng.size-1-i));
}

/* Test array of strings for membership in another array (set) of strings. Returns
Expand Down
3 changes: 2 additions & 1 deletion src/UniqueMsg.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ module UniqueMsg
// Only one array which is a strings
var (myNames, _) = namesList[0].splitMsgToTuple("+", 2);
var strings = getSegString(myNames, st);
var max_bytes = max reduce strings.getLengths();
// should we do strings.getLengths()-1 to not account for null
const max_bytes = max reduce strings.getLengths();
if max_bytes < 16 {
var str_names = strings.bytesToUintArr(max_bytes, st).split("+");
var (totalDigits, bitWidths, negs) = getNumDigitsNumericArrays(str_names, st);
Expand Down

0 comments on commit fe3da03

Please sign in to comment.