Skip to content

Commit

Permalink
More protection in cbindWithNames from weird names. (#85)
Browse files Browse the repository at this point in the history
- Null names are now ignored, allowing us to handle missing symbols safely.
- Explicitly handle duplicate names by only keeping the first occurrence, which
  is more intuitive than the old behavior of keeping only the last occurrence.
- Preserve the order of names from the first matrix in the final intersection,
  which should reduce the amount of reordering during extraction.
- Remove the C++ code by handling all the intersection in the Javascript side,
  which avoids the need to mock up incomparable indices for nulls/duplicates.
  • Loading branch information
LTLA authored Sep 24, 2023
1 parent c9fa537 commit 12783ed
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 96 deletions.
109 changes: 58 additions & 51 deletions js/cbind.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as utils from "./utils.js";
import { ScranMatrix } from "./ScranMatrix.js";
import * as gc from "./gc.js";
import * as subset from "./subset.js";

function harvest_matrices(x) {
let output = utils.createBigUint64WasmArray(x.length);
Expand Down Expand Up @@ -67,87 +68,93 @@ export function rbind(inputs) {
return output;
}


/**
* Combine matrices by column, after subsetting each matrix to the intersection of common features.
*
* @param {Array} inputs - Array of one or more {@linkplain ScranMatrix} objects.
* @param {Array} names - Array of length equal to `inputs`.
* Each entry should be an Array containing the row names of the corresponding entry of `inputs`.
* Names should correspond to the rows of that entry of `inputs`.
* Any `null` names are ignored.
* If names are duplicated within each array, only the first occurrence is considered in the intersection.
*
* @return {object} An object containing:
* - `matrix`, a {@linkplain ScranMatrix} containing the combined matrices.
* - `indices`, an Int32WasmArray of length equal to the number of rows in `matrix`.
* - `indices`, an Int32Array of length equal to the number of rows in `matrix`.
* This contains the index of the row in the first entry of `inputs` corresponding to each row of `matrix`,
* i.e., the gene at the `i`-th row of `matrix` is the same as the gene at the `indices[i]`-th row of `inputs[0]`.
* This is guaranteed to be sorted.
* - `names`, an array of names identifying the rows of `matrix`.
* This is constructed by indexing the first entry of `names` with `indices`.
*/
export function cbindWithNames(x, names) {
let mat_ptrs;
let renamed = [];
let name_ptrs;
let indices;
let output = {};

try {
// Building a common set of rownames.
if (names.length !== x.length) {
throw new Error("length of 'names' should be equal to length of 'x'");
}

let common = {};
let universe = [];
for (var i = 0; i < names.length; i++) {
if (x[i].numberOfRows() !== names[i].length) {
throw new Error("length of each 'names' must equal number of rows of its corresponding 'x'");
}
names[i].forEach(x => {
if (!(x in common)) {
common[x] = universe.length;
universe.push(x);
// Find the intersection of names, following the order of the first entry.
// We do so to try to improve the chance of getting an ordered subset for efficient extraction.
let ordered_intersection = [];
let remapping = new Map;

if (names.length > 0) {
let intersection = new Set;
for (var n = 0; n < names.length; ++n) {
let current = new Set();
for (const name of names[n]) {
if (name !== null) {
if (n == 0 || intersection.has(name)) {
current.add(name);
}
}
});
}
intersection = current;
}

name_ptrs = utils.createBigUint64WasmArray(x.length);
{
let names_arr = name_ptrs.array();
for (var i = 0; i < names.length; i++) {
let current = names[i];
let replacement = utils.createInt32WasmArray(current.length);
let replacement_arr = replacement.array();
current.forEach((x, i) => {
replacement_arr[i] = common[x];
});
renamed.push(replacement);
names_arr[i] = BigInt(replacement.offset);
for (const name of names[0]) {
if (name !== null && intersection.has(name)) {
let candidate = remapping.get(name);
if (candidate == undefined) { // only consider the first occurence.
remapping.set(name, ordered_intersection.length);
ordered_intersection.push(name);
}
}
}
}

mat_ptrs = harvest_matrices(x);
indices = utils.createInt32WasmArray(x[0].numberOfRows());
output.matrix = gc.call(
module => module.cbind_with_rownames(x.length, mat_ptrs.offset, name_ptrs.offset, indices.offset),
ScranMatrix
);
// Actually generating the combined matrix.
let output = {};
let tmp_subset = []
let tmp_sliced = []

try {
for (var n = 0; n < names.length; ++n) {
let survivors = utils.createInt32WasmArray(ordered_intersection.length);
survivors.fill(-1);
let sarray = survivors.array();
names[n].forEach((x, i) => {
let candidate = remapping.get(x);
if (candidate !== undefined) {
if (sarray[candidate] < 0) { // only consider the first occurrence.
sarray[candidate] = i;
}
}
});

output.indices = indices.slice(0, output.matrix.numberOfRows());
let internames = [];
for (const i of output.indices) {
internames.push(names[0][i]);
tmp_subset.push(survivors);
tmp_sliced.push(subset.subsetRows(x[n], survivors));
}
output.names = internames;

output.matrix = cbind(tmp_sliced);
output.indices = tmp_subset[0].slice();
output.names = ordered_intersection;

} catch (e) {
utils.free(output.matrix);
throw e;

} finally {
utils.free(mat_ptrs);
utils.free(name_ptrs);
utils.free(indices);
for (const x of renamed) {
for (const x of tmp_subset) {
utils.free(x);
}
for (const x of tmp_sliced) {
utils.free(x);
}
}
Expand Down
26 changes: 0 additions & 26 deletions src/cbind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,34 +60,8 @@ NumericMatrix rbind(int n, uintptr_t mats) {
return NumericMatrix(tatami::make_DelayedBind<0>(std::move(collected)));
}

NumericMatrix cbind_with_rownames(int n, uintptr_t mats, uintptr_t names, uintptr_t indices) {
if (n == 0) {
throw std::runtime_error("need at least one matrix to cbind");
}

auto mat_ptrs = convert_array_of_offsets<const NumericMatrix*>(n, mats);
const auto& first = *(mat_ptrs[0]);
std::vector<std::remove_reference<decltype(first.ptr)>::type> inputs;
inputs.reserve(n);
for (int i = 0; i < n; ++i) {
inputs.push_back(mat_ptrs[i]->ptr);
}

auto name_ptrs = convert_array_of_offsets<const int32_t*>(n, names);
auto out = tatami::bind_intersection<1>(inputs, name_ptrs);

// Save the direct row indices for the first matrix.
const auto& idx = out.second;
auto idptr = reinterpret_cast<int*>(indices);
std::copy(idx.begin(), idx.end(), idptr);

return NumericMatrix(std::move(out.first));
}

EMSCRIPTEN_BINDINGS(cbind) {
emscripten::function("cbind", &cbind);

emscripten::function("rbind", &rbind);

emscripten::function("cbind_with_rownames", &cbind_with_rownames);
}
115 changes: 96 additions & 19 deletions tests/cbind.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,43 +60,98 @@ test("cbindWithNames works correctly (simple)", () => {
expect(combined.matrix.numberOfColumns()).toBe(60);

expect(compare.equalArrays(combined.names, ["E", "F", "G", "H", "I", "J"])).toBe(true);
expect(compare.equalArrays(combined.indices, [4, 5, 6, 7, 8, 9])).toBe(true);

expect(compare.equalArrays(combined.matrix.column(0), mat1.column(0).slice(4))).toBe(true);
expect(compare.equalArrays(combined.matrix.column(10), mat2.column(0).slice(2, 8))).toBe(true);
expect(compare.equalArrays(combined.matrix.column(30), mat3.column(0).slice(0, 6))).toBe(true);

expect(
compare.equalArrays(
combined.matrix.row(1),
[ ...mat1.row(5), ...mat2.row(3), ...mat3.row(1) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
mat1.free();
mat2.free();
mat3.free();
})

function quick_check(mat1, mat2, mat3, combined) {
test("cbindWithNames works correctly (complex)", () => {
var mat1 = simulate.simulateDenseMatrix(10, 10);
var names1 = ["Z", "X", "V", "T", "R", "P", "N", "L", "J", "H"]; // every second letter, from the end.
var mat2 = simulate.simulateDenseMatrix(8, 20);
var names2 = ["I", "J", "K", "L", "M", "N", "O", "P"]; // consecutive letters.
var mat3 = simulate.simulateDenseMatrix(4, 30);
var names3 = ["L", "J", "N", "P"]; // random order.

let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]);
expect(combined.matrix.numberOfRows()).toBe(4);
expect(combined.matrix.numberOfColumns()).toBe(60);

expect(compare.equalArrays(combined.names, ["P", "N", "L", "J",])).toBe(true);
expect(compare.equalArrays(combined.matrix.column(0), mat1.column(0).slice(5, 9))).toBe(true);
expect(compare.equalArrays(combined.indices, [5, 6, 7, 8])).toBe(true);

let y1 = mat1.column(0);
let expected1 = [5,6,7,8].map(i => y1[i]);
expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true);

let y2 = mat2.column(0);
let expected2 = [y2[7], y2[5], y2[3], y2[1]];
let expected2 = [7, 5, 3, 1].map(i => y2[i]);
expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true);

let y3 = mat3.column(0);
let expected3 = [y3[3], y3[2], y3[0], y3[1]];
let expected3 = [3, 2, 0, 1].map(i => y3[i]);
expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true);
}

test("cbindWithNames works correctly (complex)", () => {
expect(
compare.equalArrays(
combined.matrix.row(2),
[ ...mat1.row(7), ...mat2.row(3), ...mat3.row(0) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
mat1.free();
mat2.free();
mat3.free();
})

test("cbindWithNames works correctly (duplicates)", () => {
var mat1 = simulate.simulateDenseMatrix(10, 10);
var names1 = ["Z", "X", "V", "T", "R", "P", "N", "L", "J", "H"]; // every second letter, from the end.
var names1 = ["Z", "A", "Z", "A", "R", "P", "N", "L", "N", "J"]; // preceding duplicates that might mess with the indexing, plus duplicated 'N'.
var mat2 = simulate.simulateDenseMatrix(8, 20);
var names2 = ["I", "J", "K", "L", "M", "N", "O", "P"]; // consecutive letters.
var mat3 = simulate.simulateDenseMatrix(4, 30);
var names3 = ["L", "J", "N", "P"]; // random order.
var names2 = ["I", "J", "K", "L", "M", "N", "P", "P"]; // duplicate in 'P', the first occurrence is used.
var mat3 = simulate.simulateDenseMatrix(5, 30);
var names3 = ["L", "J", "N", "P", "L"]; // duplicate in 'L', first occurrence is used again.

let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]);
quick_check(mat1, mat2, mat3, combined);
expect(combined.matrix.numberOfRows()).toBe(4);
expect(combined.matrix.numberOfColumns()).toBe(60);
expect(compare.equalArrays(combined.names, ["P", "N", "L", "J",])).toBe(true);
expect(compare.equalArrays(combined.indices, [5, 6, 7, 9])).toBe(true);

let y1 = mat1.column(0);
let expected1 = [5, 6, 7, 9].map(i => y1[i]);
expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true);

let y2 = mat2.column(0);
let expected2 = [6, 5, 3, 1].map(i => y2[i]);
expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true);

let y3 = mat3.column(0);
let expected3 = [3, 2, 0, 1].map(i => y3[i]);
expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true);

expect(
compare.equalArrays(
combined.matrix.row(3),
[ ...mat1.row(9), ...mat2.row(1), ...mat3.row(1) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
Expand All @@ -105,16 +160,38 @@ test("cbindWithNames works correctly (complex)", () => {
mat3.free();
})

test("cbindWithNames works correctly (duplicates)", () => {
test("cbindWithNames works correctly (nulls)", () => {
var mat1 = simulate.simulateDenseMatrix(10, 10);
var names1 = ["Z", "A", "Z", "A", "R", "P", "N", "L", "J", "H"]; // preceding duplicates that might mess with the indexing.
var mat2 = simulate.simulateDenseMatrix(8, 20);
var names2 = ["I", "J", "K", "L", "M", "N", "P", "P"]; // duplicate in 'P', the last occurrence is used.
var mat3 = simulate.simulateDenseMatrix(4, 30);
var names3 = ["L", "J", "N", "P"];
var names1 = ["A", "B", null, "D", "E", "F", null, "H", "I", "J"];
var mat2 = simulate.simulateDenseMatrix(10, 20);
var names2 = ["C", "D", "E", null, "G", "H", "I", "J", "K", "L"];
var mat3 = simulate.simulateDenseMatrix(10, 30);
var names3 = ["E", "F", null, "H", null, "J", "K", "L", "M", "N"];

let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]);
quick_check(mat1, mat2, mat3, combined);
expect(combined.matrix.numberOfRows()).toBe(3);
expect(combined.matrix.numberOfColumns()).toBe(60);
expect(compare.equalArrays(combined.names, ["E", "H", "J"])).toBe(true);
expect(compare.equalArrays(combined.indices, [4, 7, 9])).toBe(true);

let y1 = mat1.column(0);
let expected1 = [4, 7, 9].map(i => y1[i]);
expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true);

let y2 = mat2.column(0);
let expected2 = [2, 5, 7].map(i => y2[i]);
expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true);

let y3 = mat3.column(0);
let expected3 = [0, 3, 5].map(i => y3[i]);
expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true);

expect(
compare.equalArrays(
combined.matrix.row(0),
[ ...mat1.row(4), ...mat2.row(2), ...mat3.row(0) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
Expand Down

0 comments on commit 12783ed

Please sign in to comment.