Skip to content

Commit

Permalink
fix: handle nested and overlapping slices
Browse files Browse the repository at this point in the history
  • Loading branch information
tim-evans committed Mar 27, 2024
1 parent 02d6d70 commit d9f3e11
Show file tree
Hide file tree
Showing 2 changed files with 379 additions and 102 deletions.
317 changes: 215 additions & 102 deletions packages/@atjson/util/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,55 @@ export const ROOT = "root";
*/
export const TEXT = "text";

enum TokenType {
SLICE_START,
SLICE_END,
}

type Token = {
type: TokenType;
id: string;
index: number;
mark: InternalMark;
ranges: [number, number][];
};

function sortSliceTokens(a: Token, b: Token) {
let indexDelta = a.index - b.index;
if (indexDelta !== 0) {
return indexDelta;
}

// Handle start before end for a 0 length mark:
// We're assuming that one of `a` or `b` is a start
// token and the other is the end token. Sort the start
// token first
if (a.id === b.id) {
return a.type === TokenType.SLICE_START ? -1 : 1;
}

// Sort end tokens before start tokens
if (a.type !== TokenType.SLICE_START && b.type === TokenType.SLICE_START) {
return 1;
} else if (
a.type === TokenType.SLICE_START &&
b.type !== TokenType.SLICE_START
) {
return -1;
}
let multiplier = a.type === TokenType.SLICE_START ? -1 : 1;

let startDelta = b.mark.start - a.mark.start;
if (startDelta !== 0) {
return startDelta * multiplier;
}
let endDelta = a.mark.end - b.mark.end;
if (endDelta !== 0) {
return endDelta * multiplier;
}
return 0;
}

/**
* Extracts slices from a document for the special `slice`
* type provided by atjson. This function removes slices
Expand All @@ -69,27 +118,56 @@ export function extractSlices(value: {
text: string;
}) {
let marksWithoutSlices: InternalMark[] = [];
let slices: InternalMark[] = [];

// Slices may overlap, which we need to take into account.
// Currently, the only use case that I can currently imagine
// is a slice overlapping another slice for the case of a
// footnote with a block inside of it that has a caption / credit
// or some other metadata. Overlapping or colinear slices
// don't make a _ton_ of sense, but should be handled neatly
// enough by chunking up slices so the portions of the document
// get collected into appropriate chunks.
let tokens: Token[] = [];
for (let i = 0, len = value.marks.length; i < len; i++) {
let mark = value.marks[i];
let match = mark.range.match(/([[|(])(\d+)\.\.(\d+)([\]|)])/);
if (match == null) {
throw new Error(`Malformed range ${mark.range}`);
}
let start = parseInt(match[2]);
let end = parseInt(match[3]);
if (mark.type === "slice") {
slices.push({
start: parseInt(match[2]),
end: parseInt(match[3]),
let slice = {
start,
end,
...mark,
});
};
let ranges: [number, number][] = [[start, end]];
tokens.push(
{
type: TokenType.SLICE_START,
id: mark.id,
index: start,
mark: slice,
ranges,
},
{
type: TokenType.SLICE_END,
id: slice.id,
index: end,
mark: slice,
ranges,
}
);
} else {
marksWithoutSlices.push({
start: parseInt(match[2]),
end: parseInt(match[3]),
start,
end,
...mark,
});
}
}
tokens.sort(sortSliceTokens);

let sliceMap = new Map<
string,
Expand All @@ -109,118 +187,153 @@ export function extractSlices(value: {
// slices are extracted so we don't have duplicated text.
let rangesToDelete: [number, number][] = [];

for (let i = 0, len = slices.length; i < len; i++) {
let slice = slices[i];
let { start, end } = slice;
let stack: Token[] = [];
for (let i = 0, len = tokens.length; i < len; i++) {
let token = tokens[i];
switch (token.type) {
case TokenType.SLICE_START: {
let currentSlice = stack[stack.length - 1];
// If there is another slice on the stack,
// we will need to split the range
if (currentSlice) {
let currentSliceRanges = currentSlice.ranges;
let [start, end] = currentSliceRanges.pop() as [number, number];
currentSliceRanges.push([start, token.index], [token.index, end]);
}
stack.push(token);
continue;
}
case TokenType.SLICE_END: {
stack.pop();
let currentSlice = stack[stack.length - 1];
if (currentSlice && currentSlice.id !== token.id) {
let currentSliceRanges = currentSlice.ranges;
let [, end] = currentSliceRanges.pop() as [number, number];
currentSliceRanges.push([token.index, end]);
}
continue;
}
}
}

tokens = tokens.filter((token) => token.type === TokenType.SLICE_START);

let text = value.text.slice(start, end);
for (let i = 0, len = tokens.length; i < len; i++) {
let token = tokens[i];

let text = "";
let blocks: Block[] = [];
let parentIndex = 0;
for (let j = 0, jlen = blockBoundaryPositions.length; j < jlen; j++) {
let position = blockBoundaryPositions[j];
if (position < start) {
// Keep searching forward
continue;
} else if (position + 1 <= end) {
// Add the block to the slice blocks list
let block = value.blocks[j];
if (blocks.length === 0) {
parentIndex = block.parents.length;
let marks: InternalMark[] = [];

for (let t = 0, tlen = token.ranges.length; t < tlen; t++) {
let [start, end] = token.ranges[t];
text += value.text.slice(start, end);

let parentIndex = 0;
for (let j = 0, jlen = blockBoundaryPositions.length; j < jlen; j++) {
let position = blockBoundaryPositions[j];
if (position < start) {
// Keep searching forward
continue;
} else if (position + 1 <= end) {
// Add the block to the slice blocks list
let block = value.blocks[j];
if (blocks.length === 0) {
parentIndex = block.parents.length;
}
blocks.push({
...block,
id: `${token.id}-${block.id}`,
parents: block.parents.slice(parentIndex),
});
} else {
// If the block index is after the slice, we can safely break
// from the loop, saving some extra work.
break;
}
}

let offset = 0;
// After collecting blocks, we'll need to do a pass
// over the text and blocks to ensure that it's well
// formed and there's a block parent for all text.
if (blocks.length === 0 && t === 0) {
offset = 1;
text = `${BLOCK_MARKER}${text}`;
blocks.push({
...block,
id: `${slice.id}-${block.id}`,
parents: block.parents.slice(parentIndex),
id: `${TEXT}-${token.id}`,
type: TEXT,
parents: [],
selfClosing: false,
attributes: {},
});
} else {
// If the block index is after the slice, we can safely break
// from the loop, saving some extra work.
break;
}
}

let offset = 0;
// After collecting blocks, we'll need to do a pass
// over the text and blocks to ensure that it's well
// formed and there's a block parent for all text.
if (blocks.length === 0) {
offset = 1;
text = `${BLOCK_MARKER}${text}`;
blocks.push({
id: `${TEXT}-${slice.id}`,
type: TEXT,
parents: [],
selfClosing: false,
attributes: {},
});
}
for (let j = 0, jlen = marksWithoutSlices.length; j < jlen; j++) {
let mark = marksWithoutSlices[j];

// For slicing purposes, we only consider marks whose
// boundaries are within the slice to be represented.
if (
mark.start >= start &&
mark.start <= end &&
mark.end >= start &&
mark.end <= end
) {
let adjustedStart = Math.max(mark.start - start, 0) + offset;
let adjustedEnd = Math.min(mark.end - start, end - start) + offset;
let range = `${mark.range[0]}${adjustedStart}..${adjustedEnd}${
mark.range[mark.range.length - 1]
}`;
marks.push({
...mark,
id: `${token.id}-${mark.id}`,
range,
start: adjustedStart,
end: adjustedEnd,
});
}
}

let marks: InternalMark[] = [];
for (let j = 0, jlen = marksWithoutSlices.length; j < jlen; j++) {
let mark = marksWithoutSlices[j];
// After handling the slice map, we'll be handling the
// remainder of the document that excludes slices.
// For cases where the underlying range is retained,
// the content will be kept.
if (token.mark.attributes.retain) {
// If the slice is retained, we get to skip this
// step since we want marks to be kept intact
continue;
}

// For slicing purposes, we only consider marks whose
// boundaries are within the slice to be represented.
if (
mark.start >= start &&
mark.start <= end &&
mark.end >= start &&
mark.end <= end
) {
let adjustedStart = Math.max(mark.start - start, 0) + offset;
let adjustedEnd = Math.min(mark.end - start, end - start) + offset;
let range = `${mark.range[0]}${adjustedStart}..${adjustedEnd}${
mark.range[mark.range.length - 1]
}`;
marks.push({
...mark,
id: `${slice.id}-${mark.id}`,
range,
start: adjustedStart,
end: adjustedEnd,
});
let isRangeInserted = false;
// Add the slice to the ranges to delete
for (let j = 0, jlen = rangesToDelete.length; j < jlen; j++) {
let rangeToDelete = rangesToDelete[j];

if (start >= rangeToDelete[0] && start <= rangeToDelete[1]) {
// Extend the end of the range if the start is
// within the current range
rangeToDelete[1] = Math.max(rangeToDelete[1], end);
isRangeInserted = true;
} else if (end >= rangeToDelete[0] && end <= rangeToDelete[1]) {
// Extend the start of the range if the end is
// within the current range
rangeToDelete[0] = Math.min(rangeToDelete[0], start);
isRangeInserted = true;
}
}
// If the slice wasn't within a range or extended a range,
// append it to the list of ranges to remove
if (!isRangeInserted) {
rangesToDelete.push([start, end]);
}
}

sliceMap.set(slice.id, {
sliceMap.set(token.id, {
text,
marks,
blocks,
});

// After handling the slice map, we'll be handling the
// remainder of the document that excludes slices.
// For cases where the underlying range is retained,
// the content will be kept.
if (slice.attributes.retain) {
// If the slice is retained, we get to skip this
// step since we want marks to be kept intact
continue;
}

let isRangeInserted = false;
// Add the slice to the ranges to delete
for (let j = 0, jlen = rangesToDelete.length; j < jlen; j++) {
let rangeToDelete = rangesToDelete[j];

if (start >= rangeToDelete[0] && start <= rangeToDelete[1]) {
// Extend the end of the range if the start is
// within the current range
rangeToDelete[1] = Math.max(rangeToDelete[1], end);
isRangeInserted = true;
} else if (end >= rangeToDelete[0] && end <= rangeToDelete[1]) {
// Extend the start of the range if the end is
// within the current range
rangeToDelete[0] = Math.min(rangeToDelete[0], start);
isRangeInserted = true;
}
}
// If the slice wasn't within a range or extended a range,
// append it to the list of ranges to remove
if (!isRangeInserted) {
rangesToDelete.push([start, end]);
}
}

let firstRange = rangesToDelete[0];
Expand Down
Loading

0 comments on commit d9f3e11

Please sign in to comment.