Skip to content

Commit

Permalink
feat: complete work on gdocs paste conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
tim-evans authored and colin-alexa committed Mar 21, 2024
1 parent 62f1e94 commit 745069e
Show file tree
Hide file tree
Showing 2 changed files with 264 additions and 210 deletions.
207 changes: 114 additions & 93 deletions packages/@atjson/source-gdocs-paste/src/converter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
InlineAnnotation,
ParseAnnotation,
SliceAnnotation,
TextAnnotation,
compareAnnotations,
is,
} from "@atjson/document";
Expand All @@ -23,8 +24,11 @@ import uuid from "uuid-random";

// eslint-disable-next-line no-control-regex
const VERTICAL_TABS = /\u000B/g;
// eslint-disable-next-line no-control-regex
const TABLE = /\u0010(\u0012(\u001c.*\n)+)+\u0011/g;
// eslint-disable-next-line no-control-regex
const TABLE_ROW = /\u0012(\u001c.*\n)+/gmu;
// eslint-disable-next-line no-control-regex
const TABLE_CELL = /\u001c(.*)\n/gmu;
const NEWLINE_PARAGRAPH_SEPARATOR = /\n(\s*\n)*/g;

Expand All @@ -35,6 +39,10 @@ const ALIGNMENT = {
3: "justify",
} as const;

function snakecase(text: string) {
return text.toLowerCase().replace(/\s+/, "_");
}

GDocsSource.defineConverterTo(OffsetSource, (doc) => {
// Remove zero-length annotations
// This is a bit of a workaround to deal with complications that arise when trying to
Expand Down Expand Up @@ -157,99 +165,6 @@ GDocsSource.defineConverterTo(OffsetSource, (doc) => {
);
}

// Convert vertical tabs to line breaks
for (let table of doc.match(TABLE)) {
let id = uuid();
let rows: Array<Array<{ slice: string; jsonValue: string }>> = [];

// Gather the dataset info first
for (let tableRow of doc.match(TABLE_ROW, table.start, table.end)) {
let row: Array<{ slice: string; jsonValue: string }> = [];
doc.addAnnotations(
new ParseAnnotation({
start: tableRow.start,
end: tableRow.start + 1,
})
);

for (let tableCell of doc.match(
TABLE_CELL,
tableRow.start + 1,
tableRow.end
)) {
let slice = new SliceAnnotation({
start: tableCell.start,
end: tableCell.end,
attributes: {
refs: [id],
},
});
row.push({
slice: slice.id,
jsonValue: tableCell.matches[1],
});
doc.addAnnotations(
new ParseAnnotation({
start: tableCell.start,
end: tableCell.start + 1,
}),
slice,
new ParseAnnotation({
start: tableCell.end - 1,
end: tableCell.end,
})
);
}
rows.push(row);
}
let [header, ...body] = rows;
let columnNames = header.map((header) => header.jsonValue);
let records = body.map((row) => {
return row.reduce((E, cell, index) => {
E[columnNames[index]] = cell;
return E;
}, {} as Record<string, { slice: string; jsonValue: string }>);
});
let schema = columnNames.reduce((E, columnName) => {
E[columnName] = ColumnType.RICH_TEXT;
return E;
}, {} as Record<string, ColumnType>);

let dataset = new DataSet({
id,
start: table.start + 1,
end: table.end + 1,
attributes: {
schema,
records,
},
});
doc.addAnnotations(dataset);
doc.addAnnotations(
new ParseAnnotation({
start: table.start,
end: table.start + 1,
}),
new Table({
start: table.end - 1,
end: table.end,
attributes: {
dataSet: dataset.id,
columns: columnNames.map((columnName, index) => {
return {
name: columnName,
slice: header[index].slice,
};
}),
},
}),
new ParseAnnotation({
start: table.end - 1,
end: table.end,
})
);
}

// Convert newlines to Paragraphs. Paragraphs must not cross the boundary of a BlockAnnotation, so
// divide the document into 'block boundaries' and then look for single/multiple new lines within each
// block boundary
Expand Down Expand Up @@ -441,5 +356,111 @@ GDocsSource.defineConverterTo(OffsetSource, (doc) => {
mark.end = end;
});

// Convert vertical tabs to line breaks
for (let table of doc.match(TABLE)) {
let id = uuid();
let rows: Array<Array<{ slice: string; jsonValue: string }>> = [];

// Remove all paragraphs inside of tables
doc
.where(
(a) => is(a, Paragraph) && a.start >= table.start && a.end <= table.end
)
.update((a) => {
doc.replaceAnnotation(
a,
new TextAnnotation({
start: a.start,
end: a.end,
})
);
});

// Gather the dataset info first
for (let tableRow of doc.match(TABLE_ROW, table.start, table.end)) {
let row: Array<{ slice: string; jsonValue: string }> = [];
doc.addAnnotations(
new ParseAnnotation({
start: tableRow.start,
end: tableRow.start + 1,
})
);

for (let tableCell of doc.match(
TABLE_CELL,
tableRow.start + 1,
tableRow.end
)) {
let slice = new SliceAnnotation({
start: tableCell.start,
end: tableCell.end,
attributes: {
refs: [id],
},
});
row.push({
slice: slice.id,
jsonValue: tableCell.matches[1],
});
doc.addAnnotations(
new ParseAnnotation({
start: tableCell.start,
end: tableCell.start + 1,
}),
slice
);
}
rows.push(row);
}
let [header, ...body] = rows;
let columnNames = header.map((header) => snakecase(header.jsonValue));
let records = body.map((row) => {
return row.reduce((E, cell, index) => {
E[columnNames[index]] = cell;
return E;
}, {} as Record<string, { slice: string; jsonValue: string }>);
});
let schema = columnNames.reduce((E, columnName) => {
E[columnName] = ColumnType.RICH_TEXT;
return E;
}, {} as Record<string, ColumnType>);

let dataset = new DataSet({
id,
start: table.start,
end: table.end,
attributes: {
schema,
records,
},
});

doc.addAnnotations(
dataset,
new ParseAnnotation({
start: table.start,
end: table.start + 1,
}),
new Table({
start: table.start,
end: table.end,
attributes: {
dataSet: dataset.id,
showColumnHeaders: true,
columns: header.map((cell, index) => {
return {
name: columnNames[index],
slice: cell.slice,
};
}),
},
}),
new ParseAnnotation({
start: table.end - 1,
end: table.end,
})
);
}

return doc;
});
Loading

0 comments on commit 745069e

Please sign in to comment.