Skip to content

Commit

Permalink
update: Additional drive action structuring and extraction (#911)
Browse files Browse the repository at this point in the history
  • Loading branch information
isTravis authored Jan 22, 2025
1 parent 404bf17 commit 730195c
Show file tree
Hide file tree
Showing 4 changed files with 354 additions and 13 deletions.
20 changes: 18 additions & 2 deletions core/actions/googleDriveImport/formatDriveData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ import type { PubsId } from "db/public";

import type { DriveData } from "./getGDriveFiles";
import {
formatLists,
getDescription,
processLocalLinks,
removeDescription,
removeEmptyFigCaption,
removeGoogleLinkForwards,
removeVerboseFormatting,
structureAnchors,
Expand All @@ -27,6 +31,7 @@ import {
} from "./gdocPlugins";

export type FormattedDriveData = {
pubDescription: string;
pubHtml: string;
versions: {
[description: `${string}:description`]: string;
Expand All @@ -39,6 +44,7 @@ export type FormattedDriveData = {
const processHtml = async (html: string): Promise<string> => {
const result = await rehype()
.use(structureFormatting)
.use(formatLists)
.use(removeVerboseFormatting)
.use(removeGoogleLinkForwards)
.use(processLocalLinks)
Expand All @@ -55,6 +61,8 @@ const processHtml = async (html: string): Promise<string> => {
.use(structureAnchors)
.use(structureReferences)
.use(structureFootnotes)
.use(removeEmptyFigCaption)
.use(removeDescription)
.use(rehypeFormat)
.process(html);
return String(result);
Expand All @@ -66,8 +74,15 @@ export const formatDriveData = async (
): Promise<FormattedDriveData> => {
const formattedPubHtml = await processHtml(dataFromDrive.pubHtml);

/* Check for a description in the most recent version */
const latestRawVersion = dataFromDrive.versions.reduce((latest, version) => {
return new Date(version.timestamp) > new Date(latest.timestamp) ? version : latest;
}, dataFromDrive.versions[0]);
const latestPubDescription = latestRawVersion && getDescription(latestRawVersion.html);

/* Align versions to releases in legacy data and process HTML */
const releases: any = dataFromDrive.legacyData?.releases || [];
const findDescription = (timestamp: string) => {
const findVersionDescription = (timestamp: string) => {
const matchingRelease = releases.find((release: any) => {
return release.createdAt === timestamp;
});
Expand All @@ -82,7 +97,7 @@ export const formatDriveData = async (
const versions = dataFromDrive.versions.map((version) => {
const { timestamp, html } = version;
const outputVersion: any = {
[`${communitySlug}:description`]: findDescription(timestamp),
[`${communitySlug}:description`]: findVersionDescription(timestamp),
[`${communitySlug}:publication-date`]: timestamp,
[`${communitySlug}:content`]: html,
};
Expand Down Expand Up @@ -161,6 +176,7 @@ export const formatDriveData = async (
const comments = discussions ? flattenComments(discussions) : [];

const output = {
pubDescription: latestPubDescription,
pubHtml: String(formattedPubHtml),
versions,
discussions: comments,
Expand Down
195 changes: 190 additions & 5 deletions core/actions/googleDriveImport/gdocPlugins.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ import { logger } from "logger";

import {
basic,
formatLists,
getDescription,
processLocalLinks,
removeDescription,
removeEmptyFigCaption,
removeGoogleLinkForwards,
removeVerboseFormatting,
structureAnchors,
Expand Down Expand Up @@ -137,13 +141,17 @@ test("Structure Images", async () => {
<td><p><span>Source</span></p></td>
<td><p><span>Caption</span></p></td>
<td><p><span>Alt Text</span></p></td>
<td><p><span>Align</span></p></td>
<td><p><span>Size</span></p></td>
</tr>
<tr>
<td><p><span>Image</span></p></td>
<td><p><span>n8r4ihxcrly</span></p></td>
<td><p><span>https://resize-v3.pubpub.org/123</span></p></td>
<td><p><span>With a caption. </span><b>Bold</b></p></td>
<td><p><b>123</b></p></td>
<td><p>full</p></td>
<td><p>50</p></td>
</tr>
</tbody>
</table>
Expand All @@ -154,7 +162,7 @@ test("Structure Images", async () => {
<html>
<head></head>
<body>
<figure id="n8r4ihxcrly">
<figure id="n8r4ihxcrly" data-align="full" data-size="50">
<img alt="123" src="https://resize-v3.pubpub.org/123">
<figcaption>
<p>
Expand Down Expand Up @@ -191,13 +199,17 @@ test("Structure Images", async () => {
<td><p><span>Source</span></p></td>
<td><p><span>Caption</span></p></td>
<td><p><span>Static Image</span></p></td>
<td><p><span>Align</span></p></td>
<td><p><span>Size</span></p></td>
</tr>
<tr>
<td><p><span>Video</span></p></td>
<td><p><span>n8r4ihxcrly</span></p></td>
<td><p><span>https://resize-v3.pubpub.org/123.mp4</span></p></td>
<td><p><span>With a caption. </span><b>Bold</b></p></td>
<td>https://example.com</td>
<td><p>full</p></td>
<td><p>50</p></td>
</tr>
</tbody>
</table>
Expand All @@ -208,7 +220,7 @@ test("Structure Images", async () => {
<html>
<head></head>
<body>
<figure id="n8r4ihxcrly">
<figure id="n8r4ihxcrly" data-align="full" data-size="50">
<video controls poster="https://example.com">
<source src="https://resize-v3.pubpub.org/123.mp4" type="video/mp4">
<img src="https://example.com" alt="Video fallback image">
Expand Down Expand Up @@ -247,12 +259,16 @@ test("Structure Audio", async () => {
<td><p><span>Id</span></p></td>
<td><p><span>Source</span></p></td>
<td><p><span>Caption</span></p></td>
<td><p><span>Align</span></p></td>
<td><p><span>Size</span></p></td>
</tr>
<tr>
<td><p><span>Audio</span></p></td>
<td><p><span>n8r4ihxcrly</span></p></td>
<td><p><span>https://resize-v3.pubpub.org/123.mp3</span></p></td>
<td><p><span>With a caption. </span><b>Bold</b></p></td>
<td><p>full</p></td>
<td><p>50</p></td>
</tr>
</tbody>
</table>
Expand All @@ -263,7 +279,7 @@ test("Structure Audio", async () => {
<html>
<head></head>
<body>
<figure id="n8r4ihxcrly">
<figure id="n8r4ihxcrly" data-align="full" data-size="50">
<audio controls>
<source src="https://resize-v3.pubpub.org/123.mp3" type="audio/mp3">
</audio>
Expand Down Expand Up @@ -359,13 +375,19 @@ test("Structure Iframes", async () => {
<td><p><span>Source</span></p></td>
<td><p><span>Caption</span></p></td>
<td><p><span>Static Image</span></p></td>
<td><p><span>Align</span></p></td>
<td><p><span>Size</span></p></td>
<td><p><span>Height</span></p></td>
</tr>
<tr>
<td><p><span>Iframe</span></p></td>
<td><p><span>n8r4ihxcrly</span></p></td>
<td><p><span>https://resize-v3.pubpub.org/123</span></p></td>
<td><p><span>With a caption. </span><b>Bold</b></p></td>
<td>https://example.com</td>
<td><p><span>full</span></p></td>
<td><p><span>75</span></p></td>
<td><p><span>450</span></p></td>
</tr>
</tbody>
</table>
Expand All @@ -376,8 +398,8 @@ test("Structure Iframes", async () => {
<html>
<head></head>
<body>
<figure id="n8r4ihxcrly">
<iframe src="https://resize-v3.pubpub.org/123" frameborder="0" data-fallback-image="https://example.com"></iframe>
<figure id="n8r4ihxcrly" data-align="full" data-size="75">
<iframe src="https://resize-v3.pubpub.org/123" frameborder="0" data-fallback-image="https://example.com" height="450"></iframe>
<figcaption>
<p>
<span>With a caption. </span>
Expand Down Expand Up @@ -873,3 +895,166 @@ test("processLocalLinks", async () => {

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});

test("removeEmptyFigCaption", async () => {
const inputHtml = `
<html>
<head></head>
<body>
<figure id="n8r4ihxcrly">
<img alt="123" src="https://resize-v3.pubpub.org/123">
<figcaption>
<p><span></span></p>
</figcaption>
</figure>
</body>
</html>
`;
const expectedOutputHtml = `<html>
<head></head>
<body>
<figure id="n8r4ihxcrly">
<img alt="123" src="https://resize-v3.pubpub.org/123">
</figure>
</body>
</html>`;

const result = await rehype()
.use(removeEmptyFigCaption)
.process(inputHtml)
.then((file) => String(file))
.catch((error) => {
logger.error(error);
});

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});

test("formatLists", async () => {
const inputHtml = `
<html>
<head></head>
<body>
<p>Hello</p>
<ul>
<li style="margin-left: 10pt;"><span>Bullet 1</span></li>
</ul><ul>
<li style="margin-left: 20pt;"><span>Bullet 1.1</span></li>
</ul>
<p>Hello again</p>
<ul>
<li style="margin-left: 10pt;"><span>Bullet 1</span></li>
</ul><ul>
<li style="margin-left: 20pt;"><span>Bullet 1.1</span></li>
</ul><ul>
<li style="margin-left: 30pt;"><span>Bullet 1.1.1</span></li>
</ul><ul>
<li style="margin-left: 10pt;"><span>Bullet 2</span></li>
</ul>
</body>
</html>
`;
const expectedOutputHtml = `
<html>
<head></head>
<body>
<p>Hello</p>
<ul>
<li style="margin-left: 10pt;">
<span>Bullet 1</span>
<ul>
<li style="margin-left: 20pt;"><span>Bullet 1.1</span></li>
</ul>
</li>
</ul>
<p>Hello again</p>
<ul>
<li style="margin-left: 10pt;">
<span>Bullet 1</span>
<ul>
<li style="margin-left: 20pt;">
<span>Bullet 1.1</span>
<ul>
<li style="margin-left: 30pt;"><span>Bullet 1.1.1</span></li>
</ul>
</li>
</ul>
</li>
<li style="margin-left: 10pt;"><span>Bullet 2</span></li>
</ul>
</body>
</html>
`;

const result = await rehype()
.use(formatLists)
.process(inputHtml)
.then((file) => String(file))
.catch((error) => {
logger.error(error);
});

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});

test("removeDescriptions", async () => {
const inputHtml = `
<html>
<head></head>
<body><table>
<tr>
<td>Type</td>
<td>Value</td>
</tr>
<tr>
<td>Description</td>
<td>Seeing how microbes are organized ...</td>
</tr>
</table><p>Hello</p>
</body>
</html>
`;
const expectedOutputHtml = `<html>
<head></head>
<body>
<p>Hello</p>
</body>
</html>`;

const result = await rehype()
.use(removeDescription)
.process(inputHtml)
.then((file) => String(file))
.catch((error) => {
logger.error(error);
});

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});

test("getDescription", async () => {
const inputHtml = `
<html>
<head></head>
<body><table>
<tr>
<td>Type</td>
<td>Value</td>
</tr>
<tr>
<td>Description</td>
<td>Seeing how microbes are organized ...</td>
</tr>
</table><p>Hello</p>
</body>
</html>
`;
const expectedOutputHtml = `Seeing how microbes are organized ...`;

const result = getDescription(inputHtml);

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});
Loading

0 comments on commit 730195c

Please sign in to comment.