Skip to content

Commit

Permalink
Update google drive action (#869)
Browse files Browse the repository at this point in the history
* convert some stuff

* Update plugins

* allow it to run for any community

* exclude already existing discussions

* exclude already existing versions

* create new version if none exists

* remove log

* Pull rehype into its own function

* fix dev env

* fix formatting

---------

Co-authored-by: Travis Rich <[email protected]>
  • Loading branch information
gabestein and isTravis authored Dec 24, 2024
1 parent dab57f0 commit d82ab08
Show file tree
Hide file tree
Showing 5 changed files with 279 additions and 85 deletions.
3 changes: 2 additions & 1 deletion core/.env.development
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ DATACITE_REPOSITORY_ID=""
DATACITE_PASSWORD=""
DATACITE_API_URL="https://api.test.datacite.org"

GCLOUD_KEY_FILE="xxx"
GCLOUD_KEY_FILE='xxx'

72 changes: 35 additions & 37 deletions core/actions/googleDriveImport/formatDriveData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@ import { writeFile } from "fs/promises";
import { rehype } from "rehype";
import rehypeFormat from "rehype-format";

import type { PubsId } from "db/public";

import type { DriveData } from "./getGDriveFiles";
import {
processLocalLinks,
removeGoogleLinkForwards,
removeVerboseFormatting,
structureAnchors,
structureAudio,
Expand All @@ -25,17 +29,19 @@ import {
export type FormattedDriveData = {
pubHtml: string;
versions: {
"arcadia:description": string;
"arcadia:publication-date": string;
"arcadia:content": string;
[description: `${string}:description`]: string;
[publicationDate: `${string}:publication-date`]: string;
[content: `${string}:content`]: string;
}[];
discussions: { id: string; values: {} }[];
discussions: { id: PubsId; values: {} }[];
};

export const formatDriveData = async (dataFromDrive: DriveData): Promise<FormattedDriveData> => {
const formattedPubHtml = await rehype()
const processHtml = async (html: string): Promise<string> => {
const result = await rehype()
.use(structureFormatting)
.use(removeVerboseFormatting)
.use(removeGoogleLinkForwards)
.use(processLocalLinks)
.use(structureImages)
.use(structureVideos)
.use(structureAudio)
Expand All @@ -50,7 +56,15 @@ export const formatDriveData = async (dataFromDrive: DriveData): Promise<Formatt
.use(structureReferences)
.use(structureFootnotes)
.use(rehypeFormat)
.process(dataFromDrive.pubHtml);
.process(html);
return String(result);
};

export const formatDriveData = async (
dataFromDrive: DriveData,
communitySlug: string
): Promise<FormattedDriveData> => {
const formattedPubHtml = await processHtml(dataFromDrive.pubHtml);

const releases: any = dataFromDrive.legacyData?.releases || [];
const findDescription = (timestamp: string) => {
Expand All @@ -62,32 +76,15 @@ export const formatDriveData = async (dataFromDrive: DriveData): Promise<Formatt
};

for (const version of dataFromDrive.versions) {
const processedHtml = await rehype()
.use(structureFormatting)
.use(removeVerboseFormatting)
.use(structureImages)
.use(structureVideos)
.use(structureAudio)
.use(structureFiles)
.use(structureIframes)
.use(structureBlockMath)
.use(structureInlineMath)
.use(structureBlockquote)
.use(structureCodeBlock)
.use(structureInlineCode)
.use(structureAnchors)
.use(structureReferences)
.use(structureFootnotes)
.use(rehypeFormat)
.process(version.html);
const processedHtml = await processHtml(version.html);
version.html = String(processedHtml);
}
const versions = dataFromDrive.versions.map((version) => {
const { timestamp, html } = version;
const outputVersion: any = {
"arcadia:description": findDescription(timestamp),
"arcadia:publication-date": timestamp,
"arcadia:content": html,
[`${communitySlug}:description`]: findDescription(timestamp),
[`${communitySlug}:publication-date`]: timestamp,
[`${communitySlug}:content`]: html,
};
Object.keys(outputVersion).forEach((key) => {
if (outputVersion[key] === undefined || outputVersion[key] === null) {
Expand All @@ -114,17 +111,18 @@ export const formatDriveData = async (dataFromDrive: DriveData): Promise<Formatt
const commentObject: any = {
id: comment.id,
values: {
"arcadia:anchor":
[`${communitySlug}:anchor`]:
index === 0 && discussion.anchors.length
? discussion.anchors[0]
? JSON.stringify(discussion.anchors[0])
: undefined,
"arcadia:content": comment.text,
"arcadia:publication-date": comment.createdAt,
"arcadia:full-name": comment.author.fullName,
"arcadia:orcid": comment.author.orcid,
"arcadia:avatar": comment.author.avatar,
"arcadia:is-closed": discussion.isClosed,
"arcadia:parent-id": index !== 0 ? firstCommentId : undefined,
[`${communitySlug}:content`]: comment.text,
[`${communitySlug}:publication-date`]: comment.createdAt,
[`${communitySlug}:full-name`]: comment.author.fullName,
[`${communitySlug}:orcid`]: `https://orcid.org/${comment.author.orcid}`,
[`${communitySlug}:avatar`]: comment.author.avatar,
[`${communitySlug}:is-closed`]: discussion.isClosed,
[`${communitySlug}:parent-id`]:
index !== 0 ? firstCommentId : undefined,
},
};

Expand Down
104 changes: 104 additions & 0 deletions core/actions/googleDriveImport/gdocPlugins.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import { logger } from "logger";

import {
basic,
processLocalLinks,
removeGoogleLinkForwards,
removeVerboseFormatting,
structureAnchors,
structureAudio,
Expand Down Expand Up @@ -769,3 +771,105 @@ test("Structure Footnotes", async () => {

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});

test("removeGoogleLinkForwards", async () => {
const inputHtml = `
<html>
<head></head>
<body>
<div><p>Here is some text {ref2}, {ref1}, {ref38}</p></div>
<p>
<sup>
<u>
<a
href="https://www.google.com/url?q=https://research.arcadiascience.com/icebox&#x26;sa=D&#x26;source=editors&#x26;ust=1735008185029653&#x26;usg=AOvVaw3BrkYrphpexOt8IFnAV6MN"
>
Learn more
</a>
</u>
</sup>
</p>
<p>Another <a href="https://www.google.com/url?q=https://local.pubpub/%23n84lvlagdc2&amp;sa=D&amp;source=editors&amp;ust=1735008267184897&amp;usg=AOvVaw3-emucfkfJE8CHmFXlcgUo">Figure 1</a>.</p>
<sup> about the Icebox and the different reasons we ice projects.</sup>
</body>
</html>
`;
const expectedOutputHtml = `<html>
<head></head>
<body>
<div><p>Here is some text {ref2}, {ref1}, {ref38}</p></div>
<p>
<sup>
<u>
<a href="https://research.arcadiascience.com/icebox">
Learn more
</a>
</u>
</sup>
</p>
<p>Another <a href="https://local.pubpub/#n84lvlagdc2">Figure 1</a>.</p>
<sup> about the Icebox and the different reasons we ice projects.</sup>
</body>
</html>`;

const result = await rehype()
.use(removeGoogleLinkForwards)
.process(inputHtml)
.then((file) => String(file))
.catch((error) => {
logger.error(error);
});

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});

test("processLocalLinks", async () => {
const inputHtml = `
<html>
<head></head>
<body>
<div><p>Here is some text {ref2}, {ref1}, {ref38}</p></div>
<p>
<sup>
<u>
<a href="https://research.arcadiascience.com/icebox">
Learn more
</a>
</u>
</sup>
</p>
<p>Another <a href="https://local.pubpub/#n84lvlagdc2">Figure 1</a>.</p>
<sup> about the Icebox and the different reasons we ice projects.</sup>
</body>
</html>
`;
const expectedOutputHtml = `<html>
<head></head>
<body>
<div><p>Here is some text {ref2}, {ref1}, {ref38}</p></div>
<p>
<sup>
<u>
<a href="https://research.arcadiascience.com/icebox">
Learn more
</a>
</u>
</sup>
</p>
<p>Another <a href="#n84lvlagdc2">Figure 1</a>.</p>
<sup> about the Icebox and the different reasons we ice projects.</sup>
</body>
</html>`;

const result = await rehype()
.use(processLocalLinks)
.process(inputHtml)
.then((file) => String(file))
.catch((error) => {
logger.error(error);
});

expect(trimAll(result)).toBe(trimAll(expectedOutputHtml));
});
47 changes: 37 additions & 10 deletions core/actions/googleDriveImport/gdocPlugins.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import path from "path";

import type { Element, Root } from "hast";
import type { Element, Node, Root } from "hast";

import katex from "katex";
import { rehype } from "rehype";
Expand All @@ -10,6 +10,12 @@ import { unified } from "unified";
import { filter } from "unist-util-filter";
import { visit } from "unist-util-visit";

const removeProperties = () => (tree: Root) => {
visit(tree, "element", (node: any) => {
node.properties = {};
});
};

export const mathStringToRehypeElement = (htmlString: string): Element[] => {
const parser = String(rehype().processSync(htmlString));
const file = unified().use(rehypeParse, { fragment: true }).parse(parser);
Expand All @@ -36,7 +42,7 @@ export const latexToRehypeNode = (latexString: string, isBlock: boolean): Elemen
};
};

const getTextContent = (node: any): string => {
export const getTextContent = (node: any): string => {
if (!node) {
return "";
}
Expand Down Expand Up @@ -149,27 +155,26 @@ export const structureFormatting = () => (tree: Root) => {
visit(tree, "element", (node: any, index: any, parent: any) => {
if (node.properties && node.properties.style) {
const style = node.properties.style as string;

if (node.tagName === "span") {
const styles = style.split(";").map((s) => s.trim());
const styles = style.split(";").map((s) => s.trim().replace(/\s+/g, ""));
const tags = [];

if (styles.includes("font-weight: 700")) {
if (styles.includes("font-weight:700")) {
tags.push("b");
}
if (styles.includes("font-style: italic")) {
if (styles.includes("font-style:italic")) {
tags.push("i");
}
if (styles.includes("text-decoration: line-through")) {
if (styles.includes("text-decoration:line-through")) {
tags.push("s");
}
if (styles.includes("text-decoration: underline")) {
if (styles.includes("text-decoration:underline")) {
tags.push("u");
}
if (styles.includes("vertical-align: sub")) {
if (styles.includes("vertical-align:sub")) {
tags.push("sub");
}
if (styles.includes("vertical-align: super")) {
if (styles.includes("vertical-align:super")) {
tags.push("sup");
}

Expand Down Expand Up @@ -651,3 +656,25 @@ export const structureFootnotes = () => (tree: Root) => {
insertVariables(tree, footnoteData.id, newNode);
});
};

export const removeGoogleLinkForwards = () => (tree: Root) => {
visit(tree, "element", (node: any) => {
if (
node.tagName === "a" &&
node.properties.href?.startsWith("https://www.google.com/url")
) {
const url = new URL(node.properties.href);
const q = url.searchParams.get("q");
node.properties.href = q;
}
});
};

export const processLocalLinks = () => (tree: Root) => {
visit(tree, "element", (node: any) => {
if (node.tagName === "a" && node.properties.href?.startsWith("https://local.pubpub/")) {
const href = decodeURIComponent(node.properties.href);
node.properties.href = href.split("local.pubpub/")[1].split("&")[0];
}
});
};
Loading

0 comments on commit d82ab08

Please sign in to comment.