Skip to content

Commit

Permalink
database/delete-project: refactor/fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
haraldschilly committed Jul 11, 2024
1 parent 0857338 commit ff5f370
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 93 deletions.
4 changes: 2 additions & 2 deletions src/packages/backend/metrics.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Counter, Gauge, Histogram } from "prom-client";

type Aspect = "db" | "database" | "server" | "llm";
type Aspect = "db" | "database" | "server" | "llm" | "database";

function withPrefix(aspect: Aspect, name: string): string {
return `cocalc_${aspect}_${name}`;
Expand All @@ -13,7 +13,7 @@ export function newCounter(
name: string,
help: string,
labelNames: string[] = [],
) {
): Counter<string> {
name = withPrefix(aspect, name);
const key = `counter-${name}`;
if (cache[key] != null) {
Expand Down
4 changes: 2 additions & 2 deletions src/packages/database/postgres/bulk-delete.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import getPool, { initEphemeralDatabase } from "@cocalc/database/pool";
import { uuid } from "@cocalc/util/misc";
import { bulk_delete } from "./bulk-delete";
import { bulkDelete } from "./bulk-delete";

beforeAll(async () => {
await initEphemeralDatabase({});
Expand Down Expand Up @@ -41,7 +41,7 @@ describe("bulk delete", () => {
);
expect(num1.rows[0].num).toEqual(N);

const res = await bulk_delete({
const res = await bulkDelete({
table: "project_log",
field: "project_id",
value: project_id,
Expand Down
2 changes: 1 addition & 1 deletion src/packages/database/postgres/bulk-delete.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ WHERE ${ID} IN (
)`;
}

export async function bulk_delete(opts: Opts): Ret {
export async function bulkDelete(opts: Opts): Ret {
const { table, field, value, id = "id", maxUtilPct = 10 } = opts;
let { limit = 1024 } = opts;
// assert table name is a key in SCHEMA
Expand Down
213 changes: 127 additions & 86 deletions src/packages/database/postgres/delete-projects.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,33 @@ Code related to permanently deleting projects.
*/

import getLogger from "@cocalc/backend/logger";
import { newCounter } from "@cocalc/backend/metrics";
import getPool from "@cocalc/database/pool";
import { getServerSettings } from "@cocalc/database/settings";
import { callback2 } from "@cocalc/util/async-utils";
import { KUCALC_ON_PREMISES } from "@cocalc/util/db-schema/site-defaults";
import { minutes_ago } from "@cocalc/util/misc";
import { bulk_delete } from "./bulk-delete";
import { bulkDelete } from "./bulk-delete";
import { PostgreSQL } from "./types";

const log = getLogger("db:delete-projects");

const delete_projects_prom = newCounter(
"database",
"delete_projects_total",
"Deleting projects and associated data operations counter.",
["op"],
);

/*
Permanently delete from the database all project records, where the
project is explicitly deleted already (so the deleted field is true).
Call this function to setup projects for permanent deletion. This blanks
the user field so the user no longer can access the project, and we don't
know that the user had anything to do with the project. A separate phase
later then purges these projects from disk as well as the database.
TODO:it's referenced from postgres-server-queries.coffee, but is it actually used anywhere?
*/
export async function permanently_unlink_all_deleted_projects_of_user(
db: PostgreSQL,
Expand Down Expand Up @@ -80,15 +90,24 @@ export async function unlink_old_deleted_projects(
}

const Q_CLEANUP_SYNCSTRINGS = `
SELECT p.project_id, s.string_id
FROM projects as p
INNER JOIN syncstrings as s
SELECT s.string_id, p.project_id
FROM projects as p INNER JOIN syncstrings as s
ON p.project_id = s.project_id
WHERE p.deleted = true
AND users IS NULL
AND p.state ->> 'state' != 'deleted'
AND p.users IS NULL
ORDER BY
p.project_id, s.string_id
LIMIT 10000
`;

const Q_CLEANUP_PROJECTS = `
SELECT project_id
FROM projects
WHERE deleted = true
AND users IS NULL
AND state ->> 'state' != 'deleted'
ORDER BY created ASC
LIMIT 1000
`;

/*
Expand All @@ -114,99 +133,49 @@ export async function cleanup_old_projects_data(
const start_ts = new Date();

const pool = getPool();
const { rows } = await pool.query(Q_CLEANUP_SYNCSTRINGS);

let num = 0;
let pid = "";
let num1 = 0;
let num2 = 0;

for (const row of rows) {
const { project_id, string_id } = row;
while (true) {
if (start_ts < minutes_ago(max_run_m)) {
L(`too much time elapsed, breaking after ${num} syncstrings`);
break;
L(`too much time elapsed, breaking after ${num1} syncstrings`);
return;
}

L(`deleting syncstring ${project_id}/${string_id}`);
num += 1;
await callback2(db.delete_syncstring, { string_id });

// wait a bit after deleting syncstrings, e.g. to let the standby db catch up
await new Promise((done) => setTimeout(done, 100));
const { rows: syncstrings } = await pool.query(Q_CLEANUP_SYNCSTRINGS);
L(`deleting ${syncstrings.length} syncstrings`);
for (const { project_id, string_id } of syncstrings) {
L(`deleting syncstring ${project_id}/${string_id}`);
num1 += 1;
const t0 = Date.now();
await callback2(db.delete_syncstring, { string_id });
const elapsed_ms = Date.now() - t0;
delete_projects_prom.labels("syncstring").inc();
// wait a bit after deleting syncstrings, e.g. to let the standby db catch up
// this ensures a max of "10%" utilization of the database – or wait 1 second
await new Promise((done) =>
setTimeout(done, Math.min(1000, elapsed_ms * 9)),
);
}

// Q_CLEANUP_SYNCSTRINGS orders by project_id, hence we trigger project specific actions when the id changes
if (pid != project_id) {
pid = project_id;
const { rows: projects } = await pool.query(Q_CLEANUP_PROJECTS);
L(`deleting the data of ${projects.length} projects`);
for (const { project_id } of projects) {
const L2 = L0.extend(project_id).debug;
delete_projects_prom.labels("project").inc();
num2 += 1;
let delRows = 0;

if (on_prem) {
L2(`cleanup_old_projects_data for project_id=${project_id}`);
L2(`delete all project files`);
// TODO: this only works on-prem, and requires the project files to be mounted

L2(`deleting all shared files in project ${project_id}`);
L2(`deleting all shared files`);
// TODO: do it directly like above, and also get rid of all those shares in the database

const delPublicPaths = await bulk_delete({
table: "public_paths",
field: "project_id",
value: project_id,
});
L2(`deleted public_paths ${delPublicPaths.rowsDeleted} entries`);

const delProjectLog = await bulk_delete({
table: "project_log",
field: "project_id",
value: project_id,
});
L2(`deleted project_log ${delProjectLog.rowsDeleted} entries`);

const delFileUse = await bulk_delete({
table: "file_use",
field: "project_id",
value: project_id,
});
L2(`deleted file_use ${delFileUse.rowsDeleted} entries`);

const delAccessLog = await bulk_delete({
table: "file_access_log",
field: "project_id",
value: project_id,
});
L2(`deleted file_access_log ${delAccessLog.rowsDeleted} entries`);

const delJupyterApiLog = await bulk_delete({
table: "jupyter_api_log",
field: "project_id",
value: project_id,
});
L2(`deleted jupyter_api_log ${delJupyterApiLog.rowsDeleted} entries`);

for (const field of [
"target_project_id",
"source_project_id",
] as const) {
const delCopyPaths = await bulk_delete({
table: "copy_paths",
field,
value: project_id,
});
L2(`deleted copy_paths/${field} ${delCopyPaths.rowsDeleted} entries`);
}

const delListings = await bulk_delete({
table: "listings",
field: "project_id",
id: "project_id", // TODO listings has a more complex ID, is this a problem?
value: project_id,
});
L2(`deleted ${delListings.rowsDeleted} listings`);

const delInviteTokens = await bulk_delete({
table: "project_invite_tokens",
field: "project_id",
value: project_id,
id: "token",
});
L2(`deleted ${delInviteTokens.rowsDeleted} entries`);
// for now, on-prem only as well. This gets rid of all sorts of data in tables specific to the given project.
delRows += await delete_associated_project_data(L2, project_id);
}

// now, that we're done with that project, mark it as state.state ->> 'deleted'
Expand All @@ -215,6 +184,78 @@ export async function cleanup_old_projects_data(
project_id,
state: "deleted",
});
L2(
`finished deleting project data | deleted ${delRows} entries | setting state.state="deleted"`,
);
}

if (projects.length === 0 && Q_CLEANUP_SYNCSTRINGS.length === 0) {
L(`all data of deleted projects and associated syncstrings are deleted.`);
L(`In total ${num1} syncstrings and ${num2} projects were processed.`);
return;
}
}
}

async function delete_associated_project_data(
L2,
project_id: string,
): Promise<number> {
let total = 0;
// collecting tables, where the primary key is the default (i.e. "id") and
// the field to check is always called "project_id"
const tables = [
"public_paths",
"project_log",
"file_use",
"file_access_log",
"jupyter_api_log",
"openai_chatgpt_log",
] as const;

for (const table of tables) {
const { rowsDeleted } = await bulkDelete({
table,
field: "project_id",
value: project_id,
});
total += rowsDeleted;
L2(`deleted ${table} ${rowsDeleted} entries`);
}

// these tables are different, i.e. another id, or the field to check the project_id value against is called differently

for (const field of ["target_project_id", "source_project_id"] as const) {
const { rowsDeleted } = await bulkDelete({
table: "copy_paths",
field,
value: project_id,
});
total += rowsDeleted;
L2(`deleted copy_paths/${field} ${rowsDeleted} entries`);
}

{
const { rowsDeleted } = await bulkDelete({
table: "listings",
field: "project_id",
id: "project_id", // TODO listings has a more complex ID, is this a problem?
value: project_id,
});
total += rowsDeleted;
L2(`deleted ${rowsDeleted} listings`);
}

{
const { rowsDeleted } = await bulkDelete({
table: "project_invite_tokens",
field: "project_id",
value: project_id,
id: "token",
});
total += rowsDeleted;
L2(`deleted ${rowsDeleted} entries`);
}

return total;
}
10 changes: 8 additions & 2 deletions src/packages/hub/run/delete-projects.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
#!/usr/bin/env node

/*
Periodically delete projects.
TODO: For now, this just calls the unlink function. Later on it
should do more (actually delete data, etc.).
STATUS:
For now, this just calls the unlink function and deletes all assocated syncstrings and data.
In "onprem" mode, this also entries in various tables, which contain data specific to the deleted projects.
TESTING: to run this in development and see logging, call it like that:
./src/packages/hub$ env DEBUG_CONSOLE=yes DEBUG=cocalc:debug:db:* pnpm cocalc-hub-delete-projects
*/

const postgres = require("@cocalc/database");
Expand All @@ -16,6 +21,7 @@ async function update() {
console.log("unlinking old deleted projects...");
try {
await db.unlink_old_deleted_projects();
// limit the max runtime to half the interval time
const max_run_m = (INTERVAL_MS / 2) / (1000 * 60)
await db.cleanup_old_projects_data(max_run_m);
} catch (err) {
Expand Down

0 comments on commit ff5f370

Please sign in to comment.