Skip to content

Commit

Permalink
database/bulk-delete: delete many rows without overwhelming the DB
Browse files Browse the repository at this point in the history
  • Loading branch information
haraldschilly committed Jul 10, 2024
1 parent af28b7f commit 4126e2c
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 30 deletions.
28 changes: 25 additions & 3 deletions src/packages/database/postgres/bulk-delete.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ describe("bulk delete", () => {
const p = getPool();
const project_id = uuid();
const N = 2000;

// extra entry, which has to remain
const other = uuid();
await p.query(
"INSERT INTO project_log (id, project_id, time) VALUES($1::UUID, $2::UUID, $3::TIMESTAMP)",
[other, uuid(), new Date()],
);

for (let i = 0; i < N; i++) {
await p.query(
"INSERT INTO project_log (id, project_id, time) VALUES($1::UUID, $2::UUID, $3::TIMESTAMP)",
Expand All @@ -33,17 +41,31 @@ describe("bulk delete", () => {
);
expect(num1.rows[0].num).toEqual(N);

await bulk_delete({
const res = await bulk_delete({
table: "project_log",
field: "project_id",
value: project_id,
limit: 100,
limit: 128,
});

// if this ever fails, the "ret.rowCount" value is inaccurate.
// This must be replaced by "RETURNING 1" in the the query and a "SELECT COUNT(*) ..." and so.
// (and not only here, but everywhere in the code base)
expect(res.rowsDeleted).toEqual(N);
expect(res.durationS).toBeGreaterThan(0.01);
expect(res.totalPgTimeS).toBeGreaterThan(0.001);
expect(res.totalWaitS).toBeGreaterThan(0.001);
expect((res.totalPgTimeS * 10) / res.totalWaitS).toBeGreaterThan(0.5);

const num2 = await p.query(
"SELECT COUNT(*)::INT as num FROM project_log WHERE project_id = $1",
[project_id],
);
expect(num2.rows[0].num).toEqual(0);
});

const otherRes = await p.query("SELECT * FROM project_log WHERE id = $1", [
other,
]);
expect(otherRes.rows[0].id).toEqual(other);
}, 10000);
});
62 changes: 35 additions & 27 deletions src/packages/database/postgres/bulk-delete.ts
Original file line number Diff line number Diff line change
@@ -1,73 +1,81 @@
// see packages/database/pool/pool.ts for where this name is also hard coded:
process.env.PGDATABASE = "smc_ephemeral_testing_database";

import { escapeIdentifier } from "pg";

import getPool from "@cocalc/database/pool";
import { SCHEMA } from "@cocalc/util/schema";

interface Opts {
table: string;
table: string; // e.g. project_log, etc.
field: "project_id" | "account_id"; // for now, we only support a few
id?: string; // default "id", the ID field in the table, which identifies each row uniquely
value: string; // a UUID
limit?: number;
limit?: number; // default 1024
maxUtilPct?: number; // 0-100, percent
}

type Ret = Promise<{
rowsDeleted: number;
durationS: number;
totalWaitS: number;
totalPgTimeS: number;
}>;

function deleteQuery(table: string, field: string) {
function deleteQuery(table: string, field: string, id: string) {
const T = escapeIdentifier(table);
const F = escapeIdentifier(field);
const ID = escapeIdentifier(id);

return `
DELETE FROM ${T}
WHERE ${F} IN (
SELECT ${F} FROM ${T} WHERE ${F} = $1 LIMIT $2
)
RETURNING 1
`;
WHERE ${ID} IN (
SELECT ${ID} FROM ${T} WHERE ${F} = $1 LIMIT $2
)`;
}

export async function bulk_delete(opts: Opts): Ret {
const { table, field, value } = opts;
let { limit = 1000 } = opts;
const { table, field, value, id = "id", maxUtilPct = 10 } = opts;
let { limit = 1024 } = opts;
// assert table name is a key in SCHEMA
if (!(table in SCHEMA)) {
throw new Error(`table ${table} does not exist`);
}

const q = deleteQuery(table, field);
console.log(q);
console.log(opts);
if (maxUtilPct < 1 || maxUtilPct > 99) {
throw new Error(`maxUtilPct must be between 1 and 99`);
}

const q = deleteQuery(table, field, id);
const pool = getPool();

const start_ts = Date.now();
let rowsDeleted = 0;

let rowsDeleted = 0;
let totalWaitS = 0;
let totalPgTimeS = 0;
while (true) {
const t0 = Date.now();
const ret = await pool.query(q, [value, limit]);
const td = Date.now() - t0;
const dt = (Date.now() - t0) / 1000;
rowsDeleted += ret.rowCount ?? 0;
totalPgTimeS += dt;

// adjust the limit
const next = Math.round(
td > 0.1 ? limit / 2 : td < 0.05 ? limit * 2 : limit,
);
limit = Math.max(1, Math.min(10000, next));
// adjust the limit: we aim to keep the operation between 0.1 and 0.2 secs
const next = dt > 0.2 ? limit / 2 : dt < 0.1 ? limit * 2 : limit;
limit = Math.max(1, Math.min(32768, Math.round(next)));

// wait for a bit, but not more than 1 second ~ this aims for a max utilization of 10%
const wait_ms = Math.min(1000, td * 10);
await new Promise((done) => setTimeout(done, wait_ms));
const waitS = Math.min(1, dt * ((100 - maxUtilPct) / maxUtilPct));
await new Promise((done) => setTimeout(done, 1000 * waitS));
totalWaitS += waitS;

console.log(
`loop: deleted ${ret.rowCount} | wait=${wait_ms} | limit=${limit}`,
);
// console.log(
// `deleted ${ret.rowCount} | dt=${dt} | wait=${waitS} | limit=${limit}`,
// );

if (ret.rowCount === 0) break;
}

const durationS = (Date.now() - start_ts) / 1000;
return { durationS, rowsDeleted };
return { durationS, rowsDeleted, totalWaitS, totalPgTimeS };
}

0 comments on commit 4126e2c

Please sign in to comment.