diff --git a/docs/files.md b/docs/files.md index 40dcae2b7..900733ea6 100644 --- a/docs/files.md +++ b/docs/files.md @@ -60,15 +60,16 @@ For missing files, `file.lastModified` is undefined. The `file.mimeType` is dete | method | return type | - | - +| [`file.arquero`][arquero] | Arquero [`Table`][arquero-table] | [`file.arrayBuffer`][binary] | [`ArrayBuffer`][array-buffer] -| [`file.arrow`][arrow] | [`Table`][arrow-table] +| [`file.arrow`][arrow] | Arrow [`Table`][arrow-table] | [`file.blob`][binary] | [`Blob`][blob] | [`file.csv`][csv] | [`Array`][array] | [`file.dsv`][csv] | [`Array`][array] | [`file.html`][markup] | [`Document`][document] | [`file.image`][media] | [`HTMLImageElement`][image] | [`file.json`][json] | [`Array`][array], [`Object`][object], _etc._ -| [`file.parquet`][arrow] | [`Table`][arrow-table] +| [`file.parquet`][arrow] | Arrow [`Table`][arrow-table] | [`file.sqlite`][sqlite] | [`SQLiteDatabaseClient`][sqlite] | [`file.stream`][binary] | [`ReadableStream`][stream] | [`file.text`][text] | [`string`][string] @@ -77,6 +78,8 @@ For missing files, `file.lastModified` is undefined. The `file.mimeType` is dete | [`file.xml`][markup] | [`Document`][document] | [`file.zip`][zip] | [`ZipArchive`][zip] +[arquero]: ./lib/arquero +[arquero-table]: https://idl.uw.edu/arquero/api/#table [array-buffer]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/ArrayBuffer [arrow-table]: https://arrow.apache.org/docs/js/classes/Arrow_dom.Table.html [blob]: https://developer.mozilla.org/en-US/docs/Web/API/Blob @@ -98,7 +101,7 @@ For missing files, `file.lastModified` is undefined. The `file.mimeType` is dete [xlsx]: ./lib/xlsx [zip]: ./lib/zip -The contents of a file often dictate the appropriate method — for example, an Apache Arrow file is almost always read with `file.arrow`. When multiple methods are valid, choose based on your needs. For example, you can load a CSV file using `file.text` to implement parsing yourself. +The contents of a file often dictate the appropriate method — for example, an Excel XLSX file is almost always read with `file.xlsx`. When multiple methods are valid, choose based on your needs. For example, you can load a CSV file using `file.arquero` to load it into [arquero](./lib/arquero), or even using `file.text` to implement parsing yourself. In addition to the above, you can get the resolved absolute URL of the file using `file.href`: diff --git a/docs/lib/arquero.md b/docs/lib/arquero.md index 05c003f93..bc29044d8 100644 --- a/docs/lib/arquero.md +++ b/docs/lib/arquero.md @@ -1,8 +1,17 @@ + + # Arquero [Arquero](https://uwdata.github.io/arquero/) is a JavaScript library for “query processing and transformation of array-backed data tables.” Arquero is available by default as `aq` in Markdown, but you can import it explicitly like so: -```js echo +```js run=false import * as aq from "npm:arquero"; ``` @@ -19,14 +28,13 @@ const dt = aq.table({ Arquero is column-oriented: each column is an array of values of a given type. Here, numbers representing hours of sunshine per month. But an Arquero table is also iterable and as such, its contents can be displayed with [`Inputs.table`](/lib/inputs#table). ```js echo -Inputs.table(dt, {maxWidth: 640}) +Inputs.table(dt) ``` An Arquero table can also be used to make charts with [Observable Plot](./plot): ```js echo Plot.plot({ - width: Math.min(width, 640), x: {tickFormat: Plot.formatMonth()}, y: {grid: true, label: "Hours of sunshine ☀️ per month"}, marks: [ @@ -41,25 +49,25 @@ Plot.plot({ Arquero supports a range of data transformation tasks, including filter, sample, aggregation, window, join, and reshaping operations. For example, the following operation derives differences between Seattle and Chicago and sorts the months accordingly. ```js echo -const diffs = dt.derive({ - month: (d) => aq.op.row_number(), - diff: (d) => d.Seattle - d.Chicago - }) - .select("month", "diff") - .orderby(aq.desc("diff")); - -display(Inputs.table(diffs, {maxWidth: 640})); +Inputs.table( + dt.derive({ + month: (d) => aq.op.row_number(), + diff: (d) => d.Seattle - d.Chicago + }) + .select("month", "diff") + .orderby(aq.desc("diff")) +) ``` Is Seattle more correlated with San Francisco or Chicago? ```js echo -const correlations = dt.rollup({ - corr_sf: aq.op.corr("Seattle", "San Francisco"), - corr_chi: aq.op.corr("Seattle", "Chicago") -}); - -display(Inputs.table(correlations, {maxWidth: 640})); +Inputs.table( + dt.rollup({ + corr_sf: aq.op.corr("Seattle", "San Francisco"), + corr_chi: aq.op.corr("Seattle", "Chicago") + }) +) ``` We can aggregate statistics per city. The following code reshapes (or “folds”) the data into two columns _city_ & _sun_ and shows the output as objects: @@ -68,14 +76,25 @@ We can aggregate statistics per city. The following code reshapes (or “folds dt.fold(aq.all(), {as: ["city", "sun"]}) .groupby("city") .rollup({ - min: (d) => aq.op.min(d.sun), // functional form of op.min('sun') - max: (d) => aq.op.max(d.sun), - avg: (d) => aq.op.average(d.sun), - med: (d) => aq.op.median(d.sun), - // functional forms permit flexible table expressions - skew: ({sun: s}) => (aq.op.mean(s) - aq.op.median(s)) / aq.op.stdev(s) || 0 + min: aq.op.min("sun"), + max: aq.op.max("sun"), + avg: (d) => aq.op.average(d.sun), // equivalent to aq.op.average("sun") + med: (d) => aq.op.median(d.sun), // equivalent to aq.op.median("sun") + skew: ({sun}) => (aq.op.mean(sun) - aq.op.median(sun)) / aq.op.stdev(sun) }) .objects() ``` +To load an Arquero table from an Apache Arrow, Apache Parquet, CSV, TSV, or JSON file, use [`file.arquero`](../files#arquero) : + +```js run=false +const flights = FileAttachment("flights-200k.arrow").arquero(); +``` + +This is equivalent to: + +```js run=false +const flights = aq.loadArrow(FileAttachment("flights-200k.arrow").href); +``` + For more, see [Arquero’s official documentation](https://uwdata.github.io/arquero/). diff --git a/docs/lib/csv.md b/docs/lib/csv.md index aa9d5e526..efbb2d639 100644 --- a/docs/lib/csv.md +++ b/docs/lib/csv.md @@ -18,7 +18,7 @@ The column names are listed in the `columns` property: gistemp.columns ``` -You can also load a tab-separated values (TSV) file using `FileAttachment.tsv`: +You can also load a tab-separated values (TSV) file using `file.tsv`: ```js echo const capitals = FileAttachment("us-state-capitals.tsv").tsv({typed: true}); @@ -28,7 +28,7 @@ const capitals = FileAttachment("us-state-capitals.tsv").tsv({typed: true}); Inputs.table(capitals) ``` -For a different delimiter, use `FileAttachment.dsv`. For example, for semicolon separated values: +For a different delimiter, use `file.dsv`. For example, for semicolon separated values: ```js run=false const capitals = FileAttachment("us-state-capitals.csv").dsv({delimiter: ";", typed: true}); diff --git a/docs/loaders.md b/docs/loaders.md index 86a232c87..aeb7f33e8 100644 --- a/docs/loaders.md +++ b/docs/loaders.md @@ -128,7 +128,7 @@ const metadata = FileAttachment("quakes/metadata.json").json(); const features = FileAttachment("quakes/features.csv").csv({typed: true}); ``` -The ZIP file itself can be also referenced as a whole — for example if the names of the files are not known in advance — with [`FileAttachment.zip`](./lib/zip): +The ZIP file itself can be also referenced as a whole — for example if the names of the files are not known in advance — with [`file.zip`](./lib/zip): ```js echo const zip = FileAttachment("quakes.zip").zip(); diff --git a/docs/reactivity.md b/docs/reactivity.md index b9f6a8aa6..388cdea5c 100644 --- a/docs/reactivity.md +++ b/docs/reactivity.md @@ -64,7 +64,7 @@ In Framework, when one code block refers to a promise defined in another code bl
Implicit await only applies across code blocks, not within a code block. Within a code block, a promise is just a promise.
-For example, below `FileAttachment.json` returns a promise, and so the value of `volcano` inside the code block is a promise. +For example, below `file.json` returns a promise, and so the value of `volcano` inside the code block is a promise. ```js echo const volcano = FileAttachment("volcano.json").json(); diff --git a/src/client/stdlib/fileAttachment.js b/src/client/stdlib/fileAttachment.js index 4190c3f99..6466d1cc1 100644 --- a/src/client/stdlib/fileAttachment.js +++ b/src/client/stdlib/fileAttachment.js @@ -76,6 +76,36 @@ export class AbstractFile { const [Arrow, response] = await Promise.all([import("npm:apache-arrow"), remote_fetch(this)]); return Arrow.tableFromIPC(response); } + async arquero(options) { + let request; + let from; + switch (this.mimeType) { + case "application/json": + request = this.text(); + from = "fromJSON"; + break; + case "text/tab-separated-values": + if (options?.delimiter === undefined) options = {...options, delimiter: "\t"}; + // fall through + case "text/csv": + request = this.text(); + from = "fromCSV"; + break; + default: + if (/\.arrow$/i.test(this.name)) { + request = this.arrow(); + from = "fromArrow"; + } else if (/\.parquet$/i.test(this.name)) { + request = this.parquet(); + from = "fromArrow"; + } else { + throw new Error(`unable to determine Arquero loader: ${this.name}`); + } + break; + } + const [aq, body] = await Promise.all([import("npm:arquero"), request]); + return aq[from](body, options); + } async parquet() { const [Arrow, Parquet, buffer] = await Promise.all([import("npm:apache-arrow"), import("npm:parquet-wasm").then(async (Parquet) => (await Parquet.default(import.meta.resolve("npm:parquet-wasm/esm/parquet_wasm_bg.wasm")), Parquet)), this.arrayBuffer()]); // prettier-ignore return Arrow.tableFromIPC(Parquet.readParquet(new Uint8Array(buffer)).intoIPCStream()); diff --git a/src/javascript/files.ts b/src/javascript/files.ts index 89d04f06f..e3ba16f29 100644 --- a/src/javascript/files.ts +++ b/src/javascript/files.ts @@ -105,11 +105,14 @@ export function findFiles( const filePath = resolveLocalPath(path, fileName); if (!filePath) throw syntaxError(`non-local file path: ${fileName}`, node, input); const parent = stack[stack.length - 2]; - const fileMethod = + const name = relativePath(path, filePath); + const method = parent && isMemberExpression(parent) && parent.property.type === "Identifier" - ? parent.property.name // FileAttachment("foo.csv").csv - : KNOWN_FILE_EXTENSIONS[extname(fileName)]; // bare FileAttachment("foo.csv") - files.push({node, name: relativePath(path, filePath), method: fileMethod}); + ? parent.property.name === "arquero" && /\.parquet$/i.test(fileName) + ? "arquero-parquet" // FileAttachment("foo.parquet").arquero + : parent.property.name // FileAttachment("foo.csv").csv + : KNOWN_FILE_EXTENSIONS[extname(fileName).toLowerCase()]; // bare FileAttachment("foo.csv") + files.push({node, name, method}); } }); diff --git a/src/libraries.ts b/src/libraries.ts index bf296c847..628fe60d7 100644 --- a/src/libraries.ts +++ b/src/libraries.ts @@ -2,6 +2,8 @@ export function getImplicitFileImports(methods: Iterable): Set { const set = setof(methods); const implicits = new Set(); if (set.has("arrow")) implicits.add("npm:apache-arrow"); + if (set.has("arquero")) implicits.add("npm:apache-arrow").add("npm:arquero"); + if (set.has("arquero-parquet")) implicits.add("npm:apache-arrow").add("npm:arquero").add("npm:parquet-wasm"); if (set.has("csv") || set.has("tsv")) implicits.add("npm:d3-dsv"); if (set.has("parquet")) implicits.add("npm:apache-arrow").add("npm:parquet-wasm"); if (set.has("sqlite")) implicits.add("npm:@observablehq/sqlite"); diff --git a/test/javascript/files-test.ts b/test/javascript/files-test.ts index 9f5f02a44..3c623c858 100644 --- a/test/javascript/files-test.ts +++ b/test/javascript/files-test.ts @@ -62,7 +62,8 @@ describe("findFiles(node, input)", () => { it("sets the file method based on the member expression", () => { assert.deepStrictEqual(files('FileAttachment("foo").arrayBuffer'), [{name: "./foo", method: "arrayBuffer"}]); assert.deepStrictEqual(files('FileAttachment("foo").arrow'), [{name: "./foo", method: "arrow"}]); - assert.deepStrictEqual(files('FileAttachment("foo").arrow'), [{name: "./foo", method: "arrow"}]); + assert.deepStrictEqual(files('FileAttachment("foo").arquero'), [{name: "./foo", method: "arquero"}]); + assert.deepStrictEqual(files('FileAttachment("foo.parquet").arquero'), [{name: "./foo.parquet", method: "arquero-parquet"}]); assert.deepStrictEqual(files('FileAttachment("foo").blob'), [{name: "./foo", method: "blob"}]); assert.deepStrictEqual(files('FileAttachment("foo").csv'), [{name: "./foo", method: "csv"}]); assert.deepStrictEqual(files('FileAttachment("foo").html'), [{name: "./foo", method: "html"}]); diff --git a/test/libraries-test.ts b/test/libraries-test.ts index 97fc13485..c2a27fecb 100644 --- a/test/libraries-test.ts +++ b/test/libraries-test.ts @@ -7,6 +7,8 @@ describe("getImplicitFileImports(files)", () => { assert.deepStrictEqual(getImplicitFileImports(["csv"]), new Set(["npm:d3-dsv"])); assert.deepStrictEqual(getImplicitFileImports(["tsv"]), new Set(["npm:d3-dsv"])); assert.deepStrictEqual(getImplicitFileImports(["arrow"]), new Set(["npm:apache-arrow"])); + assert.deepStrictEqual(getImplicitFileImports(["arquero"]), new Set(["npm:apache-arrow", "npm:arquero"])); + assert.deepStrictEqual(getImplicitFileImports(["arquero-parquet"]), new Set(["npm:apache-arrow", "npm:arquero", "npm:parquet-wasm"])); // prettier-ignore assert.deepStrictEqual(getImplicitFileImports(["parquet"]), new Set(["npm:apache-arrow", "npm:parquet-wasm"])); assert.deepStrictEqual(getImplicitFileImports(["sqlite"]), new Set(["npm:@observablehq/sqlite"])); assert.deepStrictEqual(getImplicitFileImports(["xlsx"]), new Set(["npm:@observablehq/xlsx"]));