From 8f3d0cb68bc55ff81e9a5f47a65b266bd4f37570 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 4 Jun 2021 19:09:02 +0800 Subject: [PATCH] use prettiery to auto format md files --- .github/workflows/dev.yml | 14 ++++++- CODE_OF_CONDUCT.md | 4 +- CONTRIBUTING.md | 26 ++++++------- README.md | 34 ++++++++--------- arrow/README.md | 36 +++++++++--------- .../fixtures/crossbow-success-message.md | 12 +++--- dev/release/README.md | 35 +++++++----------- integration-testing/README.md | 10 ++--- parquet/README.md | 37 ++++++++++--------- 9 files changed, 107 insertions(+), 101 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 9d8146a2f1d6..545cb97cd2da 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -27,7 +27,6 @@ env: ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: - lint: name: Lint C++, Python, R, Rust, Docker, RAT runs-on: ubuntu-latest @@ -41,3 +40,16 @@ jobs: run: pip install -e dev/archery[docker] - name: Lint run: archery lint --rat + prettier: + name: Use prettier to check formatting of documents + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v2 + with: + node-version: "14" + - name: Prettier check + run: | + # if you encounter error, try rerun the command below with --write instead of --check + # and commit the changes + npx prettier@2.3.0 --check {arrow,arrow-flight,dev,integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 2efe740b77c5..9a24b9b8a110 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -19,6 +19,6 @@ # Code of Conduct -* [Code of Conduct for The Apache Software Foundation][1] +- [Code of Conduct for The Apache Software Foundation][1] -[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file +[1]: https://www.apache.org/foundation/policies/conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3e636d9cd2fe..18d6a7be5abb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,15 +21,15 @@ ## Did you find a bug? -The Arrow project uses JIRA as a bug tracker. To report a bug, you'll have +The Arrow project uses JIRA as a bug tracker. To report a bug, you'll have to first create an account on the -[Apache Foundation JIRA](https://issues.apache.org/jira/). The JIRA server -hosts bugs and issues for multiple Apache projects. The JIRA project name +[Apache Foundation JIRA](https://issues.apache.org/jira/). The JIRA server +hosts bugs and issues for multiple Apache projects. The JIRA project name for Arrow is "ARROW". To be assigned to an issue, ask an Arrow JIRA admin to go to [Arrow Roles](https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/roles), -click "Add users to a role," and add you to the "Contributor" role. Most +click "Add users to a role," and add you to the "Contributor" role. Most committers are authorized to do this; if you're a committer and aren't able to load that project admin page, have someone else add you to the necessary role. @@ -39,15 +39,15 @@ Before you create a new bug entry, we recommend you first among existing Arrow issues. When you create a new JIRA entry, please don't forget to fill the "Component" -field. Arrow has many subcomponents and this helps triaging and filtering -tremendously. Also, we conventionally prefix the issue title with the component +field. Arrow has many subcomponents and this helps triaging and filtering +tremendously. Also, we conventionally prefix the issue title with the component name in brackets, such as "[C++] Crash in Array::Frobnicate()", so as to make lists more easy to navigate, and we'd be grateful if you did the same. ## Did you write a patch that fixes a bug or brings an improvement? -First create a JIRA entry as described above. Then, submit your changes -as a GitHub Pull Request. We'll ask you to prefix the pull request title +First create a JIRA entry as described above. Then, submit your changes +as a GitHub Pull Request. We'll ask you to prefix the pull request title with the JIRA issue number and the component name in brackets. (for example: "ARROW-2345: [C++] Fix crash in Array::Frobnicate()"). Respecting this convention makes it easier for us to process the backlog @@ -55,13 +55,13 @@ of submitted Pull Requests. ### Minor Fixes -Any functionality change should have a JIRA opened. For minor changes that -affect documentation, you do not need to open up a JIRA. Instead you can +Any functionality change should have a JIRA opened. For minor changes that +affect documentation, you do not need to open up a JIRA. Instead you can prefix the title of your PR with "MINOR: " if meets the following guidelines: -* Grammar, usage and spelling fixes that affect no more than 2 files -* Documentation updates affecting no more than 2 files and not more - than 500 words. +- Grammar, usage and spelling fixes that affect no more than 2 files +- Documentation updates affecting no more than 2 files and not more + than 500 words. ## Do you want to propose a significant new feature or an important refactoring? diff --git a/README.md b/README.md index 21a625aa310d..d10bf6738be2 100644 --- a/README.md +++ b/README.md @@ -25,13 +25,13 @@ Welcome to the implementation of Arrow, the popular in-memory columnar format, i This part of the Arrow project is divided in 4 main components: -| Crate | Description | Documentation | -|-----------|-------------|---------------| -|Arrow | Core functionality (memory layout, arrays, low level computations) | [(README)](arrow/README.md) | -|Parquet | Parquet support | [(README)](parquet/README.md) | -|Arrow-flight | Arrow data between processes | [(README)](arrow-flight/README.md) | -|DataFusion | In-memory query engine with SQL support | [(README)](https://github.com/apache/arrow-datafusion/blob/master/README.md) | -|Ballista | Distributed query execution | [(README)](https://github.com/apache/arrow-datafusion/blob/master/ballista/README.md) | +| Crate | Description | Documentation | +| ------------ | ------------------------------------------------------------------ | ------------------------------------------------------------------------------------- | +| Arrow | Core functionality (memory layout, arrays, low level computations) | [(README)](arrow/README.md) | +| Parquet | Parquet support | [(README)](parquet/README.md) | +| Arrow-flight | Arrow data between processes | [(README)](arrow-flight/README.md) | +| DataFusion | In-memory query engine with SQL support | [(README)](https://github.com/apache/arrow-datafusion/blob/master/README.md) | +| Ballista | Distributed query execution | [(README)](https://github.com/apache/arrow-datafusion/blob/master/ballista/README.md) | Independently, they support a vast array of functionality for in-memory computations. @@ -39,15 +39,15 @@ Together, they allow users to write an SQL query or a `DataFrame` (using the `da Generally speaking, the `arrow` crate offers functionality to develop code that uses Arrow arrays, and `datafusion` offers most operations typically found in SQL, with the notable exceptions of: -* `join` -* `window` functions +- `join` +- `window` functions There are too many features to enumerate here, but some notable mentions: -* `Arrow` implements all formats in the specification except certain dictionaries -* `Arrow` supports SIMD operations to some of its vertical operations -* `DataFusion` supports `async` execution -* `DataFusion` supports user-defined functions, aggregates, and whole execution nodes +- `Arrow` implements all formats in the specification except certain dictionaries +- `Arrow` supports SIMD operations to some of its vertical operations +- `DataFusion` supports `async` execution +- `DataFusion` supports user-defined functions, aggregates, and whole execution nodes You can find more details about each crate in their respective READMEs. @@ -118,7 +118,6 @@ export ARROW_TEST_DATA=$(cd ../testing/data; pwd) From here on, this is a pure Rust project and `cargo` can be used to run tests, benchmarks, docs and examples as usual. - ### Running the tests Run tests using the Rust standard `cargo test` command: @@ -156,9 +155,10 @@ If you use Visual Studio Code with the `rust-analyzer` plugin, you can enable `c One of the concerns with `clippy` is that it often produces a lot of false positives, or that some recommendations may hurt readability. We do not have a policy of which lints are ignored, but if you disagree with a `clippy` lint, you may disable the lint and briefly justify it. Search for `allow(clippy::` in the codebase to identify lints that are ignored/allowed. We currently prefer ignoring lints on the lowest unit possible. -* If you are introducing a line that returns a lint warning or error, you may disable the lint on that line. -* If you have several lints on a function or module, you may disable the lint on the function or module. -* If a lint is pervasive across multiple modules, you may disable it at the crate level. + +- If you are introducing a line that returns a lint warning or error, you may disable the lint on that line. +- If you have several lints on a function or module, you may disable the lint on the function or module. +- If a lint is pervasive across multiple modules, you may disable it at the crate level. ## Git Pre-Commit Hook diff --git a/arrow/README.md b/arrow/README.md index f67d582c6f44..dfd5926281a0 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -79,12 +79,12 @@ The above script will run the `flatc` compiler and perform some adjustments to t Arrow uses the following features: -* `simd` - Arrow uses the [packed_simd](https://crates.io/crates/packed_simd) crate to optimize many of the - implementations in the [compute](https://github.com/apache/arrow/tree/master/rust/arrow/src/compute) - module using SIMD intrinsics. These optimizations are turned *off* by default. - If the `simd` feature is enabled, an unstable version of Rust is required (we test with `nightly-2021-03-24`) -* `flight` which contains useful functions to convert between the Flight wire format and Arrow data -* `prettyprint` which is a utility for printing record batches +- `simd` - Arrow uses the [packed_simd](https://crates.io/crates/packed_simd) crate to optimize many of the + implementations in the [compute](https://github.com/apache/arrow/tree/master/rust/arrow/src/compute) + module using SIMD intrinsics. These optimizations are turned _off_ by default. + If the `simd` feature is enabled, an unstable version of Rust is required (we test with `nightly-2021-03-24`) +- `flight` which contains useful functions to convert between the Flight wire format and Arrow data +- `prettyprint` which is a utility for printing record batches Other than `simd` all the other features are enabled by default. Disabling `prettyprint` might be necessary in order to compile Arrow to the `wasm32-unknown-unknown` WASM target. @@ -99,12 +99,12 @@ This crate only accepts the usage of `unsafe` code upon careful consideration, a Generally, `unsafe` should only be used when a `safe` counterpart is not available and there is no `safe` way to achieve additional performance in that area. The following is a summary of the current components of the crate that require `unsafe`: -* alloc, dealloc and realloc of buffers along cache lines -* Interpreting bytes as certain rust types, for access, representation and compute -* Foreign interfaces (C data interface) -* Inter-process communication (IPC) -* SIMD -* Performance (e.g. omit bounds checks, use of pointers to avoid bound checks) +- alloc, dealloc and realloc of buffers along cache lines +- Interpreting bytes as certain rust types, for access, representation and compute +- Foreign interfaces (C data interface) +- Inter-process communication (IPC) +- SIMD +- Performance (e.g. omit bounds checks, use of pointers to avoid bound checks) #### cache-line aligned memory management @@ -147,13 +147,13 @@ Usage of `unsafe` for performance reasons is justified only when all other alter ### Considerations when introducing `unsafe` -Usage of `unsafe` in this crate *must*: +Usage of `unsafe` in this crate _must_: -* not expose a public API as `safe` when there are necessary invariants for that API to be defined behavior. -* have code documentation for why `safe` is not used / possible -* have code documentation about which invariant the user needs to enforce to ensure [soundness](https://rust-lang.github.io/unsafe-code-guidelines/glossary.html#soundness-of-code--of-a-library), or which -* invariant is being preserved. -* if applicable, use `debug_assert`s to relevant invariants (e.g. bound checks) +- not expose a public API as `safe` when there are necessary invariants for that API to be defined behavior. +- have code documentation for why `safe` is not used / possible +- have code documentation about which invariant the user needs to enforce to ensure [soundness](https://rust-lang.github.io/unsafe-code-guidelines/glossary.html#soundness-of-code--of-a-library), or which +- invariant is being preserved. +- if applicable, use `debug_assert`s to relevant invariants (e.g. bound checks) Example of code documentation: diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md b/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md index 15825218c13a..f914287dcc09 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md +++ b/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md @@ -2,9 +2,9 @@ Revision: {revision} Submitted crossbow builds: [{repo} @ {branch}](https://github.com/{repo}/branches/all?query={branch}) -|Task|Status| -|----|------| -|docker-cpp-cmake32|[![CircleCI](https://img.shields.io/circleci/build/github/{repo}/{branch}-circle-docker-cpp-cmake32.svg)](https://circleci.com/gh/{repo}/tree/{branch}-circle-docker-cpp-cmake32)| -|wheel-osx-cp36m|[![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp36m.svg)](https://travis-ci.com/{repo}/branches)| -|wheel-osx-cp37m|[![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp37m.svg)](https://travis-ci.com/{repo}/branches)| -|wheel-win-cp36m|[![Appveyor](https://img.shields.io/appveyor/ci/{repo}/{branch}-appveyor-wheel-win-cp36m.svg)](https://ci.appveyor.com/project/{repo}/history)| +| Task | Status | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| docker-cpp-cmake32 | [![CircleCI](https://img.shields.io/circleci/build/github/{repo}/{branch}-circle-docker-cpp-cmake32.svg)](https://circleci.com/gh/{repo}/tree/{branch}-circle-docker-cpp-cmake32) | +| wheel-osx-cp36m | [![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp36m.svg)](https://travis-ci.com/{repo}/branches) | +| wheel-osx-cp37m | [![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp37m.svg)](https://travis-ci.com/{repo}/branches) | +| wheel-win-cp36m | [![Appveyor](https://img.shields.io/appveyor/ci/{repo}/{branch}-appveyor-wheel-win-cp36m.svg)](https://ci.appveyor.com/project/{repo}/history) | diff --git a/dev/release/README.md b/dev/release/README.md index 62f00638d031..ccddb87d4162 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -22,15 +22,17 @@ ## Branching We would maintain two branches: `active_release` and `master`. -* All new PRs are created and merged against `master` -* All versions are created from the `active_release` branch -* Once merged to master, changes are "cherry-picked" (via a hopefully soon to be automated process), to the `active_release` branch based on the judgement of the original PR author and maintainers. -* We do not merge breaking api changes, as defined in [Rust RFC 1105](https://github.com/rust-lang/rfcs/blob/master/text/1105-api-evolution.md) to the `active_release` +- All new PRs are created and merged against `master` +- All versions are created from the `active_release` branch +- Once merged to master, changes are "cherry-picked" (via a hopefully soon to be automated process), to the `active_release` branch based on the judgement of the original PR author and maintainers. + +- We do not merge breaking api changes, as defined in [Rust RFC 1105](https://github.com/rust-lang/rfcs/blob/master/text/1105-api-evolution.md) to the `active_release` Please see the [original proposal](https://docs.google.com/document/d/1tMQ67iu8XyGGZuj--h9WQYB9inCk6c2sL_4xMTwENGc/edit?ts=60961758) document the rational of this change. ## Release Branching + We aim to release every other week from the `active_release` branch. Every other Monday, a maintainer proposes a minor (e.g. `4.1.0` to `4.2.0`) or patch (e.g `4.1.0` to `4.1.1`) release, depending on changes to the `active_release` in the previous 2 weeks, following the process beloe. @@ -44,6 +46,7 @@ Apache Arrow in general does synchronized major releases every three months. The This directory contains the scripts used to manage an Apache Arrow Release. # Process Overview + As part of the Apache governance model, official releases consist of signed source tarballs approved by the PMC. @@ -52,7 +55,6 @@ crates.io, the Rust ecosystem's package manager. ## Branching - # Release Preparation # Change Log @@ -65,16 +67,13 @@ The CHANGELOG is created automatically using This script creates a changelog using github issues and the labels associated with them. - - - # Mechanics of creating a release ## Prepare the release branch and tags First, ensure that `active_release` contains the content of the desired release. For minor and patch releases, no additional steps are needed. -To prepare for *a major release*, change `active release` to point at the latest `master` with commands such as: +To prepare for _a major release_, change `active release` to point at the latest `master` with commands such as: ``` git checkout active_release @@ -111,7 +110,6 @@ Note that when reviewing the change log, rather than editing the `CHANGELOG.md`, it is preferred to update the issues and their labels (e.g. add `invalid` label to exclude them from release notes) - ## Prepare release candidate tarball (Note you need to be a committer to run these scripts as they upload to the apache svn distribution servers) @@ -135,7 +133,7 @@ Pick numbers in sequential order, with `0` for `rc1`, `1` for `rc1`, etc. ### Create, sign, and upload tarball -Run the `create-tarball.sh` with the `` tag and `` and you found in previous steps: +Run the `create-tarball.sh` with the `` tag and `` and you found in previous steps: ```shell ./dev/release/create-tarball.sh 4.1.0 2 @@ -144,12 +142,11 @@ Run the `create-tarball.sh` with the `` tag and `` and you found This script 1. creates and uploads a release candidate tarball to the [arrow -dev](https://dist.apache.org/repos/dist/dev/arrow) location on the -apache distribution svn server + dev](https://dist.apache.org/repos/dist/dev/arrow) location on the + apache distribution svn server 2. provide you an email template to -send to dev@arrow.apache.org for release voting. - + send to dev@arrow.apache.org for release voting. ### Vote on Release Candidate tarball @@ -185,7 +182,6 @@ The vote will be open for at least 72 hours. For the release to become "official" it needs at least three PMC members to vote +1 on it. - #### Verifying Release Candidates There is a script in this repository which can be used to help `dev/release/verify-release-candidate.sh` assist the verification process. Run it like: @@ -194,12 +190,10 @@ There is a script in this repository which can be used to help `dev/release/veri ./dev/release/verify-release-candidate.sh 4.1.0 2 ``` - #### If the release is not approved If the release is not approved, fix whatever the problem is and try again with the next RC number - ### If the release is approved, Move tarball to the release location in SVN, e.g. https://dist.apache.org/repos/dist/release/arrow/arrow-4.1.0/, using the `release-tarball.sh` script: @@ -225,7 +219,7 @@ of the [arrow crate](https://crates.io/crates/arrow). Download and unpack the official release tarball Verify that the Cargo.toml in the tarball contains the correct version -(e.g. `version = "0.11.0"`) and then publish the crate with the +(e.g. `version = "0.11.0"`) and then publish the crate with the following commands ```shell @@ -247,8 +241,6 @@ Step 3a: If CI passes, merge cherry-pick PR Step 3b: If CI doesn't pass or some other changes are needed, the PR should be reviewed / approved as normal prior to merge - - For example, to backport `b2de5446cc1e45a0559fb39039d0545df1ac0d26` to active_release use the folliwing ```shell @@ -258,6 +250,7 @@ ARROW_GITHUB_API_TOKEN=$ARROW_GITHUB_API_TOKEN CHECKOUT_ROOT=/tmp/arrow-rs CHERR ``` ## Rationale for creating PRs: + 1. PRs are a natural place to run the CI tests to make sure there are no logical conflicts 2. PRs offer a place for the original author / committers to comment and say it should/should not be backported. 3. PRs offer a way to make cleanups / fixups and approve (if needed) for non cherry pick PRs diff --git a/integration-testing/README.md b/integration-testing/README.md index 66248deb3468..ff365426b520 100644 --- a/integration-testing/README.md +++ b/integration-testing/README.md @@ -23,8 +23,8 @@ See [Integration.rst](../../docs/source/format/Integration.rst) for an overview This crate contains the following binaries, which are invoked by Archery during integration testing with other Arrow implementations. -| Binary | Purpose | -|--------|---------| -| arrow-file-to-stream | Converts an Arrow file to an Arrow stream | -| arrow-stream-to-file | Converts an Arrow stream to an Arrow file | -| arrow-json-integration-test | Converts between Arrow and JSON formats | +| Binary | Purpose | +| --------------------------- | ----------------------------------------- | +| arrow-file-to-stream | Converts an Arrow file to an Arrow stream | +| arrow-stream-to-file | Converts an Arrow stream to an Arrow file | +| arrow-json-integration-test | Converts between Arrow and JSON formats | diff --git a/parquet/README.md b/parquet/README.md index 326c966b8b27..3b7771235242 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -76,23 +76,23 @@ version is available. Then simply update version of `parquet-format` crate in Ca ## Features -- [X] All encodings supported -- [X] All compression codecs supported -- [X] Read support - - [X] Primitive column value readers - - [X] Row record reader - - [X] Arrow record reader +- [x] All encodings supported +- [x] All compression codecs supported +- [x] Read support + - [x] Primitive column value readers + - [x] Row record reader + - [x] Arrow record reader - [ ] Statistics support -- [X] Write support - - [X] Primitive column value writers +- [x] Write support + - [x] Primitive column value writers - [ ] Row record writer - - [X] Arrow record writer + - [x] Arrow record writer - [ ] Predicate pushdown -- [X] Parquet format 2.6.0 support +- [x] Parquet format 2.6.0 support ## Requirements -Parquet requires LLVM. Our windows CI image includes LLVM but to build the libraries locally windows +Parquet requires LLVM. Our windows CI image includes LLVM but to build the libraries locally windows users will have to install LLVM. Follow [this](https://github.com/appveyor/ci/issues/2651) link for info. ## Build @@ -109,18 +109,19 @@ Run `cargo test` for unit tests. To also run tests related to the binaries, use ## Binaries The following binaries are provided (use `cargo install --features cli` to install them): + - **parquet-schema** for printing Parquet file schema and metadata. -`Usage: parquet-schema `, where `file-path` is the path to a Parquet file. Use `-v/--verbose` flag -to print full metadata or schema only (when not specified only schema will be printed). + `Usage: parquet-schema `, where `file-path` is the path to a Parquet file. Use `-v/--verbose` flag + to print full metadata or schema only (when not specified only schema will be printed). - **parquet-read** for reading records from a Parquet file. -`Usage: parquet-read [num-records]`, where `file-path` is the path to a Parquet file, -and `num-records` is the number of records to read from a file (when not specified all records will -be printed). Use `-j/--json` to print records in JSON lines format. + `Usage: parquet-read [num-records]`, where `file-path` is the path to a Parquet file, + and `num-records` is the number of records to read from a file (when not specified all records will + be printed). Use `-j/--json` to print records in JSON lines format. - **parquet-rowcount** for reporting the number of records in one or more Parquet files. -`Usage: parquet-rowcount ...`, where `...` is a space separated list of one or more -files to read. + `Usage: parquet-rowcount ...`, where `...` is a space separated list of one or more + files to read. If you see `Library not loaded` error, please make sure `LD_LIBRARY_PATH` is set properly: