Vectorized aggregation with grouping by one fixed-size column (#7341) #1877
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Libfuzzer | |
"on": | |
schedule: | |
# run daily 1:00 on main branch | |
- cron: '0 1 * * *' | |
push: | |
branches: | |
- main | |
- prerelease_test | |
- trigger/libfuzzer | |
pull_request: | |
paths: | |
- .github/workflows/libfuzzer.yaml | |
- 'tsl/test/fuzzing/compression/**' | |
jobs: | |
build: | |
runs-on: ubuntu-22.04 | |
name: Build PostgreSQL and TimescaleDB | |
env: | |
PG_SRC_DIR: pgbuild | |
PG_INSTALL_DIR: postgresql | |
steps: | |
- name: Install Linux Dependencies | |
run: | | |
# Don't add ddebs here because the ddebs mirror is always 503 Service Unavailable. | |
# If needed, install them before opening the core dump. | |
sudo apt-get update | |
sudo apt-get install 7zip clang lld llvm flex bison libipc-run-perl \ | |
libtest-most-perl tree jq | |
- name: Checkout TimescaleDB | |
uses: actions/checkout@v4 | |
- name: Read configuration | |
id: config | |
run: python -B .github/gh_config_reader.py | |
# We are going to rebuild Postgres daily, so that it doesn't suddenly break | |
# ages after the original problem. | |
- name: Get date for build caching | |
id: get-date | |
run: | | |
echo "date=$(date +"%d")" >> $GITHUB_OUTPUT | |
# we cache the build directory instead of the install directory here | |
# because extension installation will write files to install directory | |
# leading to a tainted cache | |
- name: Cache PostgreSQL | |
id: cache-postgresql | |
uses: actions/cache@v4 | |
with: | |
path: ~/${{ env.PG_SRC_DIR }} | |
key: "postgresql-libfuzzer-${{ steps.get-date.outputs.date }}-${{ hashFiles('.github/**') }}" | |
- name: Build PostgreSQL | |
if: steps.cache-postgresql.outputs.cache-hit != 'true' | |
run: | | |
wget -q -O postgresql.tar.bz2 \ | |
https://ftp.postgresql.org/pub/source/v${{ steps.config.outputs.PG15_LATEST }}/postgresql-${{ steps.config.outputs.PG15_LATEST }}.tar.bz2 | |
mkdir -p ~/$PG_SRC_DIR | |
tar --extract --file postgresql.tar.bz2 --directory ~/$PG_SRC_DIR --strip-components 1 | |
cd ~/$PG_SRC_DIR | |
CC=clang ./configure --prefix=$HOME/$PG_INSTALL_DIR --with-openssl \ | |
--without-readline --without-zlib --without-libxml --enable-cassert \ | |
--enable-debug CC=clang \ | |
CFLAGS="-fuse-ld=lld -ggdb3 -O2 -fno-omit-frame-pointer" | |
make -j$(nproc) | |
- name: Install PostgreSQL | |
run: | | |
make -C ~/$PG_SRC_DIR install | |
make -C ~/$PG_SRC_DIR/contrib/postgres_fdw install | |
- name: Upload config.log | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: config.log for PostgreSQL | |
path: ~/${{ env.PG_SRC_DIR }}/config.log | |
- name: Build TimescaleDB | |
run: | | |
set -e | |
export LIBFUZZER_PATH=$(dirname "$(find $(llvm-config --libdir) -name libclang_rt.fuzzer_no_main-x86_64.a | head -1)") | |
# Some pointers for the next time we have linking/undefined symbol problems: | |
# http://web.archive.org/web/20200926071757/https://github.com/google/sanitizers/issues/111 | |
# http://web.archive.org/web/20231101091231/https://github.com/cms-sw/cmssw/issues/40680 | |
cmake -B build -S . -DASSERTIONS=ON -DLINTER=OFF -DCMAKE_VERBOSE_MAKEFILE=1 \ | |
-DWARNINGS_AS_ERRORS=1 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=clang \ | |
-DCMAKE_C_FLAGS="-fsanitize=fuzzer-no-link -lstdc++ -L$LIBFUZZER_PATH -l:libclang_rt.fuzzer_no_main-x86_64.a -static-libsan" \ | |
-DCOMPRESSION_FUZZING=1 -DPG_PATH=$HOME/$PG_INSTALL_DIR | |
make -C build -j$(nproc) install | |
# Incredibly, the upload-artifact action can't preserve executable permissions: | |
# https://github.com/actions/upload-artifact/issues/38 | |
# It's also extremely slow. | |
- name: Compress the installation directory | |
run: 7z a install.7z $HOME/$PG_INSTALL_DIR | |
- name: Save the installation directory | |
uses: actions/upload-artifact@v4 | |
with: | |
name: fuzzing-install-dir | |
path: install.7z | |
if-no-files-found: error | |
retention-days: 1 | |
fuzz: | |
needs: build | |
strategy: | |
fail-fast: false | |
matrix: | |
case: [ | |
{ algo: gorilla , pgtype: float8, bulk: false, runs: 500000000 }, | |
{ algo: deltadelta, pgtype: int8 , bulk: false, runs: 500000000 }, | |
{ algo: gorilla , pgtype: float8, bulk: true , runs: 1000000000 }, | |
{ algo: deltadelta, pgtype: int8 , bulk: true , runs: 1000000000 }, | |
# array has a peculiar recv function that recompresses all input, so | |
# fuzzing it is much slower. The dictionary recv also uses it. | |
{ algo: array , pgtype: text , bulk: false, runs: 10000000 }, | |
{ algo: array , pgtype: text , bulk: true , runs: 10000000 }, | |
{ algo: dictionary, pgtype: text , bulk: false, runs: 100000000 }, | |
{ algo: dictionary, pgtype: text , bulk: true , runs: 100000000 }, | |
] | |
name: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }} | |
runs-on: ubuntu-22.04 | |
env: | |
PG_SRC_DIR: pgbuild | |
PG_INSTALL_DIR: postgresql | |
JOB_NAME: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }} | |
steps: | |
- name: Install Linux dependencies | |
run: | | |
sudo apt update | |
sudo apt install 7zip systemd-coredump gdb | |
- name: Checkout TimescaleDB | |
uses: actions/checkout@v4 | |
- name: Download the installation directory | |
uses: actions/download-artifact@v4 | |
with: | |
name: fuzzing-install-dir | |
- name: Unpack the installation directory | |
run: 7z x -o$HOME install.7z | |
- name: initdb | |
run: | | |
# Have to do this before initializing the corpus, or initdb will complain. | |
set -xeu | |
export PGDATA=db | |
export PGPORT=5432 | |
export PGDATABASE=postgres | |
export PATH=$HOME/$PG_INSTALL_DIR/bin:$PATH | |
initdb | |
echo "shared_preload_libraries = 'timescaledb'" >> $PGDATA/postgresql.conf | |
- name: Set configuration | |
id: config | |
run: | | |
set -x | |
echo "cache_prefix=${{ format('libfuzzer-corpus-2-{0}-{1}', matrix.case.algo, matrix.case.pgtype) }}" >> $GITHUB_OUTPUT | |
echo "name=${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}" >> $GITHUB_OUTPUT | |
- name: Restore the cached fuzzing corpus (bulk) | |
id: restore-corpus-cache-bulk | |
uses: actions/cache/restore@v4 | |
with: | |
path: db/corpus-bulk | |
key: "${{ steps.config.outputs.cache_prefix }}-bulk" | |
# We save the row-by-row corpus separately from the bulk corpus, so that | |
# they don't overwrite each other. Now we are going to combine them. | |
- name: Restore the cached fuzzing corpus (rowbyrow) | |
id: restore-corpus-cache-rowbyrow | |
uses: actions/cache/restore@v4 | |
with: | |
path: db/corpus-rowbyrow | |
key: "${{ steps.config.outputs.cache_prefix }}-rowbyrow" | |
- name: Initialize the fuzzing corpus | |
run: | | |
# Combine the cached corpus from rowbyrow and bulk fuzzing, and from repository. | |
mkdir -p db/corpus{,-rowbyrow,-bulk} | |
find "tsl/test/fuzzing/compression/${{ matrix.case.algo }}-${{ matrix.case.pgtype }}" -type f -exec cp -n -t db/corpus {} + | |
find "db/corpus-rowbyrow" -type f -exec cp -n -t db/corpus {} + | |
find "db/corpus-bulk" -type f -exec cp -n -t db/corpus {} + | |
ls db/corpus | wc -l | |
- name: Run libfuzzer for compression | |
run: | | |
set -xeu | |
export PGDATA=db | |
export PGPORT=5432 | |
export PGDATABASE=postgres | |
export PATH=$HOME/$PG_INSTALL_DIR/bin:$PATH | |
pg_ctl -o "-clogging_collector=true" -o "-clog_destination=jsonlog,stderr" \ | |
-o "-clog_directory=$(readlink -f .)" -o "-clog_filename=postmaster.log" \ | |
-o "-clog_error_verbosity=verbose" start | |
psql -c "create extension timescaledb;" | |
# Create the fuzzing functions | |
export MODULE_NAME=$(basename $(find $HOME/$PG_INSTALL_DIR -name "timescaledb-tsl-*.so")) | |
psql -a -c "create or replace function fuzz(algo cstring, pgtype regtype, | |
bulk bool, runs int) | |
returns int as '"$MODULE_NAME"', 'ts_fuzz_compression' language c; | |
create or replace function ts_read_compressed_data_directory(algo cstring, | |
pgtype regtype, path cstring, bulk bool) | |
returns table(path text, bytes int, rows int, sqlstate text, location text) | |
as '"$MODULE_NAME"', 'ts_read_compressed_data_directory' language c; | |
" | |
# Start more fuzzing processes in the background. We won't even monitor | |
# their progress, because the server will panic if they find an error. | |
for x in {2..$(nproc)} | |
do | |
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}', | |
'${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" & | |
done | |
# Start the one fuzzing process that we will monitor, in foreground. | |
# The LLVM fuzzing driver calls exit(), so we expect to lose the connection. | |
ret=0 | |
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}', | |
'${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" || ret=$? | |
if ! [ $ret -eq 2 ] | |
then | |
>&2 echo "Unexpected psql exit code $ret" | |
exit 1 | |
fi | |
ls db/corpus | wc -l | |
fn="ts_read_compressed_data_directory('${{ matrix.case.algo }}', | |
'${{ matrix.case.pgtype }}', 'corpus', '${{ matrix.case.bulk }}')" | |
# Show the statistics about fuzzing corpus | |
psql -c "select count(*), location, min(sqlstate), min(path) | |
from $fn | |
group by location order by count(*) desc | |
" | |
# Save interesting cases because the caches are not available for download from UI | |
mkdir -p interesting | |
psql -qtAX -c "select distinct on (location) 'db/' || path from $fn | |
order by location, bytes, path | |
" | xargs cp -t interesting | |
# Check that we don't have any internal errors | |
errors=$(psql -qtAX --set=ON_ERROR_STOP=1 -c "select count(*) | |
from $fn | |
where sqlstate = 'XX000'") | |
echo "Internal program errors: $errors" | |
[ $errors -eq 0 ] || exit 1 | |
# Shouldn't have any WARNINGS in the log. | |
! grep -F "] WARNING: " postmaster.log | |
# Check that the server is still alive. | |
psql -c "select 1" | |
- name: Collect the logs | |
if: always() | |
id: collectlogs | |
run: | | |
# wait in case there are in-progress coredumps | |
sleep 10 | |
if coredumpctl -q list >/dev/null; then echo "coredumps=true" >>$GITHUB_OUTPUT; fi | |
# print OOM killer information | |
sudo journalctl --system -q --facility=kern --grep "Killed process" || true | |
- name: Save PostgreSQL log | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: PostgreSQL log for ${{ steps.config.outputs.name }} | |
path: postgres.* | |
- name: Save fuzzer-generated crash cases | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Crash cases for ${{ steps.config.outputs.name }} | |
path: db/crash-* | |
if-no-files-found: ignore | |
- name: Save interesting cases | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Interesting cases for ${{ steps.config.outputs.name }} | |
path: interesting/ | |
# We use separate restore/save actions, because the default action won't | |
# save the updated folder after the cache hit. We also want to save the | |
# cache after fuzzing errors, and the default action doesn't save after | |
# errors. | |
# We can't overwrite the existing cache, so we add a unique suffix. The | |
# cache is matched by key prefix, not exact key, and picks the newest | |
# matching item, so this works. | |
# The caches for rowbyrow and bulk fuzzing are saved separately, otherwise | |
# the slower job would always overwrite the cache from the faster one. We | |
# want to combine corpuses from bulk and rowbyrow fuzzing for better | |
# coverage. | |
# Note that the cache action cannot be restored on a path different from the | |
# one it was saved from. To make our lives more interesting, it is not | |
# directly documented anywhere, but we can deduce it from path influencing | |
# the version. | |
- name: Change corpus path to please the 'actions/cache' GitHub Action | |
if: always() | |
run: | | |
rm -rf db/corpus-{bulk,rowbyrow} ||: | |
mv -fT db/corpus{,-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}} | |
- name: Save fuzzer corpus | |
if: always() | |
uses: actions/cache/save@v4 | |
with: | |
path: db/corpus-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }} | |
key: "${{ format('{0}-{1}-{2}-{3}', | |
steps.config.outputs.cache_prefix, | |
matrix.case.bulk && 'bulk' || 'rowbyrow', | |
github.run_id, github.run_attempt) }}" | |
- name: Stack trace | |
if: always() && steps.collectlogs.outputs.coredumps == 'true' | |
run: | | |
sudo coredumpctl gdb <<<" | |
set verbose on | |
set trace-commands on | |
show debug-file-directory | |
printf "'"'"query = '%s'\n\n"'"'", debug_query_string | |
frame function ExceptionalCondition | |
printf "'"'"condition = '%s'\n"'"'", conditionName | |
up 1 | |
l | |
info args | |
info locals | |
bt full | |
" 2>&1 | tee stacktrace.log | |
./scripts/bundle_coredumps.sh | |
exit 1 # Fail the job if we have core dumps. | |
- name: Upload core dumps | |
if: always() && steps.collectlogs.outputs.coredumps == 'true' | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Coredumps for ${{ steps.config.outputs.name }} | |
path: coredumps | |
- name: Upload test results to the database | |
if: always() | |
env: | |
CI_STATS_DB: ${{ secrets.CI_STATS_DB }} | |
GITHUB_EVENT_NAME: ${{ github.event_name }} | |
GITHUB_REF_NAME: ${{ github.ref_name }} | |
GITHUB_REPOSITORY: ${{ github.repository }} | |
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} | |
GITHUB_RUN_ID: ${{ github.run_id }} | |
GITHUB_RUN_NUMBER: ${{ github.run_number }} | |
JOB_STATUS: ${{ job.status }} | |
run: | | |
if [[ "${{ github.event_name }}" == "pull_request" ]] ; | |
then | |
GITHUB_PR_NUMBER="${{ github.event.number }}" | |
else | |
GITHUB_PR_NUMBER=0 | |
fi | |
export GITHUB_PR_NUMBER | |
scripts/upload_ci_stats.sh |