Skip to content

Vectorized aggregation with grouping by one fixed-size column (#7341) #1877

Vectorized aggregation with grouping by one fixed-size column (#7341)

Vectorized aggregation with grouping by one fixed-size column (#7341) #1877

Workflow file for this run

name: Libfuzzer
"on":
schedule:
# run daily 1:00 on main branch
- cron: '0 1 * * *'
push:
branches:
- main
- prerelease_test
- trigger/libfuzzer
pull_request:
paths:
- .github/workflows/libfuzzer.yaml
- 'tsl/test/fuzzing/compression/**'
jobs:
build:
runs-on: ubuntu-22.04
name: Build PostgreSQL and TimescaleDB
env:
PG_SRC_DIR: pgbuild
PG_INSTALL_DIR: postgresql
steps:
- name: Install Linux Dependencies
run: |
# Don't add ddebs here because the ddebs mirror is always 503 Service Unavailable.
# If needed, install them before opening the core dump.
sudo apt-get update
sudo apt-get install 7zip clang lld llvm flex bison libipc-run-perl \
libtest-most-perl tree jq
- name: Checkout TimescaleDB
uses: actions/checkout@v4
- name: Read configuration
id: config
run: python -B .github/gh_config_reader.py
# We are going to rebuild Postgres daily, so that it doesn't suddenly break
# ages after the original problem.
- name: Get date for build caching
id: get-date
run: |
echo "date=$(date +"%d")" >> $GITHUB_OUTPUT
# we cache the build directory instead of the install directory here
# because extension installation will write files to install directory
# leading to a tainted cache
- name: Cache PostgreSQL
id: cache-postgresql
uses: actions/cache@v4
with:
path: ~/${{ env.PG_SRC_DIR }}
key: "postgresql-libfuzzer-${{ steps.get-date.outputs.date }}-${{ hashFiles('.github/**') }}"
- name: Build PostgreSQL
if: steps.cache-postgresql.outputs.cache-hit != 'true'
run: |
wget -q -O postgresql.tar.bz2 \
https://ftp.postgresql.org/pub/source/v${{ steps.config.outputs.PG15_LATEST }}/postgresql-${{ steps.config.outputs.PG15_LATEST }}.tar.bz2
mkdir -p ~/$PG_SRC_DIR
tar --extract --file postgresql.tar.bz2 --directory ~/$PG_SRC_DIR --strip-components 1
cd ~/$PG_SRC_DIR
CC=clang ./configure --prefix=$HOME/$PG_INSTALL_DIR --with-openssl \
--without-readline --without-zlib --without-libxml --enable-cassert \
--enable-debug CC=clang \
CFLAGS="-fuse-ld=lld -ggdb3 -O2 -fno-omit-frame-pointer"
make -j$(nproc)
- name: Install PostgreSQL
run: |
make -C ~/$PG_SRC_DIR install
make -C ~/$PG_SRC_DIR/contrib/postgres_fdw install
- name: Upload config.log
if: always()
uses: actions/upload-artifact@v4
with:
name: config.log for PostgreSQL
path: ~/${{ env.PG_SRC_DIR }}/config.log
- name: Build TimescaleDB
run: |
set -e
export LIBFUZZER_PATH=$(dirname "$(find $(llvm-config --libdir) -name libclang_rt.fuzzer_no_main-x86_64.a | head -1)")
# Some pointers for the next time we have linking/undefined symbol problems:
# http://web.archive.org/web/20200926071757/https://github.com/google/sanitizers/issues/111
# http://web.archive.org/web/20231101091231/https://github.com/cms-sw/cmssw/issues/40680
cmake -B build -S . -DASSERTIONS=ON -DLINTER=OFF -DCMAKE_VERBOSE_MAKEFILE=1 \
-DWARNINGS_AS_ERRORS=1 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=clang \
-DCMAKE_C_FLAGS="-fsanitize=fuzzer-no-link -lstdc++ -L$LIBFUZZER_PATH -l:libclang_rt.fuzzer_no_main-x86_64.a -static-libsan" \
-DCOMPRESSION_FUZZING=1 -DPG_PATH=$HOME/$PG_INSTALL_DIR
make -C build -j$(nproc) install
# Incredibly, the upload-artifact action can't preserve executable permissions:
# https://github.com/actions/upload-artifact/issues/38
# It's also extremely slow.
- name: Compress the installation directory
run: 7z a install.7z $HOME/$PG_INSTALL_DIR
- name: Save the installation directory
uses: actions/upload-artifact@v4
with:
name: fuzzing-install-dir
path: install.7z
if-no-files-found: error
retention-days: 1
fuzz:
needs: build
strategy:
fail-fast: false
matrix:
case: [
{ algo: gorilla , pgtype: float8, bulk: false, runs: 500000000 },
{ algo: deltadelta, pgtype: int8 , bulk: false, runs: 500000000 },
{ algo: gorilla , pgtype: float8, bulk: true , runs: 1000000000 },
{ algo: deltadelta, pgtype: int8 , bulk: true , runs: 1000000000 },
# array has a peculiar recv function that recompresses all input, so
# fuzzing it is much slower. The dictionary recv also uses it.
{ algo: array , pgtype: text , bulk: false, runs: 10000000 },
{ algo: array , pgtype: text , bulk: true , runs: 10000000 },
{ algo: dictionary, pgtype: text , bulk: false, runs: 100000000 },
{ algo: dictionary, pgtype: text , bulk: true , runs: 100000000 },
]
name: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}
runs-on: ubuntu-22.04
env:
PG_SRC_DIR: pgbuild
PG_INSTALL_DIR: postgresql
JOB_NAME: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}
steps:
- name: Install Linux dependencies
run: |
sudo apt update
sudo apt install 7zip systemd-coredump gdb
- name: Checkout TimescaleDB
uses: actions/checkout@v4
- name: Download the installation directory
uses: actions/download-artifact@v4
with:
name: fuzzing-install-dir
- name: Unpack the installation directory
run: 7z x -o$HOME install.7z
- name: initdb
run: |
# Have to do this before initializing the corpus, or initdb will complain.
set -xeu
export PGDATA=db
export PGPORT=5432
export PGDATABASE=postgres
export PATH=$HOME/$PG_INSTALL_DIR/bin:$PATH
initdb
echo "shared_preload_libraries = 'timescaledb'" >> $PGDATA/postgresql.conf
- name: Set configuration
id: config
run: |
set -x
echo "cache_prefix=${{ format('libfuzzer-corpus-2-{0}-{1}', matrix.case.algo, matrix.case.pgtype) }}" >> $GITHUB_OUTPUT
echo "name=${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}" >> $GITHUB_OUTPUT
- name: Restore the cached fuzzing corpus (bulk)
id: restore-corpus-cache-bulk
uses: actions/cache/restore@v4
with:
path: db/corpus-bulk
key: "${{ steps.config.outputs.cache_prefix }}-bulk"
# We save the row-by-row corpus separately from the bulk corpus, so that
# they don't overwrite each other. Now we are going to combine them.
- name: Restore the cached fuzzing corpus (rowbyrow)
id: restore-corpus-cache-rowbyrow
uses: actions/cache/restore@v4
with:
path: db/corpus-rowbyrow
key: "${{ steps.config.outputs.cache_prefix }}-rowbyrow"
- name: Initialize the fuzzing corpus
run: |
# Combine the cached corpus from rowbyrow and bulk fuzzing, and from repository.
mkdir -p db/corpus{,-rowbyrow,-bulk}
find "tsl/test/fuzzing/compression/${{ matrix.case.algo }}-${{ matrix.case.pgtype }}" -type f -exec cp -n -t db/corpus {} +
find "db/corpus-rowbyrow" -type f -exec cp -n -t db/corpus {} +
find "db/corpus-bulk" -type f -exec cp -n -t db/corpus {} +
ls db/corpus | wc -l
- name: Run libfuzzer for compression
run: |
set -xeu
export PGDATA=db
export PGPORT=5432
export PGDATABASE=postgres
export PATH=$HOME/$PG_INSTALL_DIR/bin:$PATH
pg_ctl -o "-clogging_collector=true" -o "-clog_destination=jsonlog,stderr" \
-o "-clog_directory=$(readlink -f .)" -o "-clog_filename=postmaster.log" \
-o "-clog_error_verbosity=verbose" start
psql -c "create extension timescaledb;"
# Create the fuzzing functions
export MODULE_NAME=$(basename $(find $HOME/$PG_INSTALL_DIR -name "timescaledb-tsl-*.so"))
psql -a -c "create or replace function fuzz(algo cstring, pgtype regtype,
bulk bool, runs int)
returns int as '"$MODULE_NAME"', 'ts_fuzz_compression' language c;
create or replace function ts_read_compressed_data_directory(algo cstring,
pgtype regtype, path cstring, bulk bool)
returns table(path text, bytes int, rows int, sqlstate text, location text)
as '"$MODULE_NAME"', 'ts_read_compressed_data_directory' language c;
"
# Start more fuzzing processes in the background. We won't even monitor
# their progress, because the server will panic if they find an error.
for x in {2..$(nproc)}
do
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}',
'${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" &
done
# Start the one fuzzing process that we will monitor, in foreground.
# The LLVM fuzzing driver calls exit(), so we expect to lose the connection.
ret=0
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}',
'${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" || ret=$?
if ! [ $ret -eq 2 ]
then
>&2 echo "Unexpected psql exit code $ret"
exit 1
fi
ls db/corpus | wc -l
fn="ts_read_compressed_data_directory('${{ matrix.case.algo }}',
'${{ matrix.case.pgtype }}', 'corpus', '${{ matrix.case.bulk }}')"
# Show the statistics about fuzzing corpus
psql -c "select count(*), location, min(sqlstate), min(path)
from $fn
group by location order by count(*) desc
"
# Save interesting cases because the caches are not available for download from UI
mkdir -p interesting
psql -qtAX -c "select distinct on (location) 'db/' || path from $fn
order by location, bytes, path
" | xargs cp -t interesting
# Check that we don't have any internal errors
errors=$(psql -qtAX --set=ON_ERROR_STOP=1 -c "select count(*)
from $fn
where sqlstate = 'XX000'")
echo "Internal program errors: $errors"
[ $errors -eq 0 ] || exit 1
# Shouldn't have any WARNINGS in the log.
! grep -F "] WARNING: " postmaster.log
# Check that the server is still alive.
psql -c "select 1"
- name: Collect the logs
if: always()
id: collectlogs
run: |
# wait in case there are in-progress coredumps
sleep 10
if coredumpctl -q list >/dev/null; then echo "coredumps=true" >>$GITHUB_OUTPUT; fi
# print OOM killer information
sudo journalctl --system -q --facility=kern --grep "Killed process" || true
- name: Save PostgreSQL log
if: always()
uses: actions/upload-artifact@v4
with:
name: PostgreSQL log for ${{ steps.config.outputs.name }}
path: postgres.*
- name: Save fuzzer-generated crash cases
if: always()
uses: actions/upload-artifact@v4
with:
name: Crash cases for ${{ steps.config.outputs.name }}
path: db/crash-*
if-no-files-found: ignore
- name: Save interesting cases
if: always()
uses: actions/upload-artifact@v4
with:
name: Interesting cases for ${{ steps.config.outputs.name }}
path: interesting/
# We use separate restore/save actions, because the default action won't
# save the updated folder after the cache hit. We also want to save the
# cache after fuzzing errors, and the default action doesn't save after
# errors.
# We can't overwrite the existing cache, so we add a unique suffix. The
# cache is matched by key prefix, not exact key, and picks the newest
# matching item, so this works.
# The caches for rowbyrow and bulk fuzzing are saved separately, otherwise
# the slower job would always overwrite the cache from the faster one. We
# want to combine corpuses from bulk and rowbyrow fuzzing for better
# coverage.
# Note that the cache action cannot be restored on a path different from the
# one it was saved from. To make our lives more interesting, it is not
# directly documented anywhere, but we can deduce it from path influencing
# the version.
- name: Change corpus path to please the 'actions/cache' GitHub Action
if: always()
run: |
rm -rf db/corpus-{bulk,rowbyrow} ||:
mv -fT db/corpus{,-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}}
- name: Save fuzzer corpus
if: always()
uses: actions/cache/save@v4
with:
path: db/corpus-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}
key: "${{ format('{0}-{1}-{2}-{3}',
steps.config.outputs.cache_prefix,
matrix.case.bulk && 'bulk' || 'rowbyrow',
github.run_id, github.run_attempt) }}"
- name: Stack trace
if: always() && steps.collectlogs.outputs.coredumps == 'true'
run: |
sudo coredumpctl gdb <<<"
set verbose on
set trace-commands on
show debug-file-directory
printf "'"'"query = '%s'\n\n"'"'", debug_query_string
frame function ExceptionalCondition
printf "'"'"condition = '%s'\n"'"'", conditionName
up 1
l
info args
info locals
bt full
" 2>&1 | tee stacktrace.log
./scripts/bundle_coredumps.sh
exit 1 # Fail the job if we have core dumps.
- name: Upload core dumps
if: always() && steps.collectlogs.outputs.coredumps == 'true'
uses: actions/upload-artifact@v4
with:
name: Coredumps for ${{ steps.config.outputs.name }}
path: coredumps
- name: Upload test results to the database
if: always()
env:
CI_STATS_DB: ${{ secrets.CI_STATS_DB }}
GITHUB_EVENT_NAME: ${{ github.event_name }}
GITHUB_REF_NAME: ${{ github.ref_name }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
JOB_STATUS: ${{ job.status }}
run: |
if [[ "${{ github.event_name }}" == "pull_request" ]] ;
then
GITHUB_PR_NUMBER="${{ github.event.number }}"
else
GITHUB_PR_NUMBER=0
fi
export GITHUB_PR_NUMBER
scripts/upload_ci_stats.sh