From 7d498f68dc548ed824e6a4da676eef9a48b721b2 Mon Sep 17 00:00:00 2001 From: Gilang Ramadhan Ilhami Date: Tue, 17 May 2022 22:38:18 +0700 Subject: [PATCH] SKLearn Pipeine use LaplaceDistribution Class from Google DP (#408) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added machine learning and mechanisms directory * Added files for utils Files include: - acountant.py (for BudgetAccountant purposes) - utils.py - validation.py * Directory change for machine learning & mechanism Anything related to machine learning and mechanism is located in the "ml" directory * Added function to check epsilon delta in validation.py * Added function to check min max bounds in validation.py * Added function to clip 2-d array to given maximum norm in validation.py * Added function to clip 2-d array to given bounds in validation.py * Added gunction to set global seed in utils.py * Added function to copy docstring in ml/util/utils.py * Added subclass Budget for privacy budgets epsilon delta * Add class BudgetError for exceeded privacy budget error * Added class for privacy leakage warning * Added class for incompatible arguments PyDP * Added function for warning incompitable argument * Added class for budget accountant The class added main goal is to allocate privacy budget * Added file as base for mechanism and add base class * Added methods for base class * Added class for truncated and folded mechanism * Added files for laplace & geometric mechanism * Added class as main laplace mechanism * Added class for truncated laplacian * Added class for folder laplacian * Added class for main geometric mechanism * Added class for truncated geometric * Added file for naive bayes model with model class * FIX: Move ml directory to src/pydp directory * FEAT: Added LaplaceBoundedDomain class in laplce mechanism file * Added folder that contains machine learning capabilities * Apply Python and C++ styling * Added type:ignored on unresolved imports * Python formatting with Black after adding type:ignore * Fix mypy test errors in ml directory * Fix apply Python format with blac * Add source link to ml directory files * Removed extra special characters * Added example notebook for Naive Bayes implementation * Added Jupyter Notebook for PyDP NB comparison * Moved matplotlib as dev dependincy * change setup msbuild to version 1.0.2 * Test remove python 3.5 in Windows workflow * Add back python 3.5 for Windows build * Remove python-dp in docs/requirements.txt * Change version to cehck build doc action * conf.py in /docs append pydp build path * Change ubuntu version for build docs action * Change Python to 3.6 in build docs action * Revert back ubuntu and Python versions * Add pre-build-command in sphinx-action@master * Fix pre-build-command syntax * Change pre-build-comman for sphinx-action * pre-build-command install software-properties-common * Fix pre-build-command * Fix pre-build-command * Fix pre-build-command * Change ubuntu version for build doc action * Change Python Version to 3.8 * Removed test limited to dev branch. This test would run on all Pull requests now rather than just PR to dev. Other than that, tests are to be triggered if there's a change in ipynb too. * Wrapper for numerical mechanism class [Continuation from #372] (#380) * initial addition of partition selection * corrected instantiation to 'builder.Build' instead * build works, TODO: deal with passing Laplace/Gaussian mechanims builders * post-review changes, moved partition selection python related code to own submodule * fixed some formatting * Added documentation * post formatting * added tests for partition selection * moved imports for patition_selection * clarified TODO dependency * attempt to resolve linting issues * * Added exports in algorithms.partition_selection * Replaced the Create*PartitionStrategy functions with a template function and instantiantions * revert a to latest stable commit * restore the correct commit for google-dp submodule * added python bindings for numerical mechanisms * added some python files * fixed prereqs_linux.sh script * saving changes * build + tests work * added docs for numerical mechanisms * Fix Bazel build Bazel failed to build since the name of the workspace was inconsistent with the DP Library. This commit also fixes the order in which we install dependencies of the DP Lib. Additionally, on Linux we used the wrong compiler flags, which is now fixed in the .bazelrc. * Revert "Fix Bazel build" * Update README.md (#385) * Update README.md Changed "Currently supports Linux and macOS (Windows support coming soon)" to "Compatible with all three types of Operating Systems - Linux, macOS, and Windows" * Update README.md Changed "Currently supports Linux and macOS (Windows support coming soon)" to "Compatible with all three types of Operating Systems - Linux, macOS, and Windows" * sha upgrade and added temporary fix for bazel reference * upgraded SHA * removal of functions and dependency in accordance with SHA upgrade * fixed build file reference issue for test cases * build file reference fix * Made the system release ready * Fixed the ubuntu version for google collab * fixed the BUILD file naming * c++ styling * fixed the RC version * Attempt to trigger the find replace in BUILD file * added find replace in version scripts * updated the release number to 1.1.1 * Fixed the bazel build absolute reference * removed the temporary fix in the build actions * fix * fix * Removing unused code (#400) * clean-ups * tiny fix * format * format c++ * Add support for categorical features for Laplace mechanism * Clean-ups (#401) Removing unused code and using c++ style guidelines * Improve Laplace Demo Notebook and add minimal README (#404) * Improve Laplace Demo NB and add minimal README * Improve the Conclusion Section * Add back the old conclusion * Removed partial privacy budget + Update Google C++ DP library to the latest commit and fix compilation errors (#405) * update * update * remove unintended changed * tests * FIX: Change Google DP Commit Hash * FEAT: SKLearn Laplace Mechanism use Google DP Modifie the `LaplaceMechanism`class in `/src/pydp/ml/mechanisms/sklearn_pipeline.py` so that it uses `LaplaceDistribution` class from Google DP` * Fix typo in build-docs.yml * FIX: Stling pythong and cpp * FIX: Styling python and cpp * FIX: Typo in build-docs.yml * FIX: Changed clang-format version * FIX: Try clang-format 12 * Update to the latest version of Google building block library (#415) * remove privacy budget * fixes * fixes * tests * FIX: Change clang format action version * Reset HEAD Co-authored-by: Chinmay Shah Co-authored-by: Lev Zlotnik <46742999+levzlotnik@users.noreply.github.com> Co-authored-by: Christoph Dibak Co-authored-by: Abin (אבין ברגיס) <36173893+Spartan-119@users.noreply.github.com> Co-authored-by: FIRhinmay Co-authored-by: Vadym Doroshenko Co-authored-by: dvadym <53558779+dvadym@users.noreply.github.com> Co-authored-by: Saurav Maheshkar --- .github/workflows/build-docs.yml | 7 +- .github/workflows/publish.yml | 330 ++++++++-------- .github/workflows/tests.yml | 373 +++++++++--------- .github/workflows/versions.yml | 2 +- Dockerfile | 160 ++++---- README.md | 3 +- WORKSPACE | 18 +- docs/conf.py | 2 + docs/pydp.rst | 13 +- docs/requirements.txt | 1 - .../SKLearn_Pipeline_Laplace_Mechanism.ipynb | 111 +++++- examples/laplace_demo/README.md | 3 + examples/laplace_demo/laplace.ipynb | 122 +++--- prereqs_linux.sh | 12 +- prereqs_mac.sh | 125 +++--- setup.cfg | 2 +- setup.py | 2 +- src/bindings/BUILD | 62 +-- .../PyDP/algorithms/bounded_functions.cpp | 3 - src/bindings/PyDP/algorithms/count.cpp | 2 - .../PyDP/algorithms/distributions.cpp | 1 - .../PyDP/algorithms/order_statistics.cpp | 2 - src/bindings/PyDP/algorithms/rand.cpp | 1 - src/bindings/PyDP/algorithms/util.cpp | 79 ++-- src/bindings/PyDP/base/logging.cpp | 3 +- src/bindings/PyDP/base/status.cpp | 225 ++++++----- src/bindings/PyDP/bindings.cpp | 6 + src/bindings/PyDP/mechanisms/mechanism.cpp | 154 +++++++- src/bindings/PyDP/proto/proto.cpp | 12 +- .../PyDP/pydp_lib/algorithm_builder.hpp | 17 +- src/bindings/PyDP/pydp_lib/casting.hpp | 13 - src/bindings/PyDP/pydp_lib/helper_class.hpp | 46 --- src/pydp/__init__.py | 2 +- src/pydp/algorithms/__init__.py | 3 +- src/pydp/algorithms/_algorithm.py | 22 +- src/pydp/algorithms/numerical_mechanisms.py | 6 + src/pydp/ml/mechanisms/sklearn_pipeline.py | 193 ++++++--- tests/algorithms/conftest.py | 60 --- .../test_bounded_mean_int64_data.bin | 2 - tests/algorithms/test_count.py | 3 - tests/algorithms/test_numerical_mechanisms.py | 84 ++++ tests/algorithms/test_order_statistics.py | 13 +- tests/algorithms/test_partition_selection.py | 1 - third_party/differential-privacy | 2 +- 44 files changed, 1279 insertions(+), 1024 deletions(-) create mode 100644 examples/laplace_demo/README.md delete mode 100644 src/bindings/PyDP/pydp_lib/casting.hpp delete mode 100644 src/bindings/PyDP/pydp_lib/helper_class.hpp create mode 100644 src/pydp/algorithms/numerical_mechanisms.py delete mode 100644 tests/algorithms/conftest.py delete mode 100644 tests/algorithms/test_bounded_mean/test_bounded_mean_int64_data.bin create mode 100644 tests/algorithms/test_numerical_mechanisms.py diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index 2d197a7e..69765484 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -11,8 +11,8 @@ jobs: fail-fast: false max-parallel: 1 matrix: - os: [ubuntu-latest] - python-version: [3.9] + os: [ubuntu-18.04] + python-version: [3.8] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 @@ -91,6 +91,7 @@ jobs: - uses: ammaraskar/sphinx-action@master with: docs-folder: "docs/" + # pre-build-command: "apt-get -y update && apt-get -y install gcc" - name: Commit documentation changes run: | @@ -109,4 +110,4 @@ jobs: branch: gh-pages directory: gh-pages force: true - github_token: ${{ secrets.GITHUB_TOKEN }} + github_token: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4f35c915..0dcc8708 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,165 +1,165 @@ -name: Publish PyDP - -on: - release: - types: [published] - -jobs: - deploy: - strategy: - fail-fast: false - max-parallel: 12 - matrix: - os: [ubuntu-latest, macos-latest, windows-latest] - python-version: [3.6, 3.7, 3.8, 3.9] - runs-on: ${{ matrix.os }} - - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup msbuild Windows - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v1.0.2 - - - name: Fix Paths Windows - # Make sure that tar.exe from Git is used not from Windows - if: runner.os == 'Windows' - run: | - @("C:\Program Files\Git\usr\bin") + (Get-Content $env:GITHUB_PATH) | Set-Content $env:GITHUB_PATH -Encoding utf8 - - - name: Cache Bazel Unix - # Not working on Windows: https://github.com/actions/cache/issues/576 - if: runner.os != 'Windows' - uses: actions/cache@v2.1.5 - with: - path: | - ./bazel-PyDP - ./bazel-bin - ./bazel-cache - ./bazel-out - key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} - - - name: Update environment variables Windows - if: runner.os == 'Windows' - # See: - # - On Bazel cache: https://docs.bazel.build/versions/master/output_directories.html - run: | - echo "BAZEL_CACHE_DIR=$env:USERPROFILE\_bazel_$env:USERNAME" >> $env:GITHUB_ENV - - - name: Cache Bazel Windows - if: runner.os == 'Windows' - # https://stackoverflow.com/questions/66870002/github-actions-cache-maven-m2-repository-on-windows-environment-c-users-run - uses: actions/cache@v2.1.5 - with: - path: | - ./bazel-cache - key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} - - - name: Build Google DP Unix - if: runner.os != 'Windows' - timeout-minutes: 20 - run: | - PYTHONHOME=$(which python) - PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);") - BAZEL_CONFIG_OS=$(python -c "print('${{ matrix.os }}'.split('-')[0].lower().replace('ubuntu', 'linux'))") - echo "Running: ${{ matrix.os }}" - echo "Using BAZEL_CONFIG_OS: $BAZEL_CONFIG_OS" - bazel --output_base ./bazel-cache build src/python:pydp \ - --config $BAZEL_CONFIG_OS \ - --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME \ - --action_env=PYTHON_LIB_PATH=$PYTHONPATH - cp -f ./bazel-bin/src/bindings/_pydp.so ./src/pydp - - - name: Build Google DP Windows - if: runner.os == 'Windows' - timeout-minutes: 20 - run: | - $PYTHONHOME=$(python -c 'import sys; print(sys.executable);').replace('\', '/') - $PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);").replace('\', '/') - echo "PYTHONHOME=$PYTHONHOME" - echo "PYTHONPATH=$PYTHONPATH" - echo "Running: ${{ matrix.os }}" - bazel.exe --output_base ./bazel-cache build src/python:pydp --config windows --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME --action_env=PYTHON_LIB_PATH=$PYTHONPATH - copy ./bazel-bin/src/bindings/_pydp.so ./src/pydp/_pydp.pyd - - - name: Upgrade pip - run: | - pip install --upgrade --user pip - - - name: Install Poetry - run: | - pip install poetry - - - name: Get poetry cache dir - id: poetry-cache - run: | - echo "::set-output name=dir::$(poetry config cache-dir)" - - - name: poetry cache - uses: actions/cache@v2 - with: - path: ${{ steps.poetry-cache.outputs.dir }} - key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip-py${{ matrix.python-version }}- - - - name: Install dependencies - run: | - poetry install - - - name: Build PyDP macOS - if: runner.os == 'macOS' - run: | - poetry run python setup.py build bdist_wheel --plat-name macosx_10_14_x86_64 - - - name: Build PyDP Linux / Windows - if: runner.os != 'macOS' - run: | - poetry run python setup.py build bdist_wheel - - - name: Install Wheel Unix - if: runner.os != 'Windows' - run: | - pip install `find -L ./ -name "*.whl"` - - - name: Install Wheel Windows - if: runner.os == 'Windows' - run: | - Get-ChildItem -Path ./ -Filter "*.whl" -Recurse -File | foreach {pip install $_.FullName} - - - name: Import Package - run: | - python -c "import pydp; print(pydp.__version__)" - - - name: Run Pytest - run: | - poetry run pytest tests -n auto - - - name: Check Wheel Unix - if: runner.os != 'Windows' - run: | - poetry run twine check `find -L ./ -name "*.whl"` - - - name: Check Wheel Windows - if: runner.os == 'Windows' - run: | - Get-ChildItem -Path ./ -Filter "*.whl" -Recurse -File | foreach {poetry run twine check $_.FullName} - - - name: Renaming wheel - if: runner.os == 'Linux' - run: | - find . -name '*linux*.whl' -type f -exec bash -c 'mv "$1" "${1/linux/manylinux1}"' -- {} \; - - - name: Publishing the wheel - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.TOKEN }} - run: | - poetry run twine upload --skip-existing dist/*.whl +name: Publish PyDP + +on: + release: + types: [published] + +jobs: + deploy: + strategy: + fail-fast: false + max-parallel: 12 + matrix: + os: [ubuntu-18.04, macos-latest, windows-latest] + python-version: [3.6, 3.7, 3.8, 3.9] + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v2 + with: + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup msbuild Windows + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v1.0.2 + + - name: Fix Paths Windows + # Make sure that tar.exe from Git is used not from Windows + if: runner.os == 'Windows' + run: | + @("C:\Program Files\Git\usr\bin") + (Get-Content $env:GITHUB_PATH) | Set-Content $env:GITHUB_PATH -Encoding utf8 + + - name: Cache Bazel Unix + # Not working on Windows: https://github.com/actions/cache/issues/576 + if: runner.os != 'Windows' + uses: actions/cache@v2.1.5 + with: + path: | + ./bazel-PyDP + ./bazel-bin + ./bazel-cache + ./bazel-out + key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} + + - name: Update environment variables Windows + if: runner.os == 'Windows' + # See: + # - On Bazel cache: https://docs.bazel.build/versions/master/output_directories.html + run: | + echo "BAZEL_CACHE_DIR=$env:USERPROFILE\_bazel_$env:USERNAME" >> $env:GITHUB_ENV + + - name: Cache Bazel Windows + if: runner.os == 'Windows' + # https://stackoverflow.com/questions/66870002/github-actions-cache-maven-m2-repository-on-windows-environment-c-users-run + uses: actions/cache@v2.1.5 + with: + path: | + ./bazel-cache + key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} + + - name: Build Google DP Unix + if: runner.os != 'Windows' + timeout-minutes: 20 + run: | + PYTHONHOME=$(which python) + PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);") + BAZEL_CONFIG_OS=$(python -c "print('${{ matrix.os }}'.split('-')[0].lower().replace('ubuntu', 'linux'))") + echo "Running: ${{ matrix.os }}" + echo "Using BAZEL_CONFIG_OS: $BAZEL_CONFIG_OS" + bazel --output_base ./bazel-cache build src/python:pydp \ + --config $BAZEL_CONFIG_OS \ + --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME \ + --action_env=PYTHON_LIB_PATH=$PYTHONPATH + cp -f ./bazel-bin/src/bindings/_pydp.so ./src/pydp + + - name: Build Google DP Windows + if: runner.os == 'Windows' + timeout-minutes: 20 + run: | + $PYTHONHOME=$(python -c 'import sys; print(sys.executable);').replace('\', '/') + $PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);").replace('\', '/') + echo "PYTHONHOME=$PYTHONHOME" + echo "PYTHONPATH=$PYTHONPATH" + echo "Running: ${{ matrix.os }}" + bazel.exe --output_base ./bazel-cache build src/python:pydp --config windows --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME --action_env=PYTHON_LIB_PATH=$PYTHONPATH + copy ./bazel-bin/src/bindings/_pydp.so ./src/pydp/_pydp.pyd + + - name: Upgrade pip + run: | + pip install --upgrade --user pip + + - name: Install Poetry + run: | + pip install poetry + + - name: Get poetry cache dir + id: poetry-cache + run: | + echo "::set-output name=dir::$(poetry config cache-dir)" + + - name: poetry cache + uses: actions/cache@v2 + with: + path: ${{ steps.poetry-cache.outputs.dir }} + key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip-py${{ matrix.python-version }}- + + - name: Install dependencies + run: | + poetry install + + - name: Build PyDP macOS + if: runner.os == 'macOS' + run: | + poetry run python setup.py build bdist_wheel --plat-name macosx_10_14_x86_64 + + - name: Build PyDP Linux / Windows + if: runner.os != 'macOS' + run: | + poetry run python setup.py build bdist_wheel + + - name: Install Wheel Unix + if: runner.os != 'Windows' + run: | + pip install `find -L ./ -name "*.whl"` + + - name: Install Wheel Windows + if: runner.os == 'Windows' + run: | + Get-ChildItem -Path ./ -Filter "*.whl" -Recurse -File | foreach {pip install $_.FullName} + + - name: Import Package + run: | + python -c "import pydp; print(pydp.__version__)" + + - name: Run Pytest + run: | + poetry run pytest tests -n auto + + - name: Check Wheel Unix + if: runner.os != 'Windows' + run: | + poetry run twine check `find -L ./ -name "*.whl"` + + - name: Check Wheel Windows + if: runner.os == 'Windows' + run: | + Get-ChildItem -Path ./ -Filter "*.whl" -Recurse -File | foreach {poetry run twine check $_.FullName} + + - name: Renaming wheel + if: runner.os == 'Linux' + run: | + find . -name '*linux*.whl' -type f -exec bash -c 'mv "$1" "${1/linux/manylinux1}"' -- {} \; + + - name: Publishing the wheel + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TOKEN }} + run: | + poetry run twine upload --skip-existing dist/*.whl diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index df3f46cb..bde7b05b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,187 +1,186 @@ -name: Tests - -on: - pull_request: - branches: - - dev - paths: - - "*.bazel" - - "*.cpp" - - "*.c" - - "*.cc" - - "*.hpp" - - "*.h" - - "*.py" - - "*.go" - - "*.mod" - - "*.toml" - - "*.txt" - - "setup.cfg" - - ".github/workflows/*.yml" - -jobs: - linting: - runs-on: ubuntu-latest - strategy: - max-parallel: 1 - matrix: - python-version: [3.9] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install python dependencies - run: | - pip install black mypy - - name: Black - run: | - black . - - name: MyPY - run: | - mypy src tests - - name: Run clang-format style check for C/C++ programs. - uses: jidicula/clang-format-action@v3.3.0 - with: - clang-format-version: "11" - check-path: "/src/bindings/" - fallback-style: "Google" # optional - - build: - runs-on: ${{ matrix.os }} - needs: [linting] - strategy: - fail-fast: false - max-parallel: 3 - matrix: - os: [ubuntu-latest, macos-latest, windows-latest] - python-version: [3.9] - - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup msbuild Windows - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v1.0.2 - - - name: Fix Paths Windows - # Make sure that tar.exe from Git is used not from Windows - if: runner.os == 'Windows' - run: | - @("C:\Program Files\Git\usr\bin") + (Get-Content $env:GITHUB_PATH) | Set-Content $env:GITHUB_PATH -Encoding utf8 - - - name: Cache Bazel Unix - # Not working on Windows: https://github.com/actions/cache/issues/576 - if: runner.os != 'Windows' - uses: actions/cache@v2.1.5 - with: - path: | - ./bazel-PyDP - ./bazel-bin - ./bazel-cache - ./bazel-out - key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} - - - name: Update environment variables Windows - if: runner.os == 'Windows' - # See: - # - On Bazel cache: https://docs.bazel.build/versions/master/output_directories.html - run: | - echo "BAZEL_CACHE_DIR=$env:USERPROFILE\_bazel_$env:USERNAME" >> $env:GITHUB_ENV - - - name: Cache Bazel Windows - if: runner.os == 'Windows' - # https://stackoverflow.com/questions/66870002/github-actions-cache-maven-m2-repository-on-windows-environment-c-users-run - uses: actions/cache@v2.1.5 - with: - path: | - ./bazel-cache - key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} - - - name: Build Google DP Unix - if: runner.os != 'Windows' - timeout-minutes: 20 - run: | - PYTHONHOME=$(which python) - PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);") - BAZEL_CONFIG_OS=$(python -c "print('${{ matrix.os }}'.split('-')[0].lower().replace('ubuntu', 'linux'))") - echo "Running: ${{ matrix.os }}" - echo "Using BAZEL_CONFIG_OS: $BAZEL_CONFIG_OS" - bazel --output_base ./bazel-cache build src/python:pydp \ - --config $BAZEL_CONFIG_OS \ - --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME \ - --action_env=PYTHON_LIB_PATH=$PYTHONPATH - cp -f ./bazel-bin/src/bindings/_pydp.so ./src/pydp - - - name: Build Google DP Windows - if: runner.os == 'Windows' - timeout-minutes: 20 - run: | - $PYTHONHOME=$(python -c 'import sys; print(sys.executable);').replace('\', '/') - $PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);").replace('\', '/') - echo "PYTHONHOME=$PYTHONHOME" - echo "PYTHONPATH=$PYTHONPATH" - echo "Running: ${{ matrix.os }}" - bazel.exe --output_base ./bazel-cache build src/python:pydp --config windows --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME --action_env=PYTHON_LIB_PATH=$PYTHONPATH - copy ./bazel-bin/src/bindings/_pydp.so ./src/pydp/_pydp.pyd - - - name: Upgrade pip - run: | - pip install --upgrade --user pip - - - name: Install Poetry - run: | - pip install poetry - - - name: Get poetry cache dir - id: poetry-cache - run: | - echo "::set-output name=dir::$(poetry config cache-dir)" - - - name: poetry cache - uses: actions/cache@v2 - with: - path: ${{ steps.poetry-cache.outputs.dir }} - key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip-py${{ matrix.python-version }}- - - - name: Install dependencies - run: | - poetry install - - - name: Build PyDP macOS - if: runner.os == 'macOS' - run: | - poetry run python setup.py build bdist_wheel --plat-name macosx_10_14_x86_64 - - - name: Build PyDP Linux / Windows - if: runner.os != 'macOS' - run: | - poetry run python setup.py build bdist_wheel - - - name: Install Wheel Unix - if: runner.os != 'Windows' - run: | - pip install `find -L ./ -name "*.whl"` - - - name: Install Wheel Windows - if: runner.os == 'Windows' - run: | - Get-ChildItem -Path ./ -Filter "*.whl" -Recurse -File | foreach {pip install $_.FullName} - - - name: Import Package - run: | - python -c "import pydp; print(pydp.__version__)" - - - name: Run Pytest - run: | - poetry run pytest tests -n auto +name: Tests + +on: + pull_request: + paths: + - "*.bazel" + - "*.cpp" + - "*.c" + - "*.cc" + - "*.hpp" + - "*.h" + - "*.py" + - "*.ipynb" + - "*.go" + - "*.mod" + - "*.toml" + - "*.txt" + - "setup.cfg" + - ".github/workflows/*.yml" + +jobs: + linting: + runs-on: ubuntu-latest + strategy: + max-parallel: 1 + matrix: + python-version: [3.9] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install python dependencies + run: | + pip install black mypy + - name: Black + run: | + black . + - name: MyPY + run: | + mypy src tests + - name: Run clang-format style check for C/C++ programs. + uses: jidicula/clang-format-action@v4.5.0 + with: + clang-format-version: "11" + check-path: "/src/bindings/" + fallback-style: "Google" # optional + + build: + runs-on: ${{ matrix.os }} + needs: [linting] + strategy: + fail-fast: false + max-parallel: 3 + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: [3.9] + + steps: + - uses: actions/checkout@v2 + with: + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup msbuild Windows + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v1.0.2 + + - name: Fix Paths Windows + # Make sure that tar.exe from Git is used not from Windows + if: runner.os == 'Windows' + run: | + @("C:\Program Files\Git\usr\bin") + (Get-Content $env:GITHUB_PATH) | Set-Content $env:GITHUB_PATH -Encoding utf8 + + - name: Cache Bazel Unix + # Not working on Windows: https://github.com/actions/cache/issues/576 + if: runner.os != 'Windows' + uses: actions/cache@v2.1.5 + with: + path: | + ./bazel-PyDP + ./bazel-bin + ./bazel-cache + ./bazel-out + key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} + + - name: Update environment variables Windows + if: runner.os == 'Windows' + # See: + # - On Bazel cache: https://docs.bazel.build/versions/master/output_directories.html + run: | + echo "BAZEL_CACHE_DIR=$env:USERPROFILE\_bazel_$env:USERNAME" >> $env:GITHUB_ENV + + - name: Cache Bazel Windows + if: runner.os == 'Windows' + # https://stackoverflow.com/questions/66870002/github-actions-cache-maven-m2-repository-on-windows-environment-c-users-run + uses: actions/cache@v2.1.5 + with: + path: | + ./bazel-cache + key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} + + - name: Build Google DP Unix + if: runner.os != 'Windows' + timeout-minutes: 20 + run: | + PYTHONHOME=$(which python) + PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);") + BAZEL_CONFIG_OS=$(python -c "print('${{ matrix.os }}'.split('-')[0].lower().replace('ubuntu', 'linux'))") + echo "Running: ${{ matrix.os }}" + echo "Using BAZEL_CONFIG_OS: $BAZEL_CONFIG_OS" + bazel --output_base ./bazel-cache build src/python:pydp \ + --config $BAZEL_CONFIG_OS \ + --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME \ + --action_env=PYTHON_LIB_PATH=$PYTHONPATH + cp -f ./bazel-bin/src/bindings/_pydp.so ./src/pydp + + - name: Build Google DP Windows + if: runner.os == 'Windows' + timeout-minutes: 20 + run: | + $PYTHONHOME=$(python -c 'import sys; print(sys.executable);').replace('\', '/') + $PYTHONPATH=$(python -c "import sys; print([x for x in sys.path if 'site-packages' in x][0]);").replace('\', '/') + echo "PYTHONHOME=$PYTHONHOME" + echo "PYTHONPATH=$PYTHONPATH" + echo "Running: ${{ matrix.os }}" + bazel.exe --output_base ./bazel-cache build src/python:pydp --config windows --verbose_failures --action_env=PYTHON_BIN_PATH=$PYTHONHOME --action_env=PYTHON_LIB_PATH=$PYTHONPATH + copy ./bazel-bin/src/bindings/_pydp.so ./src/pydp/_pydp.pyd + + - name: Upgrade pip + run: | + pip install --upgrade --user pip + + - name: Install Poetry + run: | + pip install poetry + + - name: Get poetry cache dir + id: poetry-cache + run: | + echo "::set-output name=dir::$(poetry config cache-dir)" + + - name: poetry cache + uses: actions/cache@v2 + with: + path: ${{ steps.poetry-cache.outputs.dir }} + key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip-py${{ matrix.python-version }}- + + - name: Install dependencies + run: | + poetry install + + - name: Build PyDP macOS + if: runner.os == 'macOS' + run: | + poetry run python setup.py build bdist_wheel --plat-name macosx_10_14_x86_64 + + - name: Build PyDP Linux / Windows + if: runner.os != 'macOS' + run: | + poetry run python setup.py build bdist_wheel + + - name: Install Wheel Unix + if: runner.os != 'Windows' + run: | + pip install `find -L ./ -name "*.whl"` + + - name: Install Wheel Windows + if: runner.os == 'Windows' + run: | + Get-ChildItem -Path ./ -Filter "*.whl" -Recurse -File | foreach {pip install $_.FullName} + + - name: Import Package + run: | + python -c "import pydp; print(pydp.__version__)" + + - name: Run Pytest + run: | + poetry run pytest tests -n auto diff --git a/.github/workflows/versions.yml b/.github/workflows/versions.yml index 7b85e1bf..6d236aa4 100644 --- a/.github/workflows/versions.yml +++ b/.github/workflows/versions.yml @@ -41,7 +41,7 @@ jobs: - name: Run clang-format style check for C/C++ programs. uses: jidicula/clang-format-action@v3.3.0 with: - clang-format-version: "11" + clang-format-version: "12" check-path: "/src/bindings/" fallback-style: "Google" # optional diff --git a/Dockerfile b/Dockerfile index 6282ed88..6b31cc13 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,79 +1,81 @@ -# Pull base image -ARG PYTHON_VERSION=3.9 -FROM python:${PYTHON_VERSION}-slim-buster - -# must be redefined after FROM -ARG PYTHON_VERSION=$PYTHON_VERSION -ARG BAZELISK_VERSION=v1.8.1 -ARG BAZELISK_BINARY=bazelisk-linux-amd64 -ARG BAZELISK_DOWNLOAD_URL=https://github.com/bazelbuild/bazelisk/releases/download/ - -# Set environment variables -ENV HOME=/root -ENV PROJECT_DIR="${HOME}/PyDP" -ENV PATH="/root/bin:${PATH}" -ENV DP_SHA="78d3fb8f63ea904ea6449a8276b9070254c650ec" - -# Define working directory -WORKDIR ${HOME} - -# Install apt-get packages -RUN apt-get update && \ - apt-get -y install \ - sudo \ - wget \ - zip \ - git \ - software-properties-common \ - gcc \ - g++ \ - clang-format \ - build-essential \ - python3-distutils \ - pkg-config \ - zlib1g-dev - -# Download and Install Bazelisk -RUN wget "${BAZELISK_DOWNLOAD_URL}/${BAZELISK_VERSION}/${BAZELISK_BINARY}" && \ - chmod +x ${BAZELISK_BINARY} - -RUN ./${BAZELISK_BINARY} --version - -# Update pip and setuptools and install poetry -RUN pip install --upgrade pip setuptools wheel && \ - pip install poetry - -# Change working dir -WORKDIR ${PROJECT_DIR} - -# Copy local source over -COPY . ${PROJECT_DIR} - -# Get google dp dependency -RUN mkdir -p third_party && \ - cd third_party && \ - git clone https://github.com/google/differential-privacy.git && \ - cd differential-privacy && \ - git checkout ${DP_SHA} - -# Remove unused java code -RUN rm -rf third_party/differential-privacy/java && \ - rm -rf third_party/differential-privacy/examples/java - -# This makes poetry's virtual environment in the project dir -RUN poetry config virtualenvs.in-project true - -# Build the bindings using Bazel and create a python wheel -RUN poetry env use ${PYTHON_VERSION} && \ - ${HOME}/${BAZELISK_BINARY} build --config linux src/python:pydp --verbose_failures - -RUN cp -f ./bazel-bin/src/bindings/_pydp.so ./pydp && \ - rm -rf dist/ && \ - poetry run python setup.py bdist_wheel && \ - poetry add dist/*.whl - -# This `activates` the virtual env -ENV VIRTUAL_ENV=$PROJECT_DIR/.venv -ENV PATH="$VIRTUAL_ENV/bin:$PATH" -# Default entrypoint -CMD ["/bin/bash"] +# Pull base image +ARG PYTHON_VERSION=3.9 +FROM python:${PYTHON_VERSION}-slim-buster + +# must be redefined after FROM +ARG PYTHON_VERSION=$PYTHON_VERSION +ARG BAZELISK_VERSION=v1.8.1 +ARG BAZELISK_BINARY=bazelisk-linux-amd64 +ARG BAZELISK_DOWNLOAD_URL=https://github.com/bazelbuild/bazelisk/releases/download/ + +# Set environment variables +ENV HOME=/root +ENV PROJECT_DIR="${HOME}/PyDP" +ENV PATH="/root/bin:${PATH}" +ENV DP_SHA="e224a8635728026fb3aa9409ab3a98b9a3f5566a" + +# Define working directory +WORKDIR ${HOME} + +# Install apt-get packages +RUN apt-get update && \ + apt-get -y install \ + sudo \ + wget \ + zip \ + git \ + software-properties-common \ + gcc \ + g++ \ + clang-format \ + build-essential \ + python3-distutils \ + pkg-config \ + zlib1g-dev + +# Download and Install Bazelisk +RUN wget "${BAZELISK_DOWNLOAD_URL}/${BAZELISK_VERSION}/${BAZELISK_BINARY}" && \ + chmod +x ${BAZELISK_BINARY} + +RUN ./${BAZELISK_BINARY} --version + +# Update pip and setuptools and install poetry +RUN pip install --upgrade pip setuptools wheel && \ + pip install poetry + +# Change working dir +WORKDIR ${PROJECT_DIR} + +# Copy local source over +COPY . ${PROJECT_DIR} + +# Get google dp dependency +RUN mkdir -p third_party && \ + cd third_party && \ + git clone https://github.com/google/differential-privacy.git && \ + cd differential-privacy && \ + git checkout ${DP_SHA} + +# Remove unused java code +RUN rm -rf third_party/differential-privacy/java && \ + rm -rf third_party/differential-privacy/examples/java + +RUN sed -i -e 's/@com_google_cc_differential_privacy//g' third_party/differential-privacy/cc/algorithms/BUILD + +# This makes poetry's virtual environment in the project dir +RUN poetry config virtualenvs.in-project true + +# Build the bindings using Bazel and create a python wheel +RUN poetry env use ${PYTHON_VERSION} && \ + ${HOME}/${BAZELISK_BINARY} build --config linux src/python:pydp --verbose_failures + +RUN cp -f ./bazel-bin/src/bindings/_pydp.so ./pydp && \ + rm -rf dist/ && \ + poetry run python setup.py bdist_wheel && \ + poetry add dist/*.whl + +# This `activates` the virtual env +ENV VIRTUAL_ENV=$PROJECT_DIR/.venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +# Default entrypoint +CMD ["/bin/bash"] diff --git a/README.md b/README.md index f4c22ff7..b5f8a0a7 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ guarantee and accuracy of your model written in Python. BoundedSum, Max, Count Above, Percentile, Min, Median, etc. - All the computation methods mentioned above use Laplace noise only (other noise mechanisms will be added soon! :smiley:) -- :fire: Currently supports Linux and macOS (Windows support coming soon -:smiley:) +- :fire: Compatible with all three types of Operating Systems - Linux, macOS, and Windows :smiley: - :star: Use Python 3.x. ## Installation diff --git a/WORKSPACE b/WORKSPACE index d142db04..2d875841 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -49,20 +49,22 @@ http_archive( # Google DP library and it's dependencies local_repository( - name = "google_dp", - path = "third_party/differential-privacy/cc", + name = "com_google_differential_privacy", + path = "third_party/differential-privacy", ) -load("@google_dp//:cc_differential_privacy_deps.bzl", "cc_differential_privacy_deps") +load("@com_google_differential_privacy//:differential_privacy_deps.bzl", "differential_privacy_deps") + +differential_privacy_deps() -cc_differential_privacy_deps() # Google DP library and it's dependencies local_repository( - name = "com_google_differential_privacy", - path = "third_party/differential-privacy", + name = "com_google_cc_differential_privacy", + path = "third_party/differential-privacy/cc", ) -load("@com_google_differential_privacy//:differential_privacy_deps.bzl", "differential_privacy_deps") +load("@com_google_cc_differential_privacy//:cc_differential_privacy_deps.bzl", "cc_differential_privacy_deps") + +cc_differential_privacy_deps() -differential_privacy_deps() diff --git a/docs/conf.py b/docs/conf.py index 3832ef9f..17a254cb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,6 +16,8 @@ import os import sys +sys.path.insert(0, "../src") + # pydp absolute import pydp diff --git a/docs/pydp.rst b/docs/pydp.rst index 205b3131..ab9de0fc 100644 --- a/docs/pydp.rst +++ b/docs/pydp.rst @@ -26,6 +26,17 @@ Algorithms :inherited-members: +Numerical Mechanisms +#################### +.. currentmodule:: pydp.algorithms.numerical_mechanisms +.. autoclass:: NumericalMechanism + :members: +.. autoclass:: LaplaceMechanism + :members: + :show-inheritance: +.. autoclass:: GaussianMechanism + :members: + :show-inheritance: Distributions ############# @@ -52,4 +63,4 @@ Partition Selection .. currentmodule:: pydp.algorithms.partition_selection .. autoclass:: PartitionSelectionStrategy :members: -.. autofunction:: create_partition_strategy \ No newline at end of file +.. autofunction:: create_partition_strategy diff --git a/docs/requirements.txt b/docs/requirements.txt index dfea7cb4..c2c6f6a6 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,2 @@ -python-dp sphinx>=3.2.1 sphinx-rtd-theme diff --git a/examples/SKLearn_Pipeline/SKLearn_Pipeline_Laplace_Mechanism.ipynb b/examples/SKLearn_Pipeline/SKLearn_Pipeline_Laplace_Mechanism.ipynb index 78cc4097..b870b6e7 100644 --- a/examples/SKLearn_Pipeline/SKLearn_Pipeline_Laplace_Mechanism.ipynb +++ b/examples/SKLearn_Pipeline/SKLearn_Pipeline_Laplace_Mechanism.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "3a391b28", "metadata": {}, "outputs": [], @@ -44,17 +44,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "71ca131d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.48" + "0.84" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -84,17 +84,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "id": "46d88b53", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.52" + "0.48" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -127,17 +127,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "93af586e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.64" + "0.52" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -173,17 +173,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "eda5ef9c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.36" + "0.6" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -212,10 +212,93 @@ "pipe.score(X_test, y_test)" ] }, + { + "cell_type": "markdown", + "id": "ed11410c", + "metadata": {}, + "source": [ + "## Categorical Feature Support" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e5159900", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.48" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import random\n", + "\n", + "# Helper function to inject nominal value to dataset\n", + "def create_cat_data(idxs, X, cat_data = [0,1, 2, 3, 4, 5]):\n", + " \n", + " X = X.copy()\n", + " \n", + " for idx in idxs:\n", + " for i in range(len(X[:,idx])):\n", + " num = random.choice(cat_data)\n", + " X[i,idx] = num\n", + " \n", + " return X\n", + "\n", + "# DUMMY DATASET\n", + "\n", + "# Create random dataset\n", + "X, y = make_classification(random_state=0)\n", + "\n", + "# Indecies for caegorical data\n", + "cat_feat_idxs = [0, 19]\n", + "\n", + "# Inject nominal data\n", + "X = create_cat_data(cat_feat_idxs, X)\n", + "\n", + "# Split training test set\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", + " random_state=0)\n", + "\n", + "# Sensitivity function for numeric features\n", + "calculate_sensitivity = lambda x: (max(x) - min(x))/ (len(x) + 1)\n", + "\n", + "# Sensitivity function for categorical features\n", + "calculate_sensitivity_cat = lambda x: abs(sum(x)/len(x))\n", + "\n", + "# Set laplace mechanism with categoical support\n", + "laplace = LaplaceMechanism(\n", + " epsilon=0.1, \n", + " sensitivity=calculate_sensitivity,\n", + " cat_sensitivity=calculate_sensitivity_cat,\n", + " cat_feat_idxs=cat_feat_idxs\n", + ")\n", + "\n", + "# Initialize scaler and naive bayes extimator\n", + "scaler = StandardScaler()\n", + "nb = GaussianNB()\n", + "\n", + "# Create the pipeline\n", + "pipe = Pipeline([('scaler', scaler), ('laplace', laplace), ('nb', nb)])\n", + "\n", + "# Train Naive Bayes model with Local DP\n", + "pipe.fit(X_train, y_train)\n", + "\n", + "# Get model score\n", + "pipe.score(X_test, y_test)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "fdb818d4", + "id": "dce26029", "metadata": {}, "outputs": [], "source": [] diff --git a/examples/laplace_demo/README.md b/examples/laplace_demo/README.md new file mode 100644 index 00000000..f9cd6814 --- /dev/null +++ b/examples/laplace_demo/README.md @@ -0,0 +1,3 @@ +# Introduction to Laplace Distribution + +This example walksthrough why it is required to add noise to make data private. It motivates for the use of Laplace distribution's to make it easy to satisfy ε-differential privacy by setting the b parameter to 1/ε. \ No newline at end of file diff --git a/examples/laplace_demo/laplace.ipynb b/examples/laplace_demo/laplace.ipynb index 5b7752e4..84acdd48 100644 --- a/examples/laplace_demo/laplace.ipynb +++ b/examples/laplace_demo/laplace.ipynb @@ -1,21 +1,21 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "y4gpTCeMz3-T" - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - "%matplotlib inline" + "## Table of Content 🧾\n", + "\n", + "1. [Privacy 🔐 and Noise 📢](#Privacy-%F0%9F%94%90-and-Noise-📢)\n", + "2. [Exponential Distribution 📈](#Exponential-Distribution-📈)\n", + "3. [Laplace Distribution ⭐️](#Laplace-Distribution-⭐%EF%B8%8F)\n", + "4. [Why Laplace? 🧐](#Why-Laplace?-🧐)\n", + "5. [Conclusion 🔚](#Conclusion-🔚)" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", @@ -34,6 +34,8 @@ }, "outputs": [], "source": [ + "# Import Necessary Packages\n", + "%matplotlib inline\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] @@ -45,7 +47,8 @@ "id": "84VsFE-Hz3-a" }, "source": [ - "## Privacy and Noise" + "## Privacy 🔐 and Noise 📢\n", + "---" ] }, { @@ -61,7 +64,7 @@ "\n", "But how exactly do you choose that random number? More technically, from what distribution will it be drawn?\n", "\n", - "The most common choice is the [Laplace distribution](https://en.wikipedia.org/wiki/Laplace_distribution) because it works well with the privacy parameter ε. This notebook is dedicated to exploring the Laplace distribution: what it is and why it's used for differential privacy." + "The most common choice is the [**Laplace distribution**](https://en.wikipedia.org/wiki/Laplace_distribution) because it works well with the privacy parameter ε. This notebook is dedicated to exploring the Laplace distribution: what it is and why it's used for differential privacy." ] }, { @@ -71,7 +74,8 @@ "id": "gU9zQELHz3-b" }, "source": [ - "## Exponential Distribution" + "## Exponential Distribution 📈\n", + "---" ] }, { @@ -81,7 +85,7 @@ "id": "gSRaoBfsz3-d" }, "source": [ - "The Laplace distribution can be thought of as two exponential distributions back-to-back so let's start by looking at the exponential distribution. It can be defined as" + "The **Laplace distribution** can be thought of as two exponential distributions back-to-back so let's start by looking at the exponential distribution. It can be defined as" ] }, { @@ -91,7 +95,7 @@ "id": "2q1etBbNz3-e" }, "source": [ - "$ f(x) = e^{-x} $" + "$$\\large f(x) = e^{-x} $$" ] }, { @@ -101,12 +105,16 @@ "id": "nPnfuntnz3-f" }, "source": [ - "When x is zero, f(x) is one, and the bigger x gets the closer to zero the output will be. Typically this is only defined for x >= 0. It looks like this:" + "It's behaviour, can be summarized as follows:- \n", + "* When $x$ is zero, $f(x)$ is one\n", + "* the bigger $x$ gets the closer to zero the output will be. \n", + "\n", + "Typically this is only defined for $x >= 0$. Let's use matplotlib to see how it looks:" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -130,7 +138,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -153,7 +161,8 @@ "id": "08lL-b5Az3-j" }, "source": [ - "## Laplace" + "## Laplace Distribution ⭐️\n", + "---" ] }, { @@ -168,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "colab": {}, "colab_type": "code", @@ -193,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -217,7 +226,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -244,7 +253,7 @@ "\n", "What if you wanted to adjust the distribution to make it more or less likely to report something close to the real result? You can do that by introducing a scaling parameter.\n", "\n", - "The Laplace is typically defined with two additional parameters, μ and b:" + "The Laplace is typically defined with two additional parameters, $\\mu$ and $b$:" ] }, { @@ -254,7 +263,7 @@ "id": "60blurVhz3-r" }, "source": [ - "$f(x\\mid \\mu ,b)={\\frac {1}{2b}}\\exp \\left(-{\\frac {|x-\\mu |}{b}}\\right)\\,\\!$" + "$$\\large f(x\\mid \\mu ,b)={\\frac {1}{2b}}\\exp \\left(-{\\frac {|x-\\mu |}{b}}\\right)\\,\\!$$" ] }, { @@ -264,16 +273,16 @@ "id": "qTIoqLgEz3-s" }, "source": [ - "μ is the mean, in other words the center, or the pointy bit. In the context of differential privacy, this would be the real result from the database, i.e., what you would report if you added zero noise.\n", + "$\\mu$ is the **mean**, in other words the center, or the \"pointy\" bit. In the context of differential privacy, this would be the real result from the database, i.e., what you would report if you added zero noise.\n", "\n", - "b is the scaling parameter. Increasing it \"flattens out\" the graph so that the tails account for relatively more of the total. Increasing it gives you more privacy, because by flattening out the graph you make it more likely to choose a high value for the noise, which means an attacker can be less certain that the reported result is close to the true result.\n", + "$b$ is the **scaling parameter**. Increasing it \"flattens out\" the graph so that the tails account for relatively more of the total. Increasing it gives you more privacy, because by flattening out the graph you make it more likely to choose a high value for the noise, which means an attacker can be less certain that the reported result is close to the true result.\n", "\n", "You can try different values for μ and b and see how they affect the graph below." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "colab": {}, "colab_type": "code", @@ -302,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -326,7 +335,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -351,7 +360,8 @@ "id": "KdsSS8asz3-x" }, "source": [ - "## Why Laplace?" + "## Why Laplace? 🧐\n", + "---" ] }, { @@ -372,7 +382,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "colab": {}, "colab_type": "code", @@ -407,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -431,7 +441,17 @@ "outputs": [ { "data": { - "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", "text/plain": [ "
" ] @@ -466,7 +486,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -494,7 +514,7 @@ "(0.25, 0.15163266492815836)" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -517,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -545,7 +565,7 @@ "(0.25, 0.25)" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -571,7 +591,8 @@ "id": "fzgGonHLz3-_" }, "source": [ - "## Conclusion" + "## Conclusion 🔚\n", + "---" ] }, { @@ -581,25 +602,12 @@ "id": "7fmk6wcLz3-_" }, "source": [ - "To recap:\n", - "\n", - "* You make data private by adding noise.\n", + "To recap, in this example you learned about:\n", "\n", - "* The Laplace distribution is constructed by stitching together two exponential distributions.\n", - "\n", - "* The Laplace distribution makes it easy for you to satisfy the requirement of ε differential privacy by setting the b parameter to 1/ε." + "* How adding random noise to your data can help preserve it's privacy i.e. you make data private by adding noise.\n", + "* Exponential and Laplace Distributions. Remember, the Laplace distribution is constructed by stitching together two exponential distributions. Moreover, we learnt how changing certain parameters of the distribution can help give you more privacy.\n", + "* Why Laplace Distribution is an ideal choice for adding noise to the data because it makes it easy for you to satisfy the requirement of ε differential privacy by setting the b parameter to 1/ε." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "XM_K2Xkyz3_A" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -609,7 +617,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -623,7 +631,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/prereqs_linux.sh b/prereqs_linux.sh index adbf9dc7..c35da3cd 100755 --- a/prereqs_linux.sh +++ b/prereqs_linux.sh @@ -20,7 +20,7 @@ if command -v python3 &>/dev/null; then elif command python --version | grep -q 'Python 3'; then echo "Python 3 already installed" else - echo "Installing Python 3 is not installed" + echo "Installing Python 3 is not installed" sudo add-apt-repository ppa:deadsnakes/ppa sudo apt-get update sudo apt-get install python3.6 @@ -60,10 +60,16 @@ git submodule update --init --recursive # checkout out to particular commit -cd third_party/differential-privacy && \ +cd third_party/differential-privacy && git checkout bf0abf446b2f9d625a824bf14a7e3a6b6ac2a3e4 && \ cd - # renaming workspace.bazel to workspace mv third_party/differential-privacy/cc/WORKSPACE.bazel third_party/differential-privacy/cc/WORKSPACE # Removing the java part -rm -rf third_party/differential-privacy/java third_party/differential-privacy/examples/java \ No newline at end of file +rm -rf third_party/differential-privacy/java third_party/differential-privacy/examples/java + +# Removing the Go part +rm -rf third_party/differential-privacy/go third_party/differential-privacy/examples/go + +# Removing the Privacy on Beam +rm -rf third_party/differential-privacy/privacy-on-beam diff --git a/prereqs_mac.sh b/prereqs_mac.sh index ad42704f..e7d17f88 100755 --- a/prereqs_mac.sh +++ b/prereqs_mac.sh @@ -1,61 +1,64 @@ -#!/bin/bash - -# homebrew -which -s brew -if [[ $? != 0 ]] ; then - # Install Homebrew - echo "Downloading and installing homebrew" - /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" -else - brew update -fi - -# python 3 -echo "Checking for python3 installation" -if command -v python3 &>/dev/null; then - echo "Python 3 already installed" -else - echo "Downloading and installing Python3 using homebrew" - brew install python3 -fi - -# bazel -if command -v bazel &>/dev/null; then - echo "Bazel already installed" -else - echo "Downloading and installing Bazel using homebrew" - brew tap bazelbuild/tap - brew install bazelbuild/tap/bazel -fi - -# clang-format -if command -v clang-format &>/dev/null; then - echo "clang-format already installed" -else - echo "installing clang-format" - brew install clang-format -fi - -# poetry -echo "Checking for poetry" -if python3 -c "import poetry" &> /dev/null; then - echo "poetry is already installed" -else - echo "installing poetry" - pip3 install poetry -fi - -# Downloading the Google DP library -git submodule update --init --recursive - -# checkout out to particular commit -cd third_party/differential-privacy && git checkout 2b320f8c03ba97215e3de7f7782eb5b8fd0b2354 && \ -cd - -# renaming workspace.bazel to workspace -mv third_party/differential-privacy/cc/WORKSPACE.bazel third_party/differential-privacy/cc/WORKSPACE - -# Removing the java part -rm -rf third_party/differential-privacy/java third_party/differential-privacy/examples/java - -# Removing the Go part -rm -rf third_party/differential-privacy/go third_party/differential-privacy/examples/go \ No newline at end of file +#!/bin/bash + +# homebrew +which -s brew +if [[ $? != 0 ]] ; then + # Install Homebrew + echo "Downloading and installing homebrew" + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" +else + brew update +fi + +# python 3 +echo "Checking for python3 installation" +if command -v python3 &>/dev/null; then + echo "Python 3 already installed" +else + echo "Downloading and installing Python3 using homebrew" + brew install python3 +fi + +# bazel +if command -v bazel &>/dev/null; then + echo "Bazel already installed" +else + echo "Downloading and installing Bazel using homebrew" + brew tap bazelbuild/tap + brew install bazelbuild/tap/bazel +fi + +# clang-format +if command -v clang-format &>/dev/null; then + echo "clang-format already installed" +else + echo "installing clang-format" + brew install clang-format +fi + +# poetry +echo "Checking for poetry" +if python3 -c "import poetry" &> /dev/null; then + echo "poetry is already installed" +else + echo "installing poetry" + pip3 install poetry +fi + +# Downloading the Google DP library +git submodule update --init --recursive + +# checkout out to particular commit +cd third_party/differential-privacy && git checkout e224a8635728026fb3aa9409ab3a98b9a3f5566a && \ +cd - +# renaming workspace.bazel to workspace +mv third_party/differential-privacy/cc/WORKSPACE.bazel third_party/differential-privacy/cc/WORKSPACE + +# Removing the java part +rm -rf third_party/differential-privacy/java third_party/differential-privacy/examples/java + +# Removing the Go part +rm -rf third_party/differential-privacy/go third_party/differential-privacy/examples/go + +# Removing the Privacy on Beam part +rm -rf third_party/differential-privacy/privacy-on-beam \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 38a54239..1148c211 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.1.0 +current_version = 1.1.1 commit = True tag = True diff --git a/setup.py b/setup.py index 304bcdac..6eccca89 100755 --- a/setup.py +++ b/setup.py @@ -60,6 +60,6 @@ def read(fname): python_requires=">=3.6", test_suite="tests", url="https://github.com/OpenMined/PyDP", - version="1.1.0", + version="1.1.1", zip_safe=False, ) diff --git a/src/bindings/BUILD b/src/bindings/BUILD index ff549e23..4afd54dc 100644 --- a/src/bindings/BUILD +++ b/src/bindings/BUILD @@ -1,30 +1,32 @@ -load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") - -pybind_extension( - name = "_pydp", - srcs = glob([ - "PyDP/*.cpp", - "PyDP/base/*.cpp", - "PyDP/algorithms/*.cpp", - "PyDP/pydp_lib/*.hpp", - "PyDP/proto/*.cpp" - ]), - - visibility = ["//src/python:__pkg__"], - deps = [ - "@google_dp//base:percentile", - "@google_dp//base:logging", - "@google_dp//base:statusor_internals", - "@google_dp//base:status", - "@google_dp//base:canonical_errors", - "@google_dp//base:statusor", - "@google_dp//algorithms:algorithm", - "@google_dp//algorithms:bounded-mean", - "@google_dp//algorithms:bounded-sum", - "@google_dp//algorithms:bounded-standard-deviation", - "@google_dp//algorithms:partition-selection", - "@google_dp//algorithms:count", - "@google_dp//algorithms:order-statistics", - "@google_dp//proto:util-lib" - ], -) +load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") + +pybind_extension( + name = "_pydp", + srcs = glob([ + "PyDP/*.cpp", + "PyDP/base/*.cpp", + "PyDP/algorithms/*.cpp", + "PyDP/pydp_lib/*.hpp", + "PyDP/proto/*.cpp", + "PyDP/mechanisms/*.cpp" + ]), + + visibility = ["//src/python:__pkg__"], + deps = [ + "@com_google_cc_differential_privacy//base:percentile", + "@com_google_cc_differential_privacy//base:logging", + "@com_google_cc_differential_privacy//base:statusor_internals", + "@com_google_cc_differential_privacy//base:status", + # "@com_google_cc_differential_privacy//base:canonical_errors", + "@com_google_cc_differential_privacy//base:statusor", + "@com_google_cc_differential_privacy//algorithms:algorithm", + "@com_google_cc_differential_privacy//algorithms:bounded-mean", + "@com_google_cc_differential_privacy//algorithms:bounded-sum", + "@com_google_cc_differential_privacy//algorithms:bounded-standard-deviation", + "@com_google_cc_differential_privacy//algorithms:partition-selection", + "@com_google_cc_differential_privacy//algorithms:numerical-mechanisms", + "@com_google_cc_differential_privacy//algorithms:count", + "@com_google_cc_differential_privacy//algorithms:order-statistics", + "@com_google_cc_differential_privacy//proto:util-lib" + ], +) diff --git a/src/bindings/PyDP/algorithms/bounded_functions.cpp b/src/bindings/PyDP/algorithms/bounded_functions.cpp index 60e4aa42..f52dc0a0 100644 --- a/src/bindings/PyDP/algorithms/bounded_functions.cpp +++ b/src/bindings/PyDP/algorithms/bounded_functions.cpp @@ -12,9 +12,6 @@ #include "algorithms/bounded-variance.h" #include "../pydp_lib/algorithm_builder.hpp" -#include "../pydp_lib/casting.hpp" // our caster helper library - -using namespace std; namespace py = pybind11; namespace dp = differential_privacy; diff --git a/src/bindings/PyDP/algorithms/count.cpp b/src/bindings/PyDP/algorithms/count.cpp index c85eba17..7ebca108 100644 --- a/src/bindings/PyDP/algorithms/count.cpp +++ b/src/bindings/PyDP/algorithms/count.cpp @@ -7,8 +7,6 @@ #include "../pydp_lib/algorithm_builder.hpp" -using namespace std; - namespace py = pybind11; namespace dp = differential_privacy; diff --git a/src/bindings/PyDP/algorithms/distributions.cpp b/src/bindings/PyDP/algorithms/distributions.cpp index e717a165..d126f547 100644 --- a/src/bindings/PyDP/algorithms/distributions.cpp +++ b/src/bindings/PyDP/algorithms/distributions.cpp @@ -3,7 +3,6 @@ #include "algorithms/distributions.h" -using namespace std; namespace py = pybind11; namespace dpi = differential_privacy::internal; diff --git a/src/bindings/PyDP/algorithms/order_statistics.cpp b/src/bindings/PyDP/algorithms/order_statistics.cpp index 3861a8fd..6c29636e 100644 --- a/src/bindings/PyDP/algorithms/order_statistics.cpp +++ b/src/bindings/PyDP/algorithms/order_statistics.cpp @@ -7,8 +7,6 @@ #include "../pydp_lib/algorithm_builder.hpp" -using namespace std; - namespace py = pybind11; namespace dp = differential_privacy; diff --git a/src/bindings/PyDP/algorithms/rand.cpp b/src/bindings/PyDP/algorithms/rand.cpp index bb2bf8ec..2a27060c 100644 --- a/src/bindings/PyDP/algorithms/rand.cpp +++ b/src/bindings/PyDP/algorithms/rand.cpp @@ -1,6 +1,5 @@ // Provides bindings for rand #include "algorithms/rand.h" -#include "../pydp_lib/casting.hpp" #include "pybind11/pybind11.h" namespace py = pybind11; diff --git a/src/bindings/PyDP/algorithms/util.cpp b/src/bindings/PyDP/algorithms/util.cpp index 45394d42..bf27832e 100644 --- a/src/bindings/PyDP/algorithms/util.cpp +++ b/src/bindings/PyDP/algorithms/util.cpp @@ -1,42 +1,39 @@ -// Provides bindings for Util - -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -#include "algorithms/util.h" - -namespace py = pybind11; -namespace dp = differential_privacy; - -void init_algorithms_util(py::module& m) { - m.attr("__module__") = "pydp"; - m.def( - "xor_strings", &dp::XorStrings, - R"pbdoc(Character-wise XOR of two strings. In case of differing string lengths, operation will be performed by the repeated concatenation of the smaler string till it is of the same length as the longer before the performance of the XOR operation.)pbdoc"); - m.def("default_epsilon", &dp::DefaultEpsilon); // deprecated, default epsilon value - m.def( - "get_next_power_of_two", &dp::GetNextPowerOfTwo, - R"pbdoc(Outputs value of a power of two that is greater than and closest to the given numerical input.)pbdoc"); - m.def( - "qnorm", &dp::Qnorm, - R"pbdoc(Quantile function of normal distribution, inverse of the cumulative distribution function.)pbdoc"); - m.def( - "mean", &dp::Mean, - R"pbdoc(Calculation of the mean of given set of numbers for a double int data type.)pbdoc"); - m.def( - "mean", &dp::Mean, - R"pbdoc(Calculation of the mean of given set of numbers for an int data type.)pbdoc"); - m.def("variance", &dp::Variance, - R"pbdoc(Calculate variance for a set of values.)pbdoc"); - m.def("standard_deviation", &dp::StandardDev, - R"pbdoc(Standard Deviation, the square root of variance.)pbdoc"); - m.def("order_statistics", &dp::OrderStatistic, - R"pbdoc(Sample values placed in ascending order.)pbdoc"); - m.def("correlation", &dp::Correlation, - R"pbdoc(Returns linear correlation coefficient.)pbdoc"); - m.def( - "vector_filter", &dp::VectorFilter, - R"pbdoc(Filtering a vector using a logical operatio with only values selected using true output in their positions.)pbdoc"); - m.def("vector_to_string", &dp::VectorToString, - R"pbdoc(Conversion of a vector to a string data type.)pbdoc"); +// Provides bindings for Util + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +#include "algorithms/util.h" + +namespace py = pybind11; +namespace dp = differential_privacy; + +void init_algorithms_util(py::module& m) { + m.attr("__module__") = "pydp"; + m.def("default_epsilon", &dp::DefaultEpsilon); // deprecated, default epsilon value + m.def( + "get_next_power_of_two", &dp::GetNextPowerOfTwo, + R"pbdoc(Outputs value of a power of two that is greater than and closest to the given numerical input.)pbdoc"); + m.def( + "qnorm", &dp::Qnorm, + R"pbdoc(Quantile function of normal distribution, inverse of the cumulative distribution function.)pbdoc"); + m.def( + "mean", &dp::Mean, + R"pbdoc(Calculation of the mean of given set of numbers for a double int data type.)pbdoc"); + m.def( + "mean", &dp::Mean, + R"pbdoc(Calculation of the mean of given set of numbers for an int data type.)pbdoc"); + m.def("variance", &dp::Variance, + R"pbdoc(Calculate variance for a set of values.)pbdoc"); + m.def("standard_deviation", &dp::StandardDev, + R"pbdoc(Standard Deviation, the square root of variance.)pbdoc"); + m.def("order_statistics", &dp::OrderStatistic, + R"pbdoc(Sample values placed in ascending order.)pbdoc"); + m.def("correlation", &dp::Correlation, + R"pbdoc(Returns linear correlation coefficient.)pbdoc"); + m.def( + "vector_filter", &dp::VectorFilter, + R"pbdoc(Filtering a vector using a logical operatio with only values selected using true output in their positions.)pbdoc"); + m.def("vector_to_string", &dp::VectorToString, + R"pbdoc(Conversion of a vector to a string data type.)pbdoc"); } \ No newline at end of file diff --git a/src/bindings/PyDP/base/logging.cpp b/src/bindings/PyDP/base/logging.cpp index 7d0b4bcc..8ede4b2a 100644 --- a/src/bindings/PyDP/base/logging.cpp +++ b/src/bindings/PyDP/base/logging.cpp @@ -5,8 +5,7 @@ #include "pybind11/pybind11.h" -#include "../pydp_lib/casting.hpp" // our caster helper library -#include "base/logging.h" // the header file associated with logging.cc +#include "base/logging.h" // the header file associated with logging.cc namespace py = pybind11; namespace dpb = differential_privacy::base; diff --git a/src/bindings/PyDP/base/status.cpp b/src/bindings/PyDP/base/status.cpp index b5196a1b..b1724e5e 100644 --- a/src/bindings/PyDP/base/status.cpp +++ b/src/bindings/PyDP/base/status.cpp @@ -1,115 +1,110 @@ -// Provides bindings for base/status and related - -#include - -#include "../pydp_lib/casting.hpp" // our caster helper library -#include "pybind11/operators.h" // for overloading the operators -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -#include "base/canonical_errors.h" // the header file associated with status.cc -#include "base/status.h" // the header file associated with status.cc -#include "base/statusor.h" //header file associated with statusor.cc -// #include "differential_privacy/proto/data.pb.h" // for Output type - -using namespace std; - -namespace py = pybind11; -using namespace py::literals; -namespace dp = differential_privacy; -namespace dpb = differential_privacy::base; - -template -void declareStatusOr(py::module &m, string const &suffix) { - py::class_> cls(m, ("StatusOr" + suffix).c_str()); - cls.def(py::init<>()); - cls.def(py::init(), "value"_a); - cls.def(py::init(), "status"_a); - cls.def("ok", &dpb::StatusOr::ok); - // cls.def("value_or_die", &dpb::StatusOr::ValueOrDie); - // cls.def(py::self == dpbase::Status()); -} - -template -void declareStatusOr2(py::module &m, string const &suffix) { - py::class_> cls(m, ("StatusOr" + suffix).c_str()); - cls.def(py::init<>()); - // cls.def(py::init(), "value"_a); - cls.def(py::init(), "status"_a); - cls.def("ok", &dpb::StatusOr::ok); - // cls.def("value_or_die", &dpb::StatusOr::ValueOrDie); - // cls.def(py::self == dpbase::Status()); -} - -void init_base_status(py::module &m) { - // Creating the Status class - py::class_ status(m, "Status"); - status.attr("__module__") = "pydp"; - // Status class (we can now build functions and enuums from this class) - status.def(py::init()) - .def("__repr__", &dpb::Status::ToString, "String representation of status") - .def("set_payload", &dpb::Status::SetPayload, "Sets the status payload") - .def("get_payload", &dpb::Status::GetPayload, "Returns the status payload") - .def("erase_payload", &dpb::Status::ErasePayload, "Erases at target url"); - - // scoped enumerator for Status Code - py::enum_(status, "StatusCode", py::arithmetic()) - .value("kOk", dpb::StatusCode::kOk) - .value("kCancelled", dpb::StatusCode::kCancelled) - .value("kUnknown", dpb::StatusCode::kUnknown) - .value("kInvalidArgument", dpb::StatusCode::kInvalidArgument) - .value("kDeadlineExceeded", dpb::StatusCode::kDeadlineExceeded) - .value("kNotFound", dpb::StatusCode::kNotFound) - .value("kAlreadyExists", dpb::StatusCode::kAlreadyExists) - .value("kPermissionDenied", dpb::StatusCode::kPermissionDenied) - .value("kResourceExhausted", dpb::StatusCode::kResourceExhausted) - .value("kFailedPrecondition", dpb::StatusCode::kFailedPrecondition) - .value("kAborted", dpb::StatusCode::kAborted) - .value("kOutOfRange", dpb::StatusCode::kOutOfRange) - .value("kUnimplemented", dpb::StatusCode::kUnimplemented) - .value("kInternal", dpb::StatusCode::kInternal) - .value("kUnavailable", dpb::StatusCode::kUnavailable) - .value("kDataLoss", dpb::StatusCode::kDataLoss) - .value("kUnauthenticated", dpb::StatusCode::kUnauthenticated) - .value("kDoNotUseReservedForFutureExpansionUseDefaultInSwitchInstead_", - dpb::StatusCode:: - kDoNotUseReservedForFutureExpansionUseDefaultInSwitchInstead_); - - // converts a status code to a nice string - // status.def("status_code_to_string", &dpb::StatusCodeToString, - // "converts status code to string"); - - // canonical errors - status.def("aborted_error", &dpb::AbortedError); - status.def("aborted_error", &dpb::AbortedError); - status.def("already_exists_error", &dpb::AlreadyExistsError); - status.def("data_loss_error", &dpb::DataLossError); - status.def("deadline_exceeded_error", &dpb::DeadlineExceededError); - status.def("failed_precondition_error", &dpb::FailedPreconditionError); - status.def("internal_error", &dpb::InternalError); - status.def("invalid_argument_error", &dpb::InvalidArgumentError); - status.def("not_found_error", &dpb::NotFoundError); - status.def("out_of_range_error", &dpb::OutOfRangeError); - status.def("permission_denied_error", &dpb::PermissionDeniedError); - status.def("resource_exhausted_error", &dpb::ResourceExhaustedError); - status.def("unauthenticated_error", &dpb::UnauthenticatedError); - status.def("unavailable_error", &dpb::UnavailableError); - status.def("unimplemented_error", &dpb::UnimplementedError); - status.def("unknown_error", &dpb::UnknownError); - - // from statusor - m.def("handle_invalid_status_ctor_arg", - &dpb::statusor_internal::Helper::HandleInvalidStatusCtorArg); - m.def("crash", &dpb::statusor_internal::Helper::Crash, "Crash helper function"); - - declareStatusOr(m, "D"); - // declareStatusOr(m, "O"); - - // declareStatusOr2 is only a little different from declareStatusOr - // (see above in this file). - // Using declareStatusOr (without "2" at the end) below results in this error: - // external/google_dp/differential_privacy/base/statusor_internals.h:104:60: - // error: use of deleted function - // 'differential_privacy::BoundedMean::BoundedMean(differential_privacy::BoundedMean&&)' - // declareStatusOr2>(m, "BoundedMeantInt"); -} +// Provides bindings for base/status and related + +#include + +#include "pybind11/operators.h" // for overloading the operators +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +#include "base/status.h" // the header file associated with status.cc +#include "base/statusor.h" //header file associated with statusor.cc + +namespace py = pybind11; +using namespace py::literals; +namespace dp = differential_privacy; +namespace dpb = differential_privacy::base; + +template +void declareStatusOr(py::module &m, std::string const &suffix) { + py::class_> cls(m, ("StatusOr" + suffix).c_str()); + cls.def(py::init<>()); + cls.def(py::init(), "value"_a); + cls.def(py::init(), "status"_a); + cls.def("ok", &dpb::StatusOr::ok); + // cls.def("value_or_die", &dpb::StatusOr::ValueOrDie); + // cls.def(py::self == dpbase::Status()); +} + +template +void declareStatusOr2(py::module &m, std::string const &suffix) { + py::class_> cls(m, ("StatusOr" + suffix).c_str()); + cls.def(py::init<>()); + // cls.def(py::init(), "value"_a); + cls.def(py::init(), "status"_a); + cls.def("ok", &dpb::StatusOr::ok); + // cls.def("value_or_die", &dpb::StatusOr::ValueOrDie); + // cls.def(py::self == dpbase::Status()); +} + +void init_base_status(py::module &m) { + // Creating the Status class + py::class_ status(m, "Status"); + status.attr("__module__") = "pydp"; + // Status class (we can now build functions and enuums from this class) + status.def(py::init()) + .def("__repr__", &dpb::Status::ToString, "String representation of status") + .def("set_payload", &dpb::Status::SetPayload, "Sets the status payload") + .def("get_payload", &dpb::Status::GetPayload, "Returns the status payload") + .def("erase_payload", &dpb::Status::ErasePayload, "Erases at target url"); + + // scoped enumerator for Status Code + py::enum_(status, "StatusCode", py::arithmetic()) + .value("kOk", dpb::StatusCode::kOk) + .value("kCancelled", dpb::StatusCode::kCancelled) + .value("kUnknown", dpb::StatusCode::kUnknown) + .value("kInvalidArgument", dpb::StatusCode::kInvalidArgument) + .value("kDeadlineExceeded", dpb::StatusCode::kDeadlineExceeded) + .value("kNotFound", dpb::StatusCode::kNotFound) + .value("kAlreadyExists", dpb::StatusCode::kAlreadyExists) + .value("kPermissionDenied", dpb::StatusCode::kPermissionDenied) + .value("kResourceExhausted", dpb::StatusCode::kResourceExhausted) + .value("kFailedPrecondition", dpb::StatusCode::kFailedPrecondition) + .value("kAborted", dpb::StatusCode::kAborted) + .value("kOutOfRange", dpb::StatusCode::kOutOfRange) + .value("kUnimplemented", dpb::StatusCode::kUnimplemented) + .value("kInternal", dpb::StatusCode::kInternal) + .value("kUnavailable", dpb::StatusCode::kUnavailable) + .value("kDataLoss", dpb::StatusCode::kDataLoss) + .value("kUnauthenticated", dpb::StatusCode::kUnauthenticated) + .value("kDoNotUseReservedForFutureExpansionUseDefaultInSwitchInstead_", + dpb::StatusCode:: + kDoNotUseReservedForFutureExpansionUseDefaultInSwitchInstead_); + + // converts a status code to a nice string + // status.def("status_code_to_string", &dpb::StatusCodeToString, + // "converts status code to string"); + + // canonical errors + status.def("aborted_error", &dpb::AbortedError); + status.def("aborted_error", &dpb::AbortedError); + status.def("already_exists_error", &dpb::AlreadyExistsError); + status.def("data_loss_error", &dpb::DataLossError); + status.def("deadline_exceeded_error", &dpb::DeadlineExceededError); + status.def("failed_precondition_error", &dpb::FailedPreconditionError); + status.def("internal_error", &dpb::InternalError); + status.def("invalid_argument_error", &dpb::InvalidArgumentError); + status.def("not_found_error", &dpb::NotFoundError); + status.def("out_of_range_error", &dpb::OutOfRangeError); + status.def("permission_denied_error", &dpb::PermissionDeniedError); + status.def("resource_exhausted_error", &dpb::ResourceExhaustedError); + status.def("unauthenticated_error", &dpb::UnauthenticatedError); + status.def("unavailable_error", &dpb::UnavailableError); + status.def("unimplemented_error", &dpb::UnimplementedError); + status.def("unknown_error", &dpb::UnknownError); + + // from statusor + m.def("handle_invalid_status_ctor_arg", + &dpb::statusor_internal::Helper::HandleInvalidStatusCtorArg); + m.def("crash", &dpb::statusor_internal::Helper::Crash, "Crash helper function"); + + declareStatusOr(m, "D"); + // declareStatusOr(m, "O"); + + // declareStatusOr2 is only a little different from declareStatusOr + // (see above in this file). + // Using declareStatusOr (without "2" at the end) below results in this error: + // external/google_dp/differential_privacy/base/statusor_internals.h:104:60: + // error: use of deleted function + // 'differential_privacy::BoundedMean::BoundedMean(differential_privacy::BoundedMean&&)' + // declareStatusOr2>(m, "BoundedMeantInt"); +} diff --git a/src/bindings/PyDP/bindings.cpp b/src/bindings/PyDP/bindings.cpp index a2337f04..05369386 100644 --- a/src/bindings/PyDP/bindings.cpp +++ b/src/bindings/PyDP/bindings.cpp @@ -29,6 +29,9 @@ void init_algorithms_rand(py::module &); // proto void init_proto(py::module &); +// numerical mechanisms +void init_mechanisms_mechanism(py::module &); + PYBIND11_MODULE(_pydp, m) { m.doc() = "Google Differential Privacy python extension"; @@ -52,6 +55,9 @@ PYBIND11_MODULE(_pydp, m) { init_algorithms_rand(mutil); init_algorithms_util(mutil); + auto mnumericalmechanisms = m.def_submodule("_mechanisms", "Numerical Mechanisms."); + init_mechanisms_mechanism(mnumericalmechanisms); + // Proto // TODO: Delete if it is not necessary (we no longer return StatusOr to the user) init_proto(m); diff --git a/src/bindings/PyDP/mechanisms/mechanism.cpp b/src/bindings/PyDP/mechanisms/mechanism.cpp index d1f7c442..4e773a99 100644 --- a/src/bindings/PyDP/mechanisms/mechanism.cpp +++ b/src/bindings/PyDP/mechanisms/mechanism.cpp @@ -1 +1,153 @@ -#include "mechanism.h" \ No newline at end of file +#include +#include "pybind11/complex.h" +#include "pybind11/functional.h" +#include "pybind11/stl.h" + +#include "algorithms/numerical-mechanisms.h" + +#include "../pydp_lib/algorithm_builder.hpp" + +namespace py = pybind11; +namespace dp = differential_privacy; + +class ConfidenceIntervalBinder { + public: + static void DeclareIn(py::module& m) { + py::class_ confidence_interval(m, "ConfidenceInterval"); + confidence_interval.attr("__module__") = "pydp"; + confidence_interval + .def_property("lower_bound", &dp::ConfidenceInterval::lower_bound, + &dp::ConfidenceInterval::set_lower_bound) + .def_property("upper_bound", &dp::ConfidenceInterval::upper_bound, + &dp::ConfidenceInterval::set_upper_bound) + .def_property("confidence_level", &dp::ConfidenceInterval::confidence_level, + &dp::ConfidenceInterval::set_confidence_level); + } +}; + +template +py::class_& DefPyAddNoise( + py::class_& pyclass) { + using FunctorType = T (dp::NumericalMechanism::*)(T); + return pyclass.def("add_noise", + static_cast(&dp::NumericalMechanism::AddNoise), + py::arg("result")); +} + +template +std::unique_ptr downcast_unique_ptr(std::unique_ptr u_ptr) { + static_assert(std::is_base_of::value, "Illegal downcast."); + T* ptr = dynamic_cast(u_ptr.release()); + return std::unique_ptr(ptr); +} + +class NumericalMechanismBinder { + public: + static void DeclareIn(py::module& m) { + py::class_ numerical_mech(m, "NumericalMechanism", + R"pbdoc( + Base class for all (Ɛ, 𝛿)-differenially private additive noise numerical mechanisms. + )pbdoc"); + numerical_mech.attr("__module__") = "pydp"; + DefPyAddNoise(numerical_mech); + DefPyAddNoise(numerical_mech); + DefPyAddNoise(numerical_mech); + numerical_mech + .def("noised_value_above_threshold", + &dp::NumericalMechanism::NoisedValueAboveThreshold, + R"pbdoc( + Quickly determines if `result` with added noise is above certain `threshold`. + )pbdoc") + .def("memory_used", &dp::NumericalMechanism::MemoryUsed) + .def( + "noise_confidence_interval", + [](dp::NumericalMechanism& self, double cl, + double nr) -> dp::ConfidenceInterval { + auto result = self.NoiseConfidenceInterval(cl, nr); + return result.ValueOrDie(); + }, + py::arg("confidence_level"), py::arg("noised_result"), + R"pbdoc( + Returns the confidence interval of the specified confidence level of the + noise that AddNoise() would add with the specified privacy budget. + If the returned value is , then the noise added has a confidence_level + chance of being in the domain [x,y] + )pbdoc") + .def_property_readonly("epsilon", &dp::NumericalMechanism::GetEpsilon, + "The Ɛ of the numerical mechanism"); + } +}; + +class LaplaceMechanismBinder { + public: + static std::unique_ptr build(double epsilon, + double l1_sensitivity) { + dp::LaplaceMechanism::Builder builder; + builder.SetEpsilon(epsilon); + builder.SetSensitivity(l1_sensitivity); + builder.SetL1Sensitivity(l1_sensitivity); + return downcast_unique_ptr( + builder.Build().value()); + } + + static void DeclareIn(py::module& m) { + py::class_ lap_mech( + m, "LaplaceMechanism"); + lap_mech.attr("__module__") = "pydp"; + lap_mech + .def(py::init([](double epsilon, double sensitivity) { + return build(epsilon, sensitivity); + }), + py::arg("epsilon"), py::arg("sensitivity") = 1.0) + .def_property_readonly("sensitivity", &dp::LaplaceMechanism::GetSensitivity, + "The L1 sensitivity of the query.") + .def_property_readonly("diversity", &dp::LaplaceMechanism::GetDiversity, + "The diversity of the Laplace mechanism."); + } +}; + +class GaussianMechanismBinder { + public: + static std::unique_ptr build(double epsilon, double delta, + double l2_sensitivity) { + dp::GaussianMechanism::Builder builder; + builder.SetEpsilon(epsilon); + builder.SetDelta(delta); + builder.SetL2Sensitivity(l2_sensitivity); + return downcast_unique_ptr( + builder.Build().value()); + }; + + static void DeclareIn(py::module& m) { + py::class_ gaus_mech( + m, "GaussianMechanism"); + gaus_mech.attr("__module__") = "pydp"; + gaus_mech + .def(py::init([](double epsilon, double delta, double l2_sensitivity) { + return build(epsilon, delta, l2_sensitivity); + }), + py::arg("epsilon"), py::arg("delta"), py::arg("sensitivity") = 1.0) + .def_property_readonly("delta", &dp::GaussianMechanism::GetDelta, + "The 𝛿 of the Gaussian mechanism.") + .def_property_readonly("std", + [](const dp::GaussianMechanism& self) { + return dp::GaussianMechanism::CalculateStddev( + self.GetEpsilon(), self.GetDelta(), + self.GetL2Sensitivity()); + }, + R"pbdoc( + The standard deviation parameter of the + Gaussian mechanism underlying distribution. + )pbdoc") + .def_property_readonly("l2_sensitivity", + &dp::GaussianMechanism::GetL2Sensitivity, + "The L2 sensitivity of the query."); + } +}; + +void init_mechanisms_mechanism(py::module& m) { + ConfidenceIntervalBinder::DeclareIn(m); + NumericalMechanismBinder::DeclareIn(m); + LaplaceMechanismBinder::DeclareIn(m); + GaussianMechanismBinder::DeclareIn(m); +} diff --git a/src/bindings/PyDP/proto/proto.cpp b/src/bindings/PyDP/proto/proto.cpp index f0cbecb3..0ca7faee 100644 --- a/src/bindings/PyDP/proto/proto.cpp +++ b/src/bindings/PyDP/proto/proto.cpp @@ -1,17 +1,12 @@ #include #include -#include "../pydp_lib/casting.hpp" // our caster helper library #include "pybind11/pybind11.h" +#include "proto/data.pb.h" #include "proto/summary.pb.h" -#include "proto/util.h" // the header file associated with status.cc - -using namespace std; namespace py = pybind11; -using namespace py::literals; - namespace dp = differential_privacy; void init_proto(py::module &m) { @@ -23,11 +18,12 @@ void init_proto(py::module &m) { .def(py::init()) .def("save", [](dp::Summary &pythis, std::string &filename) { - fstream output(filename, ios::out | ios::trunc | ios::binary); + std::fstream output(filename, + std::ios::out | std::ios::trunc | std::ios::binary); pythis.SerializeToOstream(&output); }) .def("load", [](dp::Summary &pythis, std::string &filename) { - fstream input(filename, ios::in | ios::binary); + std::fstream input(filename, std::ios::in | std::ios::binary); pythis.ParseFromIstream(&input); }); } diff --git a/src/bindings/PyDP/pydp_lib/algorithm_builder.hpp b/src/bindings/PyDP/pydp_lib/algorithm_builder.hpp index bc3db83f..7e61c6cc 100644 --- a/src/bindings/PyDP/pydp_lib/algorithm_builder.hpp +++ b/src/bindings/PyDP/pydp_lib/algorithm_builder.hpp @@ -146,7 +146,7 @@ class AlgorithmBuilder { py::arg("linf_sensitivity") = 1); } - // // No bounds constructor + // No bounds constructor pyself.def(py::init([this](double epsilon, double delta, int l0_sensitivity, int linf_sensitivity) { return this->build(epsilon, delta, std::nullopt /*percentile*/, @@ -161,8 +161,6 @@ class AlgorithmBuilder { pyself.def_property_readonly("epsilon", &Algorithm::GetEpsilon); pyself.def_property_readonly("delta", &Algorithm::GetDelta); - pyself.def("privacy_budget_left", &Algorithm::RemainingPrivacyBudget); - pyself.def("memory_used", &Algorithm::MemoryUsed); // Input data @@ -203,10 +201,6 @@ class AlgorithmBuilder { }); pyself.def("partial_result", [](Algorithm& pythis, double privacy_budget) { - if (privacy_budget > pythis.RemainingPrivacyBudget()) { - throw std::runtime_error("Privacy budget requeted exceeds set privacy budget"); - } - auto result = pythis.PartialResult(privacy_budget); if (!result.ok()) { @@ -221,13 +215,8 @@ class AlgorithmBuilder { return dp::GetValue(result.ValueOrDie()); }); - pyself.def("partial_result", [](Algorithm& pythis, double privacy_budget, - double noise_interval_level) { - if (privacy_budget > pythis.RemainingPrivacyBudget()) { - throw std::runtime_error("Privacy budget requeted exceeds set privacy budget"); - } - - auto result = pythis.PartialResult(privacy_budget, noise_interval_level); + pyself.def("partial_result", [](Algorithm& pythis, double noise_interval_level) { + auto result = pythis.PartialResult(noise_interval_level); if (!result.ok()) { throw std::runtime_error(result.status().ToString()); diff --git a/src/bindings/PyDP/pydp_lib/casting.hpp b/src/bindings/PyDP/pydp_lib/casting.hpp deleted file mode 100644 index fde072ec..00000000 --- a/src/bindings/PyDP/pydp_lib/casting.hpp +++ /dev/null @@ -1,13 +0,0 @@ -// Convenient place to store type casters that are used through out project -// This teaches pybind11 to cast types provided in the absl library - -#include "absl/strings/string_view.h" -#include "absl/types/optional.h" -#include "pybind11/stl.h" - -namespace pybind11 { -namespace detail { -// incase we use this - -} // namespace detail -} // namespace pybind11 diff --git a/src/bindings/PyDP/pydp_lib/helper_class.hpp b/src/bindings/PyDP/pydp_lib/helper_class.hpp deleted file mode 100644 index ac90c75c..00000000 --- a/src/bindings/PyDP/pydp_lib/helper_class.hpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "../../c/c_api.h" -#include "pybind11/complex.h" -#include "pybind11/functional.h" -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -PYBIND11_MAKE_OPAQUE(BoundedFunctionHelperObject); - -using namespace std; - -namespace py = pybind11; - -class Dummy { - public: - Dummy(double epsilon, int lower_bound, int upper_bound) { - obj = NewBoundedFunctionObject(epsilon, lower_bound, upper_bound); - } - - Dummy(double epsilon) { - obj = NewBoundedFunctionObject1(epsilon); - } - - void set_l0_sensitivity(int _l0_sensitivity) { - set_l0_sensitivity_(_l0_sensitivity); - } - int get_l0_sensitivity() { - return get_l0_sensitivity_(); - } - - void set_linf_sensitivity(int _linf_sensitivity) { - set_linf_sensitivity_(_linf_sensitivity); - } - int get_linf_sensitivity() { - return get_linf_sensitivity_(); - } - - virtual double Result(py::list) {} - - virtual double Result(py::list, double) {} - - ~Dummy() { - DeleteBoundedFunctionObject(obj); - } - - BoundedFunctionHelperObject* obj; -}; \ No newline at end of file diff --git a/src/pydp/__init__.py b/src/pydp/__init__.py index a31556f7..04ed654f 100644 --- a/src/pydp/__init__.py +++ b/src/pydp/__init__.py @@ -7,4 +7,4 @@ from pydp import util from pydp import ml -__version__ = "1.1.0" +__version__ = "1.1.1" diff --git a/src/pydp/algorithms/__init__.py b/src/pydp/algorithms/__init__.py index 4ba19e15..69b31576 100644 --- a/src/pydp/algorithms/__init__.py +++ b/src/pydp/algorithms/__init__.py @@ -1,5 +1,6 @@ # pydp relative from . import laplacian from . import partition_selection +from . import numerical_mechanisms -__all__ = ["laplacian", "partition_selection"] +__all__ = ["laplacian", "partition_selection", "numerical_mechanisms"] diff --git a/src/pydp/algorithms/_algorithm.py b/src/pydp/algorithms/_algorithm.py index 0687fb7a..6a982363 100644 --- a/src/pydp/algorithms/_algorithm.py +++ b/src/pydp/algorithms/_algorithm.py @@ -72,12 +72,6 @@ def linf_sensitivity(self) -> float: """ return self._linf_sensitivity - def privacy_budget_left(self) -> float: - """ - Returns the remaining privacy budget. - """ - return self.__algorithm.privacy_budget_left() - def memory_used(self) -> float: """ Returns the memory currently used by the algorithm in bytes. @@ -112,29 +106,17 @@ def quick_result(self, data: List[Union[int, float]]) -> Union[int, float]: def result( self, - privacy_budget: Union[float, None] = None, noise_interval_level: Union[float, None] = None, ) -> Union[int, float]: """ Gets the algorithm result. - The default call consumes the remaining privacy budget. - - When `privacy_budget` (defined on [0,1]) is set, it consumes only the `privacy_budget` amount of budget. - `noise_interval_level` provides the confidence level of the noise confidence interval, which may be included in the algorithm output. """ - if self.privacy_budget_left() == 0: - raise RuntimeError( - "Privacy Budget left is already 0, you can't do any more operations" - ) - - if privacy_budget is None: + if noise_interval_level is None: return self.__algorithm.partial_result() - elif noise_interval_level is None: - return self.__algorithm.partial_result(privacy_budget) else: - return self.__algorithm.partial_result(privacy_budget, noise_interval_level) + return self.__algorithm.partial_result(noise_interval_level) def reset(self) -> None: """ diff --git a/src/pydp/algorithms/numerical_mechanisms.py b/src/pydp/algorithms/numerical_mechanisms.py new file mode 100644 index 00000000..487f4d36 --- /dev/null +++ b/src/pydp/algorithms/numerical_mechanisms.py @@ -0,0 +1,6 @@ +from .._pydp._mechanisms import ( + NumericalMechanism, # type: ignore + GaussianMechanism, # type: ignore + LaplaceMechanism, # type: ignore + ConfidenceInterval, # type: ignore +) diff --git a/src/pydp/ml/mechanisms/sklearn_pipeline.py b/src/pydp/ml/mechanisms/sklearn_pipeline.py index cfd45aa6..2ecc58ab 100644 --- a/src/pydp/ml/mechanisms/sklearn_pipeline.py +++ b/src/pydp/ml/mechanisms/sklearn_pipeline.py @@ -1,135 +1,202 @@ import numbers -from ..util.accountant import BudgetAccountant -from .laplace import Laplace +from pydp.distributions import LaplaceDistribution # type: ignore from sklearn.base import BaseEstimator, TransformerMixin import numpy as np + class LaplaceMechanism(BaseEstimator, TransformerMixin): """ - An SKLearn Pipeline operator for applying differentially private noise - addition using the laplace mechanism. - Paper link: https://link.springer.com/content/pdf/10.1007/11681878_14.pdf + An SKLearn Pipeline operator for applying differentially private noise + addition using the laplace mechanism. + Paper link: https://link.springer.com/content/pdf/10.1007/11681878_14.pdf """ - - def __init__(self, epsilon=1.0, sensitivity=1, accountant=None): - + + def __init__( + self, + epsilon=1.0, + sensitivity=1, + scale=None, + cat_feat_idxs=None, + cat_sensitivity=None, + ): + """ Checks that all parameters of the mechanism have been initialised correctly, and that the mechanism is ready to be used. - + Parameters ---------- epsilon : float or int The value of epsilon for achieving :math:`\epsilon`-differential privacy with the mechanism. Must have `epsilon > 0`. - + sensitivity : float or int The sensitivity of the mechanism. Must satisfy `sensitivity` > 0. - - accountant : BudgetAccountant, optional - Accountant to keep track of privacy budget. - - + + scale : float or int, optional + + cat_feat_idxs : list or None, optional + List of integers identifying indicies of categorical features. + + cat_feat_idxs : list or None, optional + List of integers identifying indicies of categorical features. + + cat_feat_idxs : list or None, optional + List of integers identifying indicies of categorical features. + + cat_feat_idxs : list or None, optional + List of integers identifying indicies of categorical features. + + Attributes ------- epsilon Privacy budget to calculate noise. - + sensitivty Sensitivity of the mechanism to calculate noise. - - accountant - Accountant to keep track of privacy budget. - + + scale + + + cat_feat_idxs + List of indicies that identifies categorical features. + + cat_sensitivty + Sensitivity of the mechanism to calculate noise for categorical data. + Raises ------ TypeError If epsilon is not a number, or sensitivity is not a number or a callable. - + ValueError If epsilon less than 0, or sensitivty is a number but less than 0. """ - + if not isinstance(epsilon, numbers.Number): raise TypeError(f"Epsilon must be a number. Got type {type(epsilon)}.") if epsilon <= 0: raise ValueError("Epsilon must be at least larger than 0.") - + self.epsilon = epsilon - + if not isinstance(sensitivity, numbers.Number): if not callable(sensitivity): - raise TypeError(f"Sensitivity must be a number or callable. Got type {type(sensitivity)}.") + raise TypeError( + f"Sensitivity must be a number or callable. Got type {type(sensitivity)}." + ) - if isinstance(sensitivity, numbers.Number) and sensitivity <= 0: raise ValueError("Sensitivity must be at least larger than 0.") - + self.sensitivity = sensitivity - self.accountant = BudgetAccountant.load_default(accountant) - - self.laplace = None # If sensitivity is callable, set lapalace to None - - if not callable(sensitivity): - self.laplace = Laplace() - + + if scale is not None: + if not isinstance(scale, numbers.Number): + raise TypeError( + f"Sensitivity must be a int or float. Got type {type(sensitivity)}." + ) + + self.scale = scale + + if (cat_feat_idxs is not None and cat_sensitivity is None) or ( + cat_feat_idxs is None and cat_sensitivity is not None + ): + raise ValueError( + "cat_feat_idxs cannot be None if cat_sensitivity, and vice versa." + ) + + self.categorical_exists = ( + cat_feat_idxs is not None and cat_sensitivity is not None + ) + + if self.categorical_exists: + + if not isinstance(sensitivity, numbers.Number): + if not callable(sensitivity): + raise TypeError( + f"Sensitivity must be a number or callable. Got type {type(sensitivity)}." + ) + + if isinstance(sensitivity, numbers.Number) and sensitivity <= 0: + raise ValueError("Sensitivity must be at least larger than 0.") + + if len(cat_feat_idxs) == 0: + raise ValueError( + "At least 1 categorical feature index must be provided." + ) + + self.cat_feat_idxs = cat_feat_idxs + self.cat_sensitivity = cat_sensitivity + def sensitivity_calculation(self, X): """ - Perform local differential privacy by adding noise using Laplace mechanismto the dataset if the sensitivity + Perform local differential privacy by adding noise using Laplace mechanismto the dataset if the sensitivity provided if a callable. - - + + Parameters ---------- X : numpy.array Datset in the form of a 2-dimensional numpy array. - + Returns ------ X : numpy.array Original parameter X with differentially private noise added. """ - + n_feature = X.shape[-1] n_data = X.shape[0] - for data_idx in range(n_data): - self.accountant.check(self.epsilon, 0) for feature_idx in range(n_feature): - + # Array with data point data_idx removed for feature_idx - feature = np.concatenate((X[:data_idx,feature_idx],X[data_idx + 1:,feature_idx])) - + feature = np.concatenate( + (X[:data_idx, feature_idx], X[data_idx + 1 :, feature_idx]) + ) + # Calculate sensitivity - sensitivity_ = self.sensitivity(feature) - + if self.categorical_exists and feature_idx in cat_feat_idxs: + if isinstance(self.cat_sensitivity, numbers.Number): + sensitivity_ = self.cat_sensitivity + print(sensitivity_) + else: + sensitivity_ = self.cat_sensitivity(feature) + print(sensitivity_) + + else: + if isinstance(self.sensitivity, numbers.Number): + sensitivity_ = self.sensitivity + else: + sensitivity_ = self.sensitivity(feature) + # Initialized Laplace mechanism instance - laplace = Laplace().set_epsilon(self.epsilon).set_sensitivity(sensitivity_) - + laplace = LaplaceDistribution( + epsilon=float(self.epsilon), sensitivity=float(sensitivity_) + ) + # Add noise to the data point that was removed - noised_value = laplace.randomise(X[data_idx,feature_idx]) - + if self.scale is not None: + noised_value = X[data_idx, feature_idx] - laplace.sample( + scale=float(self.scale) + ) + else: + noised_value = X[data_idx, feature_idx] - laplace.sample() + # Replaced data point in the dataset with noised version - X[data_idx,feature_idx] = noised_value - - self.accountant.spend(self.epsilon, 0) + X[data_idx, feature_idx] = noised_value return X - - + def fit(self, X, y=None): return self def transform(self, X, y=None): - if self.laplace is not None: - self.laplace.set_epsilon(self.epsilon).set_sensitivity(self.sensitivity) - vector_randomise = np.vectorize(self.laplace.randomise) - noised_array = vector_randomise(X) - return noised_array - else: - X = self.sensitivity_calculation( X) - return X \ No newline at end of file + X = self.sensitivity_calculation(X) + return X diff --git a/tests/algorithms/conftest.py b/tests/algorithms/conftest.py deleted file mode 100644 index aaad16c9..00000000 --- a/tests/algorithms/conftest.py +++ /dev/null @@ -1,60 +0,0 @@ -# stdlib -from itertools import accumulate -import math -from typing import List - - -def skew(samples: List[float], mu: float, sigma: float): - """Unfortunately this is implemented in third_party/differential-privacy/cc/algorithms/distributions_test.cc - and we don't want to pull the test files in. I'm assuming it'll be moved to - third_party/differential-privacy/cc/algorithms/util.h If they (upstream) move it we can use it. - Until then this should suffice. #FIXME: when possible we can fix this. - """ - skew = list( - accumulate(samples, lambda lhs, rhs: lhs + (rhs - mu) * (rhs - mu) * (rhs - mu)) - )[-1] - return skew / (len(samples) * sigma * sigma * sigma) - - -def kurtosis(samples: List[float], mu: float, var: float): - """Unfortunately this is implemented in third_party/differential-privacy/cc/algorithms/distributions_test.cc - and we don't want to pull the test files in. I'm assuming it'll be moved to - third_party/differential-privacy/cc/algorithms/util.h If they (upstream) move it we can use it. - Until then this should suffice. #FIXME: when possible we can fix this. - """ - kurt = list( - accumulate(samples, lambda lhs, rhs: lhs + ((rhs - mu) * (rhs - mu)) ** 2) - )[-1] - n = len(samples) - kurt = (n + 1) * kurt / (n * var * var) - kurt -= 3 * (n - 1) - kurt *= (n - 1) / (n - 2) / (n - 3) - return kurt - - -def percentile(N, percent, key=lambda x: x): - """ - Find the percentile of a list of values. - @parameter N - is a list of values. Note N MUST BE already sorted. - @parameter percent - a float value from 0.0 to 1.0. - @parameter key - optional key function to compute value from each element of N. - @return - the percentile of the values - """ - if not N: - return None - k = (len(N) - 1) * percent - f = math.floor(k) - c = math.ceil(k) - if f == c: - return key(N[int(k)]) - d0 = key(N[int(f)]) * (c - k) - d1 = key(N[int(c)]) * (k - f) - return d0 + d1 - - -# From what I understand @openmined/dp-research are going to look at validating correctness -# Until then we can use this to assert on floating point numbers. -# FIXME: When possible we should add 'correctness' tests. -expect_near = lambda expected, actual, tol: ( - expected + tol >= actual and expected - tol <= actual -) diff --git a/tests/algorithms/test_bounded_mean/test_bounded_mean_int64_data.bin b/tests/algorithms/test_bounded_mean/test_bounded_mean_int64_data.bin deleted file mode 100644 index 74ec56e4..00000000 --- a/tests/algorithms/test_bounded_mean/test_bounded_mean_int64_data.bin +++ /dev/null @@ -1,2 +0,0 @@ -M -;type.googleapis.com/differential_privacy.BoundedMeanSummary�ʵ��� \ No newline at end of file diff --git a/tests/algorithms/test_count.py b/tests/algorithms/test_count.py index ccc0f58a..d425991a 100644 --- a/tests/algorithms/test_count.py +++ b/tests/algorithms/test_count.py @@ -62,9 +62,6 @@ def test_count_datatypes(self): assert isinstance(mem, int) par = count.result() assert isinstance(par, int) - # TODO - # par2 = count.partial_result(1.0) - # assert isinstance(par2, int) res = count.quick_result([2]) assert isinstance(res, int) diff --git a/tests/algorithms/test_numerical_mechanisms.py b/tests/algorithms/test_numerical_mechanisms.py new file mode 100644 index 00000000..5cba64d6 --- /dev/null +++ b/tests/algorithms/test_numerical_mechanisms.py @@ -0,0 +1,84 @@ +import numpy as np +import pytest +import pydp.algorithms.numerical_mechanisms as num_mech +from scipy.special import erfinv + + +REL_ERR_TOL = 1e-5 + + +def assert_almost_eq(val_true, val_pred): + return np.abs((val_true - val_pred) / val_true) < REL_ERR_TOL + + +def test_basic(): + num_mech_methods = { + "add_noise", + "noised_value_above_threshold", + "memory_used", + "noise_confidence_interval", + "epsilon", + } + assert num_mech_methods.issubset(set(dir(num_mech.NumericalMechanism))) + epsilon, delta, sensitivity = 1, 1e-7, 5.0 + with pytest.raises(TypeError): + # This is a abstract class, it cannot be instantiated! + obj = num_mech.NumericalMechanism(epsilon, delta) + obj = num_mech.LaplaceMechanism(epsilon, sensitivity) + assert num_mech_methods.issubset(set(dir(obj))) + assert { + "memory_used", + "sensitivity", + "diversity", + }.issubset(set(dir(obj))) + obj = num_mech.GaussianMechanism(epsilon, delta, sensitivity) + assert num_mech_methods.issubset(set(dir(obj))) + assert { + "memory_used", + "l2_sensitivity", + "std", + "delta", + }.issubset(set(dir(obj))) + + +def test_laplace_mechanism(): + epsilon, sensitivity = 1, 3.0 + laplace = num_mech.LaplaceMechanism(epsilon, sensitivity) + value = 0 + value = laplace.add_noise(value) + assert type(value) is int + value = 0.0 + value = laplace.add_noise(value) + assert type(value) is float + conf_level = 0.5 + priv_budg = 0.1 + interval = laplace.noise_confidence_interval(0.5, value) + assert type(interval) is num_mech.ConfidenceInterval + bound = laplace.diversity * np.log(1 - conf_level) / priv_budg + lower_bound, upper_bound = value - bound, value + bound + assert_almost_eq(lower_bound, interval.lower_bound) + assert_almost_eq(upper_bound, interval.upper_bound) + assert conf_level == interval.confidence_level + + +def test_gaussian_mechanism(): + epsilon, delta, l2_sensitivity = 1, 1e-5, 3.0 + gaussian = num_mech.GaussianMechanism(epsilon, delta, l2_sensitivity) + value = 0 + value = gaussian.add_noise(value) + assert type(value) is int + value = 0.0 + value = gaussian.add_noise(value) + assert type(value) is float + conf_level = 0.5 + priv_budg = 0.1 + interval = gaussian.noise_confidence_interval(0.5, value) + local_gaussian = num_mech.GaussianMechanism( + priv_budg * epsilon, priv_budg * delta, l2_sensitivity + ) + assert type(interval) is num_mech.ConfidenceInterval + bound = erfinv(-conf_level) * local_gaussian.std * (2 ** 0.5) + lower_bound, upper_bound = value - bound, value + bound + assert_almost_eq(lower_bound, interval.lower_bound) + assert_almost_eq(upper_bound, interval.upper_bound) + assert conf_level == interval.confidence_level diff --git a/tests/algorithms/test_order_statistics.py b/tests/algorithms/test_order_statistics.py index fbb78307..4e5f8509 100644 --- a/tests/algorithms/test_order_statistics.py +++ b/tests/algorithms/test_order_statistics.py @@ -1,12 +1,7 @@ -# stdlib -import math - # verify with actual value import statistics # third party -from conftest import expect_near -from conftest import percentile import pytest # pydp absolute @@ -32,8 +27,8 @@ def test_min(data, dtype): minn = dp.algorithms.laplacian.Min( dtype=dtype, epsilon=1.0, lower_bound=0, upper_bound=200 ) - assert expect_near(min(data), minn.quick_result(data), 10) - assert expect_near(0, minn.quick_result(data), 10) + assert minn.quick_result(data) == pytest.approx(min(data), abs=10) + assert minn.quick_result(data) == pytest.approx(0, abs=10) @pytest.mark.parametrize("dtype, data", [("int", data_ints), ("float", data_floats)]) @@ -43,7 +38,7 @@ def test_max(data, dtype): ) assert 190 < maxx.quick_result(data) < 210 - assert expect_near(max(data), maxx.quick_result(data), 10) + assert maxx.quick_result(data) == pytest.approx(max(data), abs=10) @pytest.mark.parametrize("dtype, data", [("int", data_ints), ("float", data_floats)]) @@ -53,7 +48,7 @@ def test_median(data, dtype): dtype=dtype, epsilon=1.0, lower_bound=0, upper_bound=200 ) - assert expect_near(statistics.median(data), median.quick_result(data), 20) + assert median.quick_result(data) == pytest.approx(statistics.median(data), abs=20) @pytest.mark.parametrize("dtype", ["int", "float"]) diff --git a/tests/algorithms/test_partition_selection.py b/tests/algorithms/test_partition_selection.py index 1efc3053..2d3ee208 100644 --- a/tests/algorithms/test_partition_selection.py +++ b/tests/algorithms/test_partition_selection.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import pydp as dp from pydp.algorithms.partition_selection import create_partition_strategy # TODO - wait for NumericalMechanism implementation to use those for testing Laplace/Gaussian Partition Selection. diff --git a/third_party/differential-privacy b/third_party/differential-privacy index 78d3fb8f..fc4f2abd 160000 --- a/third_party/differential-privacy +++ b/third_party/differential-privacy @@ -1 +1 @@ -Subproject commit 78d3fb8f63ea904ea6449a8276b9070254c650ec +Subproject commit fc4f2abda5052f654539fc1282ed64a827465a70