Skip to content

Commit

Permalink
Merge pull request #11 from ONSdigital/dev_invalidate_impala_metadata
Browse files Browse the repository at this point in the history
Add CDSW Module: impala.py
  • Loading branch information
dombean authored Sep 19, 2023
2 parents b0f3405 + 9b4a99a commit fb95f1f
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0
- Add the helpers_python.py and test_helpers_python.py modules from cprices-utils.
- Add `init_logger_advanced` in `helpers/logging.py` module.
- Add in the general validation functions from cprices-utils.
- Add `invalidate_impala_metadata` function to the `cdsw/impala.py` module.

### Changed

Expand Down
77 changes: 77 additions & 0 deletions rdsa_utils/cdsw/impala.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Utilities for working with Impala."""
import logging
import subprocess
from typing import Optional

logger = logging.getLogger(__name__)


def invalidate_impala_metadata(
table: str,
impalad_address_port: str,
impalad_ca_cert: str,
keep_stderr: Optional[bool] = False,
):
"""Automate the invalidation of a table's metadata using impala-shell.
This function uses the impala-shell command with the given
impalad_address_port and impalad_ca_cert, to invalidate a specified
table's metadata.
It proves useful during a data pipeline's execution after writing to an
intermediate Hive table. Using Impala Query Editor in Hue, end-users often
need to run "INVALIDATE METADATA" command to refresh a table's metadata.
However, this manual step can be missed, leading to potential use of
outdated metadata.
The function automates the "INVALIDATE METADATA" command for a given table,
ensuring up-to-date metadata for future queries. This reduces manual
intervention, making outdated metadata issues less likely to occur.
Parameters
----------
table
Name of the table for metadata invalidation.
impalad_address_port
'address:port' of the impalad instance.
impalad_ca_cert
Path to impalad's CA certificate file.
keep_stderr
If True, will print impala-shell command's stderr output.
Returns
-------
None
Examples
--------
>>> invalidate_impala_metadata(
... 'my_table',
... 'localhost:21050',
... '/path/to/ca_cert.pem'
... )
>>> invalidate_impala_metadata(
... 'my_table',
... 'localhost:21050',
... '/path/to/ca_cert.pem',
... keep_stderr=True
... )
"""
result = subprocess.run(
[
'impala-shell',
'-k',
'--ssl',
'-i',
impalad_address_port,
'--ca_cert',
impalad_ca_cert,
'-q',
f'invalidate metadata {table};',
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)

if keep_stderr:
logger.info(result.stderr.decode())
91 changes: 91 additions & 0 deletions tests/cdsw/test_impala.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Tests for impala.py module."""
import subprocess

from rdsa_utils.cdsw.impala import invalidate_impala_metadata


class TestInvalidateImpalaMetadata:
"""Tests for invalidate_impala_metadata function."""

def test_invalidate_impala_metadata(self, mocker):
"""Test the invalidate_impala_metadata function.
Parameters
----------
mocker : pytest_mock.MockFixture
Pytest's MockFixture object to mock subprocess.run().
Notes
-----
This test verifies the following:
1. The correct impala-shell command is executed with
the correct arguments.
2. The function does not raise any exceptions.
3. The function correctly handles and logs the stderr output
when keep_stderr is True.
"""
# Mock the subprocess.run() call
mock_subprocess_run = mocker.patch('subprocess.run')

# Set up test parameters
table = 'test_table'
impalad_address_port = 'localhost:21050'
impalad_ca_cert = '/path/to/ca_cert.pem'

# Mock logger.info
mock_logger_info = mocker.patch('logging.Logger.info')

# Call the function without keep_stderr
invalidate_impala_metadata(table, impalad_address_port, impalad_ca_cert)

# Check that subprocess.run() was called with the correct arguments
mock_subprocess_run.assert_called_with(
[
'impala-shell',
'-k',
'--ssl',
'-i',
impalad_address_port,
'--ca_cert',
impalad_ca_cert,
'-q',
f'invalidate metadata {table};',
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)

# Reset the mock
mock_subprocess_run.reset_mock()

# Call the function with keep_stderr
result = subprocess.CompletedProcess(
args=['dummy'], returncode=0, stdout=b'', stderr=b'Test Error',
)
mock_subprocess_run.return_value = result

invalidate_impala_metadata(
table,
impalad_address_port,
impalad_ca_cert,
keep_stderr=True,
)

# Check that subprocess.run() was called with the correct arguments
# and logger.info() was called with the expected error message.
mock_subprocess_run.assert_called_with(
[
'impala-shell',
'-k',
'--ssl',
'-i',
impalad_address_port,
'--ca_cert',
impalad_ca_cert,
'-q',
f'invalidate metadata {table};',
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
mock_logger_info.assert_called_once_with('Test Error')

0 comments on commit fb95f1f

Please sign in to comment.