sdv-dev · R-Palazzo · Nov 13, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
@@ -88,7 +88,7 @@ def get_column_plot(real_data, synthetic_data, metadata, table_name, column_name
 
 
 def get_column_pair_plot(real_data, synthetic_data, metadata,
-                         table_name, column_names, plot_type=None):
+                         table_name, column_names, plot_type=None, sample_size=None):
     """Get a plot of the real and synthetic data for a given column pair.
 
     Args:
@@ -107,6 +107,9 @@ def get_column_pair_plot(real_data, synthetic_data, metadata,
             If ``None` select between ``box``, ``heatmap`` or ``scatter`` depending on the data
             that the column contains, ``scatter`` used for datetime and numerical values,
             ``heatmap`` for categorical and ``box`` for a mix of both. Defaults to ``None``.
+        sample_size (int or None):
+            The number of samples to plot. If ``None``, all samples are plotted.
+            Defaults to ``None``.
 
     Returns:
         plotly.graph_objects._figure.Figure:
@@ -120,6 +123,7 @@ def get_column_pair_plot(real_data, synthetic_data, metadata,
         synthetic_data,
         metadata,
         column_names,
+        sample_size,
         plot_type
     )
 

@@ -107,7 +107,8 @@ def get_column_plot(real_data, synthetic_data, metadata, column_name, plot_type=
     )
 
 
-def get_column_pair_plot(real_data, synthetic_data, metadata, column_names, plot_type=None):
+def get_column_pair_plot(
+        real_data, synthetic_data, metadata, column_names, plot_type=None, sample_size=None):
     """Get a plot of the real and synthetic data for a given column pair.
 
     Args:
@@ -124,6 +125,9 @@ def get_column_pair_plot(real_data, synthetic_data, metadata, column_names, plot
             If ``None` select between ``box``, ``heatmap`` or ``scatter`` depending on the data
             that the column contains, ``scatter`` used for datetime and numerical values,
             ``heatmap`` for categorical and ``box`` for a mix of both. Defaults to ``None``.
+        sample_size (int or None):
+            The number of samples to use for the plot. If ``None`` use the whole dataset.
+            Defaults to ``None``.
 
     Returns:
         plotly.graph_objects._figure.Figure:
@@ -164,6 +168,11 @@ def get_column_pair_plot(real_data, synthetic_data, metadata, column_names, plot
                 format=datetime_format
             )
 
+    require_subsample = sample_size and sample_size < min(len(real_data), len(synthetic_data))
+    if require_subsample:
+        real_data = real_data.sample(n=sample_size)
+        synthetic_data = synthetic_data.sample(n=sample_size)
+
     return visualization.get_column_pair_plot(
         real_data,
         synthetic_data,

@@ -81,11 +81,11 @@ def test_get_column_pair_plot(mock_plot):
     mock_plot.return_value = 'plot'
 
     # Run
-    plot = get_column_pair_plot(data1, data2, metadata, 'table', ['col1', 'col2'])
+    plot = get_column_pair_plot(data1, data2, metadata, 'table', ['col1', 'col2'], 2)
 
     # Assert
     call_metadata = metadata.tables['table']
-    mock_plot.assert_called_once_with(table1, table2, call_metadata, ['col1', 'col2'], None)
+    mock_plot.assert_called_once_with(table1, table2, call_metadata, ['col1', 'col2'], None, 2)
     assert plot == 'plot'
 
 

@@ -402,3 +402,60 @@ def test_get_column_pair_plot_with_invalid_sdtype_and_plot_type(mock_get_plot):
     assert mock_get_plot.call_args[0][2] == columns
     assert mock_get_plot.call_args[0][3] == 'heatmap'
     assert plot == mock_get_plot.return_value
+
+
+@patch('sdmetrics.visualization.get_column_pair_plot')
+def test_get_column_pair_plot_with_sample_size(mock_get_plot):
+    """Test ``get_column_pair_plot`` with ``sample_size`` parameter."""
+    # Setup
+    columns = ['amount', 'price']
+    real_data = pd.DataFrame({
+        'amount': [1, 2, 3],
+        'price': [10, 20, 30],
+    })
+    synthetic_data = pd.DataFrame({
+        'amount': [1., 2., 3.],
+        'price': [11., 22., 33.],
+    })
+    metadata = SingleTableMetadata()
+    metadata.add_column('amount', sdtype='numerical')
+    metadata.add_column('price', sdtype='numerical')
+
+    # Run
+    get_column_pair_plot(real_data, synthetic_data, metadata, columns, sample_size=2)
+
+    # Assert
+    real_subsample = mock_get_plot.call_args[0][0]
+    synthetic_subsample = mock_get_plot.call_args[0][1]
+    assert len(real_subsample) == 2
+    assert len(synthetic_subsample) == 2
+    assert real_subsample.isin(real_data).all().all()
+    assert synthetic_subsample.isin(synthetic_data).all().all()
+
+
+@patch('sdmetrics.visualization.get_column_pair_plot')
+def test_get_column_pair_plot_with_sample_size_too_big(mock_get_plot):
+    """Test ``get_column_pair_plot`` when ``sample_size`` is bigger than the length of the data."""
+    # Setup
+    columns = ['amount', 'price']
+    real_data = pd.DataFrame({
+        'amount': [1, 2, 3],
+        'price': [10, 20, 30],
+    })
+    synthetic_data = pd.DataFrame({
+        'amount': [1., 2., 3.],
+        'price': [11., 22., 33.],
+    })
+    metadata = SingleTableMetadata()
+    metadata.add_column('amount', sdtype='numerical')
+    metadata.add_column('price', sdtype='numerical')
+
+    # Run
+    plot = get_column_pair_plot(real_data, synthetic_data, metadata, columns, sample_size=10)
+
+    # Assert
+    pd.testing.assert_frame_equal(mock_get_plot.call_args[0][0], real_data)
+    pd.testing.assert_frame_equal(mock_get_plot.call_args[0][1], synthetic_data)
+    assert mock_get_plot.call_args[0][2] == columns
+    assert mock_get_plot.call_args[0][3] == 'scatter'
+    assert plot == mock_get_plot.return_value