-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'auto-doc-generation' of https://github.com/OpenCOMPES/sed…
… into auto-doc-generation
- Loading branch information
Showing
5 changed files
with
880 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,303 @@ | ||
Binning demonstration on locally generated fake data | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
In this example, we generate a table with random data simulating a | ||
single event dataset. We showcase the binning method, first on a simple | ||
single table using the bin_partition method and then in the distributed | ||
mehthod bin_dataframe, using daks dataframes. The first method is never | ||
really called directly, as it is simply the function called by the | ||
bin_dataframe on each partition of the dask dataframe. | ||
|
||
.. code:: ipython3 | ||
import sys | ||
import dask | ||
import numpy as np | ||
import pandas as pd | ||
import dask.dataframe | ||
import matplotlib.pyplot as plt | ||
sys.path.append("../") | ||
from sed.binning import bin_partition, bin_dataframe | ||
Generate Fake Data | ||
------------------ | ||
|
||
.. code:: ipython3 | ||
n_pts = 100000 | ||
cols = ["posx", "posy", "energy"] | ||
df = pd.DataFrame(np.random.randn(n_pts, len(cols)), columns=cols) | ||
df | ||
.. raw:: html | ||
|
||
<div> | ||
<style scoped> | ||
.dataframe tbody tr th:only-of-type { | ||
vertical-align: middle; | ||
} | ||
.dataframe tbody tr th { | ||
vertical-align: top; | ||
} | ||
.dataframe thead th { | ||
text-align: right; | ||
} | ||
</style> | ||
<table border="1" class="dataframe"> | ||
<thead> | ||
<tr style="text-align: right;"> | ||
<th></th> | ||
<th>posx</th> | ||
<th>posy</th> | ||
<th>energy</th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<th>0</th> | ||
<td>1.805042</td> | ||
<td>-0.363059</td> | ||
<td>-0.220710</td> | ||
</tr> | ||
<tr> | ||
<th>1</th> | ||
<td>-1.418791</td> | ||
<td>2.031562</td> | ||
<td>-0.383197</td> | ||
</tr> | ||
<tr> | ||
<th>2</th> | ||
<td>-0.451853</td> | ||
<td>1.449618</td> | ||
<td>-1.603912</td> | ||
</tr> | ||
<tr> | ||
<th>3</th> | ||
<td>0.249470</td> | ||
<td>0.426805</td> | ||
<td>-1.106285</td> | ||
</tr> | ||
<tr> | ||
<th>4</th> | ||
<td>-0.676840</td> | ||
<td>1.790051</td> | ||
<td>1.289248</td> | ||
</tr> | ||
<tr> | ||
<th>...</th> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
</tr> | ||
<tr> | ||
<th>99995</th> | ||
<td>0.555341</td> | ||
<td>1.035233</td> | ||
<td>1.645474</td> | ||
</tr> | ||
<tr> | ||
<th>99996</th> | ||
<td>0.966108</td> | ||
<td>-0.246699</td> | ||
<td>0.682800</td> | ||
</tr> | ||
<tr> | ||
<th>99997</th> | ||
<td>-0.696395</td> | ||
<td>-0.210527</td> | ||
<td>0.983446</td> | ||
</tr> | ||
<tr> | ||
<th>99998</th> | ||
<td>2.308371</td> | ||
<td>0.440244</td> | ||
<td>-0.380476</td> | ||
</tr> | ||
<tr> | ||
<th>99999</th> | ||
<td>0.885163</td> | ||
<td>-1.144483</td> | ||
<td>-1.426119</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
<p>100000 rows × 3 columns</p> | ||
</div> | ||
|
||
|
||
|
||
Define the binning range | ||
------------------------ | ||
|
||
.. code:: ipython3 | ||
binAxes = ["posx", "posy", "energy"] | ||
nBins = [120, 120, 120] | ||
binRanges = [(-2, 2), (-2, 2), (-2, 2)] | ||
coords = {ax: np.linspace(r[0], r[1], n) for ax, r, n in zip(binAxes, binRanges, nBins)} | ||
Compute the binning along the pandas dataframe | ||
---------------------------------------------- | ||
|
||
.. code:: ipython3 | ||
%%time | ||
res = bin_partition( | ||
part=df, | ||
bins=nBins, | ||
axes=binAxes, | ||
ranges=binRanges, | ||
hist_mode="numba", | ||
) | ||
.. parsed-literal:: | ||
CPU times: user 1.25 s, sys: 7.64 ms, total: 1.25 s | ||
Wall time: 1.25 s | ||
.. code:: ipython3 | ||
fig, axs = plt.subplots(1, 3, figsize=(8, 2.5), constrained_layout=True) | ||
for i in range(3): | ||
axs[i].imshow(res.sum(i)) | ||
.. image:: 1%20-%20Binning%20fake%20data_files/1%20-%20Binning%20fake%20data_8_0.png | ||
|
||
|
||
Transform to dask dataframe | ||
--------------------------- | ||
|
||
.. code:: ipython3 | ||
ddf = dask.dataframe.from_pandas(df, npartitions=50) | ||
ddf | ||
.. raw:: html | ||
|
||
<div><strong>Dask DataFrame Structure:</strong></div> | ||
<div> | ||
<style scoped> | ||
.dataframe tbody tr th:only-of-type { | ||
vertical-align: middle; | ||
} | ||
.dataframe tbody tr th { | ||
vertical-align: top; | ||
} | ||
.dataframe thead th { | ||
text-align: right; | ||
} | ||
</style> | ||
<table border="1" class="dataframe"> | ||
<thead> | ||
<tr style="text-align: right;"> | ||
<th></th> | ||
<th>posx</th> | ||
<th>posy</th> | ||
<th>energy</th> | ||
</tr> | ||
<tr> | ||
<th>npartitions=50</th> | ||
<th></th> | ||
<th></th> | ||
<th></th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<th>0</th> | ||
<td>float64</td> | ||
<td>float64</td> | ||
<td>float64</td> | ||
</tr> | ||
<tr> | ||
<th>2000</th> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
</tr> | ||
<tr> | ||
<th>...</th> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
</tr> | ||
<tr> | ||
<th>98000</th> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
</tr> | ||
<tr> | ||
<th>99999</th> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
</div> | ||
<div>Dask Name: from_pandas, 1 graph layer</div> | ||
|
||
|
||
|
||
compute distributed binning on the partitioned dask dataframe | ||
------------------------------------------------------------- | ||
|
||
In this example, the small dataset does not give significant improvement | ||
over the pandas implementation, at least using this number of | ||
partitions. A single partition would be faster (you can try…) but we use | ||
multiple for demonstration purpouses. | ||
|
||
.. code:: ipython3 | ||
%%time | ||
res = bin_dataframe( | ||
df=ddf, | ||
bins=nBins, | ||
axes=binAxes, | ||
ranges=binRanges, | ||
hist_mode="numba", | ||
) | ||
.. parsed-literal:: | ||
0%| | 0/50 [00:00<?, ?it/s] | ||
.. parsed-literal:: | ||
CPU times: user 639 ms, sys: 250 ms, total: 889 ms | ||
Wall time: 796 ms | ||
.. code:: ipython3 | ||
fig, axs = plt.subplots(1, 3, figsize=(8, 2.5), constrained_layout=True) | ||
for dim, ax in zip(binAxes, axs): | ||
res.sum(dim).plot(ax=ax) | ||
.. image:: 1%20-%20Binning%20fake%20data_files/1%20-%20Binning%20fake%20data_13_0.png | ||
|
||
|
Binary file added
BIN
+117 KB
docs/tutorial/1 - Binning fake data_files/1 - Binning fake data_13_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+256 KB
docs/tutorial/1 - Binning fake data_files/1 - Binning fake data_8_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.