diff --git a/narwhal/.assets/diagram-primary.svg b/narwhal/.assets/diagram-primary.svg new file mode 100644 index 0000000000000..30541c1fd5937 --- /dev/null +++ b/narwhal/.assets/diagram-primary.svg @@ -0,0 +1,3 @@ + + +
IN
IN
Receiver
(maintains one TCP connection per client)
Receiver...
IN
IN
IN
IN
IN
IN
Receiver
(maintains one TCP connection per client)
Receiver...
IN
IN
IN
IN
Handle messages from our workers
Handle messages from our w...
Handle messages from other primaries
Handle messages from other...
Signature Service
(signs headers)
Signature Service...
Core
(handles headers, votes, certificates)
Core...
Garbage Collector
(updates the GC round)
Garbage Collector...
Payload Receiver
(stores the batch digests)
Payload Receiver...
Header Waiter
(requests missing batches and certificates)
Header Waiter...
Certificate Waiter
(waits for all the history of certificates)
Certificate Waiter...
Proposer
(make new headers)
Proposer...
Helper
(replies to batch requests)
Helper...
Consensus
(orders certificates)
Consensus...
Reliable Sender
(to other primaries)
Reliable Sender...
OUT
OUT
OUT
OUT
OUT
OUT
Simple Sender
(to other primaries)
Simple Sender...
OUT
OUT
OUT
OUT
OUT
OUT
Simple Sender
(to other primaries or our workers)
Simple Sender...
OUT
OUT
OUT
OUT
OUT
OUT

Legend

Legend...
tokio task
tokio task
tokio channel
tokio channel
Network
Network
Storage
Storage
CPU
CPU
W
W
R
R
R
R
W
W
R
R
atomic reader
atomic reader
atomic writer
atomic writer
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/narwhal/.assets/diagram-worker.svg b/narwhal/.assets/diagram-worker.svg new file mode 100644 index 0000000000000..7ac32db606700 --- /dev/null +++ b/narwhal/.assets/diagram-worker.svg @@ -0,0 +1,3 @@ + + +

Legend

Legend...
Reliable Sender
(to other workers)
Reliable Sender...
Simple Sender
(to other workers)
Simple Sender...
Simple Sender
(to other workers)
Simple Sender...
Simple Sender
(to our primary)
Simple Sender...
Batch Maker
(assembles txs into batches)
Batch Maker...
QuorumWaiter
(waits for a quorum to acks our batch)
QuorumWaiter...
Processor
(hashes and stores batches)
Processor...
IN
IN
Helper
(replies to batch requests)
Helper...
Processor
(hashes and stores batches)
Processor...
Synchronizer
(requests missing batches)
Synchronizer...
Receiver
(maintains one TCP connection per client)
Receiver...
IN
IN
IN
IN
IN
IN
Receiver
(maintains one TCP connection per client)
Receiver...
IN
IN
IN
IN
Receiver
(maintains one TCP connection per client)
Receiver...
IN
IN
PrimaryConnector
(sends batch digests to our primary)
PrimaryConnector...
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
OUT
Handle messages from our primary
Handle messages from our p...
Handle clients transactions
Handle clients transactions
Handle messages from other workers
Handle messages from other...
tokio task
tokio task
tokio channel
tokio channel
Network
Network
Storage
Storage
CPU
CPU
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/narwhal/.assets/diagram.drawio b/narwhal/.assets/diagram.drawio index 0f0318e5dfa0d..28424420ec74d 100644 --- a/narwhal/.assets/diagram.drawio +++ b/narwhal/.assets/diagram.drawio @@ -1 +1 @@ -7V3bcts2EP0azbQPzoh36jF24qTTpnXjtGkeYRESWVOEQlKx3K8vwIskApAEyySxtOWZRCIEgdQ5u9hd7BIcWVeL9YcULcNPJMDxyBwH65H1bmSahm269IW1PJYt3sQuG+ZpFFSdtg230X+4ahxXrasowFmjY05InEfLZuOUJAme5o02lKbkodltRuLmWZdojoWG2ymKxdavUZCHZavvjLftH3E0D+szG+PqkwWqO1cNWYgC8rDTZL0fWVcpIXn5brG+wjEDr8al/N71nk83F5biJFf5wn3yD/k1up7/nUbv//745/XFL7/eXBhjqxznB4pX1U8emW5MR7xcsovOHysk3O8rdqWXcZTgi7D6yW9pF8Oi53W2Hei7efUal6/XS74tNOqWryS9xykd5l2E5ila1D3uUsk426/tXOOhs5mNn2DmeF2cPV/EtMGgb7M8Jff4isQkpS0JSWjPy1kUx1xTtkTTKJnTBmd79IUsacMFxd26fAijHN/SdnaqB6oLtI38wOksLjgPoyDACW1LySoJMKOEfQvF0Zy2vovxbHvBu5RWLNOB6LXvNFUUf8BkgfP0kXapPnUraavVrRbUh63wmn7VFu4Irl21oUpf5puRtyJF31RS9QQJ801BwD7jOEJ3lBRzfIspGqlI+k85oZ+SPCxk46EQkuxngdAmmjIOdriW0Mpwjaiyv614uCN5Thb0gwBlYTGu0Q4tltnkxXElvNgSXjbzSfvE2AIxt9Fi+aposeGx4pxZmQBUFvdkWlaMlGUaLVD6OGBSfI4UfyySYphOn6TYvgAnDqj3Vh1WCDUR/ne1WFZepstMdJqHZE4SFP9GCmvO0PoX5/lj1QmtKIsNEiiE6eM/bLg3Tn34rRq9OHi3bhw9Vkelq1E7kSb7fB3l5UCmUx1+q09C32/HYQf1MHupzKikTfEhX6/sl6N0jvMD/axK1hmWByUjxTHKox9NT7l9lidnlrtg2QPFcu2nnllul2UbFsvGS2HZg8SyCYrk2rUQVxTuatfoEuXTkPb5hO5lntNOML/5yk8oy/CCRo0ZQ2TN/o+SwsO6Y4Phgw6vcdy3asFFcsZNF8mW+K2GKfFb3a48JMcctr6drjqm6gQJSnXEtRJBdf5ckXS1+IqoMKcyPZHEIQ+0M9OXGWGhCKL/vheDMHyYAqHpfbaJVAptAqBLPhcD+tp1ydKhS23qhKVqTmApxf4V6o3Q36RkirOMqNqSkAXHTOZREhSUkrQ4hGNKOPH3pNF2r+KvJdiWuGAnGpfT1cZWVBsDlNaIq7u//K5drG0uPWFJPCTZwl5n2QlPj4O0CSPUowgITpWnakBgrS14Wix3F1FnH0GnMsmwlhY8+0xyByTDWgv2jruCH3G8VF5TSPEyjsoVhc1KQvFjvq9wlgPwAyc+tCUF/xV643zuSz8LtmjSzlNbCc1EcWqzJqCmNvtsvzog2QYWlDlnkjsgeQyK5MlxC3n7mEzDlCSUMXVXpfRJ6MiLKMtYMSIgC2mAS324A04olyMFb1n98vZaact1xEAovqFW33M8xThWVDLDBKVk7oAzyRDphRXMu3qW5V4suwYsN6i+7kMm8jOeYnohquZxgaIkp/+KRGbCyjK/XN2MNveFRCRhGBcFslMa8ie5fpvJlwtsVrn15Xi0lNpByPHUCjKwJE992bCyPK4JLMvjeK9Wrs1hyrUJUa4daHJtiDeLiIIuuCIN52MHoab87yrIVvjfb1t5DXlCovGolO0A6kgArduUhbE6ww2hTsJob5GF5XNElepVfWvLlTCQOz4yUKl+wkAF6Zuf/Qx3WXeC09OWkjZcxQnOBzXBuVpWe9tZ0Nvh3ezLjKmyDKvwwBDvGgNgxnxoRTjugO/w6TPaV03aG7AyW955JbZdemGZMu+8EtsuvbBKbgyFmpuXv1Y3AbdW58Jb0+jNG/QH6Q26UCron+fzPy3W64FlWObQEAvkAPj8xhja2pWrpRRmuNqgWgwDSxtc90Ww3JtlU2UZlmWrLxvWnMdvI6R9yquv57xcf9JyPV8Lf/JyvTHWvF5va5kV2wuZGyJ5LH4+fTasOx6/S8UENR1upqpXHTLzJaH6Q2ZbS8gM4cbNWkGOahKsleP6smH5FYYNzK/wBiXXTzIerSqB6v4XFjBzorQBRrG/5FVpDpTvvMso2ps77ehrEM2rewyetnelntvw9G+KYYq7LPzx1xftQE34fW0lm+f0G/mIiXAIOBm8j6IfKDHbAgIoPmLSD5TsPuRzLC2ivC8E5opAXH6XXvVg2j8yUsfBdG2tgakMX2WzMV66NMaCaassaLbKgmmr+LtF9OME01TZ4HA6W6rnWCp+yzBnMnkz2fnzTzNbNr+/5OFhu7ZhYgIFgi7x0OtWpXpgYDCZJjScxAVoCDgZ4HASXcfz1PyEIMJrzpmO08VMbUyec5aOJ24Pyh2zvW7MolqZ1f5Gwnsyw1wM4brtSB5fNOWZOmXN31+JOyPFdYtP62MfXGSFHBXP6nOX6/1P6tvuoYeSoHi00QJnGZoXm7HNUsI2DG8sCR/af5y2lVfV+rP46ofmTancYtouPANpEQVBMcPKTFpTF1t/SlJTYAzZruVunybO379fX1dCUyagiyRCipIMFenp7CwrQqQKT1j2b13V4wzTfI7dWW6EOQac3ExkcsO7RXEcLbN9iO1QhLJl+SDfWbRmsIlc7PI1Mq1x8dcOtl4TWt+SQCtB1ugK2U2C6hC0T3qGYFMDduHbB6uAfwc4jx3dOMtqtnicQ7K4W2VDxdixNWMs3fZp+LLscMtk0rKEfoFWEebhAe2rbDLZL9AKC0gDBJr3TAEALauBGqSHIcwWkrxwz9iKieGBYitMEDLPuF9sVXZoGgS2ExfcnCAm6geKLbjpVlKvJ9q1YXnDtkLVds8gy9Zzhw0yn+HTD3IdwL8gkH1wkmy9vOCZdyV87UGdpM5x6CDzu6o4+iVZwRceGMgWPJAVnOKBgcxXtujHWMW5eEr9z36cj3nK18VfSw6Gf3xa9iVAW90BDWRXn73gdl18IVDi+G+c5iCqBRd8waFfb0jfU4WFISkwzMl9xO6Fy1F2LzD9/ARjjGf589KLbcxeSukYiVqZnamVpISxZmIaoiTB8ashw55oJ0N0PX/HOUvQv1QSJvA4EFfAb3OSIjq5vxIOTP2TkhgdXN389Vrwl0bA/eKvEji0k27jmJBw1QLC3EKOJ9khxZLdz26ccEM7PUwJK8raOkwUjvATCTDr8T8=7V1Zc5s6G/41mTnfRTrsxpeJs/acNM7S9DQ33yhGNrQYEcCx3V9/JAwGhIyxwyIvmenUCCHL7/PueiVO5N54du0B17xDBrRPJMGYncgXJ5IkKpKG/yMt80VLp6ssGkaeZUSdkoYn6w+MGoWodWIZ0M90DBCyA8vNNg6Q48BBkGkDnoem2W5DZGe/1QUjmGt4GgA73/rDMgJz0aqrQtJ+A62RGX+zKER3xiDuHDX4JjDQNNUkX57IPQ+hYPFpPOtBmxAvpsvN2ZmrqOaw+/zxAF+1s9Phfed0MdjVJo8sf4IHnWDroV3Yh2ePSO15F6+3j9Ir7Hw8nYrR2B/AnkQEi35sMI8p6KGJY0AyinAin09NK4BPLhiQu1PMM7jNDMY2vhLxRz/w0G/YQzbywqdlIfzDd4BtjRzcZsMh/hHnQ8u2424OcvBo5yV/Zjxl6AVwlgI5+tnXEI1h4M1xl+iuEgEYc3AM6DThB0WP2sw0L2hRI4h4cLQcOqEz/hCRmk329+/ajXL+973a+ehcB/Bq2r29PZXkHJWhgdk2uozokSX8r8nYjcRLw5fIC0w0Qg6w/0HIjaj/CwbBPOoEJgGisQFecEZkKvmOsO3KItMPvwU6RtxjYAPftwaLxqhLAnEsTFJ4HxPlXzLCFzW+/LkcEF9czDJX8/hqZgWpx/DVz+g7yOfkIXIRP7MZk/ho4g0iCp/e3Nhn7575s69+04fCdf96akUyI2AqjGA03gd4+dWVhp3XP8/3s6+m+P/z7u1pBBhBqZDlPGiDwPrIKp/P8E/RrFNSe/utQG7F9XJbl+yJUlb4JJ0hfAzZUyoQPSbpxBydGpC8bQUkK2piVj6juSbCSZ4wgG8uMa9YVpS8rDD7aVzJSt6gHQGvGHCBK8Bbsa2HBbjKFeAKw4fVbOJkvuEPI/LhEQ4gnoe3vOHFd+IW/M3L3n+NgeUE+J+PhyUYSMJzr58EKRZyCNXJcMLAtjDt/8ep9dWy1nd5nbK+SwudNr/1eb7q0fPlw/NVd9LzVXfG8116ui14vmzRU46ix4foaTspetrOiJ7YougxSddheCi0KObcvoyjl6JaVkLTIpyI52XSSstwXpI2g2EtN6aIrDKIHLeVZtroG/oI+2Qp9Upl9WSBAm8hhdFT6bwoPZC2ZqCFmOYGChlh+bO35w19PW94Jhq/Tfz1ArU6+5pOt7KysgOMPfZna5RJYb1MiqwkbG3mUNtXc9jZNXvYLWkPy2qgyu0hk4HkVlKJ9THQFnyQYjppI6ZriYE6XDlU3Z1xqPQSyrvZLL6QI9QxyVdSWEQ5Ly3sjnyl+cRW9O2BYd7lC/Lj4k0DYq7whbnMCIiO6fzQDCvcpfP1ow+8beBVtbwzFu92IYpqxZPjlYNajaJKcxBfYZSYX//lNY7qarytCXWO0seL/i67IMuX/pa6Rw7iRX+X5aDK9Td7ZaUrUysrnWZXVsTdKRbQWzQMbNrlV3vzcn5cstxMHqSKliy769Y+6xasEuvZ+7Bm2SlRR1DXmuXkwela0+fXzuU7uBlN78yZ32fU/cf5jiEKGSQBQHufoPjGqR/K0hnuIGruLLlJZ02WKZMb4Bg2SZSMoe+DESSZk6GHxiSBMiF5kinyfkPPZ6VbUm2LWcXNFIdgEIIiVoh0CmOjUI4BCKTWANhn0Y2xZRihImHxXlYZfSIFt8HiN524ERiMpNWk35mcxNr31TgnBWaYc3M9aww8Cx65qaSzILXITewgoOU8fcbpTpL2K/P0qeAhCQ3S8QOVvN88fogjhtSMaqimYHr3Zbe0tRZHFs26KOv/hMURBBOPaJMn6H1YmAzlFgB8/KQfCgMwsNXiNL2vUavsMsvdaDS9v6e52dKRfcWSGe9/T0kmlPr3b3Nsn4POK5z0vw5vvH68ZY2TDM+elTnFRmNpQTYqkuOddSpX6uwIVFOzqkqhVVB1ESjb4WjV38iyTirpV8bhOEiN1tpOBib3tLIR5pOZ5Q0rUerzcOvnIGa/yjeVh4/inw/mqQ4uUVj+ar1Hh16qJlDcuBixUmXXSqH6dnaygXUZqYky9U+xqs6VsmulQmdFaM5BYB6v8vLMQK0V5LH9/5ZrrbnSPyLvvNNazXbRrIuyOj3klU3jmGEuOZXIOZF6ZHQUwOjjgKRuh9YA4BZOszwder9hk1keJkjSepCugfdGDtqThB6ybTgIkFcSs4lrgBAeITDJ89cEphAITvGhs3Cd1vEpURDdB3MbAQP32rA02sdIptB5A8HAxP8b1gj6Aa8iRB9r0n6iVDzWQVds7xi1qGzKt2bw2NM5lsO1xQh8ec3Snq6d7AAjtJZsLJp2kfW+Cb1Z3OkHsILSptuD7xNipPFzYwvD6IxiAx4adOwr74ZDTJfFaU3uaiqKoQqjloSum+I2xZ3DKhjiQwsgnM7C+zIt4o4R0qDhbmAnCjpv4LEKV2lv2UMu8jfYPviboOPA6QnvJQR0cNkoHC/yJURm8Hh58TgPruX5nSA8t5W83sLElbKdbaeBOnkjyCS71JYNZM6mFaf4yAStOULM2bSVRz50LuCrzo+1dSDnDtvuBl6wa1uLDBZKJbBi55hTO53bCtJ6Bmu/NoJWLGp6XtSKMumciBrrZMHc4ovjQ8ef+CWlDXmLZZddiE3oRH6jvjCzCipf4/wIbQu82YuKZsdgab2/Qr2W2ytRRHRhPdEZ+yDobQ9vKAjQ+KShTQ0KFUiqjE0NzC2QIr35rTK48ouX99+f+eR1leJ1lfWOo5p2hBSVMO8C7TSNM9rl1/u4pZ3KG+1Y2VbarzluW15B+RW13nTNo9z90k396dkBy25i1pSNhq2uoJzJN6yULy1we7CjWaHllbV1sKYdzczgNO8SPVlj9+gQMVLrCiNGrM8jKkolVGmZ8ocm1kZQ+ghyFvPXZayKkrS7YOhF+pj81olXg5fUJC/yRk7ODGCDUAjaej3b6JsJxLxe2NYokjbSkDm0Y98sJX14Eqtisz5DWViDsguqXaf8DL1BVVRYybULxBMFhTfq5TU5t9SjD7Jqn3gHcrIVfZI0s8i9JpvHfBH96uM/3Az14/OGbMuBp/HMwuOIyGGD6uoDifCsXLrNFJOKIGIwCYEuLDDywLhwPSR5LjXJoq+jOGj7s4p8zGyk0E++UJOr53C1jVScslkSYb4Y2mHCy7QMAzq51FbMdzYcBjVyHW2nZYad1msSdybXsbYnUVjhUSzXh+ulHfguHJAJD60ZIWxemFdJfV3k7lDKVWxZxvNubYFVKuGIcqxfO7R6VdslPWsL0R6aNZrjZaVdspdYjdgiQVGKxBQWV+FfnY6wvt6jYOl2uS7at/Si6ZqXfD65lkOjpMv6F1VI/qTsgNu+Q1OXu9mBqlu9YWKdr/kO0G+LZEQC4P+u0P+iHKVPnBRZlxyWMT0sOZTqksN8RBVjMzCB40D7oOHR4/Pz24InXyT2DQYkUXhQsHTXo7JMFjYCSzcHy1OAvHAL/gHDIrDc6CZhiT3FFC69/vdDxoSdOaoJk+fx6dnFu/oAhvD18m4kv989zMqcHPuDfbL0ylDfN4FLGgc2mhhVRUSrjpuuC6jcQRZqHim5Jo+cCRQr/qdf7HeIQOVOhGkbqDJvYOQZqCYXj8Usdhojp9kodiW2vx6kNpRL5CcaBarERiiuhawpoHQp7/Q1ClQ+QgIBGluDMBuzOLjhgNw/RVofKtUVwDLhyUdKS3im3uJYhgOGp8n0jweuL+fo6vkfoFmXQ/exO37ymN45rcCOBfkrKL8iiSvRFfliJRX5okSX5BePW11Sl8k5rHDhyDmf4xyBign1ijhHoK1CU5zz3hmpD4bx8keZweevd7fQ/vZSSufs7rI+/TIx5t7xuupVWeTe7yoKUVA5o3eZPW67S2+9yxm5S1TG7zC5u2V2yTdJ7jIvnt1dcud2GzRJb2l6+/Lv4OP729Ww92K/3N8A9e8yxnJfqrJUvT3VwqR9Ccu5L7Sn1brS4M5TJu1LhBf7Qnua72vUOfjSQ6QAO/HnMaXMO2RA0uM/ \ No newline at end of file +7V1td5s2FP41Oaf7kB7ewR+btGl3tm7Z0nXrR2Jkw4KRC3Lj7NdP4sU2kmwrCaBL4pzT2ghZxs9zr+6LLuLMvlysP+bhMv6MI5SeWUa0PrPfn1mW6VgefWEtD1WLb/tVwzxPorrTtuEm+Q/VjUbdukoiVLQ6EoxTkizbjVOcZWhKWm1hnuP7drcZTtvfugznSGi4mYap2Pp3EpG4ag1cY9v+CSXzuPlm06jPLMKmc91QxGGE73ea7A9n9mWOManeLdaXKGXgNbhUn7vac3ZzYTnKiMoH7rJ/8C/J1fxrnnz4+umPq/Off7k+Nx23GudHmK7qn1xfLnloMMjxKosQG8Y4sy/u44Sgm2U4ZWfvKeu0LSaLlB6Z9G1BcnyHLnGK8/LTtlH+0TNhmswz2paiGb3gi1mSpk23DGd0tAvxJ9W/8gfKCVrvNNU/8SPCC0TyB9qlPuvUaDfi1hzfb8mzG0biXeK8ujGsBWa+GXqLKX1Tw/oYiA1bArGXMgyWLaC97ysmDBdpkqHz5uLe0S6mTS/O3Xag7+b1a1q9Xi35tthsWn5Fc5RFTfttLvn0tvPOlR36Dk5CKDnkkBjU/EooL6ggJdmcNrjboy+YCtX7c2ufuGEqD7O0VKY4iSJEpeqiLaScrHUgWB4nWK4oWFYgESynL7kKLEGs/kRpEt5SUizjhlKOcpH0NwTTs5jE7KRxj/M7lBc/PU/lJbQyXBM6i76rebjFhOAFPRGFRVyOa3ZDy0aZa15cT8KLI1N4ozdiHIGYm2SxfFW0OPBYEQ3dq2NlAlBZvCfTsmKkLPNkEeYPIyYl4EgJDJEU03KHJMUJBDhRRN3i+rBGqI3wv6vFsnbfPWaicxLjOc7C9FdcWnOG1r+IkIe6U7iiLLZIoBDmD/+w4d66zeG3evTy4P26dfRQH1WuRuOdW+z8OiHVQJZbH35rvoS+347DDpph9lJZUEmbokMeXtWPhPkckQP97FrWGZYHJSNHaUiSH+0QpHuWJyeW+2DZB8Vy46eeWO6WZQcWy+ZLYdmHxLIFiuTGtRDzCLeNa3QRkmlM+3wO72Se004wv/nIm7Ao0IJGjQVDZM3+T7LSw7plg6GDDq953LfqwEVyjbaL5Ej8VlOW1ektqeOKWbJR6dvTVcdSnSBBqY6YKxFU548VzleLv0MqzLlMTyRxyD3tzPRlhlkoEtJ/38tBGD5MgcLpXbGJVEptAqBLARcDBtp1ydahS13qhK1qTmApxf689Ebor3M8RUWBVW1JzIJjJvMhyzhTSnFeHsIxJZz4+9Joe1Dx1xJsS1ywJxqXp6uNo6g2JiitEbO7P/+mXawdbnnClnhIssReb6sTvh4HaRNGqEcREJwqX9WAwMot+Fosdx9R5xBBpzLJsFILvnMiuQeSYeWC/eOu4CeULpVzCjlapkmVUdhkEsof832FCgLAD5wE0FIKwSv0xvm1L/0sOKJJO01tFTQTxanNnoCa2pyT/eqBZAdYUOaeSO6BZAMUyZPjFvLmIZvGOc4oY+quSuWT0JEXSVGwYkRAFtIEt/ThjXhBuRopescKw7fXSluuEgZC+Qm1+p7jS4yGopKZFigl80a8kgyRXljBvKcnLfdi2TVhuUHNdR8ykX+iKaIXomoeF2GSEfqvXMjMWFnml8vrs80NNwnOGMZlgeyUhvwZ0W8z+XIBS3ITyMBrPFpK7SCs8TQKMrJFnuayYa3yeBawVR7Xf7VybY1Tri2Icu1Ck2tT4a5I0RVpOR87CLXlf1dBtsL/YdvKa8gjFhqPStkOoK4E0KZNWRjrb7jG1Ek421tkYQccUZV61Z/aciUM5BlHBqrUTxioJH3zs5/hLute4PS1LUmbnuIEF4Ca4Dwt2d5uEno7vFtDmTFVlmEVHpjiXWMAzFgArQjHG/EdPkNG+6qL9iaslS3/lIntll5Ypsw/ZWK7pRdWyY2pUHPz8nN1E3C5Og9eTmMwbzAYpTfoQamgf57P/7hYbwCWYZlDUyyQA+Dzmwa03JWnpRRmvNqgWgwDSxs870WwPJhlU2UZlmVrLhvWnMdvI6R9ymuu55Suf1K6nq+Ff3K63jQ05+sdLbNidyFzSySPxc9Pnw2bjsfvUrFATYebqepVh8x8Saj+kNnREjJDuHGzUZCjmgQrc9xcNiy/wnSA+RX+qOT6UcajUyVQ3f/CBmZOlDbAKPeXvKzMgfKddwVFe3OnHX2Nknl9j8Hj9q7Ucxue/k0xLHGXhd//+qIdqAm/r61k85xhIx9xIRwCTibvo+gHSlxtAQEUHzHpB0p2H/IplhZR3hcCc0UgHr9Lr3owHRwZqedgurHWwFSGr7LZGC9dGmPDtFU2NFtlw7RV/N0i+nGCaaoccDidLNVzLBW/ZZg7mbyd7PwFTzNbDr+/5OFh+7Zh4gIKBF3iodetSs3AwGCyLGg4iQloCDiZ4HASXcfT1PyIIMJvz5mu28dMbU6e8y09T9w+lDtmB92YRbUyq/uNhPesDHMxhOd1I3l80ZRv6ZS1YH8l7gyX1y0+o4+dOC9KOSqf0Oct1/ufz7fdQy/MovLRRgtUFOG83IxtlmO2YXgrJXxo/3HaVl1V58/iax6aN6Vyi2i78AykRRJF5QwrM2ltXez8KUltgTFlu5Z7Q5q4YP9+fX0JTbUAXS4i5GFWhOXydHGSFSFShScs+7euGnCGaT/H7iQ3whwDTm4mMrnh3aI0TZbFPsR2KAqLZfWE5FmyZrCJXOzytfvk3g6w9dvQBrYEWgmyZl/IbhaoBnjw8T5YBfx7wNlwdeMsq9nicY7x4nZVjBVj19GMsXTbp/HLssulyaRlCcMCrSLM4wM6UNlkcligFRJIIwSa90wBAC2rgRqlhyHMFpJ14YGxFReGR4qtMEHIPONhsVXZoWkU2E48cHOCuFA/UmzBTbeSej3Rro3LG3YUqrYHBlmWzx03yPwKn36QmwD+BYEcgJNk++UFz7wrEWgP6iR1jmMHmd9VxdUvyQq+8MhAtuGBrOAUjwxkvrJFP8YqzsVj6n/243zMU74q/zpyMILj03IgAdruD2ggu/rsBbfv4guBEjd467YHUS244AsOg2ZD+oEqLExJgSHBdwm7F46ExZ3A9PMXGFM0I89bXuxi9lJajpGoldWbWklKGBsmpnGYZSh9NWQ4E+1kiK7nb4iwBfqXSsIEHgdiBvyG4Dykk/sr4cDSPymJ0cHl9V+vBX9pBNwN/vQwx6xiaGvN6Y+KP+MIsR7/Aw==7V1bc5s4FP41mdl9SIc7+DF1rt2mcS5Nt3nZUYxsaDEigGO7v34lDDYIGWOHi3zJTKdGCFk+37nrSJzI3dH0ygeedYtM6JxIgjk9kc9PJElUJA3/R1pm8xZd1ucNQ982407Lhkf7D4wbhbh1bJswyHQMEXJC28s29pHrwn6YaQO+jybZbgPkZL/VA0OYa3jsAyff+sM2Q2veaqjCsv0a2kMr+WZRiO+MQNI5bggsYKJJqkm+OJG7PkLh/NNo2oUOIV5Cl+uzM09RrUHn6f0evmhnp4M7/XQ+2OUmjyx+gg/dcOuhPdiDZw9I7frnLzcP0gvU3x9PxXjsd+CMY4LFPzacJRT00dg1IRlFOJE/Tyw7hI8e6JO7E8wzuM0KRw6+EvHHIPTRb9hFDvKjp2Uh+sN3gGMPXdzmwAH+EZ8HtuMk3Vzk4tE+l/yZyZShH8JpCuT4Z19BNIKhP8Nd4rtKDGDCwQmgkyU/KEbcZqV5QYsbQcyDw8XQSzrjDzGp2WR/+65dK5//uVP1d/0qhJeTzs3NqSTnqAxNzLbxZUyPLOF/jUdeLF4avkR+aKEhcoHzFSEvpv4vGIazuBMYh4jGBvjhGZGp5XdEbZc2mX70LdA1kx59BwSB3Z83xl2WECfCJEX3MVH+JSN8UpPLn4sB8cX5NHM1S66mdph6DF/9jL+DfF4+RC6SZzZjkgCN/X5M4dPra+fszbd+9tRvxkC46l1N7FhmBEyFIYzHewfPvzrSQH/583Q3/WKJ/33u3JzGgBGUClnOhw4I7fes8vkI/xTNOiW1N98K5FZcL7d1yZ4oZYVPMhjCx5A9pQLRY5JOzNGpAcnbVkCyoiZm5TOe61I4yRMmCKwF5hXLipKXFWY/jStZyRu0I+AVAy5wBXgrtvWwAFe5Alxh+LCaQ5zMV/xhSD48wD7E8/AXN/zkTtKCv3nR+68RsN0Q/wvwsAQDSXjq9pZBio1cQnUynNB3bEz7vzm1vlrW+i6uU9Z3YaHT5rc+z1c9er58eL7qTnq+6s54vgtPtwXPly16ylH0+BA9bSdFT9sZ0RNbFD0m6XSGh0KLYs7tyzh6KaplJTQtwkvxvFi20jKcl6TNYFjLjSkiqwwiJ22lmTb+hh7CPllKvVJZPVmgwJtLYfxUOi9KD6StGWguprmBIkZY/OztecNYzxu+hUav42C9QK3OvqbTraysbB9jj/3ZGmVSWC+TIisJW5s51PbVHOq7Zg87Je1hWQ1UuT1kMpDcSiqxPgbagg9STCdtxHQtMZDOlUPV2RmHyiihvJvN4gs5Qh2TfCWFRZTz0sLuyFeaT2xF3x4Y5h2+ID8u3jQg5gpfmMuMgOiYzo/MsMJdOt84+sDbBl5Vyztj8W4XoqhWPDleOajVKKo0B/EVRon59V9e46iOxtuakH6UPl70d9kFWb70t9Q5chAv+rssB1Wuv9krKx2ZWlnRm11ZEXenWMBo0TCwaZdf7c3L+XHJcjN5kCpasuysW/usW7BKrGfvw5qlXqKOoK41y/G927EnTy/6xRu4Hk5urWnQY9T9J/mOAYoYZAmA9jZGyY3TIJKlM9xB1Lzp8iadNVmkTK6BazokUTKCQQCGkGROBj4akQTKmORJJsj/Df2AlW5Jtc1nlTRTHIJBCItYIdYpjI1COQYgkNp94JzFN0a2aUaKhMV7WWX0gRTcBovfdOJGYDCSVpN+Z3ISa99X45wUWlHOzfPtEfBteOSmks6C1CI3sYOAlvP0Gad7mbRfmadPBQ/L0CAdP1DJ+83jhyRiSM2ohmoKpndfdktba3Fk0ayLsv6PWBxBOPaJNnmE/ruNyVBuASDATwaRMAATWy1O0/satcous9yNRtP7e5qbLR3ZVyyZyf73lGRCqXf3OsP2OdRf4Lj3ZXDt95Ita5xkePaszCkxGgsLslGRHO+sU7lSZ0egmppVVQqtgqqLQNkOR6v+RpZ1Ukm/Mg7HQWq01nYyMLmnlY0wH8wsb1iJUp+HWz8HMftVvqk8ehT/fDBLdfCIwgpW6z069FI1geLG+YiVKrtWCtW3s5MNrMtITZSpf4hVDa6UXSsVOitCcw4C82SVl2cGaq0gj+3/t1xrzZX+EXnnndZqtotmXZTV6SK/bBrHinLJqUTOidQlo6MQxh/7JHU7sPsAt3Ca5dHp/YZNZnmYIEnrQboC/is5aE8SushxYD9EfknMxp4JIniE0CLPXxGYIiA4xYfOwumt41OiILoHZg4CJu61YWl0gJFMofMKwr6F/zftIQxCXkWIPtak/USpeKyDrtjeMWpR2ZRvzeCxp3Msh2uLEfjymqU9XTvZAUZoLdlYNO0i630debO40w9gh6VNtw/fxsRI4+dGNobRHSYGPDLo2FfeDYeYLovTmtzVVBRDFUYtS7puitsEd46qYIgPLYBoOnPvy7KJO0ZIgwa7gZ0oGLyBxypcpb1lH3ko2GD74G+CjgsnJ7yXENDBZaNwPMsXEFnhw8X5wyy8kme3gvDUVvJ6CxNXyna2nQbS80aQSXapLRvInE0rTvGRCVpzhJizaSuPfOhcwFedH2vrQM4ddrwNvGDPsecZLJRKYCXOMad2OrcVpPUM1n5tBK1Y1Iy8qBVl0jkRNdbJgrnFFzeAbjAOSkob8ufLLrsQm9CJ/EZ9YWYVVL7G+QE6Nnh15hXNrsnSen9Fei23V6KI6MJ6ojP2QdDbHl5RGKLRSUObGhQqkFQZmxqYWyBFevNbZXDlFy/vvj/xyesqxesq6x1HNe0IKSph3gXaaRpntMuv93FLO5U32rGyrbRfc9y2vILyK2q96ZpHufOpk/ozsgOW3cSsKRsNW11BOZNvWClfWuD2YEezQssra+tgTTuamcFp3iV6tEfe0SFipNYVRoxYn0dUlEqo0jLlD02sjaD0EeQs5q/LWBUlaXfB0Iv0MfmtE68GL6lJXuSNnJwZwAahELT1erbRNxOIeb2wrVEkbaQhc2jHvllK+vAkVsVmfYaysAZlF1S7QfkZRoOqqLCSaxeIJwoKb9TLa3JuqUcfZNU+8Q7kZCv6JGlmkXtNNo/5IvrVx394Geon5w05tgtPk5lFxxGRwwbV1QcS4Vl5dJslJi1f4RCSermCRZBl59TMir6DYpvtDygKMIeR6j75XF1ePUVLbKTMlM2HCDPDwImyXJZtmtDN5bMSZnPgIKyR1WjjLDOMs1GTjDNZjbUnicIKj2J7AVwv4iDwYJ9MeGBPCWHzErxK1Osit05pVLFlwc77sgWmqIT3ybFS1WmdqrZLeta+oT20ZTTHy0q7ZC+xBLFFVqIUiSksLqO/Or1fY70bwdLtcl20b+nt0jWv83xwAYdGyZCNT6qw/JOyA2774kxD7mQHqm7Jhol1vtA7RL9tkgYJQfC7Qv+LcpQ+cDxkXXJYxvSw5FCqSw7zYVSCTd8Crgudg4bHSA7NbwuefGXYNxiS7OBBwdJZj8oiQ9gILJ0cLI8h8qN99wcMi8Byo5uEJfEUU7h0e98PGRN2uqgmTJ5Gp2fnb+o9GMCXi9uh/HZ7Py1zXOwP9nHSK0P9wAIeaew7aGxWFRGtOmO6LqByp1eoeaTkmjxyJlCs+J9+m98hApU7BqZtoMq8dpFnoJpcMRaz2GmMnGaj2JXY83qQ2lAukZ9oFKgSu5+4FrKmgDKkvNPXKFD5CAmEaGT3o2zM/LSGA3L/FGl9qFRXAMuEJx8pLeCZ+POzGA4YnibTPz64upihy6evQLMvBt5DZ/ToM71zWoEdq/BXUH5FEleiy/DFSsrwRYmuwy8et7qkLpNzWOHCkXM+xjkCFRMaFXGOQFuFpjjnTR+q96b5/EeZwqcvtzfQ+fZcSufs7rI+/QYx5obxuopUWeTe7yoKUVA5o3eZjW27S2+jwxm5S5TD7zC5O2W2xjdJ7jJvm91dcue2GDRJb2ly8/xv//376+Wg++w8310D9Z8yxnJfqrJUoz3VwqR9Ccu5L7Sn1brS4HZTJu1LhBf7Qnua72vUOfjSR6TqeunPY0pZt8iEpMf/ \ No newline at end of file diff --git a/narwhal/README.md b/narwhal/README.md index 769a1beb55dd1..b26cd81052631 100644 --- a/narwhal/README.md +++ b/narwhal/README.md @@ -1,7 +1,7 @@ # Narwhal and Tusk [![build status](https://img.shields.io/github/workflow/status/facebookresearch/narwhal/Rust/master?style=flat-square&logo=github)](https://github.com/facebookresearch/narwhal/actions) -[![rustc](https://img.shields.io/badge/rustc-1.52+-blue?style=flat-square&logo=rust)](https://www.rust-lang.org) +[![rustc](https://img.shields.io/badge/rustc-1.51+-blue?style=flat-square&logo=rust)](https://www.rust-lang.org) [![license](https://img.shields.io/badge/license-Apache-blue.svg?style=flat-square)](LICENSE) This repo provides an implementation of [Narwhal and Tusk](https://arxiv.org/pdf/2105.11827.pdf). The codebase has been designed to be small, efficient, and easy to benchmark and modify. It has not been designed to run in production but uses real cryptography ([dalek](https://doc.dalek.rs/ed25519_dalek)), networking ([tokio](https://docs.rs/tokio)), and storage ([rocksdb](https://docs.rs/rocksdb)). @@ -24,10 +24,12 @@ This command may take a long time the first time you run it (compiling rust code SUMMARY: ----------------------------------------- + CONFIG: + Faults: 0 node(s) Committee size: 4 node(s) + Worker(s) per node: 1 worker(s) + Collocate primary and workers: True Input rate: 50,000 tx/s Transaction size: 512 B - Faults: 0 node(s) Execution time: 19 s Header size: 1,000 B @@ -49,5 +51,10 @@ This command may take a long time the first time you run it (compiling rust code ----------------------------------------- ``` +## Next Steps +The next step is to read the paper [Narwhal and Tusk: A DAG-based Mempool and Efficient BFT Consensus](https://arxiv.org/pdf/2105.11827.pdf). It is then recommended to have a look at the README files of the [worker](https://github.com/facebookresearch/narwhal/tree/master/worker) and [primary](https://github.com/facebookresearch/narwhal/tree/master/primary) crates. An additional resource to better understand the Tusk consensus protocol is the paper [All You Need is DAG](https://arxiv.org/abs/2102.08325) as it describes a similar protocol. + +The README file of the [benchmark folder](https://github.com/facebookresearch/narwhal/tree/master/benchmark) explains how to benchmark the codebase and read benchmarks' results. It also provides a step-by-step tutorial to run benchmarks on [Amazon Web Services (AWS)](https://aws.amazon.com) accross multiple data centers (WAN). + ## License This software is licensed as [Apache 2.0](LICENSE). diff --git a/narwhal/benchmark/.gitignore b/narwhal/benchmark/.gitignore index d7980e9b7486f..04dda9987481b 100644 --- a/narwhal/benchmark/.gitignore +++ b/narwhal/benchmark/.gitignore @@ -1,6 +1,7 @@ node benchmark_client results +plots # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/narwhal/benchmark/README.md b/narwhal/benchmark/README.md new file mode 100644 index 0000000000000..05768180474c5 --- /dev/null +++ b/narwhal/benchmark/README.md @@ -0,0 +1,238 @@ +# Running Benchmarks +This document explains how to benchmark the codebase and read benchmarks' results. It also provides a step-by-step tutorial to run benchmarks on [Amazon Web Services (AWS)](https://aws.amazon.com) accross multiple data centers (WAN). + +## Local Benchmarks +When running benchmarks, the codebase is automatically compiled with the feature flag `benchmark`. This enables the node to print some special log entries that are then read by the python scripts and used to compute performance. These special log entries are clearly indicated with comments in the code: make sure to not alter them (otherwise the benchmark scripts will fail to interpret the logs). + +### Parametrize the benchmark +After cloning the repo and [installing all dependencies](https://github.com/facebookresearch/narwhal#quick-start), you can use [Fabric](http://www.fabfile.org/) to run benchmarks on your local machine. Locate the task called `local` in the file [fabfile.py](https://github.com/facebookresearch/narwhal/blob/master/benchmark/fabfile.py): +```python +@task +def local(ctx): + ... +``` +The task specifies two types of parameters, the *benchmark parameters* and the *nodes parameters*. The benchmark parameters look as follows: +```python +bench_params = { + 'nodes': 4, + 'workers': 1, + 'rate': 50_000, + 'tx_size': 512, + 'faults': 0, + 'duration': 20, +} +``` +They specify the number of primaries (`nodes`) and workers per primary (`workers`) to deploy, the input rate (tx/s) at which the clients submits transactions to the system (`rate`), the size of each transaction in bytes (`tx_size`), the number of faulty nodes ('faults), and the duration of the benchmark in seconds (`duration`). The minimum transaction size is 9 bytes, this ensure that the transactions of a client are all different. The benchmarking script will deploy as many clients as workers and divide the input rate equally amongst each client. For instance, if you configure the testbed with 4 nodes, 1 worker per node, and an input rate of 1,000 tx/s (as in the example above), the scripts will deploy 4 clients each submitting transactions to one node at a rate of 250 tx/s. When the parameters `faults` is set to `f > 0`, the last `f` nodes and clients are not booted; the system will thus run with `n-f` nodes (and `n-f` clients). + +The nodes parameters determine the configuration for the primaries and workers: +```python +node_params = { + 'header_size': 1_000, + 'max_header_delay': 100, + 'gc_depth': 50, + 'sync_retry_delay': 10_000, + 'sync_retry_nodes': 3, + 'batch_size': 500_000, + 'max_batch_delay': 100 +} +``` +They are defined as follows: +* `header_size`: The preferred header size. The primary creates a new header when it has enough parents and enough batches' digests to reach `header_size`. Denominated in bytes. +* `max_header_delay`: The maximum delay that the primary waits between generating two headers, even if the header did not reach `max_header_size`. Denominated in ms. +* `gc_depth`: The depth of the garbage collection (Denominated in number of rounds). +* `sync_retry_delay`: The delay after which the synchronizer retries to send sync requests. Denominated in ms. +* `sync_retry_nodes`: Determine with how many nodes to sync when re-trying to send sync-request. These nodes are picked at random from the committee. +* `batch_size`: The preferred batch size. The workers seal a batch of transactions when it reaches this size. Denominated in bytes. +* `max_batch_delay`: The delay after which the workers seal a batch of transactions, even if `max_batch_size` is not reached. Denominated in ms. + +### Run the benchmark +Once you specified both `bench_params` and `node_params` as desired, run: +``` +$ fab local +``` +This command first recompiles your code in `release` mode (and with the `benchmark` feature flag activated), thus ensuring you always benchmark the latest version of your code. This may take a long time the first time you run it. It then generates the configuration files and keys for each node, and runs the benchmarks with the specified parameters. It finally parses the logs and displays a summary of the execution similarly to the one below. All the configuration and key files are hidden JSON files; i.e., their name starts with a dot (`.`), such as `.committee.json`. +``` +----------------------------------------- + SUMMARY: +----------------------------------------- + + CONFIG: + Faults: 0 node(s) + Committee size: 4 node(s) + Worker(s) per node: 1 worker(s) + Collocate primary and workers: True + Input rate: 50,000 tx/s + Transaction size: 512 B + Execution time: 19 s + + Header size: 1,000 B + Max header delay: 100 ms + GC depth: 50 round(s) + Sync retry delay: 10,000 ms + Sync retry nodes: 3 node(s) + batch size: 500,000 B + Max batch delay: 100 ms + + + RESULTS: + Consensus TPS: 46,478 tx/s + Consensus BPS: 23,796,531 B/s + Consensus latency: 464 ms + + End-to-end TPS: 46,149 tx/s + End-to-end BPS: 23,628,541 B/s + End-to-end latency: 557 ms +----------------------------------------- +``` +The 'Consensus TPS' and 'Consensus latency' respectively report the average throughput and latency without considering the client. The consensus latency thus refers to the time elapsed between the block's creation and its commit. In contrast, 'End-to-end TPS' and 'End-to-end latency' report the performance of the whole system, starting from when the client submits the transaction. The end-to-end latency is often called 'client-perceived latency'. To accurately measure this value without degrading performance, the client periodically submits 'sample' transactions that are tracked across all the modules until they get committed into a block; the benchmark scripts use sample transactions to estimate the end-to-end latency. + +## AWS Benchmarks +This repo integrates various python scripts to deploy and benchmark the codebase on [Amazon Web Services (AWS)](https://aws.amazon.com). They are particularly useful to run benchmarks in the WAN, across multiple data centers. This section provides a step-by-step tutorial explaining how to use them. + +### Step 1. Set up your AWS credentials +Set up your AWS credentials to enable programmatic access to your account from your local machine. These credentials will authorize your machine to create, delete, and edit instances on your AWS account programmatically. First of all, [find your 'access key id' and 'secret access key'](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html#cli-configure-quickstart-creds). Then, create a file `~/.aws/credentials` with the following content: +``` +[default] +aws_access_key_id = YOUR_ACCESS_KEY_ID +aws_secret_access_key = YOUR_SECRET_ACCESS_KEY +``` +Do not specify any AWS region in that file as the python scripts will allow you to handle multiple regions programmatically. + +### Step 2. Add your SSH public key to your AWS account +You must now [add your SSH public key to your AWS account](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html). This operation is manual (AWS exposes little APIs to manipulate keys) and needs to be repeated for each AWS region that you plan to use. Upon importing your key, AWS requires you to choose a 'name' for your key; ensure you set the same name on all AWS regions. This SSH key will be used by the python scripts to execute commands and upload/download files to your AWS instances. +If you don't have an SSH key, you can create one using [ssh-keygen](https://www.ssh.com/ssh/keygen/): +``` +$ ssh-keygen -f ~/.ssh/aws +``` + +### Step 3. Configure the testbed +The file [settings.json](https://github.com/facebookresearch/narwhal/blob/master/benchmark/settings.json) (located in [narwhal/benchmarks](https://github.com/facebookresearch/narwhal/blob/master/benchmark)) contains all the configuration parameters of the testbed to deploy. Its content looks as follows: +```json +{ + "key": { + "name": "aws", + "path": "/absolute/key/path" + }, + "port": 5000, + "repo": { + "name": "narwhal", + "url": "https://github.com/facebookresearch/narwhal.git", + "branch": "master" + }, + "instances": { + "type": "m5d.8xlarge", + "regions": ["us-east-1", "eu-north-1", "ap-southeast-2", "us-west-1", "ap-northeast-1"] + } +} +``` +The first block (`key`) contains information regarding your SSH key: +```json +"key": { + "name": "aws", + "path": "/absolute/key/path" +}, +``` +Enter the name of your SSH key; this is the name you specified in the AWS web console in step 2. Also, enter the absolute path of your SSH private key (using a relative path won't work). + + +The second block (`ports`) specifies the TCP ports to use: +```json +"port": 5000, +``` +Narwhal requires a number of TCP ports, depening on the number of workers per node, Each primary requires 2 ports (one to receive messages from other primaties and one to receive messages from its workers), and each worker requires 3 ports (one to receive client transactions, one to receive messages from its primary, and one to receive messages from other workers). Note that the script will open a large port range (5000-7000) to the WAN on all your AWS instances. + +The third block (`repo`) contains the information regarding the repository's name, the URL of the repo, and the branch containing the code to deploy: +```json +"repo": { + "name": "narwhal", + "url": "https://github.com/facebookresearch/narwhal.git", + "branch": "master" +}, +``` +Remember to update the `url` field to the name of your repo. Modifying the branch name is particularly useful when testing new functionalities without having to checkout the code locally. + +The the last block (`instances`) specifies the [AWS instance type](https://aws.amazon.com/ec2/instance-types) and the [AWS regions](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions) to use: +```json +"instances": { + "type": "m5d.8xlarge", + "regions": ["us-east-1", "eu-north-1", "ap-southeast-2", "us-west-1", "ap-northeast-1"] +} +``` +The instance type selects the hardware on which to deploy the testbed. For example, `m5d.8xlarge` instances come with 32 vCPUs (16 physical cores), 128 GB of RAM, and guarantee 10 Gbps of bandwidth. The python scripts will configure each instance with 300 GB of SSD hard drive. The `regions` field specifies the data centers to use. If you require more nodes than data centers, the python scripts will distribute the nodes as equally as possible amongst the data centers. All machines run a fresh install of Ubuntu Server 20.04. + +### Step 4. Create a testbed +The AWS instances are orchestrated with [Fabric](http://www.fabfile.org) from the file [fabfile.py](https://github.com/facebookresearch/narwhal/blob/master/benchmark/fabfile.pyy) (located in [narwhal/benchmarks](https://github.com/facebookresearch/narwhal/blob/master/benchmark)); you can list all possible commands as follows: +``` +$ cd narwhal/benchmark +$ fab --list +``` +The command `fab create` creates new AWS instances; open [fabfile.py](https://github.com/facebookresearch/narwhal/blob/master/benchmark/fabfile.py) and locate the `create` task: +```python +@task +def create(ctx, nodes=2): + ... +``` +The parameter `nodes` determines how many instances to create in *each* AWS region. That is, if you specified 5 AWS regions as in the example of step 3, setting `nodes=2` will creates a total of 10 machines: +``` +$ fab create + +Creating 10 instances |██████████████████████████████| 100.0% +Waiting for all instances to boot... +Successfully created 10 new instances +``` +You can then clone the repo and install rust on the remote instances with `fab install`: +``` +$ fab install + +Installing rust and cloning the repo... +Initialized testbed of 10 nodes +``` +This may take a long time as the command will first update all instances. +The commands `fab stop` and `fab start` respectively stop and start the testbed without destroying it (it is good practice to stop the testbed when not in use as AWS can be quite expensive); and `fab destroy` terminates all instances and destroys the testbed. Note that, depending on the instance types, AWS instances may take up to several minutes to fully start or stop. The command `fab info` displays a nice summary of all available machines and information to manually connect to them (for debug). + +### Step 5. Run a benchmark +After setting up the testbed, running a benchmark on AWS is similar to running it locally (see [Run Local Benchmarks](https://github.com/facebookresearch/narwhal/tree/master/benchmark#local-benchmarks)). Locate the task `remote` in [fabfile.py](https://github.com/facebookresearch/narwhal/blob/master/benchmark/fabfile.py): +```python +@task +def remote(ctx): + ... +``` +The benchmark parameters are similar to [local benchmarks](https://github.com/facebookresearch/narwhal/tree/master/benchmark#local-benchmarks) but allow to specify the number of nodes and the input rate as arrays to automate multiple benchmarks with a single command. The parameter `runs` specifies the number of times to repeat each benchmark (to later compute the average and stdev of the results), and the parameter `collocate` specifies whether to collocate all the node's workers and the primary on the same machine. If `collocate` is set to `False`, the script will run one node per data center (AWS region), with its primary and each of its worker running on a dedicated instance. +```python +bench_params = { + 'nodes': [10, 20, 30], + 'workers: 2, + 'collocate': True, + 'rate': [20_000, 30_000, 40_000], + 'tx_size': 512, + 'faults': 0, + 'duration': 300, + 'runs': 2, +} +``` +Similarly to local benchmarks, the scripts will deploy as many clients as workers and divide the input rate equally amongst each client. Each client is colocated with a worker, and only submit transactions to the worker with whom they share the machine. + +Once you specified both `bench_params` and `node_params` as desired, run: +``` +$ fab remote +``` +This command first updates all machines with the latest commit of the GitHub repo and branch specified in your file [settings.json](https://github.com/facebookresearch/narwhal/blob/master/benchmark/settings.json) (step 3); this ensures that benchmarks are always run with the latest version of the code. It then generates and uploads the configuration files to each machine, runs the benchmarks with the specified parameters, and downloads the logs. It finally parses the logs and prints the results into a folder called `results` (which is automatically created if it doesn't already exists). You can run `fab remote` multiple times without fearing to override previous results, the command either appends new results to a file containing existing ones or prints them in separate files. If anything goes wrong during a benchmark, you can always stop it by running `fab kill`. + +### Step 6. Plot the results +Once you have enough results, you can aggregate and plot them: +``` +$ fab plot +``` +This command creates a latency graph, a throughput graph, and a robustness graph in a folder called `plots` (which is automatically created if it doesn't already exists). You can adjust the plot parameters to filter which curves to add to the plot: +```python +plot_params = { + 'nodes': [10, 20], + 'tx_size': 512, + 'faults': [0], + 'max_latency': [2_000, 5_000] +} +``` + +The first graph ('latency') plots the latency versus the throughput. It shows that the latency is low until a fairly neat threshold after which it drastically increases. Determining this threshold is crucial to understand the limits of the system. + +Another challenge is comparing apples-to-apples between different deployments of the system. The challenge here is again that latency and throughput are interdependent, as a result a throughput/number of nodes chart could be tricky to produce fairly. The way to do it is to define a maximum latency and measure the throughput at this point instead of simply pushing every system to its peak throughput (where latency is meaningless). The second graph ('tps') plots the maximum achievable throughput under a maximum latency for different numbers of nodes. + +The last graph ('robustness') plots the throughput versus the input rate (and provides no information on latency). This graph is a bit redundant given the other two but clearly shows the threshold where the system saturates (and the throughput may decrease). This threshold is crucial to determine how to configure a rate-limiter to block excess transactions from entering the system. diff --git a/narwhal/benchmark/aws/remote.py b/narwhal/benchmark/aws/remote.py index fd76424fc7eb5..c8159f26447fe 100644 --- a/narwhal/benchmark/aws/remote.py +++ b/narwhal/benchmark/aws/remote.py @@ -144,7 +144,7 @@ def _update(self, hosts, collocate): ips = list(set([x for y in hosts for x in y])) Print.info( - f'Updating {len(ips)} nodes (branch "{self.settings.branch}")...' + f'Updating {len(ips)} machines (branch "{self.settings.branch}")...' ) cmd = [ f'(cd {self.settings.repo_name} && git fetch -f)', @@ -357,11 +357,12 @@ def run(self, bench_parameters_dict, node_parameters_dict, debug=False): faults = bench_parameters.faults logger = self._logs(committee_copy, faults) logger.print(PathMaker.result_file( + faults, n, bench_parameters.workers, + bench_parameters.collocate, r, bench_parameters.tx_size, - faults )) except (subprocess.SubprocessError, GroupException, ParseError) as e: self.kill(hosts=selected_hosts) diff --git a/narwhal/benchmark/benchmark/aggregate.py b/narwhal/benchmark/benchmark/aggregate.py index 26670f658236b..5fde808050e0f 100644 --- a/narwhal/benchmark/benchmark/aggregate.py +++ b/narwhal/benchmark/benchmark/aggregate.py @@ -11,8 +11,10 @@ class Setup: - def __init__(self, nodes, rate, tx_size, faults): + def __init__(self, faults, nodes, workers, collocate, rate, tx_size): self.nodes = nodes + self.workers = workers + self.collocate = collocate self.rate = rate self.tx_size = tx_size self.faults = faults @@ -20,10 +22,12 @@ def __init__(self, nodes, rate, tx_size, faults): def __str__(self): return ( - f' Committee size: {self.nodes} nodes\n' + f' Faults: {self.faults}\n' + f' Committee size: {self.nodes}\n' + f' Workers per node: {self.workers}\n' + f' Collocate primary and workers: {self.collocate}\n' f' Input rate: {self.rate} tx/s\n' f' Transaction size: {self.tx_size} B\n' - f' Faults: {self.faults} nodes\n' f' Max latency: {self.max_latency} ms\n' ) @@ -35,11 +39,15 @@ def __hash__(self): @classmethod def from_str(cls, raw): - nodes = int(search(r'.* Committee size: (\d+)', raw).group(1)) - rate = int(search(r'.* Input rate: (\d+)', raw).group(1)) - tx_size = int(search(r'.* Transaction size: (\d+)', raw).group(1)) - faults = int(search(r'.* Faults: (\d+)', raw).group(1)) - return cls(nodes, rate, tx_size, faults) + faults = int(search(r'Faults: (\d+)', raw).group(1)) + nodes = int(search(r'Committee size: (\d+)', raw).group(1)) + workers = int(search(r'Worker\(s\) per node: (\d+)', raw).group(1)) + collocate = 'True' == search( + r'Collocate primary and workers: (True|False)', raw + ).group(1) + rate = int(search(r'Input rate: (\d+)', raw).group(1)) + tx_size = int(search(r'Transaction size: (\d+)', raw).group(1)) + return cls(faults, nodes, workers, collocate, rate, tx_size) class Result: @@ -57,8 +65,8 @@ def __str__(self): @classmethod def from_str(cls, raw): - tps = int(search(r'.* End-to-end TPS: (\d+)', raw).group(1)) - latency = int(search(r'.* End-to-end latency: (\d+)', raw).group(1)) + tps = int(search(r'End-to-end TPS: (\d+)', raw).group(1)) + latency = int(search(r'End-to-end latency: (\d+)', raw).group(1)) return cls(tps, latency) @classmethod @@ -97,7 +105,10 @@ def print(self): os.makedirs(PathMaker.plots_path()) results = [ - self._print_latency(), self._print_tps(), self._print_robustness() + self._print_latency(), + self._print_tps(scalability=False), + self._print_tps(scalability=True), + self._print_robustness() ] for name, records in results: for setup, values in records.items(): @@ -114,13 +125,17 @@ def print(self): f'{data}' '-----------------------------------------\n' ) + + max_lat = setup.max_latency filename = PathMaker.agg_file( name, - setup.nodes, - setup.rate, - setup.tx_size, setup.faults, - max_latency=setup.max_latency + setup.nodes, + setup.workers, + setup.collocate, + setup.rate, + setup.tx_size, + max_latency=None if max_lat == 'any' else max_lat, ) with open(filename, 'w') as f: f.write(string) @@ -139,26 +154,30 @@ def _print_latency(self): return 'latency', organized - def _print_tps(self): + def _print_tps(self, scalability): records = deepcopy(self.records) organized = defaultdict(list) for max_latency in self.max_latencies: for setup, result in records.items(): setup = deepcopy(setup) if result.mean_latency <= max_latency: - nodes = setup.nodes - setup.nodes = 'x' setup.rate = 'any' setup.max_latency = max_latency - - new_point = all(nodes != x[0] for x in organized[setup]) + if scalability: + variable = setup.workers + setup.workers = 'x' + else: + variable = setup.nodes + setup.nodes = 'x' + + new_point = all(variable != x[0] for x in organized[setup]) highest_tps = False - for w, r in organized[setup]: - if result.mean_tps > r.mean_tps and nodes == w: - organized[setup].remove((w, r)) + for v, r in organized[setup]: + if result.mean_tps > r.mean_tps and variable == v: + organized[setup].remove((v, r)) highest_tps = True if new_point or highest_tps: - organized[setup] += [(nodes, result)] + organized[setup] += [(variable, result)] [v.sort(key=lambda x: x[0]) for v in organized.values()] return 'tps', organized diff --git a/narwhal/benchmark/benchmark/config.py b/narwhal/benchmark/benchmark/config.py index 28a462602abf8..50bcc26af21e1 100644 --- a/narwhal/benchmark/benchmark/config.py +++ b/narwhal/benchmark/benchmark/config.py @@ -189,26 +189,32 @@ def print(self, filename): class BenchParameters: def __init__(self, json): try: + self.faults = int(json['faults']) + nodes = json['nodes'] nodes = nodes if isinstance(nodes, list) else [nodes] if not nodes or any(x <= 1 for x in nodes): raise ConfigError('Missing or invalid number of nodes') + self.nodes = [int(x) for x in nodes] rate = json['rate'] rate = rate if isinstance(rate, list) else [rate] if not rate: raise ConfigError('Missing input rate') + self.rate = [int(x) for x in rate] - self.nodes = [int(x) for x in nodes] + self.workers = int(json['workers']) + if 'collocate' in json: self.collocate = bool(json['collocate']) else: self.collocate = True - self.rate = [int(x) for x in rate] + self.tx_size = int(json['tx_size']) - self.faults = int(json['faults']) + self.duration = int(json['duration']) + self.runs = int(json['runs']) if 'runs' in json else 1 except KeyError as e: raise ConfigError(f'Malformed bench parameters: missing key {e}') @@ -223,17 +229,28 @@ def __init__(self, json): class PlotParameters: def __init__(self, json): try: + faults = json['faults'] + faults = faults if isinstance(faults, list) else [faults] + self.faults = [int(x) for x in faults] if faults else [0] + nodes = json['nodes'] nodes = nodes if isinstance(nodes, list) else [nodes] if not nodes: raise ConfigError('Missing number of nodes') self.nodes = [int(x) for x in nodes] - self.tx_size = int(json['tx_size']) + workers = json['workers'] + workers = workers if isinstance(workers, list) else [workers] + if not workers: + raise ConfigError('Missing number of workers') + self.workers = [int(x) for x in workers] - faults = json['faults'] - faults = faults if isinstance(faults, list) else [faults] - self.faults = [int(x) for x in faults] if faults else [0] + if 'collocate' in json: + self.collocate = bool(json['collocate']) + else: + self.collocate = True + + self.tx_size = int(json['tx_size']) max_lat = json['max_latency'] max_lat = max_lat if isinstance(max_lat, list) else [max_lat] @@ -246,3 +263,11 @@ def __init__(self, json): except ValueError: raise ConfigError('Invalid parameters type') + + if len(self.nodes) > 1 and len(self.workers) > 1: + raise ConfigError( + 'Either the "nodes" or the "workers can be a list (not both)' + ) + + def scalability(self): + return len(self.workers) > 1 diff --git a/narwhal/benchmark/benchmark/logs.py b/narwhal/benchmark/benchmark/logs.py index 55f8091a954ee..c29e77d3dcaac 100644 --- a/narwhal/benchmark/benchmark/logs.py +++ b/narwhal/benchmark/benchmark/logs.py @@ -21,7 +21,12 @@ def __init__(self, clients, primaries, workers, faults=0): assert all(x for x in inputs) self.faults = faults - self.committee_size = len(primaries) + faults + if isinstance(faults, int): + self.committee_size = len(primaries) + int(faults) + self.workers = len(workers) // len(primaries) + else: + self.committee_size = '?' + self.workers = '?' # Parse the clients logs. try: @@ -39,7 +44,7 @@ def __init__(self, clients, primaries, workers, faults=0): results = p.map(self._parse_primaries, primaries) except (ValueError, IndexError, AttributeError) as e: raise ParseError(f'Failed to parse nodes\' logs: {e}') - proposals, commits, self.configs = zip(*results) + proposals, commits, self.configs, primary_ips = zip(*results) self.proposals = self._merge_results([x.items() for x in proposals]) self.commits = self._merge_results([x.items() for x in commits]) @@ -49,11 +54,14 @@ def __init__(self, clients, primaries, workers, faults=0): results = p.map(self._parse_workers, workers) except (ValueError, IndexError, AttributeError) as e: raise ParseError(f'Failed to parse workers\' logs: {e}') - sizes, self.received_samples = zip(*results) + sizes, self.received_samples, workers_ips = zip(*results) self.sizes = { k: v for x in sizes for k, v in x.items() if k in self.commits } + # Determine whether the primary and the workers are collocated. + self.collocate = set(primary_ips) == set(workers_ips) + # Check whether clients missed their target rate. if self.misses != 0: Print.warn( @@ -121,8 +129,10 @@ def _parse_primaries(self, log): search(r'Max batch delay .* (\d+)', log).group(1) ), } + + ip = search(r'booted on (\d+.\d+.\d+.\d+)', log).group(1) - return proposals, commits, configs + return proposals, commits, configs, ip def _parse_workers(self, log): if search(r'(?:panic|Error)', log) is not None: @@ -134,7 +144,9 @@ def _parse_workers(self, log): tmp = findall(r'Batch ([^ ]+) contains sample tx (\d+)', log) samples = {int(s): d for d, s in tmp} - return sizes, samples + ip = search(r'booted on (\d+.\d+.\d+.\d+)', log).group(1) + + return sizes, samples, ip def _to_posix(self, string): x = datetime.fromisoformat(string.replace('Z', '+00:00')) @@ -195,10 +207,12 @@ def result(self): ' SUMMARY:\n' '-----------------------------------------\n' ' + CONFIG:\n' - f' Committee size: {self.committee_size:,} node(s)\n' + f' Faults: {self.faults} node(s)\n' + f' Committee size: {self.committee_size} node(s)\n' + f' Worker(s) per node: {self.workers} worker(s)\n' + f' Collocate primary and workers: {self.collocate}\n' f' Input rate: {sum(self.rate):,} tx/s\n' f' Transaction size: {self.size[0]:,} B\n' - f' Faults: {self.faults:,} node(s)\n' f' Execution time: {round(duration):,} s\n' '\n' f' Header size: {header_size:,} B\n' diff --git a/narwhal/benchmark/benchmark/plot.py b/narwhal/benchmark/benchmark/plot.py index 27e0c9a449cc3..e5b2ce43a2f09 100644 --- a/narwhal/benchmark/benchmark/plot.py +++ b/narwhal/benchmark/benchmark/plot.py @@ -1,7 +1,8 @@ # Copyright(C) Facebook, Inc. and its affiliates. +from collections import defaultdict from re import findall, search, split import matplotlib.pyplot as plt -from matplotlib.ticker import StrMethodFormatter +import matplotlib.ticker as tick from glob import glob from itertools import cycle @@ -10,6 +11,30 @@ from benchmark.aggregate import LogAggregator +@tick.FuncFormatter +def default_major_formatter(x, pos): + if pos is None: + return + if x >= 1_000: + return f'{x/1000:.0f}k' + else: + return f'{x:.0f}' + + +@tick.FuncFormatter +def sec_major_formatter(x, pos): + if pos is None: + return + return f'{float(x)/1000:.1f}' + + +@tick.FuncFormatter +def mb_major_formatter(x, pos): + if pos is None: + return + return f'{x:,.0f}' + + class PlotError(Exception): pass @@ -69,21 +94,25 @@ def _plot(self, x_label, y_label, y_axis, z_axis, type): linestyle='dotted', marker=next(markers), capsize=3 ) - plt.legend(loc='lower center', bbox_to_anchor=(0.5, 1), ncol=2) + plt.legend(loc='lower center', bbox_to_anchor=(0.5, 1), ncol=3) plt.xlim(xmin=0) plt.ylim(bottom=0) - plt.xlabel(x_label) - plt.ylabel(y_label[0]) + plt.xlabel(x_label, fontweight='bold') + plt.ylabel(y_label[0], fontweight='bold') + plt.xticks(weight='bold') + plt.yticks(weight='bold') plt.grid() ax = plt.gca() - ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}')) - ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}')) + ax.xaxis.set_major_formatter(default_major_formatter) + ax.yaxis.set_major_formatter(default_major_formatter) + if 'latency' in type: + ax.yaxis.set_major_formatter(sec_major_formatter) if len(y_label) > 1: secaxy = ax.secondary_yaxis( 'right', functions=(self._tps2bps, self._bps2tps) ) secaxy.set_ylabel(y_label[1]) - secaxy.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}')) + secaxy.yaxis.set_major_formatter(mb_major_formatter) for x in ['pdf', 'png']: plt.savefig(PathMaker.plot_file(type, x), bbox_inches='tight') @@ -95,6 +124,13 @@ def nodes(data): faults = f'({f} faulty)' if f != '0' else '' return f'{x} nodes {faults}' + @staticmethod + def workers(data): + x = search(r'Workers per node: (\d+)', data).group(1) + f = search(r'Faults: (\d+)', data).group(1) + faults = f'({f} faulty)' if f != '0' else '' + return f'{x} workers {faults}' + @staticmethod def max_latency(data): x = search(r'Max latency: (\d+)', data).group(1) @@ -103,31 +139,31 @@ def max_latency(data): return f'Max latency: {float(x) / 1000:,.1f} s {faults}' @classmethod - def plot_robustness(cls, files): + def plot_latency(cls, files, scalability): assert isinstance(files, list) assert all(isinstance(x, str) for x in files) - z_axis = cls.nodes - x_label = 'Input rate (tx/s)' - y_label = ['Throughput (tx/s)', 'Throughput (MB/s)'] + z_axis = cls.workers if scalability else cls.nodes + x_label = 'Throughput (tx/s)' + y_label = ['Latency (s)'] ploter = cls(files) - ploter._plot(x_label, y_label, ploter._tps, z_axis, 'robustness') + ploter._plot(x_label, y_label, ploter._latency, z_axis, 'latency') @classmethod - def plot_latency(cls, files): + def plot_robustness(cls, files, scalability): assert isinstance(files, list) assert all(isinstance(x, str) for x in files) - z_axis = cls.nodes - x_label = 'Throughput (tx/s)' - y_label = ['Latency (ms)'] + z_axis = cls.workers if scalability else cls.nodes + x_label = 'Input rate (tx/s)' + y_label = ['Throughput (tx/s)', 'Throughput (MB/s)'] ploter = cls(files) - ploter._plot(x_label, y_label, ploter._latency, z_axis, 'latency') + ploter._plot(x_label, y_label, ploter._tps, z_axis, 'robustness') @classmethod - def plot_tps(cls, files): + def plot_tps(cls, files, scalability): assert isinstance(files, list) assert all(isinstance(x, str) for x in files) z_axis = cls.max_latency - x_label = 'Committee size' + x_label = 'Workers per node' if scalability else 'Committee size' y_label = ['Throughput (tx/s)', 'Throughput (MB/s)'] ploter = cls(files) ploter._plot(x_label, y_label, ploter._tps, z_axis, 'tps') @@ -142,24 +178,48 @@ def plot(cls, params_dict): # Aggregate the logs. LogAggregator(params.max_latency).print() - # Load the aggregated log files. - robustness_files, latency_files, tps_files = [], [], [] - tx_size = params.tx_size - + # Make the latency, tps, and robustness graphs. + iterator = params.workers if params.scalability() else params.nodes + latency_files, robustness_files, tps_files = [], [], [] for f in params.faults: - for n in params.nodes: - robustness_files += glob( - PathMaker.agg_file('robustness', n, 'x', tx_size, f, 'any') - ) + for x in iterator: latency_files += glob( - PathMaker.agg_file('latency', n, 'any', tx_size, f, 'any') + PathMaker.agg_file( + 'latency', + f, + x if not params.scalability() else params.nodes[0], + x if params.scalability() else params.workers[0], + params.collocate, + 'any', + params.tx_size, + ) ) + robustness_files += glob( + PathMaker.agg_file( + 'robustness', + f, + x if not params.scalability() else params.nodes[0], + x if params.scalability() else params.workers[0], + params.collocate, + 'x', + params.tx_size, + ) + ) + for l in params.max_latency: tps_files += glob( - PathMaker.agg_file('tps', 'x', 'any', tx_size, f, l) + PathMaker.agg_file( + 'tps', + f, + 'x' if not params.scalability() else params.nodes[0], + 'x' if params.scalability() else params.workers[0], + params.collocate, + 'any', + params.tx_size, + max_latency=l + ) ) - # Make the plots. - cls.plot_robustness(robustness_files) - cls.plot_latency(latency_files) - cls.plot_tps(tps_files) + cls.plot_latency(latency_files, params.scalability()) + cls.plot_tps(tps_files, params.scalability()) + cls.plot_robustness(latency_files, params.scalability()) diff --git a/narwhal/benchmark/benchmark/utils.py b/narwhal/benchmark/benchmark/utils.py index 70177e69208b4..671d478de0ec0 100644 --- a/narwhal/benchmark/benchmark/utils.py +++ b/narwhal/benchmark/benchmark/utils.py @@ -65,10 +65,10 @@ def results_path(): return 'results' @staticmethod - def result_file(nodes, workers, rate, tx_size, faults): + def result_file(faults, nodes, workers, collocate, rate, tx_size): return join( PathMaker.results_path(), - f'bench-{nodes}-{workers}-{rate}-{tx_size}-{faults}.txt' + f'bench-{faults}-{nodes}-{workers}-{collocate}-{rate}-{tx_size}.txt' ) @staticmethod @@ -76,11 +76,12 @@ def plots_path(): return 'plots' @staticmethod - def agg_file(type, nodes, rate, tx_size, faults, max_latency): - return join( - PathMaker.plots_path(), - f'{type}-{nodes}-{rate}-{tx_size}-{faults}-{max_latency}.txt' - ) + def agg_file(type, faults, nodes, workers, collocate, rate, tx_size, max_latency=None): + if max_latency is None: + name = f'{type}-bench-{faults}-{nodes}-{workers}-{collocate}-{rate}-{tx_size}.txt' + else: + name = f'{type}-{max_latency}-bench-{faults}-{nodes}-{workers}-{collocate}-{rate}-{tx_size}.txt' + return join(PathMaker.plots_path(), name) @staticmethod def plot_file(name, ext): diff --git a/narwhal/benchmark/fabfile.py b/narwhal/benchmark/fabfile.py index 9766045940699..f2feaf85a2924 100644 --- a/narwhal/benchmark/fabfile.py +++ b/narwhal/benchmark/fabfile.py @@ -13,21 +13,21 @@ def local(ctx, debug=True): ''' Run benchmarks on localhost ''' bench_params = { + 'faults': 0, 'nodes': 4, 'workers': 1, 'rate': 50_000, 'tx_size': 512, - 'faults': 0, 'duration': 20, } node_params = { - 'header_size': 1_000, - 'max_header_delay': 100, - 'gc_depth': 50, - 'sync_retry_delay': 10_000, - 'sync_retry_nodes': 3, - 'batch_size': 500_000, - 'max_batch_delay': 100 + 'header_size': 1_000, # bytes + 'max_header_delay': 200, # ms + 'gc_depth': 50, # rounds + 'sync_retry_delay': 10_000, # ms + 'sync_retry_nodes': 3, # number of nodes + 'batch_size': 500_000, # bytes + 'max_batch_delay': 200 # ms } try: ret = LocalBench(bench_params, node_params).run(debug) @@ -94,23 +94,23 @@ def install(ctx): def remote(ctx, debug=False): ''' Run benchmarks on AWS ''' bench_params = { - 'nodes': [20, 50], + 'faults': 3, + 'nodes': [10], 'workers': 1, 'collocate': True, - 'rate': [50_000, 150_000], + 'rate': [10_000, 110_000], 'tx_size': 512, - 'faults': 0, 'duration': 300, 'runs': 2, } node_params = { - 'header_size': 1_000, - 'max_header_delay': 200, - 'gc_depth': 50, - 'sync_retry_delay': 10_000, - 'sync_retry_nodes': 3, - 'batch_size': 500_000, - 'max_batch_delay': 200 + 'header_size': 1_000, # bytes + 'max_header_delay': 200, # ms + 'gc_depth': 50, # rounds + 'sync_retry_delay': 10_000, # ms + 'sync_retry_nodes': 3, # number of nodes + 'batch_size': 500_000, # bytes + 'max_batch_delay': 200 # ms } try: Bench(ctx).run(bench_params, node_params, debug) @@ -122,10 +122,12 @@ def remote(ctx, debug=False): def plot(ctx): ''' Plot performance using the logs generated by "fab remote" ''' plot_params = { - 'nodes': [10, 20], + 'faults': [0, 1, 3], + 'nodes': [10], + 'workers': [1], + 'collocate': True, 'tx_size': 512, - 'faults': [0], - 'max_latency': [2_000, 5_000] + 'max_latency': [3_500, 4_500] } try: Ploter.plot(plot_params) @@ -146,6 +148,6 @@ def kill(ctx): def logs(ctx): ''' Print a summary of the logs ''' try: - print(LogParser.process('./logs').result()) + print(LogParser.process('./logs', faults='?').result()) except ParseError as e: Print.error(BenchError('Failed to parse logs', e)) diff --git a/narwhal/benchmark/settings.json b/narwhal/benchmark/settings.json new file mode 100644 index 0000000000000..d13629f22b0b5 --- /dev/null +++ b/narwhal/benchmark/settings.json @@ -0,0 +1,16 @@ +{ + "key": { + "name": "aws-fb", + "path": "/Users/asonnino/.ssh/aws-fb.pem" + }, + "port": 5000, + "repo": { + "name": "narwhal", + "url": "https://github.com/facebookresearch/narwhal", + "branch": "master" + }, + "instances": { + "type": "m5d.8xlarge", + "regions": ["us-east-1", "eu-north-1", "ap-southeast-2", "us-west-1", "ap-northeast-1"] + } +} diff --git a/narwhal/config/src/lib.rs b/narwhal/config/src/lib.rs index 122b54ca79eb0..b7fed7bb8a7d2 100644 --- a/narwhal/config/src/lib.rs +++ b/narwhal/config/src/lib.rs @@ -68,8 +68,7 @@ pub struct Parameters { pub max_header_delay: u64, /// The depth of the garbage collection (Denominated in number of rounds). pub gc_depth: u64, - /// The delay after which the synchronizer retries to send sync requests. - /// Denominated in ms. + /// The delay after which the synchronizer retries to send sync requests. Denominated in ms. pub sync_retry_delay: u64, /// Determine with how many nodes to sync when re-trying to send sync-request. These nodes /// are picked at random from the committee. diff --git a/narwhal/consensus/src/lib.rs b/narwhal/consensus/src/lib.rs index fdfec0231df0f..50a903a6fbfa7 100644 --- a/narwhal/consensus/src/lib.rs +++ b/narwhal/consensus/src/lib.rs @@ -181,7 +181,7 @@ impl Consensus { info!("Committed {}", certificate.header); #[cfg(feature = "benchmark")] - for (digest, _) in &certificate.header.payload { + for digest in certificate.header.payload.keys() { // NOTE: This log entry is used to compute performance. info!("Committed {} -> {:?}", certificate.header, digest); } diff --git a/narwhal/consensus/src/tests/consensus_tests.rs b/narwhal/consensus/src/tests/consensus_tests.rs index 3de8fe2535374..de492d5d87b56 100644 --- a/narwhal/consensus/src/tests/consensus_tests.rs +++ b/narwhal/consensus/src/tests/consensus_tests.rs @@ -61,7 +61,7 @@ fn make_certificates( start: Round, stop: Round, initial_parents: &BTreeSet, - keys: &Vec, + keys: &[PublicKey], ) -> (VecDeque, BTreeSet) { let mut certificates = VecDeque::new(); let mut parents = initial_parents.iter().cloned().collect::>(); @@ -185,7 +185,7 @@ async fn not_enough_support() { let mut certificates = VecDeque::new(); // Round 1: Fully connected graph. - let nodes = keys.iter().cloned().take(3).collect(); + let nodes: Vec<_> = keys.iter().cloned().take(3).collect(); let (out, parents) = make_certificates(1, 1, &genesis, &nodes); certificates.extend(out); @@ -194,7 +194,7 @@ async fn not_enough_support() { let (leader_2_digest, certificate) = mock_certificate(keys[0], 2, parents.clone()); certificates.push_back(certificate); - let nodes = keys.iter().cloned().skip(1).collect(); + let nodes: Vec<_> = keys.iter().cloned().skip(1).collect(); let (out, mut parents) = make_certificates(2, 2, &parents, &nodes); certificates.extend(out); @@ -220,7 +220,7 @@ async fn not_enough_support() { parents = next_parents.clone(); // Rounds 4, 5, and 6: Fully connected graph. - let nodes = keys.iter().cloned().take(3).collect(); + let nodes: Vec<_> = keys.iter().cloned().take(3).collect(); let (out, parents) = make_certificates(4, 6, &parents, &nodes); certificates.extend(out); @@ -279,7 +279,7 @@ async fn missing_leader() { let mut certificates = VecDeque::new(); // Remove the leader for rounds 1 and 2. - let nodes = keys.iter().cloned().skip(1).collect(); + let nodes: Vec<_> = keys.iter().cloned().skip(1).collect(); let (out, parents) = make_certificates(1, 2, &genesis, &nodes); certificates.extend(out); diff --git a/narwhal/network/src/tests/common.rs b/narwhal/network/src/tests/common.rs index dfc8557da0d29..38e27b0b2eb97 100644 --- a/narwhal/network/src/tests/common.rs +++ b/narwhal/network/src/tests/common.rs @@ -18,7 +18,7 @@ pub fn listener(address: SocketAddr, expected: String) -> JoinHandle<()> { assert_eq!(received, expected); writer.send(Bytes::from("Ack")).await.unwrap() } - _ => assert!(false, "Failed to receive network message"), + _ => panic!("Failed to receive network message"), } }) } diff --git a/narwhal/network/src/tests/receiver_tests.rs b/narwhal/network/src/tests/receiver_tests.rs index 3098c5410256a..9f32178f05aa7 100644 --- a/narwhal/network/src/tests/receiver_tests.rs +++ b/narwhal/network/src/tests/receiver_tests.rs @@ -30,7 +30,7 @@ async fn receive() { // Make the network receiver. let address = "127.0.0.1:4000".parse::().unwrap(); let (tx, mut rx) = channel(1); - Receiver::spawn(address.clone(), TestHandler { deliver: tx }); + Receiver::spawn(address, TestHandler { deliver: tx }); sleep(Duration::from_millis(50)).await; // Send a message. diff --git a/narwhal/primary/README.md b/narwhal/primary/README.md new file mode 100644 index 0000000000000..6358759b87726 --- /dev/null +++ b/narwhal/primary/README.md @@ -0,0 +1,6 @@ +# Primary Diagram +The diagram below illustrates the primary's architecture and could be useful to keep in mind while going through the code. + +

+ +

diff --git a/narwhal/primary/src/garbage_collector.rs b/narwhal/primary/src/garbage_collector.rs index 612ef7deeed14..0c0a0352fd757 100644 --- a/narwhal/primary/src/garbage_collector.rs +++ b/narwhal/primary/src/garbage_collector.rs @@ -56,7 +56,7 @@ impl GarbageCollector { let round = certificate.round(); if round > last_committed_round { last_committed_round = round; - + // Trigger cleanup on the primary. self.consensus_round.store(round, Ordering::Relaxed); diff --git a/narwhal/primary/src/primary.rs b/narwhal/primary/src/primary.rs index 9e85bf90fac46..8ba317fe2741e 100644 --- a/narwhal/primary/src/primary.rs +++ b/narwhal/primary/src/primary.rs @@ -200,6 +200,7 @@ impl Primary { // The `Helper` is dedicated to reply to certificates requests from other primaries. Helper::spawn(committee.clone(), store, rx_cert_requests); + // NOTE: This log entry is used to compute performance. info!( "Primary {} successfully booted on {}", name, diff --git a/narwhal/primary/src/proposer.rs b/narwhal/primary/src/proposer.rs index eb5a6d6054d0e..5afaa96568e79 100644 --- a/narwhal/primary/src/proposer.rs +++ b/narwhal/primary/src/proposer.rs @@ -91,7 +91,7 @@ impl Proposer { debug!("Created {:?}", header); #[cfg(feature = "benchmark")] - for (digest, _) in &header.payload { + for digest in header.payload.keys() { // NOTE: This log entry is used to compute performance. info!("Created {} -> {:?}", header, digest); } diff --git a/narwhal/primary/src/tests/core_tests.rs b/narwhal/primary/src/tests/core_tests.rs index 600731154c087..746ad925ea38a 100644 --- a/narwhal/primary/src/tests/core_tests.rs +++ b/narwhal/primary/src/tests/core_tests.rs @@ -76,7 +76,7 @@ async fn process_header() { let received = handle.await.unwrap(); match bincode::deserialize(&received).unwrap() { PrimaryMessage::Vote(x) => assert_eq!(x, expected), - x => assert!(false, "Unexpected message: {:?}", x), + x => panic!("Unexpected message: {:?}", x), } // Ensure the header is correctly stored. @@ -277,7 +277,7 @@ async fn process_votes() { for received in try_join_all(handles).await.unwrap() { match bincode::deserialize(&received).unwrap() { PrimaryMessage::Certificate(x) => assert_eq!(x, expected), - x => assert!(false, "Unexpected message: {:?}", x), + x => panic!("Unexpected message: {:?}", x), } } } diff --git a/narwhal/worker/README.md b/narwhal/worker/README.md new file mode 100644 index 0000000000000..970ac919d8460 --- /dev/null +++ b/narwhal/worker/README.md @@ -0,0 +1,6 @@ +# Worker Diagram +The diagram below illustrates the worker's architecture and could be useful to keep in mind while going through the code. + +

+ +

diff --git a/narwhal/worker/src/tests/batch_maker_tests.rs b/narwhal/worker/src/tests/batch_maker_tests.rs index d35f5db948e64..2aff2423dc5ce 100644 --- a/narwhal/worker/src/tests/batch_maker_tests.rs +++ b/narwhal/worker/src/tests/batch_maker_tests.rs @@ -27,7 +27,7 @@ async fn make_batch() { let QuorumWaiterMessage { batch, handlers: _ } = rx_message.recv().await.unwrap(); match bincode::deserialize(&batch).unwrap() { WorkerMessage::Batch(batch) => assert_eq!(batch, expected_batch), - _ => assert!(false, "Unexpected message"), + _ => panic!("Unexpected message"), } } @@ -54,6 +54,6 @@ async fn batch_timeout() { let QuorumWaiterMessage { batch, handlers: _ } = rx_message.recv().await.unwrap(); match bincode::deserialize(&batch).unwrap() { WorkerMessage::Batch(batch) => assert_eq!(batch, expected_batch), - _ => assert!(false, "Unexpected message"), + _ => panic!("Unexpected message"), } } diff --git a/narwhal/worker/src/tests/common.rs b/narwhal/worker/src/tests/common.rs index 52f8d87a5d940..03c1d209fff3c 100644 --- a/narwhal/worker/src/tests/common.rs +++ b/narwhal/worker/src/tests/common.rs @@ -122,7 +122,7 @@ pub fn listener(address: SocketAddr, expected: Option) -> JoinHandle<()> assert_eq!(received.freeze(), expected); } } - _ => assert!(false, "Failed to receive network message"), + _ => panic!("Failed to receive network message"), } }) } diff --git a/narwhal/worker/src/tests/synchronizer_tests.rs b/narwhal/worker/src/tests/synchronizer_tests.rs index cf119e1f938d6..43cbcbc4b501b 100644 --- a/narwhal/worker/src/tests/synchronizer_tests.rs +++ b/narwhal/worker/src/tests/synchronizer_tests.rs @@ -20,7 +20,7 @@ async fn synchronize() { // Spawn a `Synchronizer` instance. Synchronizer::spawn( - name.clone(), + name, id, committee.clone(), store.clone(), diff --git a/narwhal/worker/src/worker.rs b/narwhal/worker/src/worker.rs index 23e8c01a25241..9aa7be873bf70 100644 --- a/narwhal/worker/src/worker.rs +++ b/narwhal/worker/src/worker.rs @@ -85,6 +85,7 @@ impl Worker { rx_primary, ); + // NOTE: This log entry is used to compute performance. info!( "Worker {} successfully booted on {}", id,