-
Notifications
You must be signed in to change notification settings - Fork 93
/
blobstore.proto
908 lines (812 loc) · 40.5 KB
/
blobstore.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
syntax = "proto3";
package buildbarn.configuration.blobstore;
import "google/rpc/status.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "pkg/proto/configuration/blockdevice/blockdevice.proto";
import "pkg/proto/configuration/cloud/aws/aws.proto";
import "pkg/proto/configuration/cloud/gcp/gcp.proto";
import "pkg/proto/configuration/digest/digest.proto";
import "pkg/proto/configuration/grpc/grpc.proto";
import "pkg/proto/configuration/http/http.proto";
option go_package = "github.com/buildbarn/bb-storage/pkg/proto/configuration/blobstore";
// Storage configuration for Bazel Buildbarn.
message BlobstoreConfiguration {
// Storage configuration for the Content Addressable Storage (CAS).
BlobAccessConfiguration content_addressable_storage = 1;
// Storage configuration for the Action Cache (AC).
BlobAccessConfiguration action_cache = 2;
}
message BlobAccessConfiguration {
oneof backend {
// Cache reads from a slow remote storage backend into a fast
// local storage backend.
ReadCachingBlobAccessConfiguration read_caching = 4;
// Read objects from/write objects to a GRPC service that
// implements the remote execution protocol.
buildbarn.configuration.grpc.ClientConfiguration grpc = 7;
// Always fail with a fixed error response.
google.rpc.Status error = 8;
// Fan out requests across multiple storage backends to spread
// out load.
ShardingBlobAccessConfiguration sharding = 9;
// Store blobs in two backends. Blobs present in exactly one backend
// are automatically replicated to the other backend.
//
// This backend does not guarantee high availability, as it does not
// function in case one backend is unavailable. Crashed backends
// need to be replaced with functional empty instances. These will
// be refilled automatically.
MirroredBlobAccessConfiguration mirrored = 14;
// Store blobs on the local system.
LocalBlobAccessConfiguration local = 15;
// Cache knowledge of which blobs exist locally.
//
// Bazel doesn't have a client-side cache with knowledge on which
// objects are present inside a remote cache. This means that it
// will often call ContentAddressableStorage.FindMissingBlobs() with
// sets that have a strong overlap with what was requested
// previously.
//
// This decorator can be used to introduce such a cache server side.
// It is especially useful for multi-level storage setups. It can
// cause a reduction in load on storage nodes when this cache
// enabled on frontend nodes.
//
// It only makes sense to use this decorator for the Content
// Addressable Storage, as FindMissingBlobs() is never called
// against the Action Cache. The storage backend must also be robust
// enough to guarantee that objects don't disappear shortly after
// calling ContentAddressableStorage.FindMissingBlobs(), as that
// would cause this decorator to cache invalid data.
ExistenceCachingBlobAccessConfiguration existence_caching = 16;
// Only return ActionResult messages for which all output files are
// present in the Content Addressable Storage (CAS). Certain
// clients, such as Bazel, require the use of this decorator. To
// reduce latency, it is advised that this decorator is used at the
// lowest level that has a full view of the entire CAS.
//
// This decorator must be placed on the Action Cache.
CompletenessCheckingBlobAccessConfiguration completeness_checking = 17;
// Fall back to reading data from a secondary backend when not found
// in the primary backend. Data is written to the primary backend only.
//
// This backend can be used to integrate external data sets into the
// system, e.g. by combining it with reference_expanding.
ReadFallbackBlobAccessConfiguration read_fallback = 18;
// Load Reference messages from an Indirect Content Addressable
// Storage (ICAS). Expand them by fetching the object from the
// location stored in the Reference message. This backend is only
// supported for the CAS.
//
// This backend can be used to integrate external data sets into the
// system by combining it with read_fallback.
ReferenceExpandingBlobAccessConfiguration reference_expanding = 19;
// Demultiplex requests across multiple storage backends, based on
// the instance name prefix.
//
// The logic for matching incoming requests and mutating the
// instance name in outgoing requests is identical to bb_storage's
// 'schedulers' configuration option.
DemultiplexingBlobAccessConfiguration demultiplexing = 20;
// Read objects using instance names in a hierarchical fashion. This
// means that if an object is written using instance name "foo/bar",
// it will be possible to read it using instance names "foo/bar",
// "foo/bar/baz", "foo/bar/baz/qux", but not instance names "",
// "foo", "foo/xyzzy". In other words, non-empty instance names will
// have contents inherited from their parent instance names. In case
// multiple instance names contain an object of a given digest, the
// one with the longest instance name is preferred.
//
// For the Action Cache (AC), it is recommended that this decorator
// is placed above CompletenessCheckingBlobAccess. This ensures that
// resolution continues, even if one or more instance names store an
// incomplete ActionResult.
//
// For every read operation, this decorator may generate a linear
// number of operations against the backend, based on the number of
// components in the instance name. This is acceptable for
// low-throughput data stores such as the Action Cache (AC) and
// Initial Size Class Cache (ISCC). For the Content Addressable
// Storage (CAS), this approach tends to be too inefficient. For the
// CAS, it would also be better to prefer the object with the
// shortest instance name, so that sharing of data between instance
// names is maximised. This is why this implementation does not
// allow enabling this option for the CAS. It is recommended that
// the LocalBlobAccessConfiguration.hierarchical_instance_names
// option is used instead.
BlobAccessConfiguration hierarchical_instance_names = 21;
// Hide ActionResult messages in the Action Cache (AC) where the
// 'worker_completed_timestamp' field in the ExecutedActionMetadata
// is too far in the past. This decorator can be used to ensure that
// all targets are rebuilt periodically.
ActionResultExpiringBlobAccessConfiguration action_result_expiring = 22;
// Send read traffic to a read-only replica, while sending write
// traffic to a source of truth. Read traffic may be sent to the
// source of truth if the replica is unavailable.
//
// By default, all requests are sent to the source. For read
// requests, this backend periodically sends a single canary request
// to the replica. Upon success, all subsequent read requests are
// sent to the replica as well. Upon failure, all requests will
// continue to go to the source.
//
// Only infrastructure errors (RPCs failing with INTERNAL,
// UNAVAILABLE and UNKNOWN) are considered failures.
ReadCanaryingBlobAccessConfiguration read_canarying = 23;
// Read objects from a ZIP file. Example use cases of this backend
// include the following:
//
// - When used in combination with ReadFallbackBlobAccess, it may be
// used to augment a data store with a set of objects that are
// guaranteed to remain present.
// - It may be used to access historical build actions that have
// been archived, so that they can be inspected or rerun.
//
// If this backend is used as a Content Addressable Storage (CAS),
// it will search for files named:
//
// ${digestFunction}-${hash}-${sizeBytes}
//
// For other storage types it will search for files named:
//
// ${digestFunction}-${hash}-${sizeBytes}-${instanceName}
ZIPBlobAccessConfiguration zip_reading = 24;
// Write objects to an uncompressed ZIP file. The resulting ZIP
// files can be read back using the 'zip_reading' option.
//
// This backend does not support reopening existing ZIP files. ZIP
// files will always be truncated upon startup. The trailing central
// directory is only written upon graceful termination, meaning that
// interrupting execution will create a malformed ZIP file.
ZIPBlobAccessConfiguration zip_writing = 25;
// Prevent repetition in the BlobAccess configuration by introducing
// one or more BlobAccess objects that can later be referred to
// using string labels.
//
// This option does not introduce new kind of actual backend; it's
// merely present to allow creating BlobAccess setups that are DAG
// (Directed Acyclic Graph) shaped, as opposed to just trees.
WithLabelsBlobAccessConfiguration with_labels = 26;
// Refer to a BlobAccess object declared through 'with_labels'.
string label = 27;
}
// Was 'redis'. Instead of using Redis, one may run a separate
// instance of bb_storage that uses the 'local' backend.
reserved 2;
// Was 'http'. Using HTTP as a transport for REv2 is suboptimal, as
// it does not provide any facilities for performing batch existence
// checks. Please use 'grpc' instead.
reserved 3;
// Was 'size_distinguishing'. This was mainly of use with the
// initial versions of Buildbarn, where the recommended approach for
// storing CAS objects was to store small objects in Redis and large
// objects in S3. Unlike the 'redis' backend, the 'local' backend is
// capable of storing objects of any size.
reserved 5;
// Was 'circular' (CircularBlobAccess). This backend has been replaced
// by 'local' (LocalBlobAccess).
reserved 6;
// Was 'cloud' (CloudBlobAccess for systems such as S3 and GCS). This
// backend has been removed for several reasons:
//
// - Compared to other storage backends, its time to first byte (TTFB)
// was relatively high, making it unattractive for storing
// everything but large Content Addressable Storage (CAS) objects.
// - The lack of efficient bulk operations meant that
// FindMissingBlobs() performance was very poor.
// - The consistency guarantees provided by many bucket
// implementations, most notably Amazon S3, are too weak for build
// clients to function properly.
//
// Users are instructed to migrate to LocalBlobAccess in combination
// with ShardingBlobAccess and MirroredBlobAccess. More details can be
// found in the following Architecture Decision Record (ADR):
//
// https://github.com/buildbarn/bb-adrs/blob/master/0002-storage.md
//
// If S3 was mainly used to integrate existing large corpora into the
// CAS, it may be sufficient to use ReferenceExpandingBlobAccess
// instead. More details about that can be found in this ADR:
//
// https://github.com/buildbarn/bb-adrs/blob/master/0004-icas.md
reserved 10;
}
message ReadCachingBlobAccessConfiguration {
// A remote storage backend that can only be accessed slowly. This
// storage backend is treated as the source of truth. Write
// operations are forwarded to this backend.
BlobAccessConfiguration slow = 1;
// A local storage backend that can be accessed quickly. This
// storage backend is treated as a cache. Objects will only be
// written into it when requested for reading.
BlobAccessConfiguration fast = 2;
// The replication strategy that should be used to copy objects from
// the slow backend to the fast backend.
BlobReplicatorConfiguration replicator = 3;
}
message ShardingBlobAccessConfiguration {
message Shard {
// Storage backend that is used by this shard. Omitting this
// causes the implementation to assume this shard is drained.
// Requests to this shard will be spread out across the other
// shards.
BlobAccessConfiguration backend = 1;
// Non-zero ratio of how many keys are allocated to this shard.
// When all shards have equal specifications (i.e., capacity and
// bandwidth), every shard may have a weight of one.
//
// For the backend selection algorithm to run quickly, it is not
// not advised to let the total weight of drained backends
// strongly exceed the total weight of undrained ones.
uint32 weight = 2;
}
// Initialization for the hashing algorithm used to partition the
// key space. This should be a random 64-bit value that is unique to
// this deployment. Failure to do so may result in poor distribution
// in case sharding is nested.
//
// Changing this value will in effect cause a full repartitioning of
// the data.
uint64 hash_initialization = 1;
// Shards to which requests are routed. To reduce the need for full
// repartitioning of the data when growing a cluster, it's possible
// to terminate this list with a drained backend that increases the
// total weight up to a given number. Newly added backends may
// allocate their weight from this backend, thereby causing most of
// the keyspace to still be routed to its original backend.
repeated Shard shards = 2;
}
message MirroredBlobAccessConfiguration {
// Primary backend.
BlobAccessConfiguration backend_a = 1;
// Secondary backend.
BlobAccessConfiguration backend_b = 2;
// The replication strategy that should be used to copy objects from
// the primary backend to the secondary backend in case of
// inconsistencies.
BlobReplicatorConfiguration replicator_a_to_b = 3;
// The replication strategy that should be used to copy objects from
// the secondary backend to the primary backend in case of
// inconsistencies.
BlobReplicatorConfiguration replicator_b_to_a = 4;
}
// LocalBlobAccess stores all data onto disk in block sizes. A block
// cannot span multiple blocks, meaning that blocks generally need to
// be large in size (gigabytes). The number of blocks may be relatively
// low. For example, for a 512 GiB cache, it is acceptable to create 32
// blocks of 16 GiB in size.
//
// Blocks are partitioned into three groups based on their creation
// time, named "old", "current" and "new". Blobs provided to Put() will
// always be stored in a block in the "new" group. When the oldest block
// in the "new" group becomes full, it is moved to the "current" group.
// This causes the oldest block in the "current" group to be displaced
// to the "old" group. The oldest block in the "old" group is discarded.
//
// The difference between the "current" group and the "old" group is
// that data is refreshed when accessed. Data in the "old" group is at
// risk of being removed in the nearby future, which is why it needs to
// be copied into the "new" group when requested to be retained. Data
// in the "current" group is assumed to remain present for the time
// being, which is why it is left in place.
//
// Below is an illustration of how the blocks of data may be laid out at
// a given point in time. Every column of █ characters corresponds to a
// single block. The number of characters indicates the amount of data
// stored within.
//
// ← Over time, blocks move from "new" to "current" to "old" ←
//
// Old Current New
// █ █ █ █ │ █ █ █ █ █ █ █ █ │
// █ █ █ █ │ █ █ █ █ █ █ █ █ │
// █ █ █ █ │ █ █ █ █ █ █ █ █ │
// █ █ █ █ │ █ █ █ █ █ █ █ █ │
// █ █ █ █ │ █ █ █ █ █ █ █ █ │ █
// █ █ █ █ │ █ █ █ █ █ █ █ █ │ █
// █ █ █ █ │ █ █ █ █ █ █ █ █ │ █ █
// █ █ █ █ │ █ █ █ █ █ █ █ █ │ █ █ █
// ↓ ↓ ↓ ↓ ↑ ↑ ↑ ↑
// └─┴─┴─┴─────────────────────┴─┴─┴─┘
// Data gets copied from "old" to "new" when requested.
//
// Blobs get stored in blocks in the "new" group with an inverse
// exponential probability. This is done to reduce the probability of
// multiple block rotations close after each other, as this might put
// excessive pressure on the garbage collector. Because the placement
// distribution decreases rapidly, having more than three or four "new"
// blocks would be wasteful. Having fewer is also not recommended, as
// that increases the chance of placing objects that are used together
// inside the same block. This may cause 'tidal waves' of I/O whenever
// such data ends up in the "old" group at once.
//
// After initialization, there will be fewer blocks in the "current"
// group than configured, due to there simply being no data. This is
// compensated by adding more blocks to the "new" group. Unlike the
// regular blocks in this group, these will have a uniform placement
// distribution that is twice as high as normal. This is done to ensure
// the "current" blocks are randomly seeded to reduce 'tidal waves'
// later on.
//
// The number of blocks in the "old" group should not be too low, as
// this would cause this storage backend to become a FIFO instead of
// being LRU-like. Setting it too high is also not recommended, as this
// would increase redundancy in the data stored. The "current" group
// should likely be two or three times as large as the "old" group.
message LocalBlobAccessConfiguration {
// Was 'digest_location_map_size'. This option has been moved to
// 'key_location_map_in_memory.entries'.
reserved 1;
message KeyLocationMapInMemory {
// The key-location map is a hash table that is used by this storage
// backend to resolve digests to locations where data is stored.
// This option determines the size of this hash table. Because
// entries are small (about 64 bytes in size), it is recommended to
// make this map relatively large to reduce collisions.
//
// Recommended value: between 2 and 10 times the expected number of
// objects stored.
int64 entries = 1;
}
// Data store for the metadata of objects. The following Prometheus
// queries may be used to determine whether insertion into the
// key-location map caused other entries to be displaced prematurely:
//
// buildbarn_blobstore_hashing_key_location_map_put_iterations_count{outcome="TooManyAttempts"}
// buildbarn_blobstore_hashing_key_location_map_put_too_many_iterations_total
//
// If this query yields values greater than zero, you may need to
// increase this data store's size (or reduce the size of the blocks
// backend).
//
// Note that restarting bb_storage causes these metrics to be reset,
// meaning that you may need to run bb_storage for a longer amount of
// time to get better insight in whether objects are discarded
// prematurely.
oneof key_location_map_backend {
// Store the key-location map in memory.
KeyLocationMapInMemory key_location_map_in_memory = 11;
// Store the key-location map on a block device. The size of the
// block device determines the number of entries stored.
buildbarn.configuration.blockdevice.Configuration
key_location_map_on_block_device = 12;
}
// The number of indices a Get() call on the key-location map may
// attempt to access. The lower the utilization rate of the
// key-location map, the lower this value may be set. For example, if
// the size of the key-location map is set in such a way that it is
// only utilized by 10% (factor 0.1), setting this field to 16 means
// there is only a 0.1^16 chance that inserting an entry prematurely
// displaces another object from storage.
//
// Recommended value: 16
uint32 key_location_map_maximum_get_attempts = 2;
// The number of mutations that a Put() on the key-location map may
// perform. Because the key-location map uses a scheme similar to
// Robin Hood hashing, insertions may cause other entries to be
// displaced. Those entries may then cause even more entries to be
// displaced. Because of that, it is recommended to set this field to
// a small multiple of the maximum Get() attempts.
//
// Recommended value: 64
int64 key_location_map_maximum_put_attempts = 3;
// The number of blocks, where attempting to access any data stored
// within will cause it to be refreshed (i.e., copied into new
// blocks).
//
// Setting the number of old blocks too low may cause builds to fail,
// due to data disappearing prematurely. Setting the number of old
// blocks too high may cause an excessive amount of duplication in the
// data set. For example, if old_blocks == current_blocks + new_blocks,
// there may be a redundancy in the data set up to a factor of two.
//
// Recommended value: 8
int32 old_blocks = 5;
// The number of blocks, where attempting to access data stored within
// will not cause data to be refreshed immediately. The containing
// block will first need to become old for data to be eligible for
// refreshes.
//
// Recommended value: 24
int32 current_blocks = 6;
// The number of blocks where new data needs to be written. It is
// valid to set this to just 1. Setting it to a slightly higher value
// has the advantage that frequently used objects will over time get
// smeared out across the data set. This spreads out the cost
// refreshing data from old to new blocks.
//
// Because the probability of storing objects in new blocks has an
// inverse exponential distribution, it is not recommended to set this
// to any value higher than 4. Whereas the first new block will at
// times be somewhere between 50% and 100% full, the fourth new block
// will only be between 6.25% and 12.5% full, which is wasteful.
//
// Setting this to any value other than 1 is only supported for the
// Content Addressable Storage (CAS). Other storage types such as the
// Action Cache (AC) need to support updates to existing objects,
// which can only be done reliably if new objects are written into a
// single block.
//
// Recommended value: 3 for the CAS, 1 for other storage types.
int32 new_blocks = 7;
// Was 'instances'. This field no longer needs to be provided, as this
// storage backend is now capable of storing entries for arbitrary
// instance names transparently.
reserved 8;
message BlocksInMemory {
// Data is stored in a list of blocks. The total number of blocks
// constant over time, with small fluctuations to deal with lingering
// requests when removing a block. This option sets the size of an
// individual block.
//
// Recommended value: (total space available) /
// (old_blocks + current_blocks + new_blocks)
int64 block_size_bytes = 1;
}
message BlocksOnBlockDevice {
// The block device where data needs to be stored.
buildbarn.configuration.blockdevice.Configuration source = 1;
// To deal with lingering read requests, a small number of old
// blocks may need to be retained for a short period of time before
// being recycled to store new data. This option determines how many
// of such lingering blocks are allocated.
//
// Unlike in-memory storage, where the block size is configured
// explicitly, block device backed storage automatically infers an
// optimal block size. The block size is equal to:
//
// block_size = (size of block device) /
// (spare_blocks + old_blocks + current_blocks + new_blocks)
//
// Recommended value: 3
int32 spare_blocks = 2;
// When set, temporarily cache the integrity of data after it's been
// read from the block device. This is a requirement for being able
// to randomly access objects quickly.
//
// The disadvantage of enabling this option is that data corruption
// on the block device may not be detected. It is therefore
// recommended to set the cache duration to a limited value (e.g.,
// "4h").
buildbarn.configuration.digest.ExistenceCacheConfiguration
data_integrity_validation_cache = 3;
}
// Data store for the contents of objects. The following Prometheus
// query may be used to determine the worst-case retention of this
// data store in seconds:
//
// time() -
// buildbarn_blobstore_old_current_new_location_blob_map_last_removed_old_block_insertion_time_seconds
//
// If this query yields a value that is lower than desired, you may
// need to increase this data store's size.
//
// Note that restarting bb_storage causes this metric to be reset,
// meaning that you may need to run bb_storage for a longer amount of
// time to get better insight in the worst-case retention.
oneof blocks_backend {
// Store all data in memory. For larger setups, this may place a lot
// of pressure on Go's garbage collector. It may be necessary to
// reduce the value of GOGC to use this option reliably.
BlocksInMemory blocks_in_memory = 9;
// Store the blocks containing data on a block device.
BlocksOnBlockDevice blocks_on_block_device = 10;
}
message Persistent {
// Path to a directory on disk where metadata can be stored to be
// able to persist. This metadata needs to be reloaded on startup to
// be able to access previous data.
//
// This directory will hold a single file named "state", containing
// a Protobuf message of type
// buildbarn.blobstore.local.PersistentState. It is not recommended
// to use this directory for any purpose other than storing the
// persistent state file, as fsync() is called on it regularly.
string state_directory_path = 1;
// The amount of time between fsync() calls against the block device
// used to store blocks of data. Setting this option to a lower
// value reduces the amount of data that may get lost across
// restarts.
//
// This option acts as a lower bound on the amount of time between
// fsync() calls. No calls to fsync() are made if the system is
// idle, nor are multiple calls performed in parallel in case they
// take longer to complete than the configured interval.
//
// Care should be taken that this value is not set too low. Every
// epoch that still references valid data consumes 16 bytes of
// memory and increases the size of the state file by a similar
// amount. This means that if this option is set to '300s', epoch
// bookkeeping consumes up to 12*24*365*16 B = ~1.68 MB of space if
// the system were to operate for a full year without blocks being
// released. Setting this to '1s' blows this up by a factor 300.
//
// Recommended value: '300s'
google.protobuf.Duration minimum_epoch_interval = 2;
}
// When set, persist data across restarts. This feature is only
// available when both the key-location map and blocks are stored on a
// block device.
//
// When not set, data is not persisted. The data store will be empty
// every time the application is restarted. Existing entries in the
// key-location map and data in blocks will be ignored, even if their
// contents are valid.
Persistent persistent = 13;
// For all data stores except for the Content Addressable Storage
// (CAS), this storage backend always fully respects the REv2 instance
// name. This means that every instance name may store a separate copy
// of an object. Reads and writes are fully isolated.
//
// For the Content Addressable Storage, this option determines to what
// extent the instance name should be respected. When set to false,
// the instance name is completely ignored, meaning that all instance
// names share all objects. This is great from a performance point of
// view, as it means that users of multi-tenant setups need to upload
// objects less frequently, and that storage space usage is minimised.
// Unfortunately, it does mean that all tenants can access each
// other's objects once they get their hands on their digests.
//
// When this option is set to true, the instance name is respected in
// a hierarchical fashion. This means that if an object is written
// using instance name "foo/bar", it will be possible to read it using
// instance names "foo/bar", "foo/bar/baz", "foo/bar/baz/qux", but not
// instance names "", "foo", "foo/xyzzy". In other words, non-empty
// instance names will have Content Addressable Storage contents
// inherited from their parent instance names.
//
// This feature is implemented in such a way that object contents are
// still shared across all instance names. Enabling this option does
// not cause more data to be written into blocks, as uploads for
// objects that already exist under another instance name are treated
// as no-ops. It does cause at least a twofold increase in
// key-location map usage to track which instance name prefixes may
// access an object, proportional to the number of instance names
// used.
//
// This option is only supported for the Content Addressable Storage,
// as only for this data store it is safe to provide such behaviour at
// the individual storage node level. For the Action Cache, you may
// only want to do hierarchical instance name matching at a higher
// level, e.g., on top of CompletenessCheckingBlobAccess. This can be
// achieved by using HierarchicalInstanceNamesBlobAccess.
bool hierarchical_instance_names = 14;
}
message ExistenceCachingBlobAccessConfiguration {
// The backend for which results of
// ContentAddressableStorage.FindMissingBlobs() results need to be
// cached.
BlobAccessConfiguration backend = 1;
// Parameters for the cache data structure that is used by this
// decorator.
buildbarn.configuration.digest.ExistenceCacheConfiguration existence_cache =
2;
}
message CompletenessCheckingBlobAccessConfiguration {
// The Action Cache (AC) backend from which ActionResult messages are
// loaded.
BlobAccessConfiguration backend = 1;
// The maximum combined size of Tree objects that may be referenced by
// the ActionResult message. ActionResults having output directories
// whose combined size exceeds that exceeds this limit are suppressed
// (i.e., a NOT_FOUND error will be returned).
//
// This option places a limit on the amount of data that is read from
// the Content Addressable Storage (CAS) while processing a call to
// GetActionResult().
int64 maximum_total_tree_size_bytes = 2;
}
message ReadFallbackBlobAccessConfiguration {
// Backend from which data is attempted to be read first, and to which
// data is written.
BlobAccessConfiguration primary = 1;
// Backend from which data is attempted to be read last.
BlobAccessConfiguration secondary = 2;
// The replication strategy that should be used to copy objects from
// the secondary backend to the primary backend. If unset, objects
// will not be copied.
BlobReplicatorConfiguration replicator = 3;
}
message ReferenceExpandingBlobAccessConfiguration {
// The Indirect Content Addressable Storage (ICAS) backend from which
// Reference objects are loaded.
BlobAccessConfiguration indirect_content_addressable_storage = 1;
// Optional: AWS access options and credentials for objects loaded
// from S3.
buildbarn.configuration.cloud.aws.SessionConfiguration aws_session = 2;
// Optional: Options to be used by the HTTP client.
buildbarn.configuration.http.ClientConfiguration http_client = 3;
// Optional: Google Cloud Platform (GCP) client options for objects
// loaded from GCS. Support for GCS is disabled if left unspecified.
buildbarn.configuration.cloud.gcp.ClientOptionsConfiguration
gcp_client_options = 4;
// Optional: Storage backend to use when Reference objects refer to
// objects stored in another Content Addressable Storage.
BlobAccessConfiguration content_addressable_storage = 5;
}
message BlobReplicatorConfiguration {
oneof mode {
// When blobs are only present in one backend, but not the other,
// they are copied by the client immediately.
//
// Because there is no orchestration between clients, this may for
// certain workloads cause multiple clients to all replicate the
// same objects. Especially for setups with many clients, this could
// put a lot of pressure on storage nodes.
//
// This strategy may still be acceptable for the Action Cache, even
// for larger setups. The Action Cache receives less load than the
// Content Addressable Storage. There is also a lower propbability
// of clients requesting the same object at around the same time.
// Action Cache objects also tend to be relatively small, meaning
// that little space and bandwidth is wasted when replicating
// objects unnecessarily.
google.protobuf.Empty local = 1;
// Instruct an external gRPC service (bb_replicator) to perform
// replications. This is advised for setups with a larger number of
// clients, as a centralized replicator process may deduplicate
// replication actions. This reduces the load on storage nodes.
//
// This strategy is only supported for the Content Addressable
// Storage.
buildbarn.configuration.grpc.ClientConfiguration remote = 2;
// Queue and deduplicate all replication operations prior to
// executing them.
//
// In setups with a high volume of requests, it may normally be
// unsafe to restart a non-persistent storage node. Once the storage
// node would come back online, it would succumb to traffic
// generated by clients to replicate missing data.
//
// By executing all replication operations sequentially, the amount
// of pressure placed on storage nodes is bounded. By letting a
// dedicated bb_replicator instance use this strategy, replication
// throughput is bounded globally.
//
// TODO: This backend shares some overlap with 'deduplicating' and
// 'concurrency_limiting'. Should it be removed in favor of those?
// Right now this backend is more efficient for remote sinks,
// because it doesn't decompose requests for multiple objects.
QueuedBlobReplicatorConfiguration queued = 3;
// No replication will be performed. This can be useful when one
// or more of the backends have their contents managed externally.
google.protobuf.Empty noop = 4;
// Ensure that blobs are not replicated redundantly. Replication
// requests for the same blob are merged. To deal with potential
// race conditions, double check whether the sink already contains a
// blob before copying.
//
// In order to guarantee responsiveness for all callers, this
// replicator decomposes requests for multiple blobs into one
// request per blob. To prevent callers from stalling the
// replication process, it also doesn't stream data back to the
// caller as it is being replicated. This means that blobs are fully
// replicated from the source to the sink, prior to letting the
// caller read the data from the sink at its own pace.
//
// This replicator has been designed to reduce the amount of traffic
// against the source to an absolute minimum, at the cost of
// generating more traffic against the sink. It is recommended to
// use this replicator when the sink is an instance of
// LocalBlobAccess that is embedded into the same process, and blobs
// are expected to be consumed locally.
//
// This strategy is only supported for the Content Addressable
// Storage (CAS) and Indirect Content Addressable Storage (ICAS).
BlobReplicatorConfiguration deduplicating = 5;
// Ensure that the total number of concurrent replication requests
// remains bounded by a constant. By limiting the number of
// concurrent requests issues against a source, network starvation
// may be prevented.
//
// If this replicator is used in combination with 'deduplicating',
// it is recommended that 'deduplicating' is placed on the outside.
// More concretely:
//
// { deduplicating: { concurrencyLimiting: { ... } }
//
// Otherwise, the concurrency limit will be applied against requests
// that haven't been deduplicated yet, leading to lower concurrency.
ConcurrencyLimitingBlobReplicatorConfiguration concurrency_limiting = 6;
}
}
message QueuedBlobReplicatorConfiguration {
// Base replication strategy to which calls should be forwarded.
BlobReplicatorConfiguration base = 1;
// Parameters for the cache data structure that is used to incoming
// deduplicate replication operations.
buildbarn.configuration.digest.ExistenceCacheConfiguration existence_cache =
2;
}
message ConcurrencyLimitingBlobReplicatorConfiguration {
// Base replication strategy to which calls should be forwarded.
BlobReplicatorConfiguration base = 1;
// The maximum number of concurrent replication requests that are
// forwarded to the base replication strategy.
int64 maximum_concurrency = 2;
}
message DemultiplexingBlobAccessConfiguration {
// Map of storage backends, where the key corresponds to the instance
// name prefix to match. In case of multiple matches, the storage
// backend with the longest matching prefix is used. The matching
// prefix is removed from the resulting instance name.
//
// For example, if storage backends for instance name prefixes
// "acmecorp" and "acmecorp/rockets" are declared, requests for
// instance name "acmecorp/rockets/mars" will be forwarded to the
// latter. This storage backend will receive requests with instance
// name "mars".
//
// The empty string can be used to match all instance names, thereby
// causing all requests to be forwarded to a single storage backend.
map<string, DemultiplexedBlobAccessConfiguration> instance_name_prefixes = 1;
}
message DemultiplexedBlobAccessConfiguration {
// The backend to which requests are forwarded.
BlobAccessConfiguration backend = 1;
// Add a prefix to the instance name of all requests forwarded to this
// backend.
string add_instance_name_prefix = 2;
}
message ActionResultExpiringBlobAccessConfiguration {
// The backend to which requests are forwarded.
BlobAccessConfiguration backend = 1;
// The minimum amount of time to pass before an ActionResult expires.
google.protobuf.Duration minimum_validity = 2;
// Maximum amount of jitter to be added to the expiration time. This
// ensures that actions that were built at around the same time don't
// also expire at around the same time, therefore amortizing the rate
// at which actions are rebuilt.
//
// The process for computing the jitter is deterministic, meaning that
// subsequent requests for the same ActionResult still yield the same
// expiration time.
google.protobuf.Duration maximum_validity_jitter = 3;
// The minimum value 'worker_completed_timestamp' should have for it
// to be considered valid. This can be used to fully invalidate the
// contents of the Action Cache (AC) in case its contents have become
// poisoned.
google.protobuf.Timestamp minimum_timestamp = 4;
}
message ReadCanaryingBlobAccessConfiguration {
// The backend that is the source of truth.
BlobAccessConfiguration source = 1;
// The backend that contains a read-only replica of the source.
BlobAccessConfiguration replica = 2;
// Size of the cache that is used to track the availability of the
// replica on a per REv2 instance name basis. This ensures that if the
// replica uses features authoriation and demultiplexing based on
// instance names, availability is tracked accurately.
//
// Recommended value: 256
int32 maximum_cache_size = 3;
// The validity duration of cache entries. This controls how much time
// may pass without any read traffic before the backend falls back to
// the default state.
//
// Recommended value: 300s
google.protobuf.Duration maximum_cache_duration = 4;
}
message ZIPBlobAccessConfiguration {
// Path of the ZIP file.
string path = 1;
// When set, temporarily cache the integrity of data after it's been
// read from the ZIP file. Once cached, uncompressed files in the ZIP
// file (i.e., ones stored with compression method STORE) may be
// randomly accessed quickly.
//
// The disadvantage of enabling this option is that data corruption in
// the ZIP file may not be detected. It is therefore recommended to
// set the cache duration to a limited value (e.g., "4h").
buildbarn.configuration.digest.ExistenceCacheConfiguration
data_integrity_validation_cache = 2;
}
message WithLabelsBlobAccessConfiguration {
// The backend that should be created, having access to the declared
// labels.
BlobAccessConfiguration backend = 1;
// A map of string labels to backends that can be referenced.
map<string, BlobAccessConfiguration> labels = 2;
}