-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathblockreader.rs
1972 lines (1813 loc) · 85.5 KB
/
blockreader.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// src/readers/blockreader.rs
//
// Blocks and BlockReader implementations
// …
pub use crate::common::{
Count,
FPath,
FileOffset,
FileType,
FileSz,
};
use crate::common::{
File,
FileMetadata,
FileOpenOptions,
ResultS3,
};
#[cfg(test)]
use crate::common::{
Bytes,
};
use crate::data::datetime::{
SystemTime,
};
#[allow(unused_imports)]
use crate::printer_debug::printers::{
dpo,
dpn,
dpx,
dpnx,
dpof,
dpnf,
dpxf,
dpnxf,
dp_err,
dp_wrn,
p_err,
p_wrn,
};
use std::borrow::Cow;
use std::collections::{
BTreeMap,
BTreeSet,
HashSet,
};
use std::fmt;
use std::fs::Metadata;
use std::io::{
BufReader,
Error,
ErrorKind,
Result,
Seek,
SeekFrom,
Take,
};
use std::io::prelude::Read;
use std::path::Path;
use std::sync::Arc;
use std::time::Duration;
extern crate lru;
use lru::LruCache;
extern crate mime_guess;
use mime_guess::{
MimeGuess,
};
extern crate more_asserts;
use more_asserts::{
assert_le,
assert_ge,
debug_assert_le,
};
extern crate flate2;
use flate2::read::GzDecoder;
use flate2::GzHeader;
// crate `lzma-rs` is the only pure rust crate.
// Other crates interface to liblzma which not ideal.
extern crate lzma_rs;
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
/// Block Size in bytes
pub type BlockSz = u64;
/// Byte offset (Index) _into_ a `Block` from beginning of `Block`. zero first.
pub type BlockIndex = usize;
/// Offset into a file in `Block`s, depends on `BlockSz` runtime value. zero first.
pub type BlockOffset = u64;
/// Block of bytes data read from some file storage
pub type Block = Vec<u8>;
/// thread-safe Atomic Reference Counting Pointer to a `Block`
pub type BlockP = Arc<Block>;
pub type Slices<'a> = Vec<&'a [u8]>;
/// tracker of `BlockP`
pub type Blocks = BTreeMap<BlockOffset, BlockP>;
/// tracker of `BlockOffset` that have been rad
pub type BlocksTracked = BTreeSet<BlockOffset>;
pub type BlocksLRUCache = LruCache<BlockOffset, BlockP>;
// for case where reading blocks, lines, or syslines reaches end of file, the value `WriteZero` will
// be used here to mean "_end of file reached, nothing new_"
// XXX: this is a hack
//#[allow(non_upper_case_globals)]
//pub const EndOfFile: ErrorKind = ErrorKind::WriteZero;
#[allow(non_upper_case_globals)]
pub type ResultS3ReadBlock = ResultS3<BlockP, Error>;
/// minimum Block Size (inclusive)
pub const BLOCKSZ_MIN: BlockSz = 1;
/// maximum Block Size (inclusive)
pub const BLOCKSZ_MAX: BlockSz = 0xFFFFFF;
/// default Block Size
pub const BLOCKSZ_DEF: usize = 0xFFFF;
/// data and readers for a gzip `.gz` file
#[derive(Debug)]
pub struct GzData {
/// size of file uncompressed, taken from trailing gzip file data
pub filesz: FileSz,
/// calls to `read` use this
pub decoder: GzDecoder<File>,
/// filename taken from gzip header
pub filename: String,
/// file modified time taken from gzip header
///
/// From https://datatracker.ietf.org/doc/html/rfc1952#page-7
///
/// > MTIME (Modification TIME)
/// > This gives the most recent modification time of the original
/// > file being compressed. The time is in Unix format, i.e.,
/// > seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
/// > may cause problems for MS-DOS and other systems that use
/// > local rather than Universal time.) If the compressed data
/// > did not come from a file, MTIME is set to the time at which
/// > compression started. MTIME = 0 means no time stamp is
/// > available.
///
pub mtime: u32,
/// CRC32 taken from trailing gzip file data
pub crc32: u32,
}
type BufReaderXz = BufReader<File>;
/// data and readers for a LZMA `.xz` file
#[derive(Debug)]
pub struct XzData {
/// size of file uncompressed
pub filesz: FileSz,
pub bufreader: BufReaderXz,
}
// TODO: 2022/07 it is not impossible for paths to have ':', use '\0' instead
// which should never be in a path. But use ':' when printing paths.
/// separator substring for a filesystem path and subpath within an archive
/// e.g. `path/logs.tar:logs/syslog`
pub const SUBPATH_SEP: char = ':';
type TarHandle = tar::Archive::<File>;
/// taken from `tar::Archive::<File>::headers()`
type TarChecksum = u32;
/// taken from `tar::Archive::<File>::headers()`
type TarMTime = u64;
/// data and readers for a file within a `.tar` file
pub struct TarData {
/// size of file unarchived
pub filesz: FileSz,
//pub handle: TarHandle,
/// iteration count of `tar::Archive::entries_with_seek`
pub entry_index: usize,
/// checksum retreived from tar header
pub checksum: TarChecksum,
/// modified time retreived from tar header
///
/// from https://www.gnu.org/software/tar/manual/html_node/Standard.html
/// > The mtime field represents the data modification time of the file at
/// > the time it was archived. It represents the integer number of seconds
/// > since January 1, 1970, 00:00 Coordinated Universal Time.
pub mtime: TarMTime,
}
/// A `BlockReader` reads a file in `BlockSz` byte-sized `Block`s. It interfaces
/// with the filesystem (or any other data retreival method). It handles the
/// lookup and storage of `Block`s of data.
///
/// A `BlockReader` uses it's `FileType` to determine how to handle files.
/// This includes reading bytes from files (e.g. `.log`),
/// compressed files (e.g. `.gz`), and archive files (e.g. `.tar`).
///
/// One `BlockReader` corresponds to one file. For archive files, one `BlockReader`
/// handles only one file *within* the archive file.
///
/// A `BlockReader` does not know about `char`s (a `LineReader` does).
///
/// XXX: not a rust "Reader"; does not implement trait `Read`
///
pub struct BlockReader {
/// Path to file
pub path: FPath,
/// subpath to file, only for `filetype.is_archived()` files
pub subpath: Option<FPath>,
/// File handle
file: File,
/// File.metadata()
///
/// For compressed or archived files, the metadata of the `path`
/// compress or archive file.
file_metadata: FileMetadata,
/// copy of `self.file_metadata.modified()`, copied during `new()`
///
/// to simplify later retrievals
pub(crate) file_metadata_modified: SystemTime,
/// The `MimeGuess::from_path` result
mimeguess_: MimeGuess,
/// enum that guides file-handling behavior in `read`, `new`
filetype: FileType,
/// For gzipped files (FileType::FileGz), otherwise `None`
gz: Option<GzData>,
/// For LZMA xz files (FileType::FileXz), otherwise `None`
xz: Option<XzData>,
/// for files within a `.tar` file (FileType::FileTar), otherwise `None`
tar: Option<TarData>,
/// The filesz of uncompressed data, set during `new`.
/// Should always be `== gz.unwrap().filesz`.
///
/// Users should always call `filesz()`.
pub(crate) filesz_actual: FileSz,
/// File size in bytes of file at `path`, actual size.
/// For compressed files, this is the size of the file compressed.
/// For the uncompressed size of a compressed file, see `filesz_actual`.
/// Set in `open`.
///
/// For regular files (not compressed or archived),
/// `filesz` and `filesz_actual` will be the same.
///
/// Users should always call `filesz()`.
pub(crate) filesz: FileSz,
/// File size in blocks, set in `open`.
pub(crate) blockn: u64,
/// standard `Block` size in bytes; all `Block`s are this size except the
/// last `Block` which may this size or smaller (and not zero).
pub(crate) blocksz: BlockSz,
/// Count of bytes stored by the `BlockReader`.
/// May not match `self.blocks.iter().map(|x| sum += x.len()); sum` as
/// `self.blocks` may have some elements `drop`ped during streaming.
count_bytes_: Count,
/// Storage of blocks `read` from storage. Lookups O(log(n)).
///
/// During file processing, some elements that are not needed may be `drop`ped.
blocks: Blocks,
/// track blocks read in `read_block`. Never drops data.
///
/// useful for when streaming kicks-in and some key+vale of `self.blocks` have
/// been dropped.
blocks_read: BlocksTracked,
/// internal LRU cache for `fn read_block()`. Lookups O(1).
read_block_lru_cache: BlocksLRUCache,
/// enable/disable use of `read_block_lru_cache`
read_block_lru_cache_enabled: bool,
/// internal LRU cache count of lookup hits
pub(crate) read_block_cache_lru_hit: Count,
/// internal LRU cache count of lookup misses
pub(crate) read_block_cache_lru_miss: Count,
/// internal LRU cache count of lookup `.put`
pub(crate) read_block_cache_lru_put: Count,
/// internal storage count of lookup hit
pub(crate) read_blocks_hit: Count,
/// internal storage count of lookup miss
pub(crate) read_blocks_miss: Count,
/// internal storage count of `self.blocks.insert`
pub(crate) read_blocks_put: Count,
}
impl fmt::Debug for BlockReader {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("BlockReader")
.field("path", &self.path)
.field("file", &self.file)
//.field("file_metadata", &self._file_metadata)
.field("mimeguess", &self.mimeguess_)
.field("filesz", &self.filesz())
.field("blockn", &self.blockn)
.field("blocksz", &self.blocksz)
.field("blocks currently stored", &self.blocks.len())
.field("blocks read", &self.blocks_read.len())
.field("bytes read", &self.count_bytes_)
.field("cache LRU hit", &self.read_block_cache_lru_hit)
.field("miss", &self.read_block_cache_lru_miss)
.field("put", &self.read_block_cache_lru_put)
.field("cache hit", &self.read_blocks_hit)
.field("miss", &self.read_blocks_miss)
.field("insert", &self.read_blocks_put)
.finish()
}
}
/// helper to unpack DWORD unsigned integers in a gzip header
///
/// XXX: u32::from_*_bytes failed for test file compressed with GNU gzip 1.10
///
/// TODO: validate XXX, did I do that correctly?
const fn dword_to_u32(buf: &[u8; 4]) -> u32 {
let mut buf_: [u8; 4] = [0; 4];
buf_[0] = buf[3];
buf_[1] = buf[2];
buf_[2] = buf[1];
buf_[3] = buf[0];
u32::from_be_bytes(buf_)
}
/// implement the BlockReader things
impl BlockReader {
/// maximum size of a gzip compressed file that will be processed.
///
/// XXX: The gzip standard stores uncompressed "media stream" bytes size in within
/// 32 bits, 4 bytes. A larger uncompressed size 0xFFFFFFFF will store the modulo.
/// So there is no certain way to determine the size of the "media stream".
/// This terrible hack just aborts processing .gz files that might be over that
/// size.
const GZ_MAX_SZ: FileSz = 0x20000000;
/// cache slots for `read_block` LRU cache
const READ_BLOCK_LRU_CACHE_SZ: usize = 4;
/// Create a new `BlockReader`.
///
/// Opens the `path` file, configures settings based on determined `filetype`.
pub fn new(path: FPath, filetype: FileType, blocksz_: BlockSz) -> Result<BlockReader> {
// TODO: how to make some fields `blockn` `blocksz` `filesz` immutable?
// https://stackoverflow.com/questions/23743566/how-can-i-force-a-structs-field-to-always-be-immutable-in-rust
dpn!("BlockReader::new({:?}, {:?}, {:?})", path, filetype, blocksz_);
assert_ne!(0, blocksz_, "Block Size cannot be 0");
assert_ge!(blocksz_, BLOCKSZ_MIN, "Block Size {} is too small", blocksz_);
assert_le!(blocksz_, BLOCKSZ_MAX, "Block Size {} is too big", blocksz_);
// shadow passed immutable with local mutable
let mut path: FPath = path;
let mut subpath_opt: Option<FPath> = None;
if filetype.is_archived() {
dpof!("filetype.is_archived()");
let (path_, subpath_) = match path.rsplit_once(SUBPATH_SEP) {
Some(val) => val,
None => {
dpxf!("filetype {:?} but failed to find delimiter {:?} in {:?}", filetype, SUBPATH_SEP, path);
return Result::Err(
Error::new(
// TODO: use `ErrorKind::InvalidFilename` when it is stable
ErrorKind::NotFound,
format!("Given Filetype {:?} but failed to find delimiter {:?} in {:?}", filetype, SUBPATH_SEP, path)
)
);
}
};
subpath_opt = Some(subpath_.to_string());
path = FPath::from(path_);
}
let path_std: &Path = Path::new(&path);
// TODO: pass in `mimeguess`; avoid repeats of the tedious operation
let mimeguess_: MimeGuess = MimeGuess::from_path(path_std);
let mut open_options = FileOpenOptions::new();
dpof!("open_options.read(true).open({:?})", path);
let file: File = match open_options.read(true).open(path_std) {
Ok(val) => val,
Err(err) => {
dpxf!("return {:?}", err);
return Err(err);
}
};
let mut blocks = Blocks::new();
let mut blocks_read = BlocksTracked::new();
let mut count_bytes_: Count = 0;
let filesz: FileSz;
let mut filesz_actual: FileSz;
let blocksz: BlockSz;
let file_metadata: FileMetadata;
let file_metadata_modified: SystemTime;
let mut gz_opt: Option<GzData> = None;
let mut xz_opt: Option<XzData> = None;
let mut tar_opt: Option<TarData> = None;
let mut read_blocks_put: Count = 0;
match file.metadata() {
Ok(val) => {
filesz = val.len() as FileSz;
file_metadata = val;
file_metadata_modified = match file_metadata.modified() {
Ok(systemtime_) => {
systemtime_
}
Err(err) => {
dpxf!("file_metadata.modified() failed Err {:?}", err);
return Result::Err(err);
}
}
}
Err(err) => {
dpxf!("return {:?}", err);
eprintln!("ERROR: File::metadata() error {}", err);
return Err(err);
}
};
if file_metadata.is_dir() {
dpxf!("return Err(Unsupported)");
return std::result::Result::Err(
Error::new(
//ErrorKind::IsADirectory, // XXX: error[E0658]: use of unstable library feature 'io_error_more'
ErrorKind::Unsupported,
format!("Path is a directory {:?}", path)
)
);
}
match filetype {
FileType::File => {
filesz_actual = filesz;
blocksz = blocksz_;
},
FileType::FileGz => {
blocksz = blocksz_;
dpof!("FileGz: blocksz set to {0} (0x{0:08X}) (passed {1} (0x{1:08X})", blocksz, blocksz_);
// GZIP last 8 bytes:
// 4 bytes (DWORD) is CRC32
// 4 bytes (DWORD) is gzip file uncompressed size
// GZIP binary format https://datatracker.ietf.org/doc/html/rfc1952#page-5
//
// +---+---+---+---+---+---+---+---+
// | CRC32 | SIZE |
// +---+---+---+---+---+---+---+---+
//
// sanity check file size
if filesz < 8 {
dpxf!("FileGz: return Err(InvalidData)");
return Result::Err(
Error::new(
ErrorKind::InvalidData,
format!("gzip file size {:?} is too small for {:?}", filesz, path)
)
);
}
// TODO: [2022/06] it's known that for a file larger than 4GB uncompressed,
// gzip cannot store it's filesz accurately, since filesz is stored within 32 bits.
// gzip will only store the rollover (uncompressed filesz % 4GB).
// How to handle large gzipped files correctly?
// First, how to detect that the stored filesz is a rollover value?
// Second, the file could be streamed and the filesz calculated from that
// activity. However, streaming, for example, a 3GB log.gz that uncompresses to
// 10GB is very inefficient.
// Third, similar to "Second" but for very large files, i.e. a 32GB log.gz file, what then?
if filesz > BlockReader::GZ_MAX_SZ {
dpxf!("FileGz: return Err(InvalidData)");
return Result::Err(
Error::new(
// TODO: [2022/06] use `ErrorKind::FileTooLarge` when it is stable
// `ErrorKind::FileTooLarge` causes error:
// use of unstable library feature 'io_error_more'
// see issue #86442 <https://github.com/rust-lang/rust/issues/86442> for more informationrustc(E0658)
ErrorKind::InvalidData,
format!("Cannot handle gzip files larger than semi-arbitrary {0} (0x{0:08X}) uncompressed bytes, file is {1} (0x{1:08X}) uncompressed bytes according to gzip header {2:?}", BlockReader::GZ_MAX_SZ, filesz, path),
)
);
}
// create "take handler" that will read 8 bytes as-is (no decompression)
match (&file).seek(SeekFrom::End(-8)) {
Ok(_) => {}
Err(err) => {
dpxf!("FileGz: return Err({})", err);
eprintln!("ERROR: file.SeekFrom(-8) Error {}", err);
return Err(err);
}
};
let mut reader = (&file).take(8);
// extract DWORD for CRC32
let mut buffer_crc32: [u8; 4] = [0; 4];
dpof!("FileGz: reader.read_exact(@{:p}) (buffer len {})", &buffer_crc32, buffer_crc32.len());
match reader.read_exact(&mut buffer_crc32) {
Ok(_) => {}
//Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => {},
Err(err) => {
dpx!("FileGz: return {:?}", err);
eprintln!("reader.read_to_end(&buffer_crc32) Error {:?}", err);
return Err(err);
}
}
dpof!("FileGz: buffer_crc32 {:?}", buffer_crc32);
let crc32 = dword_to_u32(&buffer_crc32);
dpof!("FileGz: crc32 {0} (0x{0:08X})", crc32);
// extract DWORD for SIZE
let mut buffer_size: [u8; 4] = [0; 4];
dpof!("FileGz: reader.read_exact(@{:p}) (buffer len {})", &buffer_size, buffer_size.len());
match reader.read_exact(&mut buffer_size) {
Ok(_) => {}
Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => {},
Err(err) => {
dpx!("FileGz: return {:?}", err);
eprintln!("reader.read_to_end(&buffer_size) Error {:?}", err);
return Err(err);
}
}
dpof!("FileGz: buffer_size {:?}", buffer_size);
let size: u32 = dword_to_u32(&buffer_size);
dpof!("FileGz: file size uncompressed {0:?} (0x{0:08X})", size);
let filesz_uncompressed: FileSz = size as FileSz;
/*
if filesz_uncompressed == 0 {
dpxf!("FileGz: return Err(InvalidData)");
return Result::Err(
Error::new(
ErrorKind::InvalidData,
format!("extracted uncompressed file size value 0, nothing to read {:?}", path),
)
);
}
*/
filesz_actual = filesz_uncompressed;
// reset Seek pointer
// XXX: not sure if this is necessary
match (&file).seek(SeekFrom::Start(0)) {
Ok(_) => {},
Err(err) => {
dpxf!("FileGz: return Err({:?})", err);
return Result::Err(err);
}
}
//let mut open_options = FileOpenOptions::new();
dpof!("FileGz: open_options.read(true).open({:?})", path_std);
let file_gz: File = match open_options.read(true).open(path_std) {
Ok(val) => val,
Err(err) => {
dpxf!("FileGz: open_options.read({:?}) Error, return {:?}", path, err);
return Err(err);
}
};
let decoder: GzDecoder<File> = GzDecoder::new(file_gz);
dpof!("FileGz: {:?}", decoder);
let header_opt: Option<&GzHeader> = decoder.header();
let mut filename: String = String::with_capacity(0);
//
// GZIP binary format https://datatracker.ietf.org/doc/html/rfc1952#page-5
//
// Each member has the following structure:
//
// +---+---+---+---+---+---+---+---+---+---+
// |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
// +---+---+---+---+---+---+---+---+---+---+
//
// MTIME (Modification TIME)
// This gives the most recent modification time of the original
// file being compressed. The time is in Unix format, i.e.,
// seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
// may cause problems for MS-DOS and other systems that use
// local rather than Universal time.) If the compressed data
// did not come from a file, MTIME is set to the time at which
// compression started. MTIME = 0 means no time stamp is
// available.
//
let mut mtime: u32 = 0;
match header_opt {
Some(header) => {
let filename_: &[u8] = header.filename().unwrap_or(&[]);
filename = match String::from_utf8(filename_.to_vec()) {
Ok(val) => val,
Err(_err) => String::with_capacity(0),
};
mtime = header.mtime();
},
None => {
dpof!("FileGz: GzDecoder::header() is None for {:?}", path);
},
};
gz_opt = Some(
GzData {
filesz: filesz_uncompressed,
decoder,
filename,
mtime,
crc32,
}
);
dpof!("FileGz: created {:?}", gz_opt);
},
FileType::FileXz => {
blocksz = blocksz_;
dpof!("FileXz: blocksz set to {0} (0x{0:08X}) (passed {1} (0x{1:08X})", blocksz, blocksz_);
dpof!("FileXz: open_options.read(true).open({:?})", path_std);
let mut file_xz: File = match open_options.read(true).open(path_std) {
Ok(val) => val,
Err(err) => {
dpxf!("FileXz: open_options.read({:?}) Error, return {:?}", path, err);
return Err(err);
}
};
//
// Get the .xz file size from XZ header
//
// "bare-bones" implentation of reading xz compressed file
// other availale crates for reading `.xz` files did not meet
// the needs of this program.
//
/*
https://tukaani.org/xz/xz-file-format.txt
1. Byte and Its Representation
In this document, byte is always 8 bits.
A "null byte" has all bits unset. That is, the value of a null
byte is 0x00.
To represent byte blocks, this document uses notation that
is similar to the notation used in [RFC-1952]:
+-------+
| Foo | One byte.
+-------+
+---+---+
| Foo | Two bytes; that is, some of the vertical bars
+---+---+ can be missing.
+=======+
| Foo | Zero or more bytes.
+=======+
2. Overall Structure of .xz File
A standalone .xz files consist of one or more Streams which may
have Stream Padding between or after them:
+========+================+========+================+
| Stream | Stream Padding | Stream | Stream Padding | ...
+========+================+========+================+
2.1. Stream
+-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
| Stream Header | Block | Block | ... | Block |
+-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
2.1.1. Stream Header
+---+---+---+---+---+---+-------+------+--+--+--+--+
| Header Magic Bytes | Stream Flags | CRC32 |
+---+---+---+---+---+---+-------+------+--+--+--+--+
3. Block
+==============+=================+===============+=======+
| Block Header | Compressed Data | Block Padding | Check |
+==============+=================+===============+=======+
3.1. Block Header
+-------------------+-------------+=================+
| Block Header Size | Block Flags | Compressed Size |
+-------------------+-------------+=================+
+===================+======================+
---> | Uncompressed Size | List of Filter Flags |
+===================+======================+
+================+--+--+--+--+
---> | Header Padding | CRC32 |
+================+--+--+--+--+
3.1.1. Block Header Size
This field overlaps with the Index Indicator field (see
Section 4.1).
This field contains the size of the Block Header field,
including the Block Header Size field itself. Valid values are
in the range [0x01, 0xFF], which indicate the size of the Block
Header as multiples of four bytes, minimum size being eight
bytes:
real_header_size = (encoded_header_size + 1) * 4;
If a Block Header bigger than 1024 bytes is needed in the
future, a new field can be added between the Block Header and
Compressed Data fields. The presence of this new field would
be indicated in the Block Header field.
3.1.2. Block Flags
The Block Flags field is a bit field:
Bit(s) Mask Description
0-1 0x03 Number of filters (1-4)
2-5 0x3C Reserved for future use; MUST be zero for now.
6 0x40 The Compressed Size field is present.
7 0x80 The Uncompressed Size field is present.
If any reserved bit is set, the decoder MUST indicate an error.
It is possible that there is a new field present which the
decoder is not aware of, and can thus parse the Block Header
incorrectly.
3.1.3. Compressed Size
This field is present only if the appropriate bit is set in
the Block Flags field (see Section 3.1.2).
The Compressed Size field contains the size of the Compressed
Data field, which MUST be non-zero. Compressed Size is stored
using the encoding described in Section 1.2. If the Compressed
Size doesn't match the size of the Compressed Data field, the
decoder MUST indicate an error.
3.1.4. Uncompressed Size
This field is present only if the appropriate bit is set in
the Block Flags field (see Section 3.1.2).
The Uncompressed Size field contains the size of the Block
after uncompressing. Uncompressed Size is stored using the
encoding described in Section 1.2. If the Uncompressed Size
does not match the real uncompressed size, the decoder MUST
indicate an error.
Storing the Compressed Size and Uncompressed Size fields serves
several purposes:
- The decoder knows how much memory it needs to allocate
for a temporary buffer in multithreaded mode.
- Simple error detection: wrong size indicates a broken file.
- Seeking forwards to a specific location in streamed mode.
It should be noted that the only reliable way to determine
the real uncompressed size is to uncompress the Block,
because the Block Header and Index fields may contain
(intentionally or unintentionally) invalid information.
*/
// create "take handler" that will read bytes as-is (no decompression)
match (&file_xz).seek(SeekFrom::Start(0)) {
Ok(_) => {},
Err(err) => {
dpxf!("FileXz: return Err({})", err);
eprintln!("ERROR: file.SeekFrom(0) Error {}", err);
return Err(err);
},
};
let mut reader = (&file_xz).take(6 + 2 + 4 + 1 + 1);
// stream header magic bytes
let mut buffer_: [u8; 6] = [0; 6];
match reader.read_exact(&mut buffer_) {
Ok(_) => {},
Err(err) => {
dpxf!("FileXz: return {:?}", err);
eprintln!("reader.read_exact() (stream header magic bytes) Error {:?}", err);
return Err(err);
},
}
// magic bytes expected "ý7zXZ\0"
const XZ_MAGIC_BYTES: [u8; 6] = [0xFD, 0x37, 0x7A, 0x58, 0x5A,0x00];
dpof!("FileXz: stream header magic bytes {:?}", buffer_);
if cfg!(debug_assertions) {
for (i, b_) in buffer_.iter().enumerate() {
let b_ex = XZ_MAGIC_BYTES[i];
let c_ex: char = b_ex as char;
let c_: char = (*b_) as char;
dpo!("actual {0:3} (0x{0:02X}) {1:?}", b_, c_);
dpo!("expect {0:3} (0x{0:02X}) {1:?}\n", b_ex, c_ex);
}
}
if buffer_ != XZ_MAGIC_BYTES {
return Result::Err(
Error::new(
ErrorKind::InvalidData,
format!("Failed to find XZ stream header magic bytes for {:?}", path_std)
)
);
}
// stream header flags
let mut buffer_: [u8; 2] = [0; 2];
match reader.read_exact(&mut buffer_) {
Ok(_) => {},
Err(err) => {
dpxf!("FileXz: return {:?}", err);
eprintln!("reader.read_exact() (stream header flags) Error {:?}", err);
return Err(err);
},
}
dpof!("FileXz: buffer {:?}", buffer_);
let _flags: u16 = u16::from_le_bytes(buffer_);
dpof!("FileXz: stream header flags 0b{0:016b}", _flags);
// stream header CRC32
let mut buffer_: [u8; 4] = [0; 4];
match reader.read_exact(&mut buffer_) {
Ok(_) => {},
Err(err) => {
dpxf!("FileXz: return {:?}", err);
eprintln!("reader.read_exact() (stream header CRC32) Error {:?}", err);
return Err(err);
},
}
dpof!("FileXz: buffer {:?}", buffer_);
let _crc32: u32 = u32::from_le_bytes(buffer_);
dpof!("FileXz: stream header CRC32 {0:} (0x{0:08X}) (0b{0:032b})", _crc32);
// block #0 block header size
let mut buffer_: [u8; 1] = [0; 1];
match reader.read_exact(&mut buffer_) {
Ok(_) => {},
Err(err) => {
dpxf!("FileXz: return {:?}", err);
eprintln!("reader.read_exact() (block #0 block header size) Error {:?}", err);
return Err(err);
},
}
dpof!("FileXz: buffer {:?}", buffer_);
let _bhsz: u8 = buffer_[0];
dpof!("FileXz: block #0 block header size {0:} (0x{0:02X})", _bhsz);
// block #0 block header flags
let mut buffer_: [u8; 1] = [0; 1];
match reader.read_exact(&mut buffer_) {
Ok(_) => {},
Err(err) => {
dpxf!("FileXz: return {:?}", err);
eprintln!("reader.read_exact() (block #0 block header flags) Error {:?}", err);
return Err(err);
},
}
dpof!("FileXz: buffer {:?}", buffer_);
let _bhflags: u8 = buffer_[0];
dpof!("FileXz: block #0 block header flags {0:} (0x{0:02X}) (0b{0:08b})", _bhflags);
// reset Seek pointer
match file_xz.seek(SeekFrom::Start(0)) {
Ok(_) => {},
Err(err) => {
dpxf!("FileXz: return {:?}", err);
eprintln!("file_xz.seek() (block #0 block header flags) Error {:?}", err);
return Err(err);
}
}
let mut bufreader: BufReaderXz = BufReaderXz::new(file_xz);
// XXX: THIS IS A TERRIBLE HACK!
// read the entire file into blocks in one go!
// putting this here until the implementation of reading the header/blocks
// of the underlying .xz file
#[allow(clippy::never_loop)]
loop {
let mut buffer = Block::new();
dpof!("FileXz: xz_decompress({:?}, buffer (len {}, capacity {}))", bufreader, buffer.len(), buffer.capacity());
// XXX: xz_decompress may resize the passed `buffer`
match lzma_rs::xz_decompress(&mut bufreader, &mut buffer) {
Ok(_) => {
dpof!("FileXz: xz_decompress returned buffer len {}, capacity {}", buffer.len(), buffer.capacity());
},
Err(err) => {
match &err {
lzma_rs::error::Error::IoError(ref ioerr) => {
dpof!("FileXz: ioerr.kind() {:?}", ioerr.kind());
if ioerr.kind() == ErrorKind::UnexpectedEof {
dpof!("FileXz: xz_decompress Error UnexpectedEof, break!");
break;
}
}
err_ => {
dpof!("FileXz: err {:?}", err_);
},
}
dpxf!("FileXz: xz_decompress Error, return Err({:?})", err);
return Err(
Error::new(
ErrorKind::Other,
format!("{:?}", err),
)
);
}
}
if buffer.is_empty() {
dpof!("buffer.is_empty()");
break;
}
let blocksz_u: usize = blocksz as usize;
let mut blockoffset: BlockOffset = 0;
// the `block`
while blockoffset <= ((buffer.len() / blocksz_u) as BlockOffset) {
let mut block: Block = Block::with_capacity(blocksz_u);
let a: usize = (blockoffset * blocksz) as usize;
let b: usize = a + (std::cmp::min(blocksz_u, buffer.len() - a));
dpof!("FileXz: block.extend_from_slice(&buffer[{}‥{}])", a, b);
block.extend_from_slice(&buffer[a..b]);
let blockp: BlockP = BlockP::new(block);
if let Some(bp_) = blocks.insert(blockoffset, blockp.clone()) {
eprintln!("WARNING: blockreader.blocks.insert({}, BlockP@{:p}) already had a entry BlockP@{:p}", blockoffset, blockp, bp_);
}
read_blocks_put += 1;
count_bytes_ += (*blockp).len() as Count;
blocks_read.insert(blockoffset);
blockoffset += 1;
}
break;
}
let filesz_uncompressed: FileSz = count_bytes_ as FileSz;
filesz_actual = filesz_uncompressed;
xz_opt = Some(
XzData {
filesz: filesz_uncompressed,
bufreader,
}
);
dpof!("FileXz: created {:?}", xz_opt.as_ref().unwrap());
},
FileType::FileTar => {
blocksz = blocksz_;
dpof!("FileTar: blocksz set to {0} (0x{0:08X}) (passed {1} (0x{1:08X})", blocksz, blocksz_);
filesz_actual = 0;
let mut checksum: TarChecksum = 0;
let mut mtime: TarMTime = 0;
let subpath: &String = subpath_opt.as_ref().unwrap();
let mut archive: TarHandle = BlockReader::open_tar(&path_std)?;
let entry_iter: tar::Entries<File> = match archive.entries_with_seek() {
Ok(val) => {
val
},
Err(err) => {
dpxf!("FileTar: Err {:?}", err);
return Result::Err(err);
}
};
let mut entry_index: usize = 0;
for (index, entry_res) in entry_iter.enumerate() {
entry_index = index;
let entry: tar::Entry<File> = match entry_res {
Ok(val) => val,
Err(err) => {
dpof!("FileTar: entry Err {:?}", err);
continue;
}
};
let subpath_cow: Cow<Path> = match entry.path() {
Ok(val) => val,
Err(err) => {
dpof!("FileTar: entry.path() Err {:?}", err);
continue;
}
};
let subfpath: FPath = subpath_cow.to_string_lossy().to_string();
if subpath != &subfpath {
dpof!("FileTar: skip {:?}", subfpath);
continue;
}
// found the matching subpath
dpof!("FileTar: found {:?}", subpath);
filesz_actual = match entry.header().size() {
Ok(val) => val,
Err(err) => {
dpxf!("FileTar: entry.header().size() Err {:?}", err);
return Result::Err(err);
}
};
checksum = match entry.header().cksum() {
Ok(val) => val,
Err(err) => {
dpo!("FileTar: entry.header().cksum() Err {:?}", err);
0
}
};
mtime = match entry.header().mtime() {
Ok(val) => val,
Err(err) => {
dpo!("FileTar: entry.header().mtime() Err {:?}", err);