diff --git a/azure-pipeline-templates/e2e-tests-block-cache-data-integrity.yml b/azure-pipeline-templates/e2e-tests-block-cache-data-integrity.yml new file mode 100644 index 000000000..e985245d0 --- /dev/null +++ b/azure-pipeline-templates/e2e-tests-block-cache-data-integrity.yml @@ -0,0 +1,212 @@ +parameters: + - name: conf_template + type: string + - name: config_file + type: string + - name: container + type: string + - name: temp_dir + type: string + - name: mount_dir + type: string + - name: idstring + type: string + - name: adls + type: boolean + - name: account_name + type: string + - name: account_key + type: string + - name: account_type + type: string + - name: account_endpoint + - name: distro_name + type: string + - name: quick_test + type: boolean + default: true + - name: verbose_log + type: boolean + default: false + - name: clone + type: boolean + default: false + - name: block_size_mb + type: string + default: "8" + +steps: + - script: | + $(WORK_DIR)/blobfuse2 gen-test-config --config-file=$(WORK_DIR)/testdata/config/azure_key.yaml --container-name=${{ parameters.container }} --temp-path=${{ parameters.temp_dir }} --output-file=${{ parameters.config_file }} + displayName: 'Create Config File for RW mount' + env: + NIGHTLY_STO_ACC_NAME: ${{ parameters.account_name }} + NIGHTLY_STO_ACC_KEY: ${{ parameters.account_key }} + ACCOUNT_TYPE: ${{ parameters.account_type }} + ACCOUNT_ENDPOINT: ${{ parameters.account_endpoint }} + VERBOSE_LOG: ${{ parameters.verbose_log }} + continueOnError: false + + - script: | + cat ${{ parameters.config_file }} + displayName: 'Print config file' + + - template: 'mount.yml' + parameters: + working_dir: $(WORK_DIR) + mount_dir: ${{ parameters.mount_dir }} + temp_dir: ${{ parameters.temp_dir }} + prefix: ${{ parameters.idstring }} + mountStep: + script: | + $(WORK_DIR)/blobfuse2 mount ${{ parameters.mount_dir }} --config-file=${{ parameters.config_file }} --default-working-dir=$(WORK_DIR) --file-cache-timeout=3200 + + - script: | + for i in $(seq 1 10); do echo $(shuf -i 0-4294967296 -n 1); done | parallel --will-cite -j 5 'head -c {} < /dev/urandom > ${{ parameters.mount_dir }}/datafiles_{}' + for i in {1,2,3,4,5,6,7,8,9,10,20,30,50,100,200}; do echo $i; done | parallel --will-cite -j 5 'head -c {}M < /dev/urandom > ${{ parameters.mount_dir }}/mixedfiles_{}.txt' + for i in {1,2,3,4,5,6,7,8,9,10,20,30,50,100,200}; do echo $i; done | parallel --will-cite -j 5 'head -c {}M < /dev/urandom > ${{ parameters.mount_dir }}/mixedfiles_{}.png' + cd ${{ parameters.mount_dir }} + python3 $(WORK_DIR)/testdata/scripts/generate-parquet-files.py + ls -l ${{ parameters.mount_dir }}/mixedfiles_* + ls -l ${{ parameters.mount_dir }}/datafiles_* + displayName: 'Generate data with File-Cache' + + - script: | + md5sum ${{ parameters.mount_dir }}/datafiles_* > $(WORK_DIR)/md5sum_original_files.txt + md5sum ${{ parameters.mount_dir }}/mixedfiles_* >> $(WORK_DIR)/md5sum_original_files.txt + displayName: 'Generate md5Sum with File-Cache' + + - script: | + echo "----------------------------------------------" + ls -l ${{ parameters.mount_dir }} + displayName: 'Print contents of File-Cache' + + - script: | + $(WORK_DIR)/blobfuse2 unmount all + displayName: 'Unmount RW mount' + + - script: | + cd $(WORK_DIR) + $(WORK_DIR)/blobfuse2 gen-test-config --config-file=$(WORK_DIR)/testdata/config/azure_key_bc.yaml --container-name=${{ parameters.container }} --temp-path=${{ parameters.temp_dir }} --output-file=${{ parameters.config_file }} + displayName: 'Create Config File for RO mount' + env: + NIGHTLY_STO_ACC_NAME: ${{ parameters.account_name }} + NIGHTLY_STO_ACC_KEY: ${{ parameters.account_key }} + ACCOUNT_TYPE: ${{ parameters.account_type }} + ACCOUNT_ENDPOINT: ${{ parameters.account_endpoint }} + VERBOSE_LOG: ${{ parameters.verbose_log }} + continueOnError: false + + - template: 'mount.yml' + parameters: + working_dir: $(WORK_DIR) + mount_dir: ${{ parameters.mount_dir }} + temp_dir: ${{ parameters.temp_dir }} + prefix: ${{ parameters.idstring }} + ro_mount: true + mountStep: + script: | + $(WORK_DIR)/blobfuse2 mount ${{ parameters.mount_dir }} --config-file=${{ parameters.config_file }} --default-working-dir=$(WORK_DIR) -o ro --block-cache-block-size ${{ parameters.block_size_mb }} + + - script: | + echo "----------------------------------------------" + ls -l ${{ parameters.mount_dir }}/datafiles* + ls -l ${{ parameters.mount_dir }}/mixedfiles* + displayName: 'Print contents of Block-Cache' + + - script: | + md5sum ${{ parameters.mount_dir }}/datafiles_* > $(WORK_DIR)/md5sum_block_cache.txt + md5sum ${{ parameters.mount_dir }}/mixedfiles_* >> $(WORK_DIR)/md5sum_block_cache.txt + displayName: 'Generate md5Sum with Block-Cache' + + - script: | + $(WORK_DIR)/blobfuse2 unmount all + displayName: 'Unmount RW mount' + + - script: | + echo "----------------------------------------------" + cat $(WORK_DIR)/md5sum_original_files.txt + cat $(WORK_DIR)/md5sum_original_files.txt | cut -d " " -f1 > $(WORK_DIR)/temp.txt && mv $(WORK_DIR)/temp.txt $(WORK_DIR)/md5sum_original_files.txt + echo "----------------------------------------------" + cat $(WORK_DIR)/md5sum_block_cache.txt + cat $(WORK_DIR)/md5sum_block_cache.txt | cut -d " " -f1 > $(WORK_DIR)/temp.txt && mv $(WORK_DIR)/temp.txt $(WORK_DIR)/md5sum_block_cache.txt + echo "----------------------------------------------" + diff $(WORK_DIR)/md5sum_original_files.txt $(WORK_DIR)/md5sum_block_cache.txt + if [ $? -ne 0 ]; then + exit 1 + fi + displayName: 'Compare md5Sum' + + - script: | + cd $(WORK_DIR) + $(WORK_DIR)/blobfuse2 gen-test-config --config-file=$(WORK_DIR)/testdata/config/azure_key_bc.yaml --container-name=${{ parameters.container }} --temp-path=${{ parameters.temp_dir }} --output-file=${{ parameters.config_file }} + displayName: 'Create Config File for RO mount with direct-io and disk-cache enabled' + env: + NIGHTLY_STO_ACC_NAME: ${{ parameters.account_name }} + NIGHTLY_STO_ACC_KEY: ${{ parameters.account_key }} + ACCOUNT_TYPE: ${{ parameters.account_type }} + ACCOUNT_ENDPOINT: ${{ parameters.account_endpoint }} + VERBOSE_LOG: ${{ parameters.verbose_log }} + continueOnError: false + + - template: 'mount.yml' + parameters: + working_dir: $(WORK_DIR) + mount_dir: ${{ parameters.mount_dir }} + temp_dir: ${{ parameters.temp_dir }} + prefix: ${{ parameters.idstring }} + ro_mount: true + mountStep: + script: | + $(WORK_DIR)/blobfuse2 mount ${{ parameters.mount_dir }} --config-file=${{ parameters.config_file }} --default-working-dir=$(WORK_DIR) -o ro -o direct_io --block-cache-path block_cache --block-cache-block-size ${{ parameters.block_size_mb }} + + - script: | + echo "----------------------------------------------" + ls -l ${{ parameters.mount_dir }} + displayName: 'Print contents of Block-Cache' + + - script: | + md5sum ${{ parameters.mount_dir }}/datafiles_* > $(WORK_DIR)/md5sum_block_cache_direct_io.txt + md5sum ${{ parameters.mount_dir }}/mixedfiles_* >> $(WORK_DIR)/md5sum_block_cache_direct_io.txt + displayName: 'Generate md5Sum with Block-Cache Direct-IO' + + - script: | + md5sum ${{ parameters.mount_dir }}/datafiles_* > $(WORK_DIR)/md5sum_block_cache_disk_cache.txt + md5sum ${{ parameters.mount_dir }}/mixedfiles_* >> $(WORK_DIR)/md5sum_block_cache_disk_cache.txt + displayName: 'Generate md5Sum with Block-Cache Disk-Cache' + + - script: | + $(WORK_DIR)/blobfuse2 unmount all + displayName: 'Unmount RW mount' + + - script: | + echo "----------------------------------------------" + cat $(WORK_DIR)/md5sum_original_files.txt + echo "----------------------------------------------" + cat $(WORK_DIR)/md5sum_block_cache_direct_io.txt | cut -d " " -f1 > $(WORK_DIR)/temp.txt && mv $(WORK_DIR)/temp.txt $(WORK_DIR)/md5sum_block_cache_direct_io.txt + cat $(WORK_DIR)/md5sum_block_cache_direct_io.txt + echo "----------------------------------------------" + diff $(WORK_DIR)/md5sum_original_files.txt $(WORK_DIR)/md5sum_block_cache_direct_io.txt + if [ $? -ne 0 ]; then + exit 1 + fi + displayName: 'Compare md5Sum with Block-Cache Direct-IO' + + - script: | + echo "----------------------------------------------" + cat $(WORK_DIR)/md5sum_original_files.txt + echo "----------------------------------------------" + cat $(WORK_DIR)/md5sum_block_cache_disk_cache.txt | cut -d " " -f1 > $(WORK_DIR)/temp.txt && mv $(WORK_DIR)/temp.txt $(WORK_DIR)/md5sum_block_cache_disk_cache.txt + cat $(WORK_DIR)/md5sum_block_cache_disk_cache.txt + echo "----------------------------------------------" + diff $(WORK_DIR)/md5sum_original_files.txt $(WORK_DIR)/md5sum_block_cache_disk_cache.txt + if [ $? -ne 0 ]; then + exit 1 + fi + displayName: 'Compare md5Sum with Block-Cache Disk-Cache' + + - template: 'cleanup.yml' + parameters: + working_dir: $(WORK_DIR) + mount_dir: ${{ parameters.mount_dir }} + temp_dir: ${{ parameters.temp_dir }} \ No newline at end of file diff --git a/blobfuse2-nightly.yaml b/blobfuse2-nightly.yaml index 7f37e1837..5345a0915 100755 --- a/blobfuse2-nightly.yaml +++ b/blobfuse2-nightly.yaml @@ -1407,6 +1407,110 @@ stages: temp_dir: $(TEMP_DIR) mount_dir: $(MOUNT_DIR) + - stage: BlockCacheDataIntegrityValidation + jobs: + # Ubuntu Tests + - job: Set_1 + timeoutInMinutes: 300 + strategy: + matrix: + Ubuntu-20: + AgentName: 'blobfuse-ubuntu20' + containerName: 'test-cnt-ubn-20' + adlsSas: $(AZTEST_ADLS_CONT_SAS_UBN_20) + fuselib: 'libfuse-dev' + tags: 'fuse2' + Ubuntu-22: + AgentName: 'blobfuse-ubuntu22' + containerName: 'test-cnt-ubn-22' + adlsSas: $(AZTEST_ADLS_CONT_SAS_UBN_22) + fuselib: 'libfuse3-dev' + tags: 'fuse3' + + pool: + name: "blobfuse-ubuntu-pool" + demands: + - ImageOverride -equals $(AgentName) + + variables: + - group: NightlyBlobFuse + - name: ROOT_DIR + value: "/usr/pipeline/workv2" + - name: WORK_DIR + value: "/usr/pipeline/workv2/go/src/azure-storage-fuse" + - name: skipComponentGovernanceDetection + value: true + - name: MOUNT_DIR + value: "/usr/pipeline/workv2/blob_mnt" + - name: TEMP_DIR + value: "/usr/pipeline/workv2/temp" + - name: BLOBFUSE2_CFG + value: "/usr/pipeline/workv2/blobfuse2.yaml" + - name: GOPATH + value: "/usr/pipeline/workv2/go" + + steps: + - template: 'azure-pipeline-templates/setup.yml' + parameters: + tags: $(tags) + installStep: + script: | + sudo apt-get update --fix-missing + sudo apt update + sudo apt-get install cmake gcc $(fuselib) git parallel -y + if [ $(tags) == "fuse2" ]; then + sudo apt-get install fuse -y + else + sudo apt-get install fuse3 -y + fi + displayName: 'Install fuse' + + - script: | + sudo apt-get install python3-setuptools -y + sudo apt install python3-pip -y + sudo pip3 install pandas numpy pyarrow fastparquet + displayName: 'Install Python Packages' + + - template: 'azure-pipeline-templates/e2e-tests-block-cache-data-integrity.yml' + parameters: + conf_template: azure_key.yaml + config_file: $(BLOBFUSE2_CFG) + container: $(containerName) + idstring: Block_Blob + adls: false + account_name: $(NIGHTLY_STO_BLOB_ACC_NAME) + account_key: $(NIGHTLY_STO_BLOB_ACC_KEY) + account_type: block + account_endpoint: https://$(NIGHTLY_STO_BLOB_ACC_NAME).blob.core.windows.net + distro_name: $(AgentName) + quick_test: false + verbose_log: ${{ parameters.verbose_log }} + clone: true + # TODO: These can be removed one day and replace all instances of ${{ parameters.temp_dir }} with $(TEMP_DIR) since it is a global variable + temp_dir: $(TEMP_DIR) + mount_dir: $(MOUNT_DIR) + block_size_mb: "1" + + - template: 'azure-pipeline-templates/e2e-tests-block-cache-data-integrity.yml' + parameters: + conf_template: azure_key.yaml + config_file: $(BLOBFUSE2_CFG) + container: $(containerName) + idstring: Block_Blob + adls: false + account_name: $(NIGHTLY_STO_BLOB_ACC_NAME) + account_key: $(NIGHTLY_STO_BLOB_ACC_KEY) + account_type: block + account_endpoint: https://$(NIGHTLY_STO_BLOB_ACC_NAME).blob.core.windows.net + distro_name: $(AgentName) + quick_test: false + verbose_log: ${{ parameters.verbose_log }} + clone: true + # TODO: These can be removed one day and replace all instances of ${{ parameters.temp_dir }} with $(TEMP_DIR) since it is a global variable + temp_dir: $(TEMP_DIR) + mount_dir: $(MOUNT_DIR) + block_size_mb: "8" + - stage: FNSDataValidation jobs: # Ubuntu Tests diff --git a/testdata/config/azure_key_bc.yaml b/testdata/config/azure_key_bc.yaml index 34a92d066..f83a11b2d 100644 --- a/testdata/config/azure_key_bc.yaml +++ b/testdata/config/azure_key_bc.yaml @@ -17,14 +17,6 @@ libfuse: block_cache: block-size-mb: 8 - mem-size-mb: 4192 - - path: block_cache - disk-size-mb: 4192 - disk-timeout-sec: 30 - - prefetch: 120 - parallelism: 128 attr_cache: timeout-sec: 3600 diff --git a/testdata/scripts/generate-parquet-files.py b/testdata/scripts/generate-parquet-files.py new file mode 100644 index 000000000..39d74312f --- /dev/null +++ b/testdata/scripts/generate-parquet-files.py @@ -0,0 +1,15 @@ +import pandas as pd +import numpy as np +import random + +# Function to generate a random number of rows for the DataFrame +def random_row_count(min_rows=10, max_rows=1000): + return random.randint(min_rows, max_rows) + +# Generate 10 Parquet files with varying sizes +for i in range(10): + row_count = random_row_count() + df = pd.DataFrame(np.random.randn(row_count, 4), columns=list('ABCD')) + file_name = f'mixedfiles_{i}.parquet' + df.to_parquet(file_name, index=False) + print(f'Created {file_name} with {row_count} rows')