Merge pull request #1 from 2lambda123/ft-future

Ft future
2lambda123 · Sep 6, 2024 · c4caa02 · c4caa02
2 parents 3473140 + 238333c
commit c4caa02
Show file tree

Hide file tree

Showing 39 changed files with 486 additions and 551 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,6 +43,8 @@ notes.txt
 
 # python
 **/__pycache__/*
+.venv
+venv
 
 #testing and plotting
 scripts/test_eval/*.png

diff --git a/.gitmodules b/.gitmodules
@@ -1,7 +1,8 @@
 [submodule "pash"]
 	path = pash
 	url = https://github.com/binpash/pash.git
-	branch = dspash-future
+	branch = ft-future
 [submodule "docker-hadoop"]
 	path = docker-hadoop
 	url = https://github.com/binpash/docker-hadoop
+	branch = ft-future
diff --git a/docker-hadoop b/docker-hadoop
diff --git a/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh b/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Specify the folder where the .out files are located
+folder="$DISH_TOP/evaluation/distr_benchmarks/oneliners/outputs"
+
+# Loop through the files in the folder
+for script_faults_out in "$folder"/*faults.out; do
+    # Extract the script name without the extension
+    script_name=$(basename "$script_faults_out" .faults.out)
+
+    # Check if there is a corresponding .distr.out file
+    script_distr_out="$folder/$script_name.distr.out"
+
+    if [ -f "$script_distr_out" ]; then
+        # Perform a diff between the two files
+        echo "Comparing faults_out and distr_out for script $script_name.sh"
+        if diff -q "$script_faults_out" "$script_distr_out"; then
+            echo "Outputs are identical"
+        else
+            echo "Files are different. Differences are as follows:"
+            diff -y "$script_faults_out" "$script_distr_out"
+        fi
+        echo "-------------------------------------------"
+    fi
+done
diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -18,7 +18,7 @@ fi
 hdfs dfs -mkdir -p /oneliners
 
 if [ ! -f ./1M.txt ]; then
-    curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
     if [ $? -ne 0 ]; then
         curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
         [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
@@ -41,7 +41,7 @@ if [ ! -f ./100M.txt ]; then
 fi
 
 if [ ! -f ./1G.txt ]; then
-    curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
     if [ $? -ne 0 ]; then
         touch 1G.txt
         for (( i = 0; i < 10; i++ )); do
@@ -68,14 +68,14 @@ fi
 
 # download wamerican-insane dictionary and sort according to machine
 if [ ! -f ./dict.txt ]; then
-    curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
     if [ $? -ne 0 ]; then
         sort words > sorted_words
     fi
 fi
 
 if [ ! -f ./all_cmds.txt ]; then
-    curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
     if [ $? -ne 0 ]; then
         # This should be OK for tests, no need for abort
         ls /usr/bin/* > all_cmds.txt
@@ -102,4 +102,4 @@ input_files+=("3G.txt")
 for file in "${input_files[@]}"; do
     hdfs dfs -put $file /oneliners/$file
     rm -f $file
-done
+done
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh b/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh
@@ -0,0 +1,137 @@
+PASH_FLAGS='--width 8 --r_split'
+export TIMEFORMAT=%R
+export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt"
+curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict
+
+
+scripts_inputs=(
+      "nfa-regex;1G.txt"
+      "sort;3G.txt"
+      "top-n;3G.txt"
+      "wf;3G.txt"
+      "spell;3G.txt"
+      "diff;3G.txt"
+      "bi-grams;3G.txt"
+      "set-diff;3G.txt"
+      "sort-sort;3G.txt"
+      "shortest-scripts;all_cmdsx100.txt"
+  )
+
+oneliners_bash() {
+    outputs_dir="outputs"
+    seq_times_file="seq.res"
+    seq_outputs_suffix="seq.out"
+
+    mkdir -p "$outputs_dir"
+
+    touch "$seq_times_file"
+    cat $seq_times_file >> $seq_times_file.d
+    echo executing one-liners $(date) | tee "$seq_times_file"
+    echo '' >> "$seq_times_file"
+
+    for script_input in ${scripts_inputs[@]}
+    do
+    IFS=";" read -r -a script_input_parsed <<< "${script_input}"
+    script="${script_input_parsed[0]}"
+    input="${script_input_parsed[1]}"
+
+    export IN="/oneliners/$input"
+    export dict=
+
+    printf -v pad %30s
+    padded_script="${script}.sh:${pad}"
+    padded_script=${padded_script:0:30}
+
+    seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}"
+
+    echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file"
+    done
+}
+
+oneliners_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+  prefix=$prefix
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat $times_file >> $times_file.d
+  echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+  for script_input in ${scripts_inputs[@]}
+  do
+    IFS=";" read -r -a script_input_parsed <<< "${script_input}"
+    script="${script_input_parsed[0]}"
+    input="${script_input_parsed[1]}"
+
+    export IN="/oneliners/$input"
+    export dict=
+
+    printf -v pad %30s
+    padded_script="${script}.sh:${pad}"
+    padded_script=${padded_script:0:30}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    pash_log="${pash_logs_dir}/${script}.pash.log"
+    single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+    echo -n "${padded_script}" | tee -a "$times_file"
+    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    cat "${single_time_file}" | tee -a "$times_file"
+  done
+}
+
+oneliners_hadoopstreaming(){
+  jarpath="/opt/hadoop-3.2.2/share/hadoop/tools/lib/hadoop-streaming-3.2.2.jar" # Adjust as required
+  basepath="" # Adjust as required
+  times_file="hadoopstreaming.res"
+  outputs_suffix="hadoopstreaming.out"
+  outputs_dir="/outputs/hadoop-streaming/oneliners"
+  . bi-gram.aux.sh
+
+  cd "hadoop-streaming/"
+
+  hdfs dfs -rm -r "$outputs_dir"
+  hdfs dfs -mkdir -p "$outputs_dir"
+
+  touch "$times_file"
+  cat "$times_file" >> "$times_file".d
+  echo executing oneliners $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+  while IFS= read -r line; do
+      printf -v pad %20s
+      name=$(cut -d "#" -f2- <<< "$line")
+      name=$(sed "s/ //g" <<< $name)
+      padded_script="${name}.sh:${pad}"
+      padded_script=${padded_script:0:20} 
+      echo "${padded_script}" $({ time { eval $line &> /dev/null; } } 2>&1) | tee -a "$times_file"
+  done <"run_all.sh"
+  cd ".."
+  mv "hadoop-streaming/$times_file" .
+}
+
+outputs_dir="outputs"
+rm -rf "$outputs"
+
+# oneliners_bash
+
+# oneliners_pash "$PASH_FLAGS" "par"
+
+oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
+
+# it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work
+# until it's fully supported!
+oneliners_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100" "faults"
+
+
+# oneliners_hadoopstreaming
diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
@@ -8,4 +8,5 @@
 
 IN=${IN:-/oneliners/all_cmdsx100.txt}
 
-hdfs dfs -cat -ignoreCrc $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
+# hdfs dfs -cat -ignoreCrc $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
+hdfs dfs -cat -ignoreCrc $IN
diff --git a/pash b/pash
diff --git a/runtime/.gitignore b/runtime/.gitignore
@@ -1,10 +1 @@
-eager
-split
-r_split
-r_merge
-r_unwrap
-r_wrap
-set-diff
-dspash/socket_pipe
-tests/perf*
-tests/*out
+bin
diff --git a/runtime/dspash/file_reader/README.md → runtime/README.md b/runtime/dspash/file_reader/README.md → runtime/README.md
diff --git a/...sh/file_reader/client/dfs_split_reader.go → runtime/dfs/client/dfs_split_reader.go b/...sh/file_reader/client/dfs_split_reader.go → runtime/dfs/client/dfs_split_reader.go
@@ -10,15 +10,15 @@ import (
 	"io"
 	"log"
 	"os"
+	"strings"
 
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/credentials/insecure"
 
-	pb "dspash/filereader"
+	pb "runtime/dfs/proto"
 )
 
 var (
-	config     = flag.String("config", "", "File to read")
 	splitNum   = flag.Int("split", 0, "The logical split number")
 	serverPort = flag.Int("port", 50051, "The server port, all machines should use same port")
 )
@@ -44,10 +44,11 @@ func readFirstLine(block DFSBlock, writer *bufio.Writer) (ok bool, e error) {
 	ok = false
 	e = errors.New("Failed to read newline from all replicas")
 	for _, host := range block.Hosts {
-		addr := fmt.Sprintf("%s:%d", host, *serverPort)
+		addr := fmt.Sprintf("%s:%d", strings.Split(host, ":")[0], *serverPort)
 		conn, err := grpc.Dial(addr, opts...)
 
 		if err != nil {
+			log.Println("Failed to connect to ", addr, err)
 			continue // try next addr
 		}
 		defer conn.Close()
@@ -56,6 +57,7 @@ func readFirstLine(block DFSBlock, writer *bufio.Writer) (ok bool, e error) {
 
 		stream, err := client.ReadFile(ctx, &pb.FileRequest{Path: block.Path})
 		if err != nil {
+			log.Println("Failed to read file from ", addr, err)
 			continue
 		}
 
@@ -145,9 +147,9 @@ func readDFSLogicalSplit(conf DFSConfig, split int) error {
 
 }
 
-func serialize_conf(p string) DFSConfig {
+func serialize_conf() DFSConfig {
 	conf := DFSConfig{}
-	byt, err := os.ReadFile(p)
+	byt, err := io.ReadAll(os.Stdin)
 	if err != nil {
 		log.Fatalln(err)
 	}
@@ -159,14 +161,9 @@ func serialize_conf(p string) DFSConfig {
 
 func main() {
 	flag.Parse()
-	if flag.NArg() < 1 && *config == "" {
-		flag.Usage()
-		os.Exit(0)
-	} else if *config == "" {
-		*config = flag.Arg(0)
-	}
+	log.Println("Starting dfs_split_reader client", *splitNum, *serverPort)
 
-	conf := serialize_conf(*config)
+	conf := serialize_conf()
 	err := readDFSLogicalSplit(conf, *splitNum)
 	if err != nil {
 		log.Fatalln(err)
+16 −8		Makefile
+15 −8		base/Dockerfile
+1 −1		datanode/Dockerfile
+10 −1		datanode/run.sh
+1 −1		docker-compose-client.yml
+5 −2		docker-compose.yml
+1 −1		namenode/run.sh
+6 −2		pash-base/Dockerfile
+0 −1		sample-script.sh
+18 −6		setup-compose.sh
+2 −1		stop-compose.sh