Skip to content

Commit

Permalink
Merge pull request #1 from 2lambda123/ft-future
Browse files Browse the repository at this point in the history
Ft future
  • Loading branch information
2lambda123 authored Sep 6, 2024
2 parents 3473140 + 238333c commit c4caa02
Show file tree
Hide file tree
Showing 39 changed files with 486 additions and 551 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ notes.txt

# python
**/__pycache__/*
.venv
venv

#testing and plotting
scripts/test_eval/*.png
Expand Down
3 changes: 2 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[submodule "pash"]
path = pash
url = https://github.com/binpash/pash.git
branch = dspash-future
branch = ft-future
[submodule "docker-hadoop"]
path = docker-hadoop
url = https://github.com/binpash/docker-hadoop
branch = ft-future
25 changes: 25 additions & 0 deletions evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Specify the folder where the .out files are located
folder="$DISH_TOP/evaluation/distr_benchmarks/oneliners/outputs"

# Loop through the files in the folder
for script_faults_out in "$folder"/*faults.out; do
# Extract the script name without the extension
script_name=$(basename "$script_faults_out" .faults.out)

# Check if there is a corresponding .distr.out file
script_distr_out="$folder/$script_name.distr.out"

if [ -f "$script_distr_out" ]; then
# Perform a diff between the two files
echo "Comparing faults_out and distr_out for script $script_name.sh"
if diff -q "$script_faults_out" "$script_distr_out"; then
echo "Outputs are identical"
else
echo "Files are different. Differences are as follows:"
diff -y "$script_faults_out" "$script_distr_out"
fi
echo "-------------------------------------------"
fi
done
10 changes: 5 additions & 5 deletions evaluation/distr_benchmarks/oneliners/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fi
hdfs dfs -mkdir -p /oneliners

if [ ! -f ./1M.txt ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
[ $? -ne 0 ] && eexit 'cannot find 1M.txt'
Expand All @@ -41,7 +41,7 @@ if [ ! -f ./100M.txt ]; then
fi

if [ ! -f ./1G.txt ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
if [ $? -ne 0 ]; then
touch 1G.txt
for (( i = 0; i < 10; i++ )); do
Expand All @@ -68,14 +68,14 @@ fi

# download wamerican-insane dictionary and sort according to machine
if [ ! -f ./dict.txt ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
if [ $? -ne 0 ]; then
sort words > sorted_words
fi
fi

if [ ! -f ./all_cmds.txt ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
if [ $? -ne 0 ]; then
# This should be OK for tests, no need for abort
ls /usr/bin/* > all_cmds.txt
Expand All @@ -102,4 +102,4 @@ input_files+=("3G.txt")
for file in "${input_files[@]}"; do
hdfs dfs -put $file /oneliners/$file
rm -f $file
done
done
137 changes: 137 additions & 0 deletions evaluation/distr_benchmarks/oneliners/run.distr.faults.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
PASH_FLAGS='--width 8 --r_split'
export TIMEFORMAT=%R
export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt"
curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict


scripts_inputs=(
"nfa-regex;1G.txt"
"sort;3G.txt"
"top-n;3G.txt"
"wf;3G.txt"
"spell;3G.txt"
"diff;3G.txt"
"bi-grams;3G.txt"
"set-diff;3G.txt"
"sort-sort;3G.txt"
"shortest-scripts;all_cmdsx100.txt"
)

oneliners_bash() {
outputs_dir="outputs"
seq_times_file="seq.res"
seq_outputs_suffix="seq.out"

mkdir -p "$outputs_dir"

touch "$seq_times_file"
cat $seq_times_file >> $seq_times_file.d
echo executing one-liners $(date) | tee "$seq_times_file"
echo '' >> "$seq_times_file"

for script_input in ${scripts_inputs[@]}
do
IFS=";" read -r -a script_input_parsed <<< "${script_input}"
script="${script_input_parsed[0]}"
input="${script_input_parsed[1]}"

export IN="/oneliners/$input"
export dict=

printf -v pad %30s
padded_script="${script}.sh:${pad}"
padded_script=${padded_script:0:30}

seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}"

echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file"
done
}

oneliners_pash(){
flags=${1:-$PASH_FLAGS}
prefix=${2:-par}
prefix=$prefix

times_file="$prefix.res"
outputs_suffix="$prefix.out"
time_suffix="$prefix.time"
outputs_dir="outputs"
pash_logs_dir="pash_logs_$prefix"

mkdir -p "$outputs_dir"
mkdir -p "$pash_logs_dir"

touch "$times_file"
cat $times_file >> $times_file.d
echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
echo '' >> "$times_file"

for script_input in ${scripts_inputs[@]}
do
IFS=";" read -r -a script_input_parsed <<< "${script_input}"
script="${script_input_parsed[0]}"
input="${script_input_parsed[1]}"

export IN="/oneliners/$input"
export dict=

printf -v pad %30s
padded_script="${script}.sh:${pad}"
padded_script=${padded_script:0:30}

outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
pash_log="${pash_logs_dir}/${script}.pash.log"
single_time_file="${outputs_dir}/${script}.${time_suffix}"

echo -n "${padded_script}" | tee -a "$times_file"
{ time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
cat "${single_time_file}" | tee -a "$times_file"
done
}

oneliners_hadoopstreaming(){
jarpath="/opt/hadoop-3.2.2/share/hadoop/tools/lib/hadoop-streaming-3.2.2.jar" # Adjust as required
basepath="" # Adjust as required
times_file="hadoopstreaming.res"
outputs_suffix="hadoopstreaming.out"
outputs_dir="/outputs/hadoop-streaming/oneliners"
. bi-gram.aux.sh

cd "hadoop-streaming/"

hdfs dfs -rm -r "$outputs_dir"
hdfs dfs -mkdir -p "$outputs_dir"

touch "$times_file"
cat "$times_file" >> "$times_file".d
echo executing oneliners $(date) | tee "$times_file"
echo '' >> "$times_file"

while IFS= read -r line; do
printf -v pad %20s
name=$(cut -d "#" -f2- <<< "$line")
name=$(sed "s/ //g" <<< $name)
padded_script="${name}.sh:${pad}"
padded_script=${padded_script:0:20}
echo "${padded_script}" $({ time { eval $line &> /dev/null; } } 2>&1) | tee -a "$times_file"
done <"run_all.sh"
cd ".."
mv "hadoop-streaming/$times_file" .
}

outputs_dir="outputs"
rm -rf "$outputs"

# oneliners_bash

# oneliners_pash "$PASH_FLAGS" "par"

oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"

# it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work
# until it's fully supported!
oneliners_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100" "faults"


# oneliners_hadoopstreaming
3 changes: 2 additions & 1 deletion evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@

IN=${IN:-/oneliners/all_cmdsx100.txt}

hdfs dfs -cat -ignoreCrc $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
# hdfs dfs -cat -ignoreCrc $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
hdfs dfs -cat -ignoreCrc $IN
2 changes: 1 addition & 1 deletion pash
Submodule pash updated 534 files
11 changes: 1 addition & 10 deletions runtime/.gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1 @@
eager
split
r_split
r_merge
r_unwrap
r_wrap
set-diff
dspash/socket_pipe
tests/perf*
tests/*out
bin
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ import (
"io"
"log"
"os"
"strings"

"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"

pb "dspash/filereader"
pb "runtime/dfs/proto"
)

var (
config = flag.String("config", "", "File to read")
splitNum = flag.Int("split", 0, "The logical split number")
serverPort = flag.Int("port", 50051, "The server port, all machines should use same port")
)
Expand All @@ -44,10 +44,11 @@ func readFirstLine(block DFSBlock, writer *bufio.Writer) (ok bool, e error) {
ok = false
e = errors.New("Failed to read newline from all replicas")
for _, host := range block.Hosts {
addr := fmt.Sprintf("%s:%d", host, *serverPort)
addr := fmt.Sprintf("%s:%d", strings.Split(host, ":")[0], *serverPort)
conn, err := grpc.Dial(addr, opts...)

if err != nil {
log.Println("Failed to connect to ", addr, err)
continue // try next addr
}
defer conn.Close()
Expand All @@ -56,6 +57,7 @@ func readFirstLine(block DFSBlock, writer *bufio.Writer) (ok bool, e error) {

stream, err := client.ReadFile(ctx, &pb.FileRequest{Path: block.Path})
if err != nil {
log.Println("Failed to read file from ", addr, err)
continue
}

Expand Down Expand Up @@ -145,9 +147,9 @@ func readDFSLogicalSplit(conf DFSConfig, split int) error {

}

func serialize_conf(p string) DFSConfig {
func serialize_conf() DFSConfig {
conf := DFSConfig{}
byt, err := os.ReadFile(p)
byt, err := io.ReadAll(os.Stdin)
if err != nil {
log.Fatalln(err)
}
Expand All @@ -159,14 +161,9 @@ func serialize_conf(p string) DFSConfig {

func main() {
flag.Parse()
if flag.NArg() < 1 && *config == "" {
flag.Usage()
os.Exit(0)
} else if *config == "" {
*config = flag.Arg(0)
}
log.Println("Starting dfs_split_reader client", *splitNum, *serverPort)

conf := serialize_conf(*config)
conf := serialize_conf()
err := readDFSLogicalSplit(conf, *splitNum)
if err != nil {
log.Fatalln(err)
Expand Down
Loading

0 comments on commit c4caa02

Please sign in to comment.