amplab · rpoloju · Jun 14, 2016 · Jun 15, 2016 · Jun 15, 2016 · Jun 15, 2016
diff --git a/create_image.sh b/create_image.sh
@@ -38,12 +38,12 @@ sudo sed -i 's/.*ephemeral.*//g' /etc/cloud/cloud.cfg
 sudo sed -i 's/.*swap.*//g' /etc/cloud/cloud.cfg
 
 echo "mounts:" >> /etc/cloud/cloud.cfg
-echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime,nodiratime\", "\
+echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime\", "\
   "\"0\", \"0\" ]" >> /etc/cloud.cloud.cfg
 
 for x in {1..23}; do
   echo " - [ ephemeral$x, /mnt$((x + 1)), auto, "\
-    "\"defaults,noatime,nodiratime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg
+    "\"defaults,noatime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg
 done
 
 # Install Maven (for Hadoop)

diff --git a/setup-slave.sh b/setup-slave.sh
@@ -30,24 +30,24 @@ echo "Setting up slave on `hostname`... of type $instance_type"
 if [[ $instance_type == r3* || $instance_type == i2* || $instance_type == hi1* ]]; then
   # Format & mount using ext4, which has the best performance among ext3, ext4, and xfs based
   # on our shuffle heavy benchmark
-  EXT4_MOUNT_OPTS="defaults,noatime,nodiratime"
+  EXT4_MOUNT_OPTS="defaults,noatime"
   rm -rf /mnt*
   mkdir /mnt
   # To turn TRIM support on, uncomment the following line.
-  #echo '/dev/sdb /mnt  ext4  defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab
+  #echo '/dev/sdb /mnt  ext4  defaults,noatime,discard 0 0' >> /etc/fstab
   mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdb
   mount -o $EXT4_MOUNT_OPTS /dev/sdb /mnt
 
   if [[ $instance_type == "r3.8xlarge" || $instance_type == "hi1.4xlarge" ]]; then
     mkdir /mnt2
     # To turn TRIM support on, uncomment the following line.
-    #echo '/dev/sdc /mnt2  ext4  defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab
+    #echo '/dev/sdc /mnt2  ext4  defaults,noatime,discard 0 0' >> /etc/fstab
     if [[ $instance_type == "r3.8xlarge" ]]; then
       mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdc      
       mount -o $EXT4_MOUNT_OPTS /dev/sdc /mnt2
     fi
     # To turn TRIM support on, uncomment the following line.
-    #echo '/dev/sdf /mnt2  ext4  defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab
+    #echo '/dev/sdf /mnt2  ext4  defaults,noatime,discard 0 0' >> /etc/fstab
     if [[ $instance_type == "hi1.4xlarge" ]]; then
       mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdf      
       mount -o $EXT4_MOUNT_OPTS /dev/sdf /mnt2
@@ -57,7 +57,7 @@ fi
 
 # Mount options to use for ext3 and xfs disks (the ephemeral disks
 # are ext3, but we use xfs for EBS volumes to format them faster)
-XFS_MOUNT_OPTS="defaults,noatime,nodiratime,allocsize=8m"
+XFS_MOUNT_OPTS="defaults,noatime,allocsize=8m"
 
 function setup_ebs_volume {
   device=$1

diff --git a/spark_ec2.py b/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.6.0"
+SPARK_EC2_VERSION = "2.0.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -76,6 +76,15 @@
     "1.5.1",
     "1.5.2",
     "1.6.0",
+    "1.6.1",
+    "1.6.2",
+    "1.6.3",
+    "2.0.0-preview",
+    "2.0.0",
+    "2.0.1",
+    "2.0.2",
+    "2.1.0",
+    "2.1.1"
 ])
 
 SPARK_TACHYON_MAP = {
@@ -94,14 +103,15 @@
     "1.5.1": "0.7.1",
     "1.5.2": "0.7.1",
     "1.6.0": "0.8.2",
+    "2.0.0-preview": "",
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
 DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
 
 # Default location to get the spark-ec2 scripts (and ami-list) from
 DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2"
-DEFAULT_SPARK_EC2_BRANCH = "branch-1.5"
+DEFAULT_SPARK_EC2_BRANCH = "branch-2.0"
 
 
 def setup_external_libs(libs):
@@ -234,7 +244,7 @@ def parse_args():
              "the directory is not created and its contents are copied directly into /. " +
              "(default: %default).")
     parser.add_option(
-        "--hadoop-major-version", default="1",
+        "--hadoop-major-version", default="yarn",
         help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.2.0), yarn " +
              "(Hadoop 2.4.0) (default: %default)")
     parser.add_option(
@@ -307,6 +317,10 @@ def parse_args():
         "--additional-tags", type="string", default="",
         help="Additional tags to set on the machines; tags are comma-separated, while name and " +
              "value are colon separated; ex: \"Task:MySparkProject,Env:production\"")
+    parser.add_option(
+        "--tag-volumes", action="store_true", default=False,
+        help="Apply the tags given in --additional-tags to any EBS volumes " +
+             "attached to master and slave instances.")
     parser.add_option(
         "--copy-aws-credentials", action="store_true", default=False,
         help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
@@ -362,10 +376,23 @@ def get_or_make_group(conn, name, vpc_id):
         print("Creating security group " + name)
         return conn.create_security_group(name, "Spark EC2 group", vpc_id)
 
+def validate_spark_hadoop_version(spark_version, hadoop_version):
+    if "." in spark_version:
+        parts = spark_version.split(".")
+        if parts[0].isdigit():
+            spark_major_version = float(parts[0])
+            if spark_major_version > 1.0 and hadoop_version != "yarn":
+              print("Spark version: {v}, does not support Hadoop version: {hv}".
+                    format(v=spark_version, hv=hadoop_version), file=stderr)
+              sys.exit(1)
+        else:
+            print("Invalid Spark version: {v}".format(v=spark_version), file=stderr)
+            sys.exit(1)
 
 def get_validate_spark_version(version, repo):
     if "." in version:
-        version = version.replace("v", "")
+        # Remove leading v to handle inputs like v1.5.0
+        version = version.lstrip("v")
         if version not in VALID_SPARK_VERSIONS:
             print("Don't know about Spark version: {v}".format(v=version), file=stderr)
             sys.exit(1)
@@ -735,16 +762,28 @@ def launch_cluster(conn, opts, cluster_name):
             map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
         )
 
+    print('Applying tags to master nodes')
     for master in master_nodes:
         master.add_tags(
             dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
         )
 
+    print('Applying tags to slave nodes')
     for slave in slave_nodes:
         slave.add_tags(
             dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
         )
 
+    if opts.tag_volumes:
+        if len(additional_tags) > 0:
+            print('Applying tags to volumes')
+            all_instance_ids = [x.id for x in master_nodes + slave_nodes]
+            volumes = conn.get_all_volumes(filters={'attachment.instance-id': all_instance_ids})
+            for v in volumes:
+                v.add_tags(additional_tags)
+        else:
+            print('--tag-volumes has no effect without --additional-tags')
+
     # Return all the instances
     return (master_nodes, slave_nodes)
 
@@ -1052,13 +1091,16 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
     if "." in opts.spark_version:
         # Pre-built Spark deploy
         spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
+        validate_spark_hadoop_version(spark_v, opts.hadoop_major_version)
         tachyon_v = get_tachyon_version(spark_v)
     else:
         # Spark-only custom deploy
         spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
         tachyon_v = ""
-        print("Deploying Spark via git hash; Tachyon won't be set up")
-        modules = filter(lambda x: x != "tachyon", modules)
+
+    if tachyon_v == "":
+      print("No valid Tachyon version found; Tachyon won't be set up")
+      modules.remove("tachyon")
 
     master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes]
     slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes]
@@ -1259,7 +1301,8 @@ def real_main():
     (opts, action, cluster_name) = parse_args()
 
     # Input parameter validation
-    get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
+    spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
+    validate_spark_hadoop_version(spark_v, opts.hadoop_major_version)
 
     if opts.wait is not None:
         # NOTE: DeprecationWarnings are silent in 2.7+ by default.