Skip to content
This repository has been archived by the owner on Nov 23, 2017. It is now read-only.

Branch 2.0 #115

Open
wants to merge 22 commits into
base: branch-1.6
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
0020b7a
Add support for 2.0.0-preview
shivaram Jun 14, 2016
38b0095
Check if hadoop version is YARN for Spark 2.0
shivaram Jun 15, 2016
11a2975
Address code review comments
shivaram Jun 15, 2016
79736f2
Remove debug print statement
shivaram Jun 15, 2016
d89a22e
Merge pull request #35 from shivaram/2.0-preview
shivaram Jun 15, 2016
8aff6d1
Now that it's been released, enable launching with spark 2.0.0
tomerk Aug 29, 2016
59045a1
Updated default spark version and hadoop version to 2.0.0 and yarn
tomerk Aug 29, 2016
472d067
Merge pull request #46 from tomerk/branch-2.0
shivaram Aug 29, 2016
783a075
Apply --additional-tags to EBS volumes
ajohnson-inst Sep 7, 2016
06f5d2b
Merge pull request #48 from aaronj1331/tag-volumes
shivaram Sep 7, 2016
bd25efa
Add Spark 2.0.1 to valid spark versions.
lagerspetz Oct 9, 2016
78280cb
Added also Spark 1.6.2
lagerspetz Oct 10, 2016
81a5aeb
Merge pull request #62 from lagerspetz/branch-2.0
shivaram Oct 10, 2016
bafa07c
Add missing 1.6.1 and new 1.6.3 and 2.0.2
lagerspetz Nov 15, 2016
a3e1d7b
Merge pull request #73 from lagerspetz/202
shivaram Nov 15, 2016
5188c78
Fix missing close quote
shivaram Nov 16, 2016
fcbe85f
Get rid of useless mount flag
Jan 10, 2017
9314296
Merge pull request #79 from dud225/branch-2.0
shivaram Jan 11, 2017
045507a
add missing spark_version 2.1.0
shiyuangu Feb 20, 2017
7af4f6d
Merge pull request #87 from shiyuangu/branch-2.0
shivaram Feb 22, 2017
697e802
Added Spark 2.1.1.
lagerspetz Jun 21, 2017
e6c4e09
Merge pull request #104 from lagerspetz/add-210
shivaram Jun 21, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions create_image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ sudo sed -i 's/.*ephemeral.*//g' /etc/cloud/cloud.cfg
sudo sed -i 's/.*swap.*//g' /etc/cloud/cloud.cfg

echo "mounts:" >> /etc/cloud/cloud.cfg
echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime,nodiratime\", "\
echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime\", "\
"\"0\", \"0\" ]" >> /etc/cloud.cloud.cfg

for x in {1..23}; do
echo " - [ ephemeral$x, /mnt$((x + 1)), auto, "\
"\"defaults,noatime,nodiratime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg
"\"defaults,noatime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg
done

# Install Maven (for Hadoop)
Expand Down
10 changes: 5 additions & 5 deletions setup-slave.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,24 @@ echo "Setting up slave on `hostname`... of type $instance_type"
if [[ $instance_type == r3* || $instance_type == i2* || $instance_type == hi1* ]]; then
# Format & mount using ext4, which has the best performance among ext3, ext4, and xfs based
# on our shuffle heavy benchmark
EXT4_MOUNT_OPTS="defaults,noatime,nodiratime"
EXT4_MOUNT_OPTS="defaults,noatime"
rm -rf /mnt*
mkdir /mnt
# To turn TRIM support on, uncomment the following line.
#echo '/dev/sdb /mnt ext4 defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab
#echo '/dev/sdb /mnt ext4 defaults,noatime,discard 0 0' >> /etc/fstab
mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdb
mount -o $EXT4_MOUNT_OPTS /dev/sdb /mnt

if [[ $instance_type == "r3.8xlarge" || $instance_type == "hi1.4xlarge" ]]; then
mkdir /mnt2
# To turn TRIM support on, uncomment the following line.
#echo '/dev/sdc /mnt2 ext4 defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab
#echo '/dev/sdc /mnt2 ext4 defaults,noatime,discard 0 0' >> /etc/fstab
if [[ $instance_type == "r3.8xlarge" ]]; then
mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdc
mount -o $EXT4_MOUNT_OPTS /dev/sdc /mnt2
fi
# To turn TRIM support on, uncomment the following line.
#echo '/dev/sdf /mnt2 ext4 defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab
#echo '/dev/sdf /mnt2 ext4 defaults,noatime,discard 0 0' >> /etc/fstab
if [[ $instance_type == "hi1.4xlarge" ]]; then
mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdf
mount -o $EXT4_MOUNT_OPTS /dev/sdf /mnt2
Expand All @@ -57,7 +57,7 @@ fi

# Mount options to use for ext3 and xfs disks (the ephemeral disks
# are ext3, but we use xfs for EBS volumes to format them faster)
XFS_MOUNT_OPTS="defaults,noatime,nodiratime,allocsize=8m"
XFS_MOUNT_OPTS="defaults,noatime,allocsize=8m"

function setup_ebs_volume {
device=$1
Expand Down
57 changes: 50 additions & 7 deletions spark_ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
raw_input = input
xrange = range

SPARK_EC2_VERSION = "1.6.0"
SPARK_EC2_VERSION = "2.0.0"
SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))

VALID_SPARK_VERSIONS = set([
Expand All @@ -76,6 +76,15 @@
"1.5.1",
"1.5.2",
"1.6.0",
"1.6.1",
"1.6.2",
"1.6.3",
"2.0.0-preview",
"2.0.0",
"2.0.1",
"2.0.2",
"2.1.0",
"2.1.1"
])

SPARK_TACHYON_MAP = {
Expand All @@ -94,14 +103,15 @@
"1.5.1": "0.7.1",
"1.5.2": "0.7.1",
"1.6.0": "0.8.2",
"2.0.0-preview": "",
}

DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"

# Default location to get the spark-ec2 scripts (and ami-list) from
DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2"
DEFAULT_SPARK_EC2_BRANCH = "branch-1.5"
DEFAULT_SPARK_EC2_BRANCH = "branch-2.0"


def setup_external_libs(libs):
Expand Down Expand Up @@ -234,7 +244,7 @@ def parse_args():
"the directory is not created and its contents are copied directly into /. " +
"(default: %default).")
parser.add_option(
"--hadoop-major-version", default="1",
"--hadoop-major-version", default="yarn",
help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.2.0), yarn " +
"(Hadoop 2.4.0) (default: %default)")
parser.add_option(
Expand Down Expand Up @@ -307,6 +317,10 @@ def parse_args():
"--additional-tags", type="string", default="",
help="Additional tags to set on the machines; tags are comma-separated, while name and " +
"value are colon separated; ex: \"Task:MySparkProject,Env:production\"")
parser.add_option(
"--tag-volumes", action="store_true", default=False,
help="Apply the tags given in --additional-tags to any EBS volumes " +
"attached to master and slave instances.")
parser.add_option(
"--copy-aws-credentials", action="store_true", default=False,
help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
Expand Down Expand Up @@ -362,10 +376,23 @@ def get_or_make_group(conn, name, vpc_id):
print("Creating security group " + name)
return conn.create_security_group(name, "Spark EC2 group", vpc_id)

def validate_spark_hadoop_version(spark_version, hadoop_version):
if "." in spark_version:
parts = spark_version.split(".")
if parts[0].isdigit():
spark_major_version = float(parts[0])
if spark_major_version > 1.0 and hadoop_version != "yarn":
print("Spark version: {v}, does not support Hadoop version: {hv}".
format(v=spark_version, hv=hadoop_version), file=stderr)
sys.exit(1)
else:
print("Invalid Spark version: {v}".format(v=spark_version), file=stderr)
sys.exit(1)

def get_validate_spark_version(version, repo):
if "." in version:
version = version.replace("v", "")
# Remove leading v to handle inputs like v1.5.0
version = version.lstrip("v")
if version not in VALID_SPARK_VERSIONS:
print("Don't know about Spark version: {v}".format(v=version), file=stderr)
sys.exit(1)
Expand Down Expand Up @@ -735,16 +762,28 @@ def launch_cluster(conn, opts, cluster_name):
map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
)

print('Applying tags to master nodes')
for master in master_nodes:
master.add_tags(
dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
)

print('Applying tags to slave nodes')
for slave in slave_nodes:
slave.add_tags(
dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
)

if opts.tag_volumes:
if len(additional_tags) > 0:
print('Applying tags to volumes')
all_instance_ids = [x.id for x in master_nodes + slave_nodes]
volumes = conn.get_all_volumes(filters={'attachment.instance-id': all_instance_ids})
for v in volumes:
v.add_tags(additional_tags)
else:
print('--tag-volumes has no effect without --additional-tags')

# Return all the instances
return (master_nodes, slave_nodes)

Expand Down Expand Up @@ -1052,13 +1091,16 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
if "." in opts.spark_version:
# Pre-built Spark deploy
spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
validate_spark_hadoop_version(spark_v, opts.hadoop_major_version)
tachyon_v = get_tachyon_version(spark_v)
else:
# Spark-only custom deploy
spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
tachyon_v = ""
print("Deploying Spark via git hash; Tachyon won't be set up")
modules = filter(lambda x: x != "tachyon", modules)

if tachyon_v == "":
print("No valid Tachyon version found; Tachyon won't be set up")
modules.remove("tachyon")

master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes]
slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes]
Expand Down Expand Up @@ -1259,7 +1301,8 @@ def real_main():
(opts, action, cluster_name) = parse_args()

# Input parameter validation
get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
validate_spark_hadoop_version(spark_v, opts.hadoop_major_version)

if opts.wait is not None:
# NOTE: DeprecationWarnings are silent in 2.7+ by default.
Expand Down