Skip to content

Commit

Permalink
FEAT-#2444: add docker file for nyc on omnisci (#2445)
Browse files Browse the repository at this point in the history
Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev authored Nov 20, 2020
1 parent 54604f2 commit 74acc1b
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 0 deletions.
19 changes: 19 additions & 0 deletions examples/docker/taxi-on-omnisci/build-docker-image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash -e

# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

cd "`dirname \"$0\"`"

docker build -f nyc-taxi-omnisci.dockerfile -t nyc-taxi-omnisci --build-arg https_proxy --build-arg http_proxy .
printf "\n\nTo run the benchmark execute:\n\tdocker run --rm nyc-taxi-omnisci\n"
53 changes: 53 additions & 0 deletions examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

FROM ubuntu:18.04
ENV http_proxy ${http_proxy}
ENV https_proxy ${https_proxy}
ENV MODIN_BACKEND "omnisci"
ENV MODIN_EXPERIMENTAL "true"

RUN apt-get update --yes \
&& apt-get install wget --yes && \
rm -rf /var/lib/apt/lists/*

ENV USER modin
ENV UID 1000
ENV HOME /home/$USER

RUN adduser --disabled-password \
--gecos "Non-root user" \
--uid $UID \
--home $HOME \
$USER

ENV CONDA_DIR ${HOME}/miniconda

SHELL ["/bin/bash", "--login", "-c"]

RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
"${CONDA_DIR}/bin/conda" init bash && \
rm -f /tmp/miniconda3.sh && \
echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"

RUN conda update -n base -c defaults conda -y && \
conda create -n modin --yes --no-default-packages && \
conda activate modin && \
conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" && \
conda clean --all --yes

COPY trips_xaa.csv "${HOME}/trips_xaa.csv"
COPY nyc-taxi-omnisci.py "${HOME}/nyc-taxi-omnisci.py"

CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/nyc-taxi-omnisci.py"]
108 changes: 108 additions & 0 deletions examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import os
import time
import modin.pandas as pd
from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer

def read():
columns_names = [
"trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",
"rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
"passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
"tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type",
"trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall",
"max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid",
"pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010",
"pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma",
"dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname",
"dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode",
"dropoff_ntaname", "dropoff_puma",
]
# use string instead of category
columns_types = [
"int64", "string", "timestamp", "timestamp", "string", "int64", "float64", "float64",
"float64", "float64", "int64", "float64", "float64", "float64", "float64", "float64", "float64",
"float64", "float64", "float64", "string", "float64", "string", "string", "string", "float64",
"int64", "float64", "int64", "int64", "float64", "float64", "float64", "float64", "string", "float64",
"float64", "string", "string", "string", "float64", "float64", "float64", "float64", "string",
"float64", "float64", "string", "string", "string", "float64",
]

dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))}
all_but_dates = {
col: valtype for (col, valtype) in dtypes.items() if valtype not in ["timestamp"]
}
dates_only = [col for (col, valtype) in dtypes.items() if valtype in ["timestamp"]]

df = pd.read_csv(
os.path.expanduser('~/trips_xaa.csv'),
names=columns_names,
dtype=all_but_dates,
parse_dates=dates_only,
)

df.shape # to trigger real execution
df._query_compiler._modin_frame._partitions[0][
0
].frame_id = OmnisciServer().put_arrow_to_omnisci(
df._query_compiler._modin_frame._partitions[0][0].get()
) # to trigger real execution
return df


def q1_omnisci(df):
q1_pandas_output = df.groupby("cab_type").size()
q1_pandas_output.shape # to trigger real execution
return q1_pandas_output

def q2_omnisci(df):
q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"})
q2_pandas_output.shape # to trigger real execution
return q2_pandas_output

def q3_omnisci(df):
df["pickup_datetime"] = df["pickup_datetime"].dt.year
q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size()
q3_pandas_output.shape # to trigger real execution
return q3_pandas_output

def q4_omnisci(df):
df["pickup_datetime"] = df["pickup_datetime"].dt.year
df["trip_distance"] = df["trip_distance"].astype("int64")
q4_pandas_output = (
df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False)
.size()
.reset_index()
.sort_values(by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False])
)
q4_pandas_output.shape # to trigger real execution
return q4_pandas_output

def measure(name, func, *args, **kw):
t0 = time.time()
res = func(*args, **kw)
t1 = time.time()
print(f'{name}: {t1 - t0} sec')
return res

def main():
df = measure('Reading', read)
measure('Q1', q1_omnisci, df)
measure('Q2', q2_omnisci, df)
measure('Q3', q3_omnisci, df.copy())
measure('Q4', q4_omnisci, df.copy())

if __name__ == '__main__':
main()

0 comments on commit 74acc1b

Please sign in to comment.