Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Header-only refactoring of derive_trajectories #628

Merged
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions cpp/include/cuspatial/experimental/derive_trajectories.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <iterator>
#include <memory>

namespace cuspatial {

/**
* @addtogroup trajectory_api
* @{
*/

/**
* @brief Derive trajectories from object ids, points, and timestamps.
*
* Groups the input object ids to determine unique trajectories, and reorders all input data to be
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This reads like the input data will be reordered in-place.

* grouped by object ID and ordered by timestamp within groups. Returns a vector containing the
* offset index of the first object of each trajectory in the output.
*
* @tparam IdInputIt Iterator over object IDs. Must meet the requirements of
* [LegacyRandomAccessIterator][LinkLRAI] and be device-readable.
* @tparam PointInputIt Iterator over points. Must meet the requirements of
* [LegacyRandomAccessIterator][LinkLRAI] and be device-readable.
* @tparam TimestampInputIt Iterator over timestamps. Must meet the requirements of
* [LegacyRandomAccessIterator][LinkLRAI] and be device-readable.
* @tparam IdOutputIt Iterator over output object IDs. Must meet the requirements of
* [LegacyRandomAccessIterator][LinkLRAI] and be device-writeable.
* @tparam PointOutputIt Iterator over output points. Must meet the requirements of
* [LegacyRandomAccessIterator][LinkLRAI] and be device-writeable.
* @tparam TimestampOutputIt Iterator over output timestamps. Must meet the requirements of
* [LegacyRandomAccessIterator][LinkLRAI] and be device-writeable.
*
* @param ids_first beginning of the range of input object ids
* @param ids_first end of the range of input object ids
harrism marked this conversation as resolved.
Show resolved Hide resolved
* @param points_first beginning of the range of input point (x,y) coordinates
* @param timestamps_first beginning of the range of input timestamps
* @param ids_out_first beginning of the range of output object ids
* @param points_out_first beginning of the range of output point (x,y) coordinates
* @param timestamps_out_first beginning of the range of output timestamps
* @param stream the CUDA stream on which to perform computations and allocate memory.
* @param mr optional resource to use for output device memory allocations
*
* @return a unique_ptr to a device_vector containing the offset index of the first object of each
* trajectory in the sorted output. These offsets can be used to access the sorted output data.
*
* @pre There must be no overlap between any of the input and output ranges.
* @pre The type of the object IDs and timestamps must support strict weak ordering via comparison
* operators.
*
* [LinkLRAI]: https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator
* "LegacyRandomAccessIterator"
*/
template <typename IdInputIt,
typename PointInputIt,
typename TimestampInputIt,
typename IdOutputIt,
typename PointOutputIt,
typename TimestampOutputIt,
typename OffsetType = std::int32_t>
std::unique_ptr<rmm::device_vector<OffsetType>> derive_trajectories(
IdInputIt ids_first,
IdInputIt ids_last,
PointInputIt points_first,
TimestampInputIt timestamps_first,
IdOutputIt ids_output_first,
PointOutputIt points_output_first,
TimestampOutputIt timestamps_output_first,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @} // end of doxygen group
*/

} // namespace cuspatial

#include <cuspatial/experimental/detail/derive_trajectories.cuh>
155 changes: 155 additions & 0 deletions cpp/include/cuspatial/experimental/detail/derive_trajectories.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/gather.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>

#include <cub/device/device_merge_sort.cuh>

#include <cstdint>

namespace cuspatial {

namespace detail {

template <typename Tuple>
struct trajectory_comparator {
__device__ bool operator()(Tuple const& lhs, Tuple const& rhs)
{
auto lhs_id = thrust::get<0>(lhs);
auto rhs_id = thrust::get<0>(rhs);
auto lhs_ts = thrust::get<1>(lhs);
auto rhs_ts = thrust::get<1>(rhs);
return (lhs_id < rhs_id) || ((lhs_id == rhs_id) && (lhs_ts < rhs_ts));
};
};

template <typename IdInputIt,
typename PointInputIt,
typename TimestampInputIt,
typename IdOutputIt,
typename PointOutputIt,
typename TimestampOutputIt>
void order_trajectories(IdInputIt ids_first,
IdInputIt ids_last,
PointInputIt points_first,
TimestampInputIt timestamps_first,
IdOutputIt ids_out_first,
PointOutputIt points_out_first,
TimestampOutputIt timestamps_out_first,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using id_type = typename std::iterator_traits<IdInputIt>::value_type;
using timestamp_type = typename std::iterator_traits<TimestampInputIt>::value_type;
harrism marked this conversation as resolved.
Show resolved Hide resolved
using tuple_type = thrust::tuple<id_type, timestamp_type>;

auto keys_first = thrust::make_zip_iterator(ids_first, timestamps_first);
auto keys_out_first = thrust::make_zip_iterator(ids_out_first, timestamps_out_first);

std::size_t temp_storage_bytes = 0;
cub::DeviceMergeSort::SortPairsCopy(nullptr,
temp_storage_bytes,
keys_first,
points_first,
keys_out_first,
points_out_first,
std::distance(ids_first, ids_last),
trajectory_comparator<tuple_type>{},
stream);

auto temp_storage = rmm::device_buffer(temp_storage_bytes, stream, mr);

cub::DeviceMergeSort::SortPairsCopy(temp_storage.data(),
temp_storage_bytes,
keys_first,
points_first,
keys_out_first,
points_out_first,
std::distance(ids_first, ids_last),
trajectory_comparator<tuple_type>{},
stream);

stream.synchronize();
}

} // namespace detail

template <typename IdInputIt,
typename PointInputIt,
typename TimestampInputIt,
typename IdOutputIt,
typename PointOutputIt,
typename TimestampOutputIt,
typename OffsetType>
std::unique_ptr<rmm::device_vector<OffsetType>> derive_trajectories(
harrism marked this conversation as resolved.
Show resolved Hide resolved
IdInputIt ids_first,
IdInputIt ids_last,
PointInputIt points_first,
TimestampInputIt timestamps_first,
IdOutputIt ids_out_first,
PointOutputIt points_out_first,
TimestampOutputIt timestamps_out_first,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
detail::order_trajectories(ids_first,
ids_last,
points_first,
timestamps_first,
ids_out_first,
points_out_first,
timestamps_out_first,
stream,
mr);

auto const num_points = std::distance(ids_first, ids_last);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed here that you're computing the number of points based on the number of ids. I suggest that you error check here that the number of ids, points, and trajectories are all equal.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not possible -- we only get a last iterator for ids. Assuming (not asserting) the iterator ranges are equal is standard practice in iterator-based APIs like this (and Thrust, and STL).

Copy link
Member Author

@harrism harrism Sep 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, the beauty of an iterator-based API is flexibility / fusability. The caller is free to pass anything that looks like an iterator -- it might be, for example, a counting iterator, or some other dynamic iterator type that generates an open-ended sequence. There is no end. :)

auto lengths = rmm::device_uvector<OffsetType>(num_points, stream);
auto grouped = thrust::reduce_by_key(rmm::exec_policy(stream),
ids_out_first,
ids_out_first + num_points,
thrust::make_constant_iterator(1),
thrust::make_discard_iterator(),
lengths.begin());

auto const num_trajectories = std::distance(lengths.begin(), grouped.second);
auto offsets = std::make_unique<rmm::device_vector<OffsetType>>(
num_trajectories, rmm::mr::thrust_allocator<OffsetType>(stream, mr));

thrust::exclusive_scan(rmm::exec_policy(stream),
lengths.begin(),
lengths.begin() + num_trajectories,
offsets->begin());

return offsets;
}

/**
* @} // end of doxygen group
*/

} // namespace cuspatial
3 changes: 3 additions & 0 deletions cpp/include/cuspatial/trajectory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>

#include <rmm/mr/device/per_device_resource.hpp>
Expand Down
102 changes: 64 additions & 38 deletions cpp/src/trajectory/derive_trajectories.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,30 +14,75 @@
* limitations under the License.
*/

#include <cudf/utilities/type_dispatcher.hpp>
#include <cuspatial/error.hpp>
#include <cuspatial/trajectory.hpp>
#include <cuspatial/experimental/derive_trajectories.cuh>
#include <cuspatial/experimental/type_utils.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/sorting.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/distance.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>

#include <memory>
#include <vector>

namespace cuspatial {
namespace detail {

struct derive_trajectories_dispatch {
template <
typename T,
typename Timestamp,
std::enable_if_t<std::is_floating_point_v<T> and cudf::is_timestamp<Timestamp>()>* = nullptr>
std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> operator()(
cudf::column_view const& object_id,
cudf::column_view const& x,
cudf::column_view const& y,
cudf::column_view const& timestamp,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto cols = std::vector<std::unique_ptr<cudf::column>>{};
cols.reserve(4);
cols.push_back(cudf::allocate_like(object_id, cudf::mask_allocation_policy::NEVER, mr));
cols.push_back(cudf::allocate_like(x, cudf::mask_allocation_policy::NEVER, mr));
cols.push_back(cudf::allocate_like(y, cudf::mask_allocation_policy::NEVER, mr));
cols.push_back(cudf::allocate_like(timestamp, cudf::mask_allocation_policy::NEVER, mr));

auto points_begin = thrust::make_zip_iterator(x.begin<T>(), y.begin<T>());
auto points_out_begin = thrust::make_zip_iterator(cols[1]->mutable_view().begin<T>(),
cols[2]->mutable_view().begin<T>());

auto offsets = derive_trajectories(object_id.begin<std::int32_t>(),
object_id.end<std::int32_t>(),
points_begin,
timestamp.begin<Timestamp>(),
cols[0]->mutable_view().begin<std::int32_t>(),
points_out_begin,
cols[3]->mutable_view().begin<Timestamp>(),
stream,
mr);

auto result_table = std::make_unique<cudf::table>(std::move(cols));
auto offsets_column = std::make_unique<cudf::column>(cudf::column_view(
cudf::data_type(cudf::type_id::INT32), offsets->size(), offsets->data().get()));

return {std::move(result_table), std::move(offsets_column)};
}

template <typename T,
typename Timestamp,
std::enable_if_t<not(std::is_floating_point_v<T> and
cudf::is_timestamp<Timestamp>())>* = nullptr>
std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> operator()(...)
{
CUSPATIAL_FAIL("Unsupported data type");
}
};

std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> derive_trajectories(
cudf::column_view const& object_id,
cudf::column_view const& x,
Expand All @@ -46,34 +91,15 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> derive_tr
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto sorted = cudf::detail::sort_by_key(cudf::table_view{{object_id, x, y, timestamp}},
cudf::table_view{{object_id, timestamp}},
{},
{},
stream,
mr);

auto sorted_id = sorted->get_column(0).view();
rmm::device_vector<int32_t> lengths(object_id.size());
auto grouped = thrust::reduce_by_key(rmm::exec_policy(stream),
sorted_id.begin<int32_t>(),
sorted_id.end<int32_t>(),
thrust::make_constant_iterator(1),
thrust::make_discard_iterator(),
lengths.begin());

auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
thrust::distance(lengths.begin(), grouped.second),
cudf::mask_state::UNALLOCATED,
stream,
mr);

thrust::exclusive_scan(rmm::exec_policy(stream),
lengths.begin(),
lengths.end(),
offsets->mutable_view().begin<int32_t>());

return std::make_pair(std::move(sorted), std::move(offsets));
return cudf::double_type_dispatcher(x.type(),
timestamp.type(),
derive_trajectories_dispatch{},
object_id,
x,
y,
timestamp,
stream,
mr);
}
} // namespace detail

Expand Down
Loading