Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add regexp_count function #12970

Merged
merged 24 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f72c11f
Implement regexp_ccount
xinlifoobar Aug 20, 2024
682a50a
Merge branch 'main' of github.com:apache/datafusion into dev/xinli/re…
xinlifoobar Aug 20, 2024
ee23b97
Update document
xinlifoobar Aug 20, 2024
d5b63f4
fix check
xinlifoobar Aug 20, 2024
2acd148
add more tests
xinlifoobar Aug 20, 2024
a3563ee
Merge branch 'main' of github.com:apache/datafusion into dev/xinli/re…
xinlifoobar Aug 21, 2024
27a6fc6
Update the world to 1.80
xinlifoobar Aug 21, 2024
d17e45d
Fix doc format
xinlifoobar Aug 21, 2024
ee14adf
Add null tests
xinlifoobar Aug 22, 2024
08343dd
Add uft8 support and bench
xinlifoobar Aug 22, 2024
218ff7b
Refactoring regexp_count
xinlifoobar Aug 28, 2024
0333ec4
Merge branch 'main' of github.com:apache/datafusion into dev/xinli/re…
xinlifoobar Aug 29, 2024
07312be
Refactoring regexp_count
xinlifoobar Aug 29, 2024
4eb7e6b
Revert ci change
xinlifoobar Aug 29, 2024
cb13556
Fix ci
xinlifoobar Aug 29, 2024
574047a
Merge remote-tracking branch 'upstream/main' into fork/xinlifoobar/de…
Omega359 Oct 12, 2024
5a41fbf
Updates for documentation, minor improvements.
Omega359 Oct 13, 2024
2e4cd78
Updates for documentation, minor improvements.
Omega359 Oct 13, 2024
59432f3
Merge remote-tracking branch 'origin/feature/regexp_count' into featu…
Omega359 Oct 13, 2024
01509e8
Merge remote-tracking branch 'upstream/main' into feature/regexp_count
Omega359 Oct 16, 2024
97e61ae
updates to fix scalar tests, doc updates.
Omega359 Oct 16, 2024
74b545a
Merge remote-tracking branch 'origin/main' into feature/regexp_count
Omega359 Oct 17, 2024
696545f
Merge remote-tracking branch 'origin/main' into feature/regexp_count
Omega359 Oct 18, 2024
7371923
updated regex and string features to remove deps on other features.
Omega359 Oct 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ math_expressions = []
# enable regular expressions
regex_expressions = ["regex"]
# enable string functions
string_expressions = ["regex_expressions", "uuid"]
string_expressions = ["uuid"]
# enable unicode functions
unicode_expressions = ["hashbrown", "unicode-segmentation"]

Expand Down
54 changes: 53 additions & 1 deletion datafusion/functions/benches/regx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
extern crate criterion;

use arrow::array::builder::StringBuilder;
use arrow::array::{ArrayRef, AsArray, StringArray};
use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_functions::regex::regexpcount::regexp_count_func;
use datafusion_functions::regex::regexplike::regexp_like;
use datafusion_functions::regex::regexpmatch::regexp_match;
use datafusion_functions::regex::regexpreplace::regexp_replace;
Expand Down Expand Up @@ -59,6 +62,15 @@ fn regex(rng: &mut ThreadRng) -> StringArray {
StringArray::from(data)
}

fn start(rng: &mut ThreadRng) -> Int64Array {
let mut data: Vec<i64> = vec![];
for _ in 0..1000 {
data.push(rng.gen_range(1..5));
}

Int64Array::from(data)
}

fn flags(rng: &mut ThreadRng) -> StringArray {
let samples = [Some("i".to_string()), Some("im".to_string()), None];
let mut sb = StringBuilder::new();
Expand All @@ -75,6 +87,46 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
}

fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("regexp_count_1000 string", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;

b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(&regex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8"),
)
})
});

c.bench_function("regexp_count_1000 utf8view", |b| {
let mut rng = rand::thread_rng();
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
let regex = cast(&regex(&mut rng), &DataType::Utf8View).unwrap();
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();

b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(&regex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8view"),
)
})
});

c.bench_function("regexp_like_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
Expand Down
27 changes: 26 additions & 1 deletion datafusion/functions/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@

use std::sync::Arc;

pub mod regexpcount;
pub mod regexplike;
pub mod regexpmatch;
pub mod regexpreplace;

// create UDFs
make_udf_function!(regexpcount::RegexpCountFunc, REGEXP_COUNT, regexp_count);
make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match);
make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like);
make_udf_function!(
Expand All @@ -35,6 +37,24 @@ make_udf_function!(
pub mod expr_fn {
use datafusion_expr::Expr;

/// Returns the number of consecutive occurrences of a regular expression in a string.
pub fn regexp_count(
values: Expr,
regex: Expr,
start: Option<Expr>,
flags: Option<Expr>,
) -> Expr {
let mut args = vec![values, regex];
if let Some(start) = start {
args.push(start);
};

if let Some(flags) = flags {
args.push(flags);
};
super::regexp_count().call(args)
}

/// Returns a list of regular expression matches in a string.
pub fn regexp_match(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
let mut args = vec![values, regex];
Expand Down Expand Up @@ -70,5 +90,10 @@ pub mod expr_fn {

/// Returns all DataFusion functions defined in this package
pub fn functions() -> Vec<Arc<datafusion_expr::ScalarUDF>> {
vec![regexp_match(), regexp_like(), regexp_replace()]
vec![
regexp_count(),
regexp_match(),
regexp_like(),
regexp_replace(),
]
}
Loading