Skip to content

Commit

Permalink
Move Trino RE2J fork to a new package
Browse files Browse the repository at this point in the history
There are situations where downstream dependencies require com.google.re2j:1.7 and we pull in both dependencies due to a dependence on Trino parts.

These conflicts cannot be resolved if both dependencies use the same package.
  • Loading branch information
wendigo committed Feb 7, 2024
1 parent 356eee3 commit 216e9b1
Show file tree
Hide file tree
Showing 51 changed files with 166 additions and 218 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/parse.go

package com.google.re2j;
package io.trino.re2j;

/**
* A "builder"-style helper class for manipulating character classes
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.go

package com.google.re2j;
package io.trino.re2j;

import java.util.HashMap;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/compile.go

package com.google.re2j;
package io.trino.re2j;

import java.util.LinkedList;
import java.util.List;

import static com.google.re2j.Inst.Op.BYTE;
import static com.google.re2j.Inst.Op.BYTE1;
import static com.google.re2j.RE2.FOLD_CASE;
import static com.google.re2j.Unicode.RUNE_SELF;
import static com.google.re2j.Unicode.UTF_MAX;
import static com.google.re2j.Unicode.codePointToUtf8;
import static com.google.re2j.Unicode.maxRune;
import static com.google.re2j.Unicode.simpleFold;
import static io.trino.re2j.Inst.Op.BYTE;
import static io.trino.re2j.Inst.Op.BYTE1;
import static io.trino.re2j.RE2.FOLD_CASE;
import static io.trino.re2j.Unicode.RUNE_SELF;
import static io.trino.re2j.Unicode.UTF_MAX;
import static io.trino.re2j.Unicode.codePointToUtf8;
import static io.trino.re2j.Unicode.maxRune;
import static io.trino.re2j.Unicode.simpleFold;

/**
* Compiler from {@code Regexp} (RE2 abstract syntax) to {@code RE2} (compiled regular expression).
Expand Down
42 changes: 18 additions & 24 deletions java/com/google/re2j/DFA.java → java/io/trino/re2j/DFA.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,20 @@
// Original RE2 source here:
// https://github.com/google/re2/blob/master/re2/dfa.cc

package com.google.re2j;

import com.google.re2j.RE2.MatchKind;
package io.trino.re2j;

import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;

import static com.google.re2j.DFAState.DEAD_STATE;
import static com.google.re2j.Inst.Op.EMPTY_WIDTH;
import static com.google.re2j.MachineInput.EOF;
import static com.google.re2j.RE2.MatchKind.FIRST_MATCH;
import static com.google.re2j.RE2.MatchKind.LONGEST_MATCH;
import static com.google.re2j.Utils.EMPTY_BEGIN_LINE;
import static com.google.re2j.Utils.EMPTY_BEGIN_TEXT;
import static com.google.re2j.Utils.EMPTY_END_LINE;
import static com.google.re2j.Utils.EMPTY_END_TEXT;
import static com.google.re2j.Utils.EMPTY_NO_WORD_BOUNDARY;
import static com.google.re2j.Utils.EMPTY_WORD_BOUNDARY;
import static com.google.re2j.Utils.isRuneStart;
import static com.google.re2j.Utils.isWordByte;
import static io.trino.re2j.MachineInput.EOF;
import static io.trino.re2j.Utils.EMPTY_BEGIN_LINE;
import static io.trino.re2j.Utils.EMPTY_BEGIN_TEXT;
import static io.trino.re2j.Utils.EMPTY_END_LINE;
import static io.trino.re2j.Utils.EMPTY_END_TEXT;
import static io.trino.re2j.Utils.EMPTY_NO_WORD_BOUNDARY;
import static io.trino.re2j.Utils.EMPTY_WORD_BOUNDARY;
import static io.trino.re2j.Utils.isRuneStart;
import static io.trino.re2j.Utils.isWordByte;
import static java.util.Arrays.sort;

class DFA {
Expand All @@ -51,7 +45,7 @@ private DFATooManyStatesException() {
private static final int START_PARAMS_CACHE_SIZE = 1 << 13;
private static final int START_PARAMS_CACHE_SHIFT = 12;

private static final StartParams DEAD_START_PARAMS = new StartParams(DEAD_STATE, new boolean[256]);
private static final StartParams DEAD_START_PARAMS = new StartParams(DFAState.DEAD_STATE, new boolean[256]);

// Info for the search
private final Prog prog;
Expand All @@ -60,7 +54,7 @@ private DFATooManyStatesException() {
private final Inst[] instructions;

// Search for longest match like egrep or POSIX or for first match like Perl, PCRE
private final MatchKind matchKind;
private final RE2.MatchKind matchKind;

// Should input bytes be read forward or backward
private final boolean runForward;
Expand All @@ -74,7 +68,7 @@ private DFATooManyStatesException() {
private final ConcurrentHashMap<DFAStateKey, DFAState> statesCache;
private final AtomicInteger availableStates;

public DFA(Prog prog, MatchKind matchKind, boolean reversed,
public DFA(Prog prog, RE2.MatchKind matchKind, boolean reversed,
ConcurrentHashMap<DFAStateKey, DFAState> statesCache, AtomicInteger availableStates) {
this.prog = prog;
this.instructions = prog.getInst();
Expand All @@ -85,7 +79,7 @@ public DFA(Prog prog, MatchKind matchKind, boolean reversed,

int progSize = prog.numInst();
int nMarks = 0;
if (matchKind == LONGEST_MATCH) {
if (matchKind == RE2.MatchKind.LONGEST_MATCH) {
nMarks = progSize;
}

Expand Down Expand Up @@ -130,7 +124,7 @@ private DFAState workQueueToCachedState(WorkQueue q, int flag) {
case MATCH:
case ALT:
instIndexes[nIndexes++] = instIndex;
if (inst.op() == EMPTY_WIDTH) {
if (inst.op() == Inst.Op.EMPTY_WIDTH) {
neededFlags |= inst.arg;
}
break;
Expand All @@ -153,13 +147,13 @@ private DFAState workQueueToCachedState(WorkQueue q, int flag) {

// No match possibilities
if (nIndexes == 0 && flag == 0) {
return DEAD_STATE;
return DFAState.DEAD_STATE;
}

// If we're in the longest match mode, the state is a sequence of
// unordered state sets separated by Marks. Sort each set to
// canonicalize, to reduce the number of distinct sets stored.
if (matchKind == LONGEST_MATCH) {
if (matchKind == RE2.MatchKind.LONGEST_MATCH) {
int ip = 0;
while (ip < nIndexes) {
int markp = ip;
Expand Down Expand Up @@ -322,7 +316,7 @@ private boolean runWorkQueueOnByte(byte b, int flag) {
break;
case MATCH:
isMatch = true;
if (matchKind == FIRST_MATCH) {
if (matchKind == RE2.MatchKind.FIRST_MATCH) {
return true;
}
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,12 @@
// Original RE2 source here:
// https://github.com/google/re2/blob/master/re2/dfa.cc

package com.google.re2j;

import com.google.re2j.RE2.Anchor;
import com.google.re2j.RE2.MatchKind;
package io.trino.re2j;

import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;

import static com.google.re2j.DFA.NO_MATCH;
import static com.google.re2j.RE2.Anchor.ANCHOR_START;
import static com.google.re2j.RE2.MatchKind.FIRST_MATCH;
import static com.google.re2j.RE2.MatchKind.LONGEST_MATCH;
import static io.trino.re2j.DFA.NO_MATCH;

/**
* A {@link Machine} implementation using a DFA.
Expand All @@ -40,14 +34,14 @@ class DFAMachine implements Machine {
stateCache[i] = new ConcurrentHashMap<>();
}

setDfaThreadLocal(LONGEST_MATCH, true);
setDfaThreadLocal(LONGEST_MATCH, false);
setDfaThreadLocal(FIRST_MATCH, true);
setDfaThreadLocal(FIRST_MATCH, false);
setDfaThreadLocal(RE2.MatchKind.LONGEST_MATCH, true);
setDfaThreadLocal(RE2.MatchKind.LONGEST_MATCH, false);
setDfaThreadLocal(RE2.MatchKind.FIRST_MATCH, true);
setDfaThreadLocal(RE2.MatchKind.FIRST_MATCH, false);
}

@Override
public boolean match(MachineInput in, int pos, Anchor anchor, int[] submatches) {
public boolean match(MachineInput in, int pos, RE2.Anchor anchor, int[] submatches) {
// Don't ask for the location if we won't use it. SearchDFA can do extra optimizations in that case.
boolean wantMatchPosition = true;
if (submatches.length == 0) {
Expand All @@ -71,7 +65,7 @@ public boolean match(MachineInput in, int pos, Anchor anchor, int[] submatches)

// SearchDFA gives match end position but we don't know where the match started. Run the
// regexp backwards from end position to find the longest possible match -- that's where it started.
matchStart = searchDFA(in, pos, matchEnd, ANCHOR_START, true, LONGEST_MATCH, true);
matchStart = searchDFA(in, pos, matchEnd, RE2.Anchor.ANCHOR_START, true, RE2.MatchKind.LONGEST_MATCH, true);
if (matchStart == NO_MATCH) {
throw new IllegalStateException("reverse DFA did not found a match");
}
Expand Down Expand Up @@ -101,7 +95,7 @@ public boolean match(MachineInput in, int pos, Anchor anchor, int[] submatches)
return true;
}

private int searchDFA(MachineInput in, int startPos, int endPos, Anchor anchor, boolean wantMatchPosition, MatchKind matchKind, boolean reversed) {
private int searchDFA(MachineInput in, int startPos, int endPos, RE2.Anchor anchor, boolean wantMatchPosition, RE2.MatchKind matchKind, boolean reversed) {
boolean hasCarat = reversed ? anchor.isAnchorEnd() : anchor.isAnchorStart();
if (hasCarat && startPos != 0) {
return NO_MATCH;
Expand All @@ -112,7 +106,7 @@ private int searchDFA(MachineInput in, int startPos, int endPos, Anchor anchor,
boolean endMatch = false;
if (anchor.isAnchorEnd()) {
endMatch = true;
matchKind = LONGEST_MATCH;
matchKind = RE2.MatchKind.LONGEST_MATCH;
}

// If the caller doesn't care where the match is (just whether one exists),
Expand All @@ -121,7 +115,7 @@ private int searchDFA(MachineInput in, int startPos, int endPos, Anchor anchor,
boolean wantEarliestMatch = false;
if (!wantMatchPosition && !endMatch) {
wantEarliestMatch = true;
matchKind = LONGEST_MATCH;
matchKind = RE2.MatchKind.LONGEST_MATCH;
}

DFA dfa = getDfa(matchKind, reversed);
Expand All @@ -140,17 +134,17 @@ private int searchDFA(MachineInput in, int startPos, int endPos, Anchor anchor,
return match;
}

private DFA getDfa(MatchKind matchKind, boolean reversed) {
private DFA getDfa(RE2.MatchKind matchKind, boolean reversed) {
return dfaCache[dfaKey(matchKind, reversed)].get();
}

private int dfaKey(MatchKind matchKind, boolean reversed) {
int longestInt = matchKind == LONGEST_MATCH ? 1 : 0;
private int dfaKey(RE2.MatchKind matchKind, boolean reversed) {
int longestInt = matchKind == RE2.MatchKind.LONGEST_MATCH ? 1 : 0;
int reversedInt = reversed ? 1 : 0;
return longestInt | (reversedInt << 1);
}

private void setDfaThreadLocal(MatchKind matchKind, boolean reversed) {
private void setDfaThreadLocal(RE2.MatchKind matchKind, boolean reversed) {
int dfaKey = dfaKey(matchKind, reversed);
Prog prog = reversed ? re2.reverseProg : re2.prog;
dfaCache[dfaKey] = new ThreadLocal<DFA>() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
// Original RE2 source here:
// https://github.com/google/re2/blob/master/re2/dfa.cc

package com.google.re2j;
package io.trino.re2j;

import static com.google.re2j.DFA.FLAG_MATCH;
import static com.google.re2j.DFAState.StateType.DEAD;
import static com.google.re2j.DFAState.StateType.REGULAR;
import static io.trino.re2j.DFA.FLAG_MATCH;
import static io.trino.re2j.DFAState.StateType.DEAD;
import static io.trino.re2j.DFAState.StateType.REGULAR;
import static java.lang.System.arraycopy;

final class DFAState {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
// Original RE2 source here:
// https://github.com/google/re2/blob/master/re2/dfa.cc

package com.google.re2j;
package io.trino.re2j;

import java.util.Arrays;

import static com.google.re2j.Utils.arrayFirstElementsEqual;
import static io.trino.re2j.Utils.arrayFirstElementsEqual;

final class DFAStateKey {
private final int[] instIndexes;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/prog.go

package com.google.re2j;
package io.trino.re2j;

import static com.google.re2j.Inst.Op.BYTE;
import static io.trino.re2j.Inst.Op.BYTE;

/**
* A single instruction in the regular expression virtual machine.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/exec.go

package com.google.re2j;
package io.trino.re2j;

import com.google.re2j.RE2.Anchor;
import io.trino.re2j.RE2.Anchor;

/**
* A Machine matches an input string of Unicode characters against an RE2 instance.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/regexp.go

package com.google.re2j;
package io.trino.re2j;

import io.airlift.slice.Slice;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
// Copyright 2010 Google Inc. All Rights Reserved.

package com.google.re2j;

import com.google.re2j.RE2.Anchor;
package io.trino.re2j;

import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;

import static com.google.re2j.RE2.Anchor.ANCHOR_BOTH;
import static com.google.re2j.RE2.Anchor.ANCHOR_START;
import static com.google.re2j.RE2.Anchor.UNANCHORED;

/**
* A stateful iterator that interprets a regex {@code Pattern} on a
* specific input. Its interface mimics the JDK 1.4.2
Expand Down Expand Up @@ -64,7 +58,7 @@ public final class Matcher {
private boolean hasGroups;

// The anchor flag to use when repeating the match to find subgroups.
private Anchor anchorFlag;
private RE2.Anchor anchorFlag;

private Matcher(Pattern pattern) {
if (pattern == null) {
Expand Down Expand Up @@ -236,7 +230,7 @@ private void loadGroup(int group) {
* @return true if the entire input matches the pattern
*/
public boolean matches() {
return genMatch(0, ANCHOR_BOTH);
return genMatch(0, RE2.Anchor.ANCHOR_BOTH);
}

/**
Expand All @@ -246,7 +240,7 @@ public boolean matches() {
* @return true if the beginning of the input matches the pattern
*/
public boolean lookingAt() {
return genMatch(0, ANCHOR_START);
return genMatch(0, RE2.Anchor.ANCHOR_START);
}

/**
Expand All @@ -265,7 +259,7 @@ public boolean find() {
start++;
}
}
return genMatch(start, UNANCHORED);
return genMatch(start, RE2.Anchor.UNANCHORED);
}

/**
Expand All @@ -283,11 +277,11 @@ public boolean find(int start) {
"start index out of bounds: " + start);
}
reset();
return genMatch(start, UNANCHORED);
return genMatch(start, RE2.Anchor.UNANCHORED);
}

/** Helper: does match starting at start, with RE2 anchor flag. */
private boolean genMatch(int startByte, Anchor anchor) {
private boolean genMatch(int startByte, RE2.Anchor anchor) {
// TODO(rsc): Is matches/lookingAt supposed to reset the append or input positions?
// From the JDK docs, looks like no.
boolean ok = pattern.re2().match(input, startByte,
Expand Down
Loading

0 comments on commit 216e9b1

Please sign in to comment.