From 2da2646dd65e0458311a2dccfb04850c0b7d9e3c Mon Sep 17 00:00:00 2001 From: Andres Noetzli Date: Wed, 10 Jun 2020 12:50:52 -0700 Subject: [PATCH] Add support for str.replace_re/str.replace_re_all (#4594) This commit adds support for the last remaining string operators from the new SMT-LIB standard for the theory of strings. The commit adds the kinds, type checking, reductions, and evaluation rewrites for `str.replace_re` and `str.replace_re_all`. --- NEWS | 8 + src/api/cvc4cpp.cpp | 4 + src/api/cvc4cppkind.h | 27 +++ src/parser/smt2/smt2.cpp | 2 + src/printer/smt2/smt2_printer.cpp | 4 + src/theory/strings/extf_solver.cpp | 7 +- src/theory/strings/kinds | 4 + src/theory/strings/rewrites.cpp | 2 + src/theory/strings/rewrites.h | 2 + src/theory/strings/sequences_rewriter.cpp | 116 +++++++++ src/theory/strings/sequences_rewriter.h | 27 +++ src/theory/strings/skolem_cache.h | 18 ++ src/theory/strings/term_registry.cpp | 7 +- src/theory/strings/theory_strings.cpp | 3 + .../strings/theory_strings_preprocess.cpp | 169 +++++++++++++ test/regress/CMakeLists.txt | 4 +- test/regress/regress2/strings/replace_re.smt2 | 42 ++++ .../regress2/strings/replace_re_all.smt2 | 31 +++ test/unit/theory/sequences_rewriter_white.h | 222 ++++++++++++++++++ 19 files changed, 693 insertions(+), 6 deletions(-) create mode 100644 test/regress/regress2/strings/replace_re.smt2 create mode 100644 test/regress/regress2/strings/replace_re_all.smt2 diff --git a/NEWS b/NEWS index ac9f0747e..8c6ec7bc9 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,14 @@ This file contains a summary of important user-visible changes. Changes since 1.7 ================= +New Features: +* Strings: Full support of the new SMT-LIB standard for the theory of strings, + including: + * Support for `str.replace_re`, `str.replace_re_all`, `str.is_digit`, + `str.from_code`, `re.diff`, and `re.comp` + * Support for new operator names (e.g. `str.in_re` instead of `str.in.re`), + new escape sequences. The new syntax is enabled by default for smt2 files. + Improvements: * API: Function definitions can now be requested to be global. If the `global` parameter is set to true, they persist after popping the user context. diff --git a/src/api/cvc4cpp.cpp b/src/api/cvc4cpp.cpp index 325da8cdb..f0d28401e 100644 --- a/src/api/cvc4cpp.cpp +++ b/src/api/cvc4cpp.cpp @@ -257,6 +257,8 @@ const static std::unordered_map s_kinds{ {STRING_INDEXOF, CVC4::Kind::STRING_STRIDOF}, {STRING_REPLACE, CVC4::Kind::STRING_STRREPL}, {STRING_REPLACE_ALL, CVC4::Kind::STRING_STRREPLALL}, + {STRING_REPLACE_RE, CVC4::Kind::STRING_REPLACE_RE}, + {STRING_REPLACE_RE_ALL, CVC4::Kind::STRING_REPLACE_RE_ALL}, {STRING_TOLOWER, CVC4::Kind::STRING_TOLOWER}, {STRING_TOUPPER, CVC4::Kind::STRING_TOUPPER}, {STRING_REV, CVC4::Kind::STRING_REV}, @@ -526,6 +528,8 @@ const static std::unordered_map {CVC4::Kind::STRING_STRIDOF, STRING_INDEXOF}, {CVC4::Kind::STRING_STRREPL, STRING_REPLACE}, {CVC4::Kind::STRING_STRREPLALL, STRING_REPLACE_ALL}, + {CVC4::Kind::STRING_REPLACE_RE, STRING_REPLACE_RE}, + {CVC4::Kind::STRING_REPLACE_RE_ALL, STRING_REPLACE_RE_ALL}, {CVC4::Kind::STRING_TOLOWER, STRING_TOLOWER}, {CVC4::Kind::STRING_TOUPPER, STRING_TOUPPER}, {CVC4::Kind::STRING_REV, STRING_REV}, diff --git a/src/api/cvc4cppkind.h b/src/api/cvc4cppkind.h index 02f13310a..82f1eb379 100644 --- a/src/api/cvc4cppkind.h +++ b/src/api/cvc4cppkind.h @@ -2002,6 +2002,33 @@ enum CVC4_PUBLIC Kind : int32_t * mkTerm(Kind kind, const std::vector& children) */ STRING_REPLACE_ALL, + /** + * String replace regular expression match. + * Replaces the first match of a regular expression r in string s1 with + * string s2. If r does not match a substring of s1, s1 is returned + * unmodified. + * Parameters: 3 + * -[1]: Term of sort String (string s1) + * -[2]: Term of sort Regexp (regexp r) + * -[3]: Term of sort String (string s2) + * Create with: + * mkTerm(Kind kind, Term child1, Term child2, Term child3) + * mkTerm(Kind kind, const std::vector& children) + */ + STRING_REPLACE_RE, + /** + * String replace all regular expression matches. + * Replaces all matches of a regular expression r in string s1 with string + * s2. If r does not match a substring of s1, s1 is returned unmodified. + * Parameters: 3 + * -[1]: Term of sort String (string s1) + * -[2]: Term of sort Regexp (regexp r) + * -[3]: Term of sort String (string s2) + * Create with: + * mkTerm(Kind kind, Term child1, Term child2, Term child3) + * mkTerm(Kind kind, const std::vector& children) + */ + STRING_REPLACE_RE_ALL, /** * String to lower case. * Parameters: 1 diff --git a/src/parser/smt2/smt2.cpp b/src/parser/smt2/smt2.cpp index 3f25e3776..5f328d462 100644 --- a/src/parser/smt2/smt2.cpp +++ b/src/parser/smt2/smt2.cpp @@ -156,6 +156,8 @@ void Smt2::addStringOperators() { addOperator(api::STRING_CHARAT, "str.at"); addOperator(api::STRING_INDEXOF, "str.indexof"); addOperator(api::STRING_REPLACE, "str.replace"); + addOperator(api::STRING_REPLACE_RE, "str.replace_re"); + addOperator(api::STRING_REPLACE_RE_ALL, "str.replace_re_all"); if (!strictModeEnabled()) { addOperator(api::STRING_TOLOWER, "str.tolower"); diff --git a/src/printer/smt2/smt2_printer.cpp b/src/printer/smt2/smt2_printer.cpp index 6670fa065..18d60d779 100644 --- a/src/printer/smt2/smt2_printer.cpp +++ b/src/printer/smt2/smt2_printer.cpp @@ -632,6 +632,8 @@ void Smt2Printer::toStream(std::ostream& out, case kind::STRING_STRIDOF: case kind::STRING_STRREPL: case kind::STRING_STRREPLALL: + case kind::STRING_REPLACE_RE: + case kind::STRING_REPLACE_RE_ALL: case kind::STRING_TOLOWER: case kind::STRING_TOUPPER: case kind::STRING_REV: @@ -1191,6 +1193,8 @@ static string smtKindString(Kind k, Variant v) case kind::STRING_STRIDOF: return "str.indexof" ; case kind::STRING_STRREPL: return "str.replace" ; case kind::STRING_STRREPLALL: return "str.replace_all"; + case kind::STRING_REPLACE_RE: return "str.replace_re"; + case kind::STRING_REPLACE_RE_ALL: return "str.replace_re_all"; case kind::STRING_TOLOWER: return "str.tolower"; case kind::STRING_TOUPPER: return "str.toupper"; case kind::STRING_REV: return "str.rev"; diff --git a/src/theory/strings/extf_solver.cpp b/src/theory/strings/extf_solver.cpp index dd4144313..9b7af4f13 100644 --- a/src/theory/strings/extf_solver.cpp +++ b/src/theory/strings/extf_solver.cpp @@ -56,6 +56,8 @@ ExtfSolver::ExtfSolver(context::Context* c, d_extt.addFunctionKind(kind::STRING_STOI); d_extt.addFunctionKind(kind::STRING_STRREPL); d_extt.addFunctionKind(kind::STRING_STRREPLALL); + d_extt.addFunctionKind(kind::STRING_REPLACE_RE); + d_extt.addFunctionKind(kind::STRING_REPLACE_RE_ALL); d_extt.addFunctionKind(kind::STRING_STRCTN); d_extt.addFunctionKind(kind::STRING_IN_REGEXP); d_extt.addFunctionKind(kind::STRING_LEQ); @@ -180,8 +182,9 @@ bool ExtfSolver::doReduction(int effort, Node n) NodeManager* nm = NodeManager::currentNM(); Assert(k == STRING_SUBSTR || k == STRING_STRCTN || k == STRING_STRIDOF || k == STRING_ITOS || k == STRING_STOI || k == STRING_STRREPL - || k == STRING_STRREPLALL || k == STRING_LEQ || k == STRING_TOLOWER - || k == STRING_TOUPPER || k == STRING_REV); + || k == STRING_STRREPLALL || k == STRING_REPLACE_RE + || k == STRING_REPLACE_RE_ALL || k == STRING_LEQ + || k == STRING_TOLOWER || k == STRING_TOUPPER || k == STRING_REV); std::vector new_nodes; Node res = d_preproc.simplify(n, new_nodes); Assert(res != n); diff --git a/src/theory/strings/kinds b/src/theory/strings/kinds index 800847ffe..96ba82a44 100644 --- a/src/theory/strings/kinds +++ b/src/theory/strings/kinds @@ -23,6 +23,10 @@ operator STRING_LEQ 2 "string less than or equal" operator STRING_STRIDOF 3 "string indexof" operator STRING_STRREPL 3 "string replace" operator STRING_STRREPLALL 3 "string replace all" +operator STRING_REPLACE_RE 3 "string replace regular expression match" +operator STRING_REPLACE_RE_ALL 3 "string replace all regular expression matches" +typerule STRING_REPLACE_RE "SimpleTypeRule" +typerule STRING_REPLACE_RE_ALL "SimpleTypeRule" operator STRING_PREFIX 2 "string prefixof" operator STRING_SUFFIX 2 "string suffixof" operator STRING_IS_DIGIT 1 "string isdigit, returns true if argument is a string of length one that represents a digit" diff --git a/src/theory/strings/rewrites.cpp b/src/theory/strings/rewrites.cpp index a4055c4f9..5f3c83869 100644 --- a/src/theory/strings/rewrites.cpp +++ b/src/theory/strings/rewrites.cpp @@ -111,6 +111,8 @@ const char* toString(Rewrite r) case Rewrite::RPL_RPL_EMPTY: return "RPL_RPL_EMPTY"; case Rewrite::RPL_RPL_LEN_ID: return "RPL_RPL_LEN_ID"; case Rewrite::RPL_X_Y_X_SIMP: return "RPL_X_Y_X_SIMP"; + case Rewrite::REPLACE_RE_EVAL: return "REPLACE_RE_EVAL"; + case Rewrite::REPLACE_RE_ALL_EVAL: return "REPLACE_RE_ALL_EVAL"; case Rewrite::SPLIT_EQ: return "SPLIT_EQ"; case Rewrite::SPLIT_EQ_STRIP_L: return "SPLIT_EQ_STRIP_L"; case Rewrite::SPLIT_EQ_STRIP_R: return "SPLIT_EQ_STRIP_R"; diff --git a/src/theory/strings/rewrites.h b/src/theory/strings/rewrites.h index 96a3b65fd..62350c403 100644 --- a/src/theory/strings/rewrites.h +++ b/src/theory/strings/rewrites.h @@ -116,6 +116,8 @@ enum class Rewrite : uint32_t RPL_RPL_EMPTY, RPL_RPL_LEN_ID, RPL_X_Y_X_SIMP, + REPLACE_RE_EVAL, + REPLACE_RE_ALL_EVAL, SPLIT_EQ, SPLIT_EQ_STRIP_L, SPLIT_EQ_STRIP_R, diff --git a/src/theory/strings/sequences_rewriter.cpp b/src/theory/strings/sequences_rewriter.cpp index 4f74d7c15..3c40062f5 100644 --- a/src/theory/strings/sequences_rewriter.cpp +++ b/src/theory/strings/sequences_rewriter.cpp @@ -1413,6 +1413,14 @@ RewriteResponse SequencesRewriter::postRewrite(TNode node) { retNode = rewriteReplaceAll(node); } + else if (nk == kind::STRING_REPLACE_RE) + { + retNode = rewriteReplaceRe(node); + } + else if (nk == kind::STRING_REPLACE_RE_ALL) + { + retNode = rewriteReplaceReAll(node); + } else if (nk == STRING_REV) { retNode = rewriteStrReverse(node); @@ -2914,6 +2922,114 @@ Node SequencesRewriter::rewriteReplaceInternal(Node node) return Node::null(); } +Node SequencesRewriter::rewriteReplaceRe(Node node) +{ + Assert(node.getKind() == STRING_REPLACE_RE); + NodeManager* nm = NodeManager::currentNM(); + Node x = node[0]; + Node y = node[1]; + Node z = node[2]; + + if (x.isConst()) + { + // str.replace_re("ZABCZ", re.++("A", _*, "C"), y) ---> "Z" ++ y ++ "Z" + std::pair match = firstMatch(x, y); + if (match.first != string::npos) + { + String s = x.getConst(); + Node ret = nm->mkNode(STRING_CONCAT, + nm->mkConst(s.substr(0, match.first)), + z, + nm->mkConst(s.substr(match.second))); + return returnRewrite(node, ret, Rewrite::REPLACE_RE_EVAL); + } + else + { + return returnRewrite(node, x, Rewrite::REPLACE_RE_EVAL); + } + } + return node; +} + +Node SequencesRewriter::rewriteReplaceReAll(Node node) +{ + Assert(node.getKind() == STRING_REPLACE_RE_ALL); + NodeManager* nm = NodeManager::currentNM(); + Node x = node[0]; + Node y = node[1]; + Node z = node[2]; + + if (x.isConst()) + { + // str.replace_re_all("ZABCZAB", re.++("A", _*, "C"), y) ---> + // "Z" ++ y ++ "Z" ++ y + TypeNode t = x.getType(); + Node emp = Word::mkEmptyWord(t); + Node yp = Rewriter::rewrite( + nm->mkNode(REGEXP_DIFF, y, nm->mkNode(STRING_TO_REGEXP, emp))); + std::vector res; + String rem = x.getConst(); + std::pair match(0, 0); + while (rem.size() >= 0) + { + match = firstMatch(nm->mkConst(rem), yp); + if (match.first == string::npos) + { + break; + } + res.push_back(nm->mkConst(rem.substr(0, match.first))); + res.push_back(z); + rem = rem.substr(match.second); + } + res.push_back(nm->mkConst(rem)); + Node ret = utils::mkConcat(res, t); + return returnRewrite(node, ret, Rewrite::REPLACE_RE_ALL_EVAL); + } + + return node; +} + +std::pair SequencesRewriter::firstMatch(Node n, Node r) +{ + Assert(n.isConst() && n.getType().isStringLike()); + Assert(r.getType().isRegExp()); + NodeManager* nm = NodeManager::currentNM(); + + std::vector emptyVec; + Node sigmaStar = nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec)); + Node re = nm->mkNode(REGEXP_CONCAT, r, sigmaStar); + String s = n.getConst(); + + if (s.size() == 0) + { + if (RegExpEntail::testConstStringInRegExp(s, 0, r)) + { + return std::make_pair(0, 0); + } + else + { + return std::make_pair(string::npos, string::npos); + } + } + + for (size_t i = 0, size = s.size(); i < size; i++) + { + if (RegExpEntail::testConstStringInRegExp(s, i, re)) + { + for (size_t j = i; j <= size; j++) + { + String substr = s.substr(i, j - i); + if (RegExpEntail::testConstStringInRegExp(substr, 0, r)) + { + return std::make_pair(i, j); + } + } + } + } + + return std::make_pair(string::npos, string::npos); +} + Node SequencesRewriter::rewriteStrReverse(Node node) { Assert(node.getKind() == STRING_REV); diff --git a/src/theory/strings/sequences_rewriter.h b/src/theory/strings/sequences_rewriter.h index 490dd8b3c..00ae21634 100644 --- a/src/theory/strings/sequences_rewriter.h +++ b/src/theory/strings/sequences_rewriter.h @@ -204,6 +204,33 @@ class SequencesRewriter : public TheoryRewriter * str.replaceall. If it returns a non-null ret, then node rewrites to ret. */ Node rewriteReplaceInternal(Node node); + /** rewrite regular expression replace + * + * This method implements rewrite rules that apply to terms of the form + * str.replace_re(s, r, t). + * + * @param node The node to rewrite + * @return The rewritten node + */ + Node rewriteReplaceRe(Node node); + /** rewrite regular expression replace + * + * This method implements rewrite rules that apply to terms of the form + * str.replace_re_all(s, r, t). + * + * @param node The node to rewrite + * @return The rewritten node + */ + Node rewriteReplaceReAll(Node node); + /** + * Returns the first, shortest sequence in n that matches r. + * + * @param n The constant string or sequence to search in. + * @param r The regular expression to search for. + * @return A pair holding the start position and the end position of the + * match or a pair of string::npos if r does not appear in n. + */ + std::pair firstMatch(Node n, Node r); /** rewrite string reverse * * This is the entry point for post-rewriting terms n of the form diff --git a/src/theory/strings/skolem_cache.h b/src/theory/strings/skolem_cache.h index 7ce507f29..bb4d0de55 100644 --- a/src/theory/strings/skolem_cache.h +++ b/src/theory/strings/skolem_cache.h @@ -102,6 +102,16 @@ class SkolemCache // of b in a as the witness for contains( a, b ). SK_FIRST_CTN_PRE, SK_FIRST_CTN_POST, + // For sequence a and regular expression b, + // in_re(a, re.++(_*, b, _*)) => + // exists k_pre, k_match, k_post. + // a = k_pre ++ k_match ++ k_post ^ + // ~in_re(k_pre ++ substr(k_match, 0, str.len(k_match) - 1), + // re.++(_*, b, _*)) ^ + // in_re(k2, y) + SK_FIRST_MATCH_PRE, + SK_FIRST_MATCH, + SK_FIRST_MATCH_POST, // For integer b, // len( a ) > b => // exists k. a = k ++ a' ^ len( k ) = b @@ -120,6 +130,14 @@ class SkolemCache // k(x) is the end index of the x^th occurrence of b in a // where n is the number of occurrences of b in a, and k(0)=0. SK_OCCUR_INDEX, + // For function k: Int -> Int + // exists k. + // forall 0 <= x < n, + // k(x) is the length of the x^th occurrence of b in a (excluding + // matches of empty strings) + // where b is a regular expression, n is the number of occurrences of b + // in a, and k(0)=0. + SK_OCCUR_LEN, }; /** * Returns a skolem of type string that is cached for (a,b,id) and has diff --git a/src/theory/strings/term_registry.cpp b/src/theory/strings/term_registry.cpp index 6330d7c10..395604f76 100644 --- a/src/theory/strings/term_registry.cpp +++ b/src/theory/strings/term_registry.cpp @@ -76,9 +76,10 @@ void TermRegistry::preRegisterTerm(TNode n) if (!options::stringExp()) { if (k == STRING_STRIDOF || k == STRING_ITOS || k == STRING_STOI - || k == STRING_STRREPL || k == STRING_STRREPLALL || k == STRING_STRCTN - || k == STRING_LEQ || k == STRING_TOLOWER || k == STRING_TOUPPER - || k == STRING_REV) + || k == STRING_STRREPL || k == STRING_STRREPLALL + || k == STRING_REPLACE_RE || k == STRING_REPLACE_RE_ALL + || k == STRING_STRCTN || k == STRING_LEQ || k == STRING_TOLOWER + || k == STRING_TOUPPER || k == STRING_REV) { std::stringstream ss; ss << "Term of kind " << k diff --git a/src/theory/strings/theory_strings.cpp b/src/theory/strings/theory_strings.cpp index 5107fa3f9..d83d5ca49 100644 --- a/src/theory/strings/theory_strings.cpp +++ b/src/theory/strings/theory_strings.cpp @@ -118,6 +118,9 @@ TheoryStrings::TheoryStrings(context::Context* c, d_equalityEngine.addFunctionKind(kind::STRING_STRIDOF); d_equalityEngine.addFunctionKind(kind::STRING_STRREPL); d_equalityEngine.addFunctionKind(kind::STRING_STRREPLALL); + d_equalityEngine.addFunctionKind(kind::STRING_REPLACE_RE); + d_equalityEngine.addFunctionKind(kind::STRING_REPLACE_RE_ALL); + d_equalityEngine.addFunctionKind(kind::STRING_STRREPLALL); d_equalityEngine.addFunctionKind(kind::STRING_TOLOWER); d_equalityEngine.addFunctionKind(kind::STRING_TOUPPER); d_equalityEngine.addFunctionKind(kind::STRING_REV); diff --git a/src/theory/strings/theory_strings_preprocess.cpp b/src/theory/strings/theory_strings_preprocess.cpp index 50c6ede62..550a7ba79 100644 --- a/src/theory/strings/theory_strings_preprocess.cpp +++ b/src/theory/strings/theory_strings_preprocess.cpp @@ -488,6 +488,175 @@ Node StringsPreprocess::reduce(Node t, // Thus, replaceall( x, y, z ) = rpaw retNode = rpaw; } + else if (t.getKind() == STRING_REPLACE_RE) + { + Node x = t[0]; + Node y = t[1]; + Node z = t[2]; + Node k = sc->mkSkolemCached(t, SkolemCache::SK_PURIFY, "k"); + + std::vector emptyVec; + Node sigmaStar = + nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec)); + Node re = nm->mkNode(REGEXP_CONCAT, sigmaStar, y, sigmaStar); + // in_re(x, re.++(_*, y, _*)) + Node hasMatch = nm->mkNode(STRING_IN_REGEXP, x, re); + + // in_re("", y) + Node matchesEmpty = + nm->mkNode(STRING_IN_REGEXP, nm->mkConst(String("")), y); + // k = z ++ x + Node res1 = k.eqNode(nm->mkNode(STRING_CONCAT, z, x)); + + Node k1 = + sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH_PRE, "rre_pre"); + Node k2 = + sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH, "rre_match"); + Node k3 = + sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH_POST, "rre_post"); + // x = k1 ++ k2 ++ k3 + Node splitX = x.eqNode(nm->mkNode(STRING_CONCAT, k1, k2, k3)); + // ~in_re(k1 ++ str.substr(k2, 0, str.len(k2) - 1), re.++(_*, y, _*)) + Node k2len = nm->mkNode(STRING_LENGTH, k2); + Node firstMatch = + nm->mkNode( + STRING_IN_REGEXP, + nm->mkNode( + STRING_CONCAT, + k1, + nm->mkNode( + STRING_SUBSTR, k2, zero, nm->mkNode(MINUS, k2len, one))), + re) + .negate(); + // in_re(k2, y) + Node k2Match = nm->mkNode(STRING_IN_REGEXP, k2, y); + // k = k1 ++ z ++ k3 + Node res2 = k.eqNode(nm->mkNode(STRING_CONCAT, k1, z, k3)); + + // IF in_re(x, re.++(_*, y, _*)) + // THEN: + // IF in_re("", y) + // THEN: k = z ++ x + // ELSE: + // x = k1 ++ k2 ++ k3 ^ + // ~in_re(k1 ++ substr(k2, 0, str.len(k2) - 1), re.++(_*, y, _*)) ^ + // in_re(k2, y) ^ k = k1 ++ z ++ k3 + // ELSE: k = x + asserts.push_back(nm->mkNode( + ITE, + hasMatch, + nm->mkNode(ITE, + matchesEmpty, + res1, + nm->mkNode(AND, splitX, firstMatch, k2Match, res2)), + k.eqNode(x))); + retNode = k; + } + else if (t.getKind() == STRING_REPLACE_RE_ALL) + { + Node x = t[0]; + Node y = t[1]; + Node z = t[2]; + Node k = sc->mkSkolemCached(t, SkolemCache::SK_PURIFY, "k"); + + Node numOcc = sc->mkTypedSkolemCached( + nm->integerType(), x, y, SkolemCache::SK_NUM_OCCUR, "numOcc"); + std::vector argTypes; + argTypes.push_back(nm->integerType()); + Node us = nm->mkSkolem("Us", nm->mkFunctionType(argTypes, t.getType())); + TypeNode ufType = nm->mkFunctionType(argTypes, nm->integerType()); + Node uf = sc->mkTypedSkolemCached( + ufType, x, y, SkolemCache::SK_OCCUR_INDEX, "Uf"); + Node ul = + sc->mkTypedSkolemCached(ufType, x, y, SkolemCache::SK_OCCUR_LEN, "Ul"); + + Node emp = Word::mkEmptyWord(t.getType()); + + std::vector emptyVec; + Node sigmaStar = + nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec)); + Node yp = nm->mkNode(REGEXP_DIFF, y, nm->mkNode(STRING_TO_REGEXP, emp)); + Node re = nm->mkNode(REGEXP_CONCAT, sigmaStar, yp, sigmaStar); + // in_re(x, _* ++ y' ++ _*) + Node hasMatch = nm->mkNode(STRING_IN_REGEXP, x, re); + + Node ufno = nm->mkNode(APPLY_UF, uf, numOcc); + Node usno = nm->mkNode(APPLY_UF, us, numOcc); + Node rem = nm->mkNode(STRING_SUBSTR, x, ufno, nm->mkNode(STRING_LENGTH, x)); + + std::vector lemmas; + // numOcc > 0 + lemmas.push_back(nm->mkNode(GT, numOcc, zero)); + // k = Us(0) + lemmas.push_back(k.eqNode(nm->mkNode(APPLY_UF, us, zero))); + // Us(numOcc) = substr(x, Uf(numOcc)) + lemmas.push_back(usno.eqNode(rem)); + // Uf(0) = 0 + lemmas.push_back(nm->mkNode(APPLY_UF, uf, zero).eqNode(zero)); + // not(in_re(substr(x, Uf(numOcc)), re.++(_*, y', _*))) + lemmas.push_back(nm->mkNode(STRING_IN_REGEXP, rem, re).negate()); + + Node i = SkolemCache::mkIndexVar(t); + Node bvli = nm->mkNode(BOUND_VAR_LIST, i); + Node bound = + nm->mkNode(AND, nm->mkNode(GEQ, i, zero), nm->mkNode(LT, i, numOcc)); + Node ip1 = nm->mkNode(PLUS, i, one); + Node ufi = nm->mkNode(APPLY_UF, uf, i); + Node uli = nm->mkNode(APPLY_UF, ul, i); + Node ufip1 = nm->mkNode(APPLY_UF, uf, ip1); + Node ii = nm->mkNode(MINUS, ufip1, uli); + Node match = nm->mkNode(STRING_SUBSTR, x, ii, uli); + Node pfxMatch = + nm->mkNode(STRING_SUBSTR, x, ufi, nm->mkNode(MINUS, ii, ufi)); + Node nonMatch = + nm->mkNode(STRING_SUBSTR, + x, + ufi, + nm->mkNode(MINUS, nm->mkNode(MINUS, ufip1, one), ufi)); + + std::vector flem; + // Ul(i) > 0 + flem.push_back(nm->mkNode(GT, uli, zero)); + // Uf(i + 1) >= Uf(i) + Ul(i) + flem.push_back(nm->mkNode(GEQ, ufip1, nm->mkNode(PLUS, ufi, uli))); + // in_re(substr(x, ii, Ul(i)), y') + flem.push_back(nm->mkNode(STRING_IN_REGEXP, match, yp)); + // ~in_re(substr(x, Uf(i), Uf(i + 1) - 1 - Uf(i)), re.++(_*, y', _*)) + flem.push_back(nm->mkNode(STRING_IN_REGEXP, nonMatch, re).negate()); + // Us(i) = substr(x, Uf(i), ii - Uf(i)) ++ z ++ Us(i + 1) + flem.push_back( + nm->mkNode(APPLY_UF, us, i) + .eqNode(nm->mkNode( + STRING_CONCAT, pfxMatch, z, nm->mkNode(APPLY_UF, us, ip1)))); + + Node forall = nm->mkNode( + FORALL, bvli, nm->mkNode(OR, bound.negate(), nm->mkNode(AND, flem))); + lemmas.push_back(forall); + + // IF in_re(x, re.++(_*, y', _*)) + // THEN: + // numOcc > 0 ^ + // k = Us(0) ^ Us(numOcc) = substr(x, Uf(numOcc)) ^ + // Uf(0) = 0 ^ not(in_re(substr(x, Uf(numOcc)), re.++(_*, y', _*))) + // forall i. 0 <= i < nummOcc => + // Ul(i) > 0 ^ + // Uf(i + 1) >= Uf(i) + Ul(i) ^ + // in_re(substr(x, ii, Ul(i)), y') ^ + // ~in_re(substr(x, Uf(i), Uf(i + 1) - 1 - Uf(i)), re.++(_*, y', _*)) ^ + // Us(i) = substr(x, Uf(i), ii - Uf(i)) ++ z ++ Us(i + 1) + // where ii = Uf(i + 1) - Ul(i) + // ELSE: k = x + // where y' = re.diff(y, "") + // + // Conceptually, y' is the regex y but excluding the empty string (because + // we do not want to match empty strings), numOcc is the number of shortest + // matches of y' in x, Uf(i) is the end position of the i-th match, Ul(i) + // is the length of the i^th match, and Us(i) is the result of processing + // the remainder after processing the i^th occurrence of y in x. + asserts.push_back( + nm->mkNode(ITE, hasMatch, nm->mkNode(AND, lemmas), k.eqNode(x))); + retNode = k; + } else if (t.getKind() == STRING_TOLOWER || t.getKind() == STRING_TOUPPER) { Node x = t[0]; diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt index 29224803a..ea1021d89 100644 --- a/test/regress/CMakeLists.txt +++ b/test/regress/CMakeLists.txt @@ -2082,8 +2082,10 @@ set(regress_2_tests regress2/strings/non_termination_regular_expression6.smt2 regress2/strings/norn-dis-0707-3.smt2 regress2/strings/range-perf.smt2 - regress2/strings/repl-repl.smt2 regress2/strings/repl-repl-i-no-push.smt2 + regress2/strings/repl-repl.smt2 + regress2/strings/replace_re.smt2 + regress2/strings/replace_re_all.smt2 regress2/strings/replaceall-diffrange.smt2 regress2/strings/replaceall-len-c.smt2 regress2/strings/small-1.smt2 diff --git a/test/regress/regress2/strings/replace_re.smt2 b/test/regress/regress2/strings/replace_re.smt2 new file mode 100644 index 000000000..1f9b2a2ee --- /dev/null +++ b/test/regress/regress2/strings/replace_re.smt2 @@ -0,0 +1,42 @@ +; COMMAND-LINE: --strings-exp +(set-option :incremental true) +(set-logic SLIA) +(declare-const x String) + +(push) +(assert (= "AFOOE" (str.replace_re x (re.++ (str.to_re "B") re.allchar (str.to_re "C")) "FOO"))) +(assert (not (= x "AFOOE"))) +(set-info :status sat) +(check-sat) +(pop) + +(push) +(assert (= "FOOA" (str.replace_re x re.all "FOO"))) +(set-info :status sat) +(check-sat) +(pop) + +(push) +(assert (= "FOOA" (str.replace_re x (re.++ (str.to_re "A") re.all) "FOO"))) +(set-info :status sat) +(check-sat) +(pop) + +(push) +(assert (= "FOOA" (str.replace_re x (re.++ (str.to_re "A") re.all) "FOO"))) +(assert (> (str.len x) 2)) +(set-info :status unsat) +(check-sat) +(pop) + +(push) +(assert (= "FOOA" (str.replace_re "A" re.all "FOO"))) +(set-info :status sat) +(check-sat) +(pop) + +(push) +(assert (not (= "A" (str.replace_re "A" re.none "FOO")))) +(set-info :status unsat) +(check-sat) +(pop) diff --git a/test/regress/regress2/strings/replace_re_all.smt2 b/test/regress/regress2/strings/replace_re_all.smt2 new file mode 100644 index 000000000..cf2b674c3 --- /dev/null +++ b/test/regress/regress2/strings/replace_re_all.smt2 @@ -0,0 +1,31 @@ +; COMMAND-LINE: --strings-exp +(set-option :incremental true) +(set-logic SLIA) +(declare-const x String) +(declare-const y String) + +(push) +(assert (= x (str.replace_re_all "ZABCZACZADDC" (re.++ (str.to_re "A") re.all (str.to_re "C")) y))) +(assert (= x "ZFOOZFXOZFOO")) +(set-info :status unsat) +(check-sat) +(pop) + +(push) +(assert (= "ZFOOZFXOZFOO" (str.replace_re_all x (re.++ (str.to_re "A") re.all (str.to_re "C")) "FOO"))) +(assert (not (= x "ZFOOZFXOZFOO"))) +(set-info :status sat) +(check-sat) +(pop) + +(push) +(assert (= "ZFOOZZFOO" (str.replace_re_all (str.++ "ZACZ" x "ZADDC") (re.++ (str.to_re "A") re.all (str.to_re "C")) "FOO"))) +(set-info :status sat) +(check-sat) +(pop) + +(push) +(assert (= "ZFOOZZFOO" (str.replace_re_all (str.++ "ZACXZ" x "ZADDC") (re.++ (str.to_re "A") re.all (str.to_re "C")) "FOO"))) +(set-info :status unsat) +(check-sat) +(pop) diff --git a/test/unit/theory/sequences_rewriter_white.h b/test/unit/theory/sequences_rewriter_white.h index 7e45296a9..ba3070bff 100644 --- a/test/unit/theory/sequences_rewriter_white.h +++ b/test/unit/theory/sequences_rewriter_white.h @@ -707,6 +707,228 @@ class SequencesRewriterWhite : public CxxTest::TestSuite } } + void testRewriteReplaceRe() + { + TypeNode intType = d_nm->integerType(); + TypeNode strType = d_nm->stringType(); + + std::vector emptyVec; + Node sigStar = d_nm->mkNode(kind::REGEXP_STAR, + d_nm->mkNode(kind::REGEXP_SIGMA, emptyVec)); + Node foo = d_nm->mkConst(String("FOO")); + Node a = d_nm->mkConst(String("A")); + Node b = d_nm->mkConst(String("B")); + Node re = d_nm->mkNode(kind::REGEXP_CONCAT, + d_nm->mkNode(kind::STRING_TO_REGEXP, a), + sigStar, + d_nm->mkNode(kind::STRING_TO_REGEXP, b)); + + // Same normal form: + // + // (str.replace_re + // "AZZZB" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "FOO" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE, d_nm->mkConst(String("AZZZB")), re, foo); + Node res = d_nm->mkConst(::CVC4::String("FOO")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZAZZZBZZB" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "ZFOOZZB" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE, d_nm->mkConst(String("ZAZZZBZZB")), re, foo); + Node res = d_nm->mkConst(::CVC4::String("ZFOOZZB")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZAZZZBZAZB" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "ZFOOZAZB" + { + Node t = d_nm->mkNode(kind::STRING_REPLACE_RE, + d_nm->mkConst(String("ZAZZZBZAZB")), + re, + foo); + Node res = d_nm->mkConst(::CVC4::String("ZFOOZAZB")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZZZ" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "ZZZ" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE, d_nm->mkConst(String("ZZZ")), re, foo); + Node res = d_nm->mkConst(::CVC4::String("ZZZ")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZZZ" + // re.all + // "FOO") + // + // "FOOZZZ" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE, d_nm->mkConst(String("ZZZ")), sigStar, foo); + Node res = d_nm->mkConst(::CVC4::String("FOOZZZ")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "" + // re.all + // "FOO") + // + // "FOO" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE, d_nm->mkConst(String("")), sigStar, foo); + Node res = d_nm->mkConst(::CVC4::String("FOO")); + sameNormalForm(t, res); + } + } + + void testRewriteReplaceReAll() + { + TypeNode intType = d_nm->integerType(); + TypeNode strType = d_nm->stringType(); + + std::vector emptyVec; + Node sigStar = d_nm->mkNode(kind::REGEXP_STAR, + d_nm->mkNode(kind::REGEXP_SIGMA, emptyVec)); + Node foo = d_nm->mkConst(String("FOO")); + Node a = d_nm->mkConst(String("A")); + Node b = d_nm->mkConst(String("B")); + Node re = d_nm->mkNode(kind::REGEXP_CONCAT, + d_nm->mkNode(kind::STRING_TO_REGEXP, a), + sigStar, + d_nm->mkNode(kind::STRING_TO_REGEXP, b)); + + // Same normal form: + // + // (str.replace_re + // "AZZZB" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "FOO" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE_ALL, d_nm->mkConst(String("AZZZB")), re, foo); + Node res = d_nm->mkConst(::CVC4::String("FOO")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZAZZZBZZB" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "ZFOOZZB" + { + Node t = d_nm->mkNode(kind::STRING_REPLACE_RE_ALL, + d_nm->mkConst(String("ZAZZZBZZB")), + re, + foo); + Node res = d_nm->mkConst(::CVC4::String("ZFOOZZB")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZAZZZBZAZB" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "ZFOOZFOO" + { + Node t = d_nm->mkNode(kind::STRING_REPLACE_RE_ALL, + d_nm->mkConst(String("ZAZZZBZAZB")), + re, + foo); + Node res = d_nm->mkConst(::CVC4::String("ZFOOZFOO")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZZZ" + // (re.++ (str.to_re "A") re.all (str.to_re "B")) + // "FOO") + // + // "ZZZ" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE_ALL, d_nm->mkConst(String("ZZZ")), re, foo); + Node res = d_nm->mkConst(::CVC4::String("ZZZ")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "ZZZ" + // re.all + // "FOO") + // + // "FOOFOOFOO" + { + Node t = d_nm->mkNode(kind::STRING_REPLACE_RE_ALL, + d_nm->mkConst(String("ZZZ")), + sigStar, + foo); + Node res = d_nm->mkConst(::CVC4::String("FOOFOOFOO")); + sameNormalForm(t, res); + } + + // Same normal form: + // + // (str.replace_re + // "" + // re.all + // "FOO") + // + // "" + { + Node t = d_nm->mkNode( + kind::STRING_REPLACE_RE_ALL, d_nm->mkConst(String("")), sigStar, foo); + Node res = d_nm->mkConst(::CVC4::String("")); + sameNormalForm(t, res); + } + } + void testRewriteContains() { TypeNode intType = d_nm->integerType();