From 976df480c918f050608f7a23a4a21415c43475c3 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 25 Nov 2014 11:47:06 +0100 Subject: [PATCH] Add a primop for regular expression pattern matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function ‘builtins.match’ takes a POSIX extended regular expression and an arbitrary string. It returns ‘null’ if the string does not match the regular expression. Otherwise, it returns a list containing substring matches corresponding to parenthesis groups in the regex. The regex must match the entire string (i.e. there is an implied "^$" around the regex). For example: match "foo" "foobar" => null match "foo" "foo" => [] match "f(o+)(.*)" "foooobar" => ["oooo" "bar"] match "(.*/)?([^/]*)" "/dir/file.nix" => ["/dir/" "file.nix"] match "(.*/)?([^/]*)" "file.nix" => [null "file.nix"] The following example finds all regular files with extension .nix or .patch underneath the current directory: let findFiles = pat: dir: concatLists (mapAttrsToList (name: type: if type == "directory" then findFiles pat (dir + "/" + name) else if type == "regular" && match pat name != null then [(dir + "/" + name)] else []) (readDir dir)); in findFiles ".*\\.(nix|patch)" (toString ./.) --- src/libexpr/primops.cc | 30 +++++++++++++++++++++++++++- src/libutil/regex.cc | 23 ++++++++++++++++++--- src/libutil/regex.hh | 9 ++++++++- tests/lang/eval-okay-regex-match.exp | 1 + tests/lang/eval-okay-regex-match.nix | 26 ++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 tests/lang/eval-okay-regex-match.exp create mode 100644 tests/lang/eval-okay-regex-match.nix diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc index ed50c8091..b0596dad9 100644 --- a/src/libexpr/primops.cc +++ b/src/libexpr/primops.cc @@ -1430,7 +1430,34 @@ static void prim_hashString(EvalState & state, const Pos & pos, Value * * args, string s = state.forceString(*args[1], context, pos); mkString(v, printHash(hashString(ht, s)), context); -}; +} + + +/* Match a regular expression against a string and return either + ‘null’ or a list containing substring matches. */ +static void prim_match(EvalState & state, const Pos & pos, Value * * args, Value & v) +{ + Regex regex(state.forceStringNoCtx(*args[0], pos), true); + + PathSet context; + string s = state.forceString(*args[1], context, pos); + + Regex::Subs subs; + if (!regex.matches(s, subs)) { + mkNull(v); + return; + } + + unsigned int len = subs.empty() ? 0 : subs.rbegin()->first + 1; + state.mkList(v, len); + for (unsigned int n = 0; n < len; ++n) { + auto i = subs.find(n); + if (i == subs.end()) + mkNull(*(v.list.elems[n] = state.allocValue())); + else + mkString(*(v.list.elems[n] = state.allocValue()), i->second); + } +} /************************************************************* @@ -1584,6 +1611,7 @@ void EvalState::createBaseEnv() addPrimOp("__unsafeDiscardStringContext", 1, prim_unsafeDiscardStringContext); addPrimOp("__unsafeDiscardOutputDependency", 1, prim_unsafeDiscardOutputDependency); addPrimOp("__hashString", 2, prim_hashString); + addPrimOp("__match", 2, prim_match); // Versions addPrimOp("__parseDrvName", 1, prim_parseDrvName); diff --git a/src/libutil/regex.cc b/src/libutil/regex.cc index 36c8458ce..84274b3e1 100644 --- a/src/libutil/regex.cc +++ b/src/libutil/regex.cc @@ -1,13 +1,16 @@ #include "regex.hh" #include "types.hh" +#include + namespace nix { -Regex::Regex(const string & pattern) +Regex::Regex(const string & pattern, bool subs) { /* Patterns must match the entire string. */ - int err = regcomp(&preg, ("^(" + pattern + ")$").c_str(), REG_NOSUB | REG_EXTENDED); - if (err) throw Error(format("compiling pattern ‘%1%’: %2%") % pattern % showError(err)); + int err = regcomp(&preg, ("^(" + pattern + ")$").c_str(), (subs ? 0 : REG_NOSUB) | REG_EXTENDED); + if (err) throw RegexError(format("compiling pattern ‘%1%’: %2%") % pattern % showError(err)); + nrParens = subs ? std::count(pattern.begin(), pattern.end(), '(') : 0; } Regex::~Regex() @@ -23,6 +26,20 @@ bool Regex::matches(const string & s) throw Error(format("matching string ‘%1%’: %2%") % s % showError(err)); } +bool Regex::matches(const string & s, Subs & subs) +{ + regmatch_t pmatch[nrParens + 2]; + int err = regexec(&preg, s.c_str(), nrParens + 2, pmatch, 0); + if (err == 0) { + for (unsigned int n = 2; n < nrParens + 2; ++n) + if (pmatch[n].rm_eo != -1) + subs[n - 2] = string(s, pmatch[n].rm_so, pmatch[n].rm_eo - pmatch[n].rm_so); + return true; + } + else if (err == REG_NOMATCH) return false; + throw Error(format("matching string ‘%1%’: %2%") % s % showError(err)); +} + string Regex::showError(int err) { char buf[256]; diff --git a/src/libutil/regex.hh b/src/libutil/regex.hh index aa012b721..53e31f4ed 100644 --- a/src/libutil/regex.hh +++ b/src/libutil/regex.hh @@ -5,16 +5,23 @@ #include #include +#include + namespace nix { +MakeError(RegexError, Error) + class Regex { public: - Regex(const string & pattern); + Regex(const string & pattern, bool subs = false); ~Regex(); bool matches(const string & s); + typedef std::map Subs; + bool matches(const string & s, Subs & subs); private: + unsigned nrParens; regex_t preg; string showError(int err); }; diff --git a/tests/lang/eval-okay-regex-match.exp b/tests/lang/eval-okay-regex-match.exp new file mode 100644 index 000000000..27ba77dda --- /dev/null +++ b/tests/lang/eval-okay-regex-match.exp @@ -0,0 +1 @@ +true diff --git a/tests/lang/eval-okay-regex-match.nix b/tests/lang/eval-okay-regex-match.nix new file mode 100644 index 000000000..ae6501532 --- /dev/null +++ b/tests/lang/eval-okay-regex-match.nix @@ -0,0 +1,26 @@ +with builtins; + +let + + matches = pat: s: match pat s != null; + + splitFN = match "((.*)/)?([^/]*)\\.(nix|cc)"; + +in + +assert matches "foobar" "foobar"; +assert matches "fo*" "f"; +assert !matches "fo+" "f"; +assert matches "fo*" "fo"; +assert matches "fo*" "foo"; +assert matches "fo+" "foo"; +assert matches "fo{1,2}" "foo"; +assert !matches "fo{1,2}" "fooo"; +assert !matches "fo*" "foobar"; + +assert match "(.*)\\.nix" "foobar.nix" == [ "foobar" ]; + +assert splitFN "/path/to/foobar.nix" == [ "/path/to/" "/path/to" "foobar" "nix" ]; +assert splitFN "foobar.cc" == [ null null "foobar" "cc" ]; + +true