blocxx

PerlRegEx.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002 * Copyright (C) 2005 Novell, Inc. All rights reserved.
00003 *
00004 * Redistribution and use in source and binary forms, with or without
00005 * modification, are permitted provided that the following conditions are met:
00006 *
00007 *  - Redistributions of source code must retain the above copyright notice,
00008 *    this list of conditions and the following disclaimer.
00009 *
00010 *  - Redistributions in binary form must reproduce the above copyright notice,
00011 *    this list of conditions and the following disclaimer in the documentation
00012 *    and/or other materials provided with the distribution.
00013 *
00014 *  - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its
00015 *    contributors may be used to endorse or promote products derived from this
00016 *    software without specific prior written permission.
00017 *
00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
00019 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE 
00022 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
00025 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
00026 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
00027 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
00028 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 *******************************************************************************/
00034 #include "blocxx/PerlRegEx.hpp"
00035 
00036 #ifdef BLOCXX_HAVE_PCRE
00037 #ifdef BLOCXX_HAVE_PCRE_H
00038 
00039 #include "blocxx/ExceptionIds.hpp"
00040 #include "blocxx/Assertion.hpp"
00041 #include "blocxx/Format.hpp"
00042 #include <climits> // for INT_MAX
00043 
00044 
00045 namespace BLOCXX_NAMESPACE
00046 {
00047 
00048 
00049 // -------------------------------------------------------------------
00050 static String
00051 substitute_caps(const PerlRegEx::MatchArray &sub,
00052                 const String &str, const String &rep)
00053 {
00054    static const char *cap_refs[] = {
00055       NULL,  "\\1", "\\2", "\\3", "\\4",
00056       "\\5", "\\6", "\\7", "\\8", "\\9", NULL
00057    };
00058 
00059    String res( rep);
00060    size_t pos;
00061 
00062    for(size_t i=1; cap_refs[i] != NULL; i++)
00063    {
00064       String cap;
00065       if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0)
00066       {
00067          cap = str.substring(sub[i].rm_so, sub[i].rm_eo
00068                                          - sub[i].rm_so);
00069       }
00070 
00071       pos = res.indexOf(cap_refs[i]);
00072       while( pos != String::npos)
00073       {
00074          size_t quotes = 0;
00075          size_t at = pos;
00076 
00077          while( at > 0 && res.charAt(--at) == '\\')
00078             quotes++;
00079 
00080          if( quotes % 2)
00081          {
00082             quotes = (quotes + 1) / 2;
00083 
00084             res = res.erase(pos - quotes, quotes);
00085 
00086             pos = res.indexOf(cap_refs[i],
00087                               pos + 2 - quotes);
00088          }
00089          else
00090          {
00091             quotes = quotes / 2;
00092 
00093             res = res.substring(0, pos - quotes) +
00094                   cap +
00095                   res.substring(pos + 2);
00096 
00097             pos = res.indexOf(cap_refs[i],
00098                               pos + cap.length() - quotes);
00099          }
00100       }
00101    }
00102    return res;
00103 }
00104 
00105 
00106 // -------------------------------------------------------------------
00107 static inline String
00108 getError(const int errcode)
00109 {
00110    const char *ptr;
00111    switch(errcode)
00112    {
00113       case 0:
00114          ptr = "match vector to small";
00115       break;
00116 
00117       case PCRE_ERROR_NOMATCH:
00118          ptr = "match failed";
00119       break;
00120 
00121       case PCRE_ERROR_NULL:
00122          ptr = "invalid argument";
00123       break;
00124 
00125       case PCRE_ERROR_BADOPTION:
00126          ptr = "unrecognized option";
00127       break;
00128 
00129       case PCRE_ERROR_BADMAGIC:
00130          ptr = "invalid magic number";
00131       break;
00132 
00133       case PCRE_ERROR_UNKNOWN_NODE:
00134          ptr = "unknown item in the compiled pattern";
00135       break;
00136 
00137       case PCRE_ERROR_NOMEMORY:
00138          ptr = "failed to allocate memory";
00139       break;
00140 
00141       case PCRE_ERROR_NOSUBSTRING:
00142          // .*_substring.* functions only
00143          ptr = "failed to retrieve substring";
00144       break;
00145 
00146       case PCRE_ERROR_MATCHLIMIT:
00147          // match_limit in pcre_extra struct
00148          ptr = "recursion or backtracking limit reached";
00149       break;
00150 
00151       case PCRE_ERROR_CALLOUT:
00152          // reserved for pcrecallout functions
00153          ptr = "callout failure";
00154       break;
00155 
00156       case PCRE_ERROR_BADUTF8:
00157          ptr = "invalid UTF-8 byte sequence found";
00158       break;
00159 
00160       case PCRE_ERROR_BADUTF8_OFFSET:
00161          ptr = "not a UTF-8 character at specified index";
00162       break;
00163 
00164       case PCRE_ERROR_PARTIAL:
00165          ptr = "partial match";
00166       break;
00167 
00168       case PCRE_ERROR_BADPARTIAL:
00169          ptr = "pattern item not supported for partial matching";
00170       break;
00171 
00172       case PCRE_ERROR_INTERNAL:
00173          ptr = "unexpected internal error occurred";
00174       break;
00175 
00176       case PCRE_ERROR_BADCOUNT:
00177          ptr = "invalid (negative) match vector count";
00178       break;
00179 
00180       default:
00181          ptr = "unknown error code";
00182       break;
00183    }
00184    return String(ptr);
00185 }
00186 
00187 // -------------------------------------------------------------------
00188 PerlRegEx::PerlRegEx()
00189    : m_pcre(NULL)
00190    , m_flags(0)
00191    , m_ecode(0)
00192 {
00193 }
00194 
00195 
00196 // -------------------------------------------------------------------
00197 PerlRegEx::PerlRegEx(const String &regex, int cflags)
00198    : m_pcre(NULL)
00199    , m_flags(0)
00200    , m_ecode(0)
00201 {
00202    if( !compile(regex, cflags))
00203    {
00204       BLOCXX_THROW_ERR(RegExCompileException,
00205          errorString().c_str(), m_ecode);
00206    }
00207 }
00208 
00209 
00210 // -------------------------------------------------------------------
00211 PerlRegEx::PerlRegEx(const PerlRegEx &ref)
00212    : m_pcre(NULL)
00213    , m_flags(ref.m_flags)
00214    , m_ecode(0)
00215    , m_rxstr(ref.m_rxstr)
00216 {
00217    if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags))
00218    {
00219       BLOCXX_THROW_ERR(RegExCompileException,
00220          errorString().c_str(), m_ecode);
00221    }
00222 }
00223 
00224 // -------------------------------------------------------------------
00225 PerlRegEx::~PerlRegEx()
00226 {
00227    if( m_pcre)
00228    {
00229       free(m_pcre);
00230       m_pcre = NULL;
00231    }
00232 }
00233 
00234 
00235 // -------------------------------------------------------------------
00236 PerlRegEx &
00237 PerlRegEx::operator = (const PerlRegEx &ref)
00238 {
00239    if( ref.m_pcre == NULL)
00240    {
00241       m_ecode = 0;
00242       m_error.erase();
00243       m_flags = ref.m_flags;
00244       m_rxstr = ref.m_rxstr;
00245       if( m_pcre != NULL)
00246       {
00247          free(m_pcre);
00248          m_pcre = NULL;
00249       }
00250    }
00251    else if( !compile(ref.m_rxstr, ref.m_flags))
00252    {
00253       BLOCXX_THROW_ERR(RegExCompileException,
00254          errorString().c_str(), m_ecode);
00255    }
00256    return *this;
00257 }
00258 
00259 
00260 // -------------------------------------------------------------------
00261 bool
00262 PerlRegEx::compile(const String &regex, int cflags)
00263 {
00264    if( m_pcre)
00265    {
00266       free(m_pcre);
00267       m_pcre = NULL;
00268    }
00269 
00270    const char *errptr = NULL;
00271 
00272    m_ecode = 0;
00273    m_pcre  = ::pcre_compile(regex.c_str(), cflags,
00274                             &errptr, &m_ecode, NULL);
00275    if( m_pcre == NULL)
00276    {
00277       m_error = String(errptr ? errptr : "");
00278       m_rxstr.erase();
00279       m_flags = 0;
00280       return false;
00281    }
00282    else
00283    {
00284       m_error.erase();
00285       m_rxstr = regex;
00286       m_flags = cflags;
00287       return true;
00288    }
00289 }
00290 
00291 
00292 // -------------------------------------------------------------------
00293 int
00294 PerlRegEx::errorCode()
00295 {
00296    return m_ecode;
00297 }
00298 
00299 
00300 // -------------------------------------------------------------------
00301 String
00302 PerlRegEx::errorString() const
00303 {
00304    return m_error;
00305 }
00306 
00307 
00308 // -------------------------------------------------------------------
00309 String
00310 PerlRegEx::patternString() const
00311 {
00312    return m_rxstr;
00313 }
00314 
00315 
00316 // -------------------------------------------------------------------
00317 int
00318 PerlRegEx::compileFlags() const
00319 {
00320    return m_flags;
00321 }
00322 
00323 
00324 // -------------------------------------------------------------------
00325 bool
00326 PerlRegEx::isCompiled() const
00327 {
00328    return (m_pcre != NULL);
00329 }
00330 
00331 
00332 // -------------------------------------------------------------------
00333 bool
00334 PerlRegEx::execute(MatchArray &sub, const String &str,
00335                size_t index, size_t count, int eflags)
00336 {
00337    if( m_pcre == NULL)
00338    {
00339       BLOCXX_THROW(RegExCompileException,
00340          "Regular expression is not compiled");
00341    }
00342    if( count >= size_t(INT_MAX / 3))
00343    {
00344       BLOCXX_THROW(AssertionException,
00345          "Match count limit exceeded");
00346    }
00347 
00348    if( index > str.length())
00349    {
00350       BLOCXX_THROW(OutOfBoundsException,
00351          Format("String index out of bounds ("
00352                 "length = %1, index = %2).",
00353                 str.length(), index
00354          ).c_str());
00355    }
00356 
00357    if( count == 0)
00358    {
00359       int cnt = 0;
00360       int ret = ::pcre_fullinfo(m_pcre, NULL,
00361                                 PCRE_INFO_CAPTURECOUNT, &cnt);
00362       if( ret)
00363       {
00364          m_error = getError(m_ecode);
00365          return false;
00366       }
00367       count = cnt > 0 ? cnt + 1 : 1;
00368    }
00369    int vsub[count * 3];
00370 
00371    sub.clear();
00372    m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00373                          index, eflags, vsub, count * 3);
00374    //
00375    // pcre_exec returns 0 if vector too small, negative value
00376    // on errors or the number of matches (number of int pairs)
00377    //
00378    if( m_ecode > 0)
00379    {
00380       sub.resize(count); // as specified by user
00381       for(size_t i = 0, n = 0; i < count; i++, n += 2)
00382       {
00383          match_t  m = { vsub[n], vsub[n+1] };
00384 
00385          // if user wants more than detected
00386          if( i >= (size_t)m_ecode)
00387             m.rm_so = m.rm_eo = -1;
00388 
00389          sub[i] = m;
00390       }
00391       m_error.erase();
00392       return true;
00393    }
00394    else
00395    {
00396       m_error = getError(m_ecode);
00397       return false;
00398    }
00399 }
00400 
00401 
00402 // -------------------------------------------------------------------
00403 bool
00404 PerlRegEx::execute(MatchVector &sub, const String &str,
00405                size_t index, size_t count, int eflags)
00406 {
00407    if( m_pcre == NULL)
00408    {
00409       BLOCXX_THROW(RegExCompileException,
00410          "Regular expression is not compiled");
00411    }
00412    if( count >= size_t(INT_MAX / 3))
00413    {
00414       BLOCXX_THROW(AssertionException,
00415          "Match count limit exceeded");
00416    }
00417 
00418    if( index > str.length())
00419    {
00420       BLOCXX_THROW(OutOfBoundsException,
00421          Format("String index out of bounds ("
00422                 "length = %1, index = %2)",
00423                 str.length(), index
00424          ).c_str());
00425    }
00426 
00427    if( count == 0)
00428    {
00429       int cnt = 0;
00430       int ret = ::pcre_fullinfo(m_pcre, NULL,
00431                                 PCRE_INFO_CAPTURECOUNT, &cnt);
00432       if( ret)
00433       {
00434          m_error = getError(m_ecode);
00435          return false;
00436       }
00437       count = cnt > 0 ? cnt + 1 : 1;
00438    }
00439    int vsub[count * 3];
00440 
00441    sub.clear();
00442    m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00443                          index, eflags, vsub, count * 3);
00444    //
00445    // pcre_exec returns 0 if vector too small, negative value
00446    // on errors or the number of matches (number of int pairs)
00447    //
00448    if( m_ecode > 0)
00449    {
00450       count   *= 2;
00451       m_ecode *= 2;
00452       sub.resize(count); // as specified by user
00453       for(size_t i = 0; i < count; i++)
00454       {
00455          // if user wants more than detected
00456          if( i >= (size_t)m_ecode)
00457             vsub[i] = -1;
00458 
00459          sub[i] = vsub[i];
00460       }
00461       return true;
00462    }
00463    else
00464    {
00465       m_error = getError(m_ecode);
00466       return false;
00467    }
00468 }
00469 
00470 
00471 // -------------------------------------------------------------------
00472 StringArray
00473 PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags)
00474 {
00475    if( m_pcre == NULL)
00476    {
00477       BLOCXX_THROW(RegExCompileException,
00478          "Regular expression is not compiled");
00479    }
00480 
00481    MatchArray  rsub;
00482    StringArray ssub;
00483 
00484    bool match = execute(rsub, str, index, count, eflags);
00485    if( match)
00486    {
00487       if( rsub.empty())
00488       {
00489          BLOCXX_THROW(RegExCompileException,
00490             "Non-capturing regular expression");
00491       }
00492 
00493       MatchArray::const_iterator i=rsub.begin();
00494       for( ; i != rsub.end(); ++i)
00495       {
00496          if( i->rm_so >= 0 && i->rm_eo >= 0)
00497          {
00498             ssub.push_back(str.substring(i->rm_so,
00499                            i->rm_eo - i->rm_so));
00500          }
00501          else
00502          {
00503             ssub.push_back(String(""));
00504          }
00505       }
00506    }
00507    else if(m_ecode != PCRE_ERROR_NOMATCH)
00508    {
00509       BLOCXX_THROW_ERR(RegExExecuteException,
00510          errorString().c_str(), m_ecode);
00511    }
00512    return ssub;
00513 }
00514 
00515 
00516 // -------------------------------------------------------------------
00517 blocxx::String
00518 PerlRegEx::replace(const String &str, const String &rep,
00519                    bool global, int eflags)
00520 {
00521    if( m_pcre == NULL)
00522    {
00523       BLOCXX_THROW(RegExCompileException,
00524          "Regular expression is not compiled");
00525    }
00526 
00527    MatchArray  rsub;
00528    bool        match;
00529    size_t      off = 0;
00530    String      out = str;
00531 
00532    do
00533    {
00534       match = execute(rsub, out, off, 0, eflags);
00535       if( match)
00536       {
00537          if( rsub.empty()      ||
00538              rsub[0].rm_so < 0 ||
00539              rsub[0].rm_eo < 0)
00540          {
00541             // only if empty (missused as guard).
00542             BLOCXX_THROW(RegExCompileException,
00543                "Non-capturing regular expression");
00544          }
00545 
00546          String res = substitute_caps(rsub, out, rep);
00547 
00548          out = out.substring(0, rsub[0].rm_so) +
00549                res + out.substring(rsub[0].rm_eo);
00550 
00551          off = rsub[0].rm_so + res.length();
00552       }
00553       else if(m_ecode == PCRE_ERROR_NOMATCH)
00554       {
00555          m_ecode = 0;
00556          m_error.erase();
00557       }
00558       else
00559       {
00560          BLOCXX_THROW_ERR(RegExExecuteException,
00561             errorString().c_str(), m_ecode);
00562       }
00563    } while(global && match && out.length() > off);
00564 
00565    return out;
00566 }
00567 
00568 
00569 // -------------------------------------------------------------------
00570 StringArray
00571 PerlRegEx::split(const String &str, bool empty, int eflags)
00572 {
00573    if( m_pcre == NULL)
00574    {
00575       BLOCXX_THROW(RegExCompileException,
00576          "Regular expression is not compiled");
00577    }
00578 
00579    MatchArray  rsub;
00580    StringArray ssub;
00581    bool        match;
00582    size_t      off = 0;
00583    size_t      len = str.length();
00584 
00585    do
00586    {
00587       match = execute(rsub, str, off, 0, eflags);
00588       if( match)
00589       {
00590          if( rsub.empty()      ||
00591              rsub[0].rm_so < 0 ||
00592              rsub[0].rm_eo < 0)
00593          {
00594             BLOCXX_THROW(RegExCompileException,
00595                "Non-capturing regular expression");
00596          }
00597 
00598          if( empty || ((size_t)rsub[0].rm_so > off))
00599          {
00600             ssub.push_back(str.substring(off,
00601                                rsub[0].rm_so - off));
00602          }
00603          off = rsub[0].rm_eo;
00604       }
00605       else if(m_ecode == PCRE_ERROR_NOMATCH)
00606       {
00607          String tmp = str.substring(off);
00608          if( empty || !tmp.empty())
00609          {
00610             ssub.push_back(tmp);
00611          }
00612          m_ecode = 0;
00613          m_error.erase();
00614       }
00615       else
00616       {
00617          BLOCXX_THROW_ERR(RegExExecuteException,
00618             errorString().c_str(), m_ecode);
00619       }
00620    } while(match && len > off);
00621 
00622    return ssub;
00623 }
00624 
00625 
00626 // -------------------------------------------------------------------
00627 StringArray
00628 PerlRegEx::grep(const StringArray &src, int eflags)
00629 {
00630    if( m_pcre == NULL)
00631    {
00632       BLOCXX_THROW(RegExCompileException,
00633          "Regular expression is not compiled");
00634    }
00635 
00636    m_ecode = 0;
00637    m_error.erase();
00638 
00639    StringArray out;
00640    if( !src.empty())
00641    {
00642       StringArray::const_iterator i=src.begin();
00643       for( ; i != src.end(); ++i)
00644       {
00645          int ret = ::pcre_exec(m_pcre, NULL, i->c_str(),
00646                    i->length(), 0, eflags, NULL, 0);
00647          if( ret >= 0)
00648          {
00649             out.push_back(*i);
00650          }
00651          else if( ret != PCRE_ERROR_NOMATCH)
00652          {
00653             m_ecode = ret;
00654             m_error = getError(m_ecode);
00655             BLOCXX_THROW_ERR(RegExExecuteException,
00656                errorString().c_str(), m_ecode);
00657          }
00658       }
00659    }
00660    return out;
00661 }
00662 
00663 
00664 // -------------------------------------------------------------------
00665 bool
00666 PerlRegEx::match(const String &str, size_t index, int eflags) const
00667 {
00668    if( m_pcre == NULL)
00669    {
00670       BLOCXX_THROW(RegExCompileException,
00671          "Regular expression is not compiled");
00672    }
00673 
00674    if( index > str.length())
00675    {
00676       BLOCXX_THROW(OutOfBoundsException,
00677          Format("String index out of bounds."
00678                 "length = %1, index = %2",
00679                 str.length(), index
00680          ).c_str());
00681    }
00682 
00683    m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(),
00684              str.length(), 0, eflags, NULL, 0);
00685    if( m_ecode >= 0)
00686    {
00687       m_error.erase();
00688       return true;
00689    }
00690    else if( m_ecode == PCRE_ERROR_NOMATCH)
00691    {
00692       m_error = getError(m_ecode);
00693       return false;
00694    }
00695    else
00696    {
00697       m_error = getError(m_ecode);
00698       BLOCXX_THROW_ERR(RegExExecuteException,
00699          errorString().c_str(), m_ecode);
00700    }
00701 }
00702 
00703 
00704 // -------------------------------------------------------------------
00705 } // namespace BLOCXX_NAMESPACE
00706 
00707 #endif // BLOCXX_HAVE_PCRE_H
00708 #endif // BLOCXX_HAVE_PCRE
00709 
00710 /* vim: set ts=8 sts=8 sw=8 ai noet: */
00711