blocxx
|
00001 /******************************************************************************* 00002 * Copyright (C) 2005 Novell, Inc. All rights reserved. 00003 * 00004 * Redistribution and use in source and binary forms, with or without 00005 * modification, are permitted provided that the following conditions are met: 00006 * 00007 * - Redistributions of source code must retain the above copyright notice, 00008 * this list of conditions and the following disclaimer. 00009 * 00010 * - Redistributions in binary form must reproduce the above copyright notice, 00011 * this list of conditions and the following disclaimer in the documentation 00012 * and/or other materials provided with the distribution. 00013 * 00014 * - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its 00015 * contributors may be used to endorse or promote products derived from this 00016 * software without specific prior written permission. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' 00019 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00021 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE 00022 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 00025 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00026 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 00027 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 00028 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 *******************************************************************************/ 00034 #include "blocxx/PerlRegEx.hpp" 00035 00036 #ifdef BLOCXX_HAVE_PCRE 00037 #ifdef BLOCXX_HAVE_PCRE_H 00038 00039 #include "blocxx/ExceptionIds.hpp" 00040 #include "blocxx/Assertion.hpp" 00041 #include "blocxx/Format.hpp" 00042 #include <climits> // for INT_MAX 00043 00044 00045 namespace BLOCXX_NAMESPACE 00046 { 00047 00048 00049 // ------------------------------------------------------------------- 00050 static String 00051 substitute_caps(const PerlRegEx::MatchArray &sub, 00052 const String &str, const String &rep) 00053 { 00054 static const char *cap_refs[] = { 00055 NULL, "\\1", "\\2", "\\3", "\\4", 00056 "\\5", "\\6", "\\7", "\\8", "\\9", NULL 00057 }; 00058 00059 String res( rep); 00060 size_t pos; 00061 00062 for(size_t i=1; cap_refs[i] != NULL; i++) 00063 { 00064 String cap; 00065 if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0) 00066 { 00067 cap = str.substring(sub[i].rm_so, sub[i].rm_eo 00068 - sub[i].rm_so); 00069 } 00070 00071 pos = res.indexOf(cap_refs[i]); 00072 while( pos != String::npos) 00073 { 00074 size_t quotes = 0; 00075 size_t at = pos; 00076 00077 while( at > 0 && res.charAt(--at) == '\\') 00078 quotes++; 00079 00080 if( quotes % 2) 00081 { 00082 quotes = (quotes + 1) / 2; 00083 00084 res = res.erase(pos - quotes, quotes); 00085 00086 pos = res.indexOf(cap_refs[i], 00087 pos + 2 - quotes); 00088 } 00089 else 00090 { 00091 quotes = quotes / 2; 00092 00093 res = res.substring(0, pos - quotes) + 00094 cap + 00095 res.substring(pos + 2); 00096 00097 pos = res.indexOf(cap_refs[i], 00098 pos + cap.length() - quotes); 00099 } 00100 } 00101 } 00102 return res; 00103 } 00104 00105 00106 // ------------------------------------------------------------------- 00107 static inline String 00108 getError(const int errcode) 00109 { 00110 const char *ptr; 00111 switch(errcode) 00112 { 00113 case 0: 00114 ptr = "match vector to small"; 00115 break; 00116 00117 case PCRE_ERROR_NOMATCH: 00118 ptr = "match failed"; 00119 break; 00120 00121 case PCRE_ERROR_NULL: 00122 ptr = "invalid argument"; 00123 break; 00124 00125 case PCRE_ERROR_BADOPTION: 00126 ptr = "unrecognized option"; 00127 break; 00128 00129 case PCRE_ERROR_BADMAGIC: 00130 ptr = "invalid magic number"; 00131 break; 00132 00133 case PCRE_ERROR_UNKNOWN_NODE: 00134 ptr = "unknown item in the compiled pattern"; 00135 break; 00136 00137 case PCRE_ERROR_NOMEMORY: 00138 ptr = "failed to allocate memory"; 00139 break; 00140 00141 case PCRE_ERROR_NOSUBSTRING: 00142 // .*_substring.* functions only 00143 ptr = "failed to retrieve substring"; 00144 break; 00145 00146 case PCRE_ERROR_MATCHLIMIT: 00147 // match_limit in pcre_extra struct 00148 ptr = "recursion or backtracking limit reached"; 00149 break; 00150 00151 case PCRE_ERROR_CALLOUT: 00152 // reserved for pcrecallout functions 00153 ptr = "callout failure"; 00154 break; 00155 00156 case PCRE_ERROR_BADUTF8: 00157 ptr = "invalid UTF-8 byte sequence found"; 00158 break; 00159 00160 case PCRE_ERROR_BADUTF8_OFFSET: 00161 ptr = "not a UTF-8 character at specified index"; 00162 break; 00163 00164 case PCRE_ERROR_PARTIAL: 00165 ptr = "partial match"; 00166 break; 00167 00168 case PCRE_ERROR_BADPARTIAL: 00169 ptr = "pattern item not supported for partial matching"; 00170 break; 00171 00172 case PCRE_ERROR_INTERNAL: 00173 ptr = "unexpected internal error occurred"; 00174 break; 00175 00176 case PCRE_ERROR_BADCOUNT: 00177 ptr = "invalid (negative) match vector count"; 00178 break; 00179 00180 default: 00181 ptr = "unknown error code"; 00182 break; 00183 } 00184 return String(ptr); 00185 } 00186 00187 // ------------------------------------------------------------------- 00188 PerlRegEx::PerlRegEx() 00189 : m_pcre(NULL) 00190 , m_flags(0) 00191 , m_ecode(0) 00192 { 00193 } 00194 00195 00196 // ------------------------------------------------------------------- 00197 PerlRegEx::PerlRegEx(const String ®ex, int cflags) 00198 : m_pcre(NULL) 00199 , m_flags(0) 00200 , m_ecode(0) 00201 { 00202 if( !compile(regex, cflags)) 00203 { 00204 BLOCXX_THROW_ERR(RegExCompileException, 00205 errorString().c_str(), m_ecode); 00206 } 00207 } 00208 00209 00210 // ------------------------------------------------------------------- 00211 PerlRegEx::PerlRegEx(const PerlRegEx &ref) 00212 : m_pcre(NULL) 00213 , m_flags(ref.m_flags) 00214 , m_ecode(0) 00215 , m_rxstr(ref.m_rxstr) 00216 { 00217 if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags)) 00218 { 00219 BLOCXX_THROW_ERR(RegExCompileException, 00220 errorString().c_str(), m_ecode); 00221 } 00222 } 00223 00224 // ------------------------------------------------------------------- 00225 PerlRegEx::~PerlRegEx() 00226 { 00227 if( m_pcre) 00228 { 00229 free(m_pcre); 00230 m_pcre = NULL; 00231 } 00232 } 00233 00234 00235 // ------------------------------------------------------------------- 00236 PerlRegEx & 00237 PerlRegEx::operator = (const PerlRegEx &ref) 00238 { 00239 if( ref.m_pcre == NULL) 00240 { 00241 m_ecode = 0; 00242 m_error.erase(); 00243 m_flags = ref.m_flags; 00244 m_rxstr = ref.m_rxstr; 00245 if( m_pcre != NULL) 00246 { 00247 free(m_pcre); 00248 m_pcre = NULL; 00249 } 00250 } 00251 else if( !compile(ref.m_rxstr, ref.m_flags)) 00252 { 00253 BLOCXX_THROW_ERR(RegExCompileException, 00254 errorString().c_str(), m_ecode); 00255 } 00256 return *this; 00257 } 00258 00259 00260 // ------------------------------------------------------------------- 00261 bool 00262 PerlRegEx::compile(const String ®ex, int cflags) 00263 { 00264 if( m_pcre) 00265 { 00266 free(m_pcre); 00267 m_pcre = NULL; 00268 } 00269 00270 const char *errptr = NULL; 00271 00272 m_ecode = 0; 00273 m_pcre = ::pcre_compile(regex.c_str(), cflags, 00274 &errptr, &m_ecode, NULL); 00275 if( m_pcre == NULL) 00276 { 00277 m_error = String(errptr ? errptr : ""); 00278 m_rxstr.erase(); 00279 m_flags = 0; 00280 return false; 00281 } 00282 else 00283 { 00284 m_error.erase(); 00285 m_rxstr = regex; 00286 m_flags = cflags; 00287 return true; 00288 } 00289 } 00290 00291 00292 // ------------------------------------------------------------------- 00293 int 00294 PerlRegEx::errorCode() 00295 { 00296 return m_ecode; 00297 } 00298 00299 00300 // ------------------------------------------------------------------- 00301 String 00302 PerlRegEx::errorString() const 00303 { 00304 return m_error; 00305 } 00306 00307 00308 // ------------------------------------------------------------------- 00309 String 00310 PerlRegEx::patternString() const 00311 { 00312 return m_rxstr; 00313 } 00314 00315 00316 // ------------------------------------------------------------------- 00317 int 00318 PerlRegEx::compileFlags() const 00319 { 00320 return m_flags; 00321 } 00322 00323 00324 // ------------------------------------------------------------------- 00325 bool 00326 PerlRegEx::isCompiled() const 00327 { 00328 return (m_pcre != NULL); 00329 } 00330 00331 00332 // ------------------------------------------------------------------- 00333 bool 00334 PerlRegEx::execute(MatchArray &sub, const String &str, 00335 size_t index, size_t count, int eflags) 00336 { 00337 if( m_pcre == NULL) 00338 { 00339 BLOCXX_THROW(RegExCompileException, 00340 "Regular expression is not compiled"); 00341 } 00342 if( count >= size_t(INT_MAX / 3)) 00343 { 00344 BLOCXX_THROW(AssertionException, 00345 "Match count limit exceeded"); 00346 } 00347 00348 if( index > str.length()) 00349 { 00350 BLOCXX_THROW(OutOfBoundsException, 00351 Format("String index out of bounds (" 00352 "length = %1, index = %2).", 00353 str.length(), index 00354 ).c_str()); 00355 } 00356 00357 if( count == 0) 00358 { 00359 int cnt = 0; 00360 int ret = ::pcre_fullinfo(m_pcre, NULL, 00361 PCRE_INFO_CAPTURECOUNT, &cnt); 00362 if( ret) 00363 { 00364 m_error = getError(m_ecode); 00365 return false; 00366 } 00367 count = cnt > 0 ? cnt + 1 : 1; 00368 } 00369 int vsub[count * 3]; 00370 00371 sub.clear(); 00372 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(), 00373 index, eflags, vsub, count * 3); 00374 // 00375 // pcre_exec returns 0 if vector too small, negative value 00376 // on errors or the number of matches (number of int pairs) 00377 // 00378 if( m_ecode > 0) 00379 { 00380 sub.resize(count); // as specified by user 00381 for(size_t i = 0, n = 0; i < count; i++, n += 2) 00382 { 00383 match_t m = { vsub[n], vsub[n+1] }; 00384 00385 // if user wants more than detected 00386 if( i >= (size_t)m_ecode) 00387 m.rm_so = m.rm_eo = -1; 00388 00389 sub[i] = m; 00390 } 00391 m_error.erase(); 00392 return true; 00393 } 00394 else 00395 { 00396 m_error = getError(m_ecode); 00397 return false; 00398 } 00399 } 00400 00401 00402 // ------------------------------------------------------------------- 00403 bool 00404 PerlRegEx::execute(MatchVector &sub, const String &str, 00405 size_t index, size_t count, int eflags) 00406 { 00407 if( m_pcre == NULL) 00408 { 00409 BLOCXX_THROW(RegExCompileException, 00410 "Regular expression is not compiled"); 00411 } 00412 if( count >= size_t(INT_MAX / 3)) 00413 { 00414 BLOCXX_THROW(AssertionException, 00415 "Match count limit exceeded"); 00416 } 00417 00418 if( index > str.length()) 00419 { 00420 BLOCXX_THROW(OutOfBoundsException, 00421 Format("String index out of bounds (" 00422 "length = %1, index = %2)", 00423 str.length(), index 00424 ).c_str()); 00425 } 00426 00427 if( count == 0) 00428 { 00429 int cnt = 0; 00430 int ret = ::pcre_fullinfo(m_pcre, NULL, 00431 PCRE_INFO_CAPTURECOUNT, &cnt); 00432 if( ret) 00433 { 00434 m_error = getError(m_ecode); 00435 return false; 00436 } 00437 count = cnt > 0 ? cnt + 1 : 1; 00438 } 00439 int vsub[count * 3]; 00440 00441 sub.clear(); 00442 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(), 00443 index, eflags, vsub, count * 3); 00444 // 00445 // pcre_exec returns 0 if vector too small, negative value 00446 // on errors or the number of matches (number of int pairs) 00447 // 00448 if( m_ecode > 0) 00449 { 00450 count *= 2; 00451 m_ecode *= 2; 00452 sub.resize(count); // as specified by user 00453 for(size_t i = 0; i < count; i++) 00454 { 00455 // if user wants more than detected 00456 if( i >= (size_t)m_ecode) 00457 vsub[i] = -1; 00458 00459 sub[i] = vsub[i]; 00460 } 00461 return true; 00462 } 00463 else 00464 { 00465 m_error = getError(m_ecode); 00466 return false; 00467 } 00468 } 00469 00470 00471 // ------------------------------------------------------------------- 00472 StringArray 00473 PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags) 00474 { 00475 if( m_pcre == NULL) 00476 { 00477 BLOCXX_THROW(RegExCompileException, 00478 "Regular expression is not compiled"); 00479 } 00480 00481 MatchArray rsub; 00482 StringArray ssub; 00483 00484 bool match = execute(rsub, str, index, count, eflags); 00485 if( match) 00486 { 00487 if( rsub.empty()) 00488 { 00489 BLOCXX_THROW(RegExCompileException, 00490 "Non-capturing regular expression"); 00491 } 00492 00493 MatchArray::const_iterator i=rsub.begin(); 00494 for( ; i != rsub.end(); ++i) 00495 { 00496 if( i->rm_so >= 0 && i->rm_eo >= 0) 00497 { 00498 ssub.push_back(str.substring(i->rm_so, 00499 i->rm_eo - i->rm_so)); 00500 } 00501 else 00502 { 00503 ssub.push_back(String("")); 00504 } 00505 } 00506 } 00507 else if(m_ecode != PCRE_ERROR_NOMATCH) 00508 { 00509 BLOCXX_THROW_ERR(RegExExecuteException, 00510 errorString().c_str(), m_ecode); 00511 } 00512 return ssub; 00513 } 00514 00515 00516 // ------------------------------------------------------------------- 00517 blocxx::String 00518 PerlRegEx::replace(const String &str, const String &rep, 00519 bool global, int eflags) 00520 { 00521 if( m_pcre == NULL) 00522 { 00523 BLOCXX_THROW(RegExCompileException, 00524 "Regular expression is not compiled"); 00525 } 00526 00527 MatchArray rsub; 00528 bool match; 00529 size_t off = 0; 00530 String out = str; 00531 00532 do 00533 { 00534 match = execute(rsub, out, off, 0, eflags); 00535 if( match) 00536 { 00537 if( rsub.empty() || 00538 rsub[0].rm_so < 0 || 00539 rsub[0].rm_eo < 0) 00540 { 00541 // only if empty (missused as guard). 00542 BLOCXX_THROW(RegExCompileException, 00543 "Non-capturing regular expression"); 00544 } 00545 00546 String res = substitute_caps(rsub, out, rep); 00547 00548 out = out.substring(0, rsub[0].rm_so) + 00549 res + out.substring(rsub[0].rm_eo); 00550 00551 off = rsub[0].rm_so + res.length(); 00552 } 00553 else if(m_ecode == PCRE_ERROR_NOMATCH) 00554 { 00555 m_ecode = 0; 00556 m_error.erase(); 00557 } 00558 else 00559 { 00560 BLOCXX_THROW_ERR(RegExExecuteException, 00561 errorString().c_str(), m_ecode); 00562 } 00563 } while(global && match && out.length() > off); 00564 00565 return out; 00566 } 00567 00568 00569 // ------------------------------------------------------------------- 00570 StringArray 00571 PerlRegEx::split(const String &str, bool empty, int eflags) 00572 { 00573 if( m_pcre == NULL) 00574 { 00575 BLOCXX_THROW(RegExCompileException, 00576 "Regular expression is not compiled"); 00577 } 00578 00579 MatchArray rsub; 00580 StringArray ssub; 00581 bool match; 00582 size_t off = 0; 00583 size_t len = str.length(); 00584 00585 do 00586 { 00587 match = execute(rsub, str, off, 0, eflags); 00588 if( match) 00589 { 00590 if( rsub.empty() || 00591 rsub[0].rm_so < 0 || 00592 rsub[0].rm_eo < 0) 00593 { 00594 BLOCXX_THROW(RegExCompileException, 00595 "Non-capturing regular expression"); 00596 } 00597 00598 if( empty || ((size_t)rsub[0].rm_so > off)) 00599 { 00600 ssub.push_back(str.substring(off, 00601 rsub[0].rm_so - off)); 00602 } 00603 off = rsub[0].rm_eo; 00604 } 00605 else if(m_ecode == PCRE_ERROR_NOMATCH) 00606 { 00607 String tmp = str.substring(off); 00608 if( empty || !tmp.empty()) 00609 { 00610 ssub.push_back(tmp); 00611 } 00612 m_ecode = 0; 00613 m_error.erase(); 00614 } 00615 else 00616 { 00617 BLOCXX_THROW_ERR(RegExExecuteException, 00618 errorString().c_str(), m_ecode); 00619 } 00620 } while(match && len > off); 00621 00622 return ssub; 00623 } 00624 00625 00626 // ------------------------------------------------------------------- 00627 StringArray 00628 PerlRegEx::grep(const StringArray &src, int eflags) 00629 { 00630 if( m_pcre == NULL) 00631 { 00632 BLOCXX_THROW(RegExCompileException, 00633 "Regular expression is not compiled"); 00634 } 00635 00636 m_ecode = 0; 00637 m_error.erase(); 00638 00639 StringArray out; 00640 if( !src.empty()) 00641 { 00642 StringArray::const_iterator i=src.begin(); 00643 for( ; i != src.end(); ++i) 00644 { 00645 int ret = ::pcre_exec(m_pcre, NULL, i->c_str(), 00646 i->length(), 0, eflags, NULL, 0); 00647 if( ret >= 0) 00648 { 00649 out.push_back(*i); 00650 } 00651 else if( ret != PCRE_ERROR_NOMATCH) 00652 { 00653 m_ecode = ret; 00654 m_error = getError(m_ecode); 00655 BLOCXX_THROW_ERR(RegExExecuteException, 00656 errorString().c_str(), m_ecode); 00657 } 00658 } 00659 } 00660 return out; 00661 } 00662 00663 00664 // ------------------------------------------------------------------- 00665 bool 00666 PerlRegEx::match(const String &str, size_t index, int eflags) const 00667 { 00668 if( m_pcre == NULL) 00669 { 00670 BLOCXX_THROW(RegExCompileException, 00671 "Regular expression is not compiled"); 00672 } 00673 00674 if( index > str.length()) 00675 { 00676 BLOCXX_THROW(OutOfBoundsException, 00677 Format("String index out of bounds." 00678 "length = %1, index = %2", 00679 str.length(), index 00680 ).c_str()); 00681 } 00682 00683 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), 00684 str.length(), 0, eflags, NULL, 0); 00685 if( m_ecode >= 0) 00686 { 00687 m_error.erase(); 00688 return true; 00689 } 00690 else if( m_ecode == PCRE_ERROR_NOMATCH) 00691 { 00692 m_error = getError(m_ecode); 00693 return false; 00694 } 00695 else 00696 { 00697 m_error = getError(m_ecode); 00698 BLOCXX_THROW_ERR(RegExExecuteException, 00699 errorString().c_str(), m_ecode); 00700 } 00701 } 00702 00703 00704 // ------------------------------------------------------------------- 00705 } // namespace BLOCXX_NAMESPACE 00706 00707 #endif // BLOCXX_HAVE_PCRE_H 00708 #endif // BLOCXX_HAVE_PCRE 00709 00710 /* vim: set ts=8 sts=8 sw=8 ai noet: */ 00711