00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include "text/RegExp.h"
00020 #include <assert.h>
00021
00022 #include "text/strings.h"
00023
00024 using namespace std;
00025
00026 namespace Tanl {
00027 namespace Text {
00028 namespace RegExp {
00029
00030 const unsigned char* Pattern::setLocale(char const* locale)
00031 {
00032 free((void*)CharTables);
00033 char* oldLocale = setlocale(LC_CTYPE, NULL);
00034 setlocale(LC_CTYPE, locale);
00035 CharTables = pcre_maketables();
00036 setlocale(LC_CTYPE, oldLocale);
00037 return CharTables;
00038 }
00039
00040 const unsigned char* Pattern::CharTables = Pattern::setLocale(setlocale(LC_CTYPE, 0));
00041
00042 Pattern::Pattern(char const* regExpr, int cflags) :
00043 _errorCode(0), _pcre_extra(0)
00044 {
00045 cflags |= PCRE_UTF8;
00046 const char* errtext = 0;
00047 int offset = -1;
00048 _pcre = pcre_compile(regExpr, cflags, &errtext, &offset, CharTables);
00049 if (_pcre == 0)
00050 throw RegExpException(string("Error compiling Pattern: [") + regExpr +
00051 "] at " + offset + ": " + errtext);
00052 pcre_refcount(_pcre, 1);
00053 if (!(cflags & Anchored))
00054
00055 _pcre_extra = pcre_study(_pcre, 0, &errtext);
00056 pcre_fullinfo(_pcre, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns);
00057 ++subpatterns;
00058 }
00059
00060 Pattern::Pattern(string const& regExpr, int cflags) :
00061 _errorCode(0), _pcre_extra(0)
00062 {
00063 cflags |= PCRE_UTF8;
00064 const char* errtext = 0;
00065 int offset = -1;
00066 _pcre = pcre_compile(regExpr.c_str(), cflags, &errtext, &offset, CharTables);
00067 if (_pcre == 0)
00068 throw RegExpException("Error compiling Pattern: [" + regExpr + "] at " + offset + ": " + errtext);
00069 pcre_refcount(_pcre, 1);
00070 if (!(cflags & Anchored))
00071
00072 _pcre_extra = pcre_study(_pcre, 0, &errtext);
00073 pcre_fullinfo(_pcre, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns);
00074 ++subpatterns;
00075 }
00076
00077 Pattern::~Pattern()
00078 {
00079 if (_pcre && pcre_refcount(_pcre, -1) == 0) {
00080 if (pcre_refcount(_pcre, -1) == 0) {
00081 pcre_free(_pcre);
00082 _pcre = 0;
00083 if (_pcre_extra) {
00084 pcre_free(_pcre_extra);
00085 _pcre_extra = 0;
00086 }
00087 }
00088 }
00089 }
00090
00091 bool Pattern::test(string const& str, int eflags) const
00092 {
00093 MatchGroups matches(subpatterns);
00094 return pcre_exec(_pcre, _pcre_extra, (const char*)&str[0], str.length(),
00095 0, eflags, (int*)&matches[0], 3 * subpatterns) >= 0;
00096 }
00097
00098 bool Pattern::test(char const* str, size_t len, int eflags)
00099 {
00100 MatchGroups matches(subpatterns);
00101 if (len == 0)
00102 len = strlen(str);
00103 return pcre_exec(_pcre, _pcre_extra, str, len,
00104 0, eflags, (int*)&matches[0], 3 * subpatterns) >= 0;
00105 }
00106
00107 int Pattern::matchSize(string const& text, int eflags)
00108 {
00109 const char* it = (const char*)&*text.begin();
00110 const char* end = (const char*)&*text.end();
00111
00112 MatchGroups matches(subpatterns);
00113
00114 int mcount = pcre_exec(_pcre, _pcre_extra, it, end - it, 0, eflags,
00115 (int*)&matches[0], 3 * subpatterns);
00116 if (mcount < 0)
00117 return 0;
00118 return matches[0].second - matches[0].first;
00119 }
00120
00121 int Pattern::match(string const& text, MatchGroups& pos, int eflags)
00122 {
00123 const char* it = text.c_str();
00124 const char* end = it + text.length();
00125 return match(it, end, pos, eflags);
00126 }
00127
00128 int Pattern::match(const char* it, const char* end, MatchGroups& pos,
00129 int eflags)
00130 {
00131 int slotnum = pos.size() * 3;
00132 int mcount = pcre_exec(_pcre, _pcre_extra, it, end - it, 0, eflags,
00133 (int*)&pos[0], slotnum);
00134 if (mcount < 0)
00135 return 0;
00136 return mcount;
00137 }
00138
00139
00140 vector<string> Pattern::match(string const& str, int eflags)
00141 {
00142 MatchGroups matches(subpatterns);
00143 int mcount = match(str, matches, eflags);
00144 if (mcount < 0)
00145 return vector<string>(0);
00146 vector<string> sa(mcount);
00147 for (int i = 0; i < mcount; ++i)
00148 sa[i] = str.substr(matches[i].first, matches[i].second);
00149 return sa;
00150 }
00151
00152
00153 static const int maxReplaceGroups = 16;
00154
00161 static bool rewrite(string& out, const string& text, string& rewrite,
00162 MatchGroups& m, int mcount)
00163 {
00164 assert (mcount);
00165 for (string::const_iterator s = rewrite.begin(), end = rewrite.end(); s < end; s++) {
00166 int c = *s;
00167 if (c == '\\') {
00168 c = *++s;
00169 if (isdigit(c)) {
00170 int n = c - '0';
00171 if (n >= mcount)
00172 return false;
00173 int start = m[n].first;
00174 if (start >= 0)
00175 out.append(text.data() + start, m[n].second);
00176 } else if (c == '\\')
00177 out += '\\';
00178 else
00179 return false;
00180 } else
00181 out += c;
00182 }
00183 return true;
00184 }
00185
00186 string Pattern::replace(string& text, string& replwith, bool replaceAll)
00187 {
00188 string rest = text;
00189 MatchGroups sa(maxReplaceGroups);
00190
00191 string ret;
00192 int mcount;
00193 while ((mcount = match(rest, sa, 0))) {
00194 ret.append(rest.data(), sa[0].first);
00195 if (!rewrite(ret, rest, replwith, sa, mcount))
00196 break;
00197 rest = rest.substr(sa[0].second);
00198 if (!replaceAll)
00199 break;
00200 }
00201 ret += rest;
00202 return ret;
00203 }
00204
00205
00206 string Pattern::escape(string& str)
00207 {
00208 string sb;
00209 string::iterator end = str.end();
00210 for (string::iterator it = str.begin(); it < end; it++) {
00211 char c = *it;
00212 if (!((c >= 'a' && c <= 'z') ||
00213 (c >= 'A' && c <= 'Z') ||
00214 (c >= '0' && c <= '9')))
00215 sb += '\\';
00216 sb += c;
00217 }
00218 return sb;
00219 }
00220
00221 }
00222 }
00223 }
00224