00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "TokenCategorizer.h"
00025
00026 using namespace Tanl::Text;
00027
00028 namespace Tanl {
00029 namespace SST {
00030
00057
00058
00059 char const* engSplit[] = {"'s", "n't", "'ll", "'d", 0 };
00060
00061
00062
00063
00064
00065 char const WORD_EN[] = "[[:alnum:]]+-?[[:alnum:]'/@\\.#&]*";
00066
00067
00068
00069
00070 char const WORD_EU[] = "[[:alnum:]]+-?[[:alnum:]/@\\.#&]*'?";
00071
00072
00073
00074
00075
00076 char const QUANTITY_EN[] = "[+-]?[0-9]+(?:,[0-9]{3})*(?:\\.[0-9]+)?(?:(?:[eE](?:[+-]?[0-9]+))|(?:\\s*%))?";
00077
00078
00079
00080
00081 char const QUANTITY_EU[] = "[+-]?[0-9]+(?:\\.[0-9]{3})*(?:,[0-9]+)?(?:(?:[eE](?:[+-]?[0-9]+))|(?:\\s*%))?";
00082
00083
00084
00085
00086 char const CURRENCY_US[] = "$[\\s]?([0-9]+(?:\\,[0-9]{3})+(?:\\.[0-9]+)?)";
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096 char const DATE_EN[] = "(?:\\d{1,2}/\\d{1,2}(?:/\\d{2}\\d{2}?))|(?:\\d{1,2}-\\d{1,2}(?:-\\d{2}\\d{2}?))|(?:\\d{4}-\\d{1,2}-\\d{1,2})";
00097
00098
00099
00100
00101
00102 char const TIME_EN[] = "\\d{1,2}:\\d{1,2}";
00103
00104
00105
00106
00107 char const URL[] = "(?:((http:)//)|www.)([\\w+\\-]\\.)+[[:alpha:]]{2,3}(:\\d{1,4})?(/[\\w.%~!#?+\\-])*";
00108
00109
00110
00111
00112 char const EMAIL[] = "[[:alpha:]\\.]+@[[:alpha:]\\.]+";
00113
00114
00115
00116
00117 char const MUC_NAME[] = "<ENAMEX\\s(?:ALT=.*?\\s)?TYPE='(\\w+)'>";
00118 char const MUC_NUM[] = "<NUMEX\\sTYPE='(\\w+)'>";
00119 char const MUC_TIME[] = "<TIMEX\\sTYPE='(\\w+)'>";
00120 char const MUC_OPT[] = "<..MEX\\STATUS='OPT'\\sTYPE='(\\w+)'>";
00121
00122
00123
00124
00125 char const XML_TAG[] = "</?\\w+[^<]*?>";
00126
00127 static map<string, Language> localeLang;
00128
00133 static int NameTypeBuilder()
00134 {
00135 struct LocaleLang
00136 {
00137 char const* locale;
00138 Language language;
00139 };
00140
00141 static LocaleLang const localeLangTable[] = {
00142 { "english", en },
00143 { "czech", cz },
00144 { "deutsch", de },
00145 { "french", fr },
00146 { "italian", it },
00147 { "spanish", es },
00148 { "portuguese", pt }
00149 };
00150
00151 int i = 0;
00152 for (register LocaleLang const* e = localeLangTable;
00153 i < sizeof(localeLangTable)/sizeof(LocaleLang);
00154 ++i, ++e)
00155 localeLang[string(e->locale)] = e->language;
00156 return 0;
00157 }
00158
00159 static int ntBuilt = NameTypeBuilder();
00160
00161 TokenCategorizer::TokenCategorizer(char const* locale) :
00162 language(localeLang[locale])
00163 {
00164 switch (language) {
00165 case it:
00166 case de:
00167 case es:
00168 case fr:
00169 wordPattern = RegExp::Pattern(WORD_EU, RegExp::Anchored);
00170 quantityPattern = RegExp::Pattern(QUANTITY_EU, RegExp::Anchored);
00171 break;
00172 case en:
00173 default:
00174 wordPattern = RegExp::Pattern(WORD_EN, RegExp::Anchored);
00175 quantityPattern = RegExp::Pattern(QUANTITY_EN, RegExp::Anchored);
00176 break;
00177 }
00178 currencyPattern = RegExp::Pattern(CURRENCY_US, RegExp::Anchored);
00179 datePattern = RegExp::Pattern(DATE_EN, RegExp::Anchored);
00180 timePattern = RegExp::Pattern(TIME_EN, RegExp::Anchored);
00181 urlPattern = RegExp::Pattern(URL, RegExp::Anchored);
00182 emailPattern = RegExp::Pattern(EMAIL, RegExp::Anchored);
00183
00184
00185 MUC_name = RegExp::Pattern(MUC_NAME, RegExp::Anchored);
00186 MUC_num = RegExp::Pattern(MUC_NUM, RegExp::Anchored);
00187 MUC_time = RegExp::Pattern(MUC_TIME, RegExp::Anchored);
00188 MUC_opt = RegExp::Pattern(MUC_OPT, RegExp::Anchored);
00189
00190 tagPattern = RegExp::Pattern(XML_TAG, RegExp::Anchored);
00191 }
00192
00193 EntityType TokenCategorizer::analyze(char const* token)
00194 {
00195 EntityType res;
00196 RegExp::MatchGroups matches(3);
00197
00198 if (quantityPattern.match(token, matches)) {
00199 res.type = EntityType::Quantity;
00200 res.subtype = EntityType::Number;
00201 } else if (currencyPattern.match(token, matches)) {
00202 res.type = EntityType::Quantity;
00203 res.subtype = EntityType::Money;
00204 } else if (datePattern.match(token, matches)) {
00205 res.type = EntityType::Time;
00206 res.subtype = EntityType::Date;
00207 } else if (timePattern.match(token, matches)) {
00208 res.type = EntityType::Time;
00209 res.subtype = EntityType::Instant;
00210 } else if (urlPattern.match(token, matches)) {
00211 res.type = EntityType::Address;
00212 res.subtype = EntityType::Url;
00213 } else if (emailPattern.match(token, matches)) {
00214 res.type = EntityType::Address;
00215 res.subtype = EntityType::Email;
00216 } else if (wordPattern.match(token, matches)) {
00217 res.type = EntityType::Word;
00218 res.subtype = (EntityType::Subtype)EntityType::NoType;
00219 } else if (tagPattern.match(token, matches)) {
00220
00221 res.type = EntityType::NoType;
00222 res.subtype = (EntityType::Subtype)EntityType::NoType;
00223 } else if (!strcmp(token, "'s")) {
00224
00225 res.type = EntityType::Word;
00226 res.subtype = (EntityType::Subtype)EntityType::NoType;
00227 }
00228 return res;
00229 }
00230
00231 }
00232 }