tanl: tanl: tag/SST/TokenCategorizer.cpp Source File

00001 /*
00002 **  Tanl
00003 **  SST/TokenCategorizer.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of Tanl.
00009 **
00010 **  Tanl is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  Tanl is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "TokenCategorizer.h"
00025 
00026 using namespace Tanl::Text;
00027 
00028 namespace Tanl {
00029 namespace SST {
00030 
00057 //
00058 // Apostrophes in english tokens
00059 char const* engSplit[] = {"'s", "n't", "'ll", "'d", 0 };
00060 
00061 //
00062 // In all the patterns below, we avoid initial '^', since we
00063 // use the Anchored option in RegExp constructor
00064 //
00065 char const WORD_EN[] = "[[:alnum:]]+-?[[:alnum:]'/@\\.#&]*";
00066 
00067 /*
00068  * European languages word can only have apostrophe only at end
00069  */
00070 char const WORD_EU[] = "[[:alnum:]]+-?[[:alnum:]/@\\.#&]*'?";
00071 
00072 /*
00073  * English languages use '.' as decimal separator and ',' as spacing
00074  */
00075 // (?: ...) means non capturing subpattern.
00076 char const QUANTITY_EN[] = "[+-]?[0-9]+(?:,[0-9]{3})*(?:\\.[0-9]+)?(?:(?:[eE](?:[+-]?[0-9]+))|(?:\\s*%))?";
00077 
00078 /*
00079  * European languages use ',' as decimal separator and '.' as spacing
00080  */
00081 char const QUANTITY_EU[] = "[+-]?[0-9]+(?:\\.[0-9]{3})*(?:,[0-9]+)?(?:(?:[eE](?:[+-]?[0-9]+))|(?:\\s*%))?";
00082 
00083 /*
00084  * Currency
00085  */
00086 char const CURRENCY_US[] = "$[\\s]?([0-9]+(?:\\,[0-9]{3})+(?:\\.[0-9]+)?)";
00087 
00088 /*
00089  * Matches
00090  * dd/dd[/dd]
00091  * dd/dd[/dddd]
00092  * dd-dd[-dd]
00093  * dd-dd[-dddd]
00094  * dddd-d[d]-d[d]
00095  */
00096 char const DATE_EN[] = "(?:\\d{1,2}/\\d{1,2}(?:/\\d{2}\\d{2}?))|(?:\\d{1,2}-\\d{1,2}(?:-\\d{2}\\d{2}?))|(?:\\d{4}-\\d{1,2}-\\d{1,2})";
00097 
00098 /*
00099  * Matches
00100  * dd:dd
00101  */
00102 char const TIME_EN[] = "\\d{1,2}:\\d{1,2}";
00103 
00104 /*
00105  * URL
00106  */
00107 char const URL[] = "(?:((http:)//)|www.)([\\w+\\-]\\.)+[[:alpha:]]{2,3}(:\\d{1,4})?(/[\\w.%~!#?+\\-])*";
00108 
00109 /*
00110  * Email
00111  */
00112 char const EMAIL[] = "[[:alpha:]\\.]+@[[:alpha:]\\.]+";
00113 
00114 /*
00115  * MUC-6 annotations
00116  */
00117 char const MUC_NAME[] = "<ENAMEX\\s(?:ALT=.*?\\s)?TYPE='(\\w+)'>";
00118 char const MUC_NUM[] = "<NUMEX\\sTYPE='(\\w+)'>";
00119 char const MUC_TIME[] = "<TIMEX\\sTYPE='(\\w+)'>";
00120 char const MUC_OPT[] = "<..MEX\\STATUS='OPT'\\sTYPE='(\\w+)'>";
00121 
00122 /*
00123  * Generic tag
00124  */
00125 char const XML_TAG[] = "</?\\w+[^<]*?>";
00126 
00127 static map<string, Language>            localeLang;
00128 
00133 static int NameTypeBuilder()
00134 {
00135   struct LocaleLang
00136   {
00137     char const* locale;
00138     Language    language;
00139   };
00140 
00141   static LocaleLang const localeLangTable[] = {
00142     { "english",        en },
00143     { "czech",          cz },
00144     { "deutsch",        de },
00145     { "french",         fr },
00146     { "italian",        it },
00147     { "spanish",        es },
00148     { "portuguese",     pt }
00149   };
00150 
00151   int i = 0;
00152   for (register LocaleLang const* e = localeLangTable;
00153        i < sizeof(localeLangTable)/sizeof(LocaleLang);
00154        ++i, ++e)
00155     localeLang[string(e->locale)] = e->language;
00156   return 0;
00157 }
00158 
00159 static int ntBuilt = NameTypeBuilder();
00160 
00161 TokenCategorizer::TokenCategorizer(char const* locale) :
00162   language(localeLang[locale])
00163 {
00164   switch (language) {
00165   case it:
00166   case de:
00167   case es:
00168   case fr:
00169     wordPattern = RegExp::Pattern(WORD_EU, RegExp::Anchored);
00170     quantityPattern = RegExp::Pattern(QUANTITY_EU, RegExp::Anchored);
00171     break;
00172   case en:
00173   default:
00174     wordPattern = RegExp::Pattern(WORD_EN, RegExp::Anchored);
00175     quantityPattern = RegExp::Pattern(QUANTITY_EN, RegExp::Anchored);
00176     break;
00177   }
00178   currencyPattern = RegExp::Pattern(CURRENCY_US, RegExp::Anchored);
00179   datePattern = RegExp::Pattern(DATE_EN, RegExp::Anchored);
00180   timePattern = RegExp::Pattern(TIME_EN, RegExp::Anchored);
00181   urlPattern = RegExp::Pattern(URL, RegExp::Anchored);
00182   emailPattern = RegExp::Pattern(EMAIL, RegExp::Anchored);
00183 
00184   // MUC-6
00185   MUC_name = RegExp::Pattern(MUC_NAME, RegExp::Anchored);
00186   MUC_num = RegExp::Pattern(MUC_NUM, RegExp::Anchored);
00187   MUC_time = RegExp::Pattern(MUC_TIME, RegExp::Anchored);
00188   MUC_opt = RegExp::Pattern(MUC_OPT, RegExp::Anchored);
00189 
00190   tagPattern = RegExp::Pattern(XML_TAG, RegExp::Anchored);
00191 }
00192 
00193 EntityType TokenCategorizer::analyze(char const* token)
00194 {
00195   EntityType    res;
00196   RegExp::MatchGroups matches(3);
00197 
00198   if (quantityPattern.match(token, matches)) {
00199     res.type = EntityType::Quantity;
00200     res.subtype = EntityType::Number;
00201   } else if (currencyPattern.match(token, matches)) {
00202     res.type = EntityType::Quantity;
00203     res.subtype = EntityType::Money;
00204   } else if (datePattern.match(token, matches)) {
00205     res.type = EntityType::Time;
00206     res.subtype = EntityType::Date;
00207   } else if (timePattern.match(token, matches)) {
00208     res.type = EntityType::Time;
00209     res.subtype = EntityType::Instant;
00210   } else if (urlPattern.match(token, matches)) {
00211     res.type = EntityType::Address;
00212     res.subtype = EntityType::Url;
00213   } else if (emailPattern.match(token, matches)) {
00214     res.type = EntityType::Address;
00215     res.subtype = EntityType::Email;
00216   } else if (wordPattern.match(token, matches)) {
00217     res.type = EntityType::Word;
00218     res.subtype = (EntityType::Subtype)EntityType::NoType;
00219   } else if (tagPattern.match(token, matches)) {
00220     // skip closing or unknown tags
00221     res.type = EntityType::NoType;
00222     res.subtype = (EntityType::Subtype)EntityType::NoType;
00223   } else if (!strcmp(token, "'s")) {
00224     // special case of genitive after tag: </NAMEX>'s
00225     res.type = EntityType::Word;
00226     res.subtype = (EntityType::Subtype)EntityType::NoType;
00227   }
00228   return res;
00229 }
00230 
00231 } // SST
00232 } // Tanl