00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_Text_WordSet_H
00025 #define Tanl_Text_WordSet_H
00026
00027 #include "include/unordered_set.h"
00028 #include "text/strings.h"
00029
00030
00031 #include <fstream>
00032 #include <stdlib.h>
00033
00034 namespace Tanl {
00035 namespace Text {
00036
00037 template <class _Hash = hash<const char *>,
00038 class _Pred = std::equal_to<const char*> >
00039 class WordSetBase :
00040 public unordered_set<const char*, _Hash, _Pred>
00041 {
00042 protected:
00043 typedef unordered_set<const char*, _Hash, _Pred> _Base;
00044
00045 public:
00046
00047 WordSetBase() { }
00048
00052 WordSetBase(char const* file);
00053
00054 WordSetBase(std::string& file);
00055
00056 ~WordSetBase() { clear(); }
00057
00058 void clear() {
00059 for (typename _Base::iterator it = this->begin(); it != this->end(); ++it) {
00060 free((void*)*it);
00061 }
00062 _Base::clear();
00063 }
00064
00068 bool contains(char const* word) {
00069 return this->find(word) != this->end();
00070 }
00071
00075 bool contains(std::string const& word) {
00076 return this->find(word.c_str()) != this->end();
00077 }
00078
00082 std::pair<typename _Base::iterator, bool> insert(const char* word) {
00083 typename _Base::iterator fit = this->find(word);
00084 return (fit != this->end()) ?
00085 std::pair<typename _Base::iterator, bool>(fit, false) :
00086 _Base::insert(::strdup(word));
00087 }
00088
00092 void store(char const* file);
00093
00097 void load(char const* file);
00098
00099 protected:
00100 void load(std::ifstream& ifs);
00101 };
00102
00107 class WordSet : public WordSetBase<>
00108 {
00109 public:
00110 WordSet() { }
00111 WordSet(char const* file) : WordSetBase<>(file) { }
00112 WordSet(std::string& file) : WordSetBase<>(file) { }
00113 };
00114
00120 struct NormEqual
00121 {
00122 bool operator()(const char* s1, const char* s2) const
00123 {
00124 char n1, n2;
00125 while (n1 = *s1, n2 = *s2, n1 || n2) {
00126 if (n1 && n2 && tolower(n1) == tolower(n2)) {
00127 ++s1;
00128 ++s2;
00129 } else if (n1 == '.') {
00130 ++s1;
00131 } else if (n2 == '.') {
00132 ++s2;
00133 } else
00134 break;
00135 }
00136 return (n1 == '\0') && (n2 == '\0');
00137 }
00138 };
00139
00140 struct NormHash
00141 {
00142 size_t operator()(char const* str) const {
00143 std::string s = str;
00144 to_lower(s);
00145 return hash<char const *>()(s.c_str());
00146 }
00147 };
00148
00149 class NormWordSet : public WordSetBase<NormHash, NormEqual>
00150 {
00151 public:
00152 NormWordSet() { }
00153 NormWordSet(char const* file) : WordSetBase<NormHash, NormEqual>(file) { }
00154 NormWordSet(std::string& file) : WordSetBase<NormHash, NormEqual>(file) { }
00155
00159 std::pair<_Base::iterator, bool> insert(const char* word) {
00160 char* lower = (char*)malloc(strlen(word)+1);
00161 to_lower(lower, word);
00162 _Base::iterator fit = this->find(lower);
00163 return (fit != this->end()) ?
00164 std::pair<_Base::iterator, bool>(fit, false) :
00165 _Base::insert(lower);
00166 }
00167
00171 bool contains(char const* word) {
00172 return this->find(word) != this->end();
00173 }
00174
00178 bool contains(std::string const& word) {
00179 return this->find(word.c_str()) != this->end();
00180 }
00181
00182 };
00183
00184 }
00185 }
00186
00187
00188 #ifndef NON_TEMPLATE
00189 # include "WordSet.cpp"
00190 #endif
00191
00192 #endif // Tanl_Text_WordSet_H