00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Tokenizer.h"
00025
00026 using namespace std;
00027
00028 namespace Tanl {
00029
00030
00031 TokenizerPipe::TokenizerPipe() :
00032 qlex((QUEX_TYPE_CHARACTER*)0, 0, 0, "UTF-8"),
00033 len(0),
00034 currContext(0)
00035 {
00036 qlex.token_p_switch(&token);
00037 }
00038
00039 TokenizerPipe::~TokenizerPipe()
00040 {
00041 if (currContext)
00042 currContext->decRef();
00043 }
00044
00045 Enumerator<Token*>* Tokenizer::pipe(istream& is)
00046 {
00047 return new TokenizerPipe(*this, is);
00048 }
00049
00050 Enumerator<Token*>* Tokenizer::pipe(Enumerator<std::string*>& se)
00051 {
00052 return new TokenizerPipeEnum(*this, se);
00053 }
00054
00055 Enumerator<Token*>* Tokenizer::pipe(PyObject* pit)
00056 {
00057 return new TokenizerPipePython(*this, pit);
00058 }
00059
00060 TokenizerPipe::TokenizerPipe(Tokenizer& tokenizer, istream& is) :
00061 qlex(&is, "UTF-8"),
00062 len(0),
00063 currContext(0)
00064 {
00065 qlex.token_p_switch(&token);
00066 }
00067
00068 bool TokenizerPipe::MoveNext()
00069 {
00070
00071 const QUEX_TYPE_TOKEN_ID TokenID = qlex.receive();
00072
00073
00074 if (TokenID == QUEX_TKN_TERMINATION)
00075 return false;
00076 if (TokenID == QUEX_TKN_EOS) {
00077 len = 0;
00078 } else if (TokenID == QUEX_TKN_XML) {
00079 string xmlTag = token.utf8_text();
00080 if (xmlTag[1] == '/') {
00081 if (currContext) {
00082 Context* prev = currContext->parent;
00083 if (prev)
00084 prev->incRef();
00085 currContext->decRef();
00086 currContext = prev;
00087 }
00088
00089
00090 } else {
00091 Context* prevContext = currContext;
00092 currContext = new Context(xmlTag, currContext);
00093 currContext->incRef();
00094 if (prevContext)
00095 prevContext->decRef();
00096 }
00097 return MoveNext();
00098 } else
00099 len++;
00100 return true;
00101 }
00102
00103
00104 static AttributeIndex* createSimpleAI()
00105 {
00106 AttributeIndex* ai = new AttributeIndex();
00107 ai->insert("FORM");
00108 return ai;
00109 }
00110
00111 AttributeIndex* simpleAI = createSimpleAI();
00112
00113 Token* TokenizerPipe::Current()
00114 {
00115 string form = token.utf8_text();
00116 Token* t = new Token(form, simpleAI, currContext);
00117 t->set("FORM", form);
00118 return t;
00119 }
00120
00121
00122
00123
00124 static const QUEX_TYPE_CHARACTER eos[] = { (QUEX_TYPE_CHARACTER)'\n', 0 };
00125
00126 TokenizerPipeEnum::TokenizerPipeEnum(Tokenizer& tokenizer, Enumerator<std::string*>& senum) :
00127 senum(senum)
00128 {
00129 char const* null = "";
00130 qlex.buffer_fill_region_append((void*)null, (void*)null);
00131 qlex.token_p_switch(&token);
00132 }
00133
00134 bool TokenizerPipeEnum::MoveNext()
00135 {
00136 while (true) {
00137 const QUEX_TYPE_TOKEN_ID TokenID = qlex.receive();
00138
00139 if (TokenID == QUEX_TKN_TERMINATION) {
00140 if (len) { len=0; token.set(QUEX_TKN_EOS,eos); return true; }
00141 if (senum.MoveNext()) {
00142 string& line = *senum.Current();
00143 qlex.buffer_fill_region_append_conversion(line.begin().base(), line.end().base());
00144 continue;
00145 } else
00146 return false;
00147 }
00148 if (TokenID == QUEX_TKN_EOS) {
00149 len = 0;
00150 } else if (TokenID == QUEX_TKN_XML) {
00151 string xmlTag = token.utf8_text();
00152 if (xmlTag[1] == '/') {
00153 if (currContext) {
00154 Context* prev = currContext->parent;
00155 if (prev)
00156 prev->incRef();
00157 currContext->decRef();
00158 currContext = prev;
00159 }
00160
00161
00162 } else {
00163 Context* prevContext = currContext;
00164 currContext = new Context(xmlTag, currContext);
00165 currContext->incRef();
00166 if (prevContext)
00167 prevContext->decRef();
00168 }
00169 continue;
00170 } else
00171 len++;
00172 return true;
00173 }
00174 }
00175
00176
00177
00178
00179 TokenizerPipePython::TokenizerPipePython(Tokenizer& tokenizer, PyObject* pit) :
00180 pit(pit)
00181 {
00182 Py_INCREF(pit);
00183 char const* null = "";
00184 qlex.buffer_fill_region_append((void*)null, (void*)null);
00185 qlex.token_p_switch(&token);
00186 }
00187
00188 TokenizerPipePython::~TokenizerPipePython()
00189 {
00190 Py_DECREF(pit);
00191 }
00192
00193 bool TokenizerPipePython::MoveNext()
00194 {
00195 while (true) {
00196 const QUEX_TYPE_TOKEN_ID TokenID = qlex.receive();
00197
00198 if (TokenID == QUEX_TKN_TERMINATION) {
00199 if (len) { len=0; token.set(QUEX_TKN_EOS,eos); return true; }
00200 PyObject* next = PyIter_Next(pit);
00201 if (next) {
00202 char* line = PyString_AsString(next);
00203 qlex.buffer_fill_region_append_conversion(line, line + strlen(line));
00204 continue;
00205 } else
00206 return false;
00207 }
00208 if (TokenID == QUEX_TKN_EOS) {
00209 len = 0;
00210 } else if (TokenID == QUEX_TKN_XML) {
00211 string xmlTag = token.utf8_text();
00212 if (xmlTag[1] == '/') {
00213 if (currContext) {
00214 Context* prev = currContext->parent;
00215 if (prev)
00216 prev->incRef();
00217 currContext->decRef();
00218 currContext = prev;
00219 }
00220
00221
00222 } else {
00223 Context* prevContext = currContext;
00224 currContext = new Context(xmlTag, currContext);
00225 currContext->incRef();
00226 if (prevContext)
00227 prevContext->decRef();
00228 }
00229 continue;
00230 } else
00231 len++;
00232 return true;
00233 }
00234 }
00235
00236 }
00237
00238 #ifdef UNIT_TEST
00239
00240 using namespace Tanl;
00241
00242 static char* lines[] = { "la prima riga", "la seconda", "ultima riga", 0 };
00243
00244 struct EnumTest : public Enumerator<string*>
00245 {
00246 EnumTest() :
00247 current(-1)
00248 {
00249 for (char** scan = lines; *scan; ++scan)
00250 strings.push_back(*scan);
00251 }
00252 bool MoveNext() { return ++current < strings.size(); }
00253 string* Current() { return &strings[current]; }
00254
00255 int current;
00256 vector<string> strings;
00257 };
00258
00259 int main(int argc, char* argv[])
00260 {
00261 EnumTest et;
00262 Tokenizer l;
00263 Enumerator<Token*>* lpe = l.pipe(et);
00264 while (lpe->MoveNext())
00265 cout << *lpe->Current() << endl;
00266
00267 Py_Initialize();
00268 PyObject* list = PyList_New(0);
00269 for (char** scan = lines; *scan; ++scan)
00270 PyList_Append(list, PyString_FromString(*scan));
00271 PyObject* pit = PyObject_GetIter(list);
00272 Enumerator<Token*>* lpp = l.pipe(pit);
00273 while (lpp->MoveNext())
00274 cout << *lpp->Current() << endl;
00275 Py_DECREF(pit);
00276 Py_DECREF(list);
00277 Py_Finalize();
00278 return 0;
00279 }
00280 #endif