00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Corpus.h"
00025 #include "text/XmlReader.h"
00026
00027
00028 #include <sstream>
00029
00030
00031 #include "io/File.h"
00032
00033 using namespace std;
00034
00035 namespace Tanl {
00036
00037
00038
00039
00040 Corpus::Corpus(Language const& lang, char const* formatFile) :
00041 language(lang)
00042 {
00043 CorpusFormat* cf = parseFormat(formatFile);
00044 if (cf) {
00045 tokenFields = cf->tokenFields;
00046 index = cf->index;
00047 delete cf;
00048 } else
00049 throw CorpusFormatError(string("Reading file: ") + formatFile);
00050 }
00051
00052 Corpus* Corpus::create(Language const& language, char const* inputFormat)
00053 {
00054 Corpus* corpus;
00055 CorpusFactory* cfactory = CorpusMap::get(inputFormat);
00056 if (cfactory)
00057 corpus = cfactory(language, 0);
00058 else if (IXE::io::File(inputFormat).exists()) {
00059 CorpusFormat* inFormat = Corpus::parseFormat(inputFormat);
00060 if (!inFormat) {
00061 cerr << "Bad format file: " << *inputFormat << endl;
00062 return 0;
00063 }
00064 cfactory = CorpusMap::get(inFormat->name.c_str());
00065 if (cfactory)
00066 corpus = cfactory(language, inFormat);
00067 else
00068 corpus = new Corpus(language, *inFormat);
00069 }
00070 return corpus;
00071 }
00072
00073 Corpus* Corpus::create(char const* language, char const* inputFormat)
00074 {
00075 Language const* lang = Language::get(language);
00076 if (!lang)
00077 lang = Language::get("en");
00078 return create(*lang, inputFormat);
00079 }
00080
00081 CorpusFormat* Corpus::parseFormat(char const* formatFile)
00082 {
00083 ifstream fmt(formatFile);
00084 return parseFormat(fmt);
00085 }
00086
00087 CorpusFormat* Corpus::parseFormat(istream& fmt)
00088 {
00089 Tanl::XML::XmlReader reader(fmt);
00090 CorpusFormat* format = new CorpusFormat();
00091 AttributeIndex& index = format->index;
00092 while (reader.Read()) {
00093 switch (reader.NodeType) {
00094 case Tanl::XML::Element:
00095 string& name = reader.Name;
00096 if (name == "CorpusFormat") {
00097 if (reader.MoveToFirstAttribute()) {
00098 do {
00099 string& name = reader.Name;
00100 string& value = reader.Value;
00101 if (name == "name") {
00102 format->name = value;
00103 }
00104 } while (reader.MoveToNextAttribute());
00105 }
00106 } else if (name == "field") {
00107
00108 TokenField field;
00109 if (reader.MoveToFirstAttribute()) {
00110 do {
00111 string& name = reader.Name;
00112 string& value = reader.Value;
00113 if (name == "name") {
00114 field.name = value;
00115 index.insert(value.c_str());
00116 } else if (name == "use")
00117 field.use = (value == "INPUT" ? TokenField::input :
00118 (value == "OUTPUT" ? TokenField::output :
00119 (value == "ECHO" ? TokenField::echo :
00120 (value == "IGNORE" ? TokenField::ignore :
00121 TokenField::input))));
00122 else if (name == "value")
00123 field.value = (value == "STRING" ? TokenField::string :
00124 (value == "INTEGER" ? TokenField::integer : TokenField::string));
00125 else if (name == "role")
00126 field.role = (value == "FORM" ? TokenField::form :
00127 (value == "HEAD" ? TokenField::head :
00128 (value == "DEPREL" ? TokenField::deprel :
00129 (value == "PREDICATE" ? TokenField::predicate :
00130 TokenField::none))));
00131 else if (name == "link")
00132 field.link = value;
00133 else if (name == "label")
00134 field.label = value;
00135 else if (name == "default")
00136 field.default_ = value;
00137 } while (reader.MoveToNextAttribute());
00138 }
00139 format->tokenFields.push_back(field);
00140 } else {
00141 delete format;
00142 return 0;
00143 }
00144 }
00145 }
00146 return format;
00147 }
00148
00149 SentenceReader* Corpus::sentenceReader(istream* is) {
00150 return new SentenceReader(is, this);
00151 }
00152
00153 void Corpus::print(ostream& os, Sentence const& sent) const
00154 {
00155 for (Sentence::const_iterator tit = sent.begin(); tit != sent.end(); ++tit) {
00156 for (Attributes::const_iterator fit = (*tit)->attributes.begin();
00157 fit != (*tit)->attributes.end(); ++fit)
00158 os << ((*(*fit).second == "") ? "_" : *(*fit).second) << "\t";
00159 os << endl;
00160 }
00161 }
00162
00163 std::string Corpus::toString(Sentence const& sent) const
00164 {
00165 std::ostringstream ss;
00166 FOR_EACH (Sentence, sent, tit) {
00167 (*tit)->printTab(ss, *this);
00168 ss << endl;
00169 }
00170 return ss.str().c_str();
00171 }
00172
00173
00174
00175
00183 std::map<char const*, CorpusFactory*>& CorpusMap::get()
00184 {
00185 static std::map<char const*, CorpusFactory*> corpusFor;
00186 return corpusFor;
00187 }
00188
00189 CorpusFactory* CorpusMap::get(char const* type)
00190 {
00191 std::map<char const*, CorpusFactory*>& corpusMap = get();
00192 map<char const*, CorpusFactory*>::const_iterator rit = corpusMap.find(type);
00193 return (rit == corpusMap.end()) ? 0 : rit->second;
00194 }
00195
00196 }