Tanl Linguistic Pipeline |
Represents common aspects of a Corpus. More...
#include <Corpus.h>
Public Member Functions | |
Corpus (Language const &lang) | |
Corpus (Language const &lang, CorpusFormat &format) | |
Create from specified CorpusFormat. | |
Corpus (Language const &lang, char const *formatFile) | |
Read the corpus format from file formatFile . | |
AttributeId | attributeId (const char *name) |
virtual SentenceReader * | sentenceReader (std::istream *is) |
virtual void | print (std::ostream &os, Sentence const &sent) const |
Print the sentence in the standard format for the corpus. | |
virtual std::string | toString (Sentence const &sent) const |
Corpus (Language const &lang) | |
Corpus (Language const &lang, CorpusFormat &format) | |
Create from specified CorpusFormat. | |
Corpus (Language const &lang, char const *formatFile) | |
Read the corpus format from file formatFile . | |
AttributeId | attributeId (const char *name) |
virtual SentenceReader * | sentenceReader (std::istream *is) |
virtual void | print (std::ostream &os, Sentence const &sent) const |
Print the sentence in the standard format for the corpus. | |
virtual std::string | toString (Sentence const &sent) const |
Static Public Member Functions | |
static Corpus * | create (Language const &language, char const *inputFormat) |
Factory pattern for creating a Corpus based on the provided format. | |
static Corpus * | create (char const *language, char const *inputFormat) |
static CorpusFormat * | parseFormat (char const *formatFile) |
Read the corpus format from file formatFile . | |
static Corpus * | create (Language const &language, char const *inputFormat) |
Factory pattern for creating a Corpus based on the provided format. | |
static Corpus * | create (char const *language, char const *inputFormat) |
static CorpusFormat * | parseFormat (char const *formatFile) |
Read the corpus format from file formatFile . | |
Public Attributes | |
Language const & | language |
AttributeIndex | index |
associates an index to field names | |
TokenFields | tokenFields |
describes properties of fields in tokens | |
Static Protected Member Functions | |
static CorpusFormat * | parseFormat (std::istream &is) |
static CorpusFormat * | parseFormat (std::istream &is) |
Represents common aspects of a Corpus.
Tanl::Corpus::Corpus | ( | Language const & | lang | ) | [inline] |
lang | the default language for sentences in the corpus. |
Tanl::Corpus::Corpus | ( | Language const & | lang, | |
CorpusFormat & | format | |||
) | [inline] |
Create from specified CorpusFormat.
lang | the default language for sentences in the corpus. |
Tanl::Corpus::Corpus | ( | Language const & | lang, | |
char const * | formatFile | |||
) |
Read the corpus format from file formatFile
.
lang | the default language for sentences in the corpus. |
References Tanl::CorpusFormat::index, index, parseFormat(), Tanl::CorpusFormat::tokenFields, and tokenFields.
Tanl::Corpus::Corpus | ( | Language const & | lang | ) | [inline] |
lang | the default language for sentences in the corpus. |
Tanl::Corpus::Corpus | ( | Language const & | lang, | |
CorpusFormat & | format | |||
) | [inline] |
Create from specified CorpusFormat.
lang | the default language for sentences in the corpus. |
Tanl::Corpus::Corpus | ( | Language const & | lang, | |
char const * | formatFile | |||
) |
Read the corpus format from file formatFile
.
lang | the default language for sentences in the corpus. |
AttributeId Tanl::Corpus::attributeId | ( | const char * | name | ) | [inline] |
name. |
References index, and Tanl::AttributeIndex::insert().
AttributeId Tanl::Corpus::attributeId | ( | const char * | name | ) | [inline] |
name. |
static Corpus* Tanl::Corpus::create | ( | Language const & | language, | |
char const * | inputFormat | |||
) | [static] |
Factory pattern for creating a Corpus based on the provided format.
lang | the default language for sentences in the corpus. | |
inputFormat | is either the name of a builtin format (either CoNLL, conll08, DgaXML, Text, TokenizedText) or the name of a file containing the specifications of the format. |
Factory pattern for creating a Corpus based on the provided format.
lang | the default language for sentences in the corpus. | |
inputFormat | is either the name of a builtin format (either CoNLL, conll08, DgaXML, Text, TokenizedText) or the name of a file containing the specifications of the format. |
References parseFormat().
static CorpusFormat* Tanl::Corpus::parseFormat | ( | char const * | formatFile | ) | [static] |
Read the corpus format from file formatFile
.
CorpusFormat * Tanl::Corpus::parseFormat | ( | char const * | formatFile | ) | [static] |
virtual SentenceReader* Tanl::Corpus::sentenceReader | ( | std::istream * | is | ) | [virtual] |
is
. Reimplemented in Tanl::ConllXCorpus, Tanl::DgaCorpus, Tanl::TextCorpus, and Tanl::TokenizedTextCorpus.
virtual SentenceReader* Tanl::Corpus::sentenceReader | ( | std::istream * | is | ) | [virtual] |
filename
. Reimplemented in Tanl::ConllXCorpus, Tanl::DgaCorpus, Tanl::TextCorpus, and Tanl::TokenizedTextCorpus.