tanl: tanl: parse/desr/src/ap.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/ap.cpp
00004 **  Average Perceptron Sparse/Dense.
00005 **  ----------------------------------------------------------------------
00006 **  Copyright (c) 2002-2006   Massimiliano Ciaramita (massi@yahoo-inc.com)
00007 **  Copyright (c) 2006        Giuseppe Attardi (attardi@di.unipi.it).
00008 **  ----------------------------------------------------------------------
00009 **
00010 **  This file is part of DeSR.
00011 **
00012 **  DeSR is free software; you can redistribute it and/or modify it
00013 **  under the terms of the GNU General Public License, version 3,
00014 **  as published by the Free Software Foundation.
00015 **
00016 **  DeSR is distributed in the hope that it will be useful,
00017 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00018 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019 **  GNU General Public License for more details.
00020 **
00021 **  You should have received a copy of the GNU General Public License
00022 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00023 **  ----------------------------------------------------------------------
00024 */
00025 
00026 #include "ap.h"
00027 
00028 // standard
00029 #include <cmath>
00030 #include <fstream>
00031 #include <iostream>
00032 #include <set>
00033 
00034 using namespace std;
00035 
00036 #define BINARY
00037 
00038 bool    AP::verbose = false;
00039 int     AP::partitionSize = 500000;
00040 float   AP::updatePercent = 0.1F;
00041 
00043 // AP INITIALIZATION:
00044 
00045 APS::APS(int k, int d) :
00046   AP(k, d)
00047 {
00048   M.rehash(k * d / 3);
00049 }
00050 
00051 APSV::APSV(int k, int d) :
00052   AP(k, d),
00053   M(k)
00054 { }
00055 
00056 APD::APD(int k, int d) :
00057   AP(k, d),
00058   M(k)
00059 {
00060   for (int i = 0; i < k; i++)
00061     M[i].resize(d);
00062 }
00063 
00065 // AP TRAIN:
00066 
00071 Float APS::score(unsigned r, X& v) {
00072   Float ans = 0;
00073   for (unsigned i = 0; i < v.size(); ++i) {
00074     pair<unsigned, unsigned> p(r, v[i]);
00075     Matrix::const_iterator m_i = M.find(p);
00076     if (m_i != M.end())
00077       ans += m_i->second.alpha;
00078   }
00079   return ans;
00080 }
00081 
00082 Float APSV::score(unsigned r, X& v) {
00083   Float ans = 0;
00084   Row& Mr = M[r];
00085   for (unsigned i = 0; i < v.size(); ++i) {
00086     Row::const_iterator r_i = Mr.find(v[i]);
00087     if (r_i != Mr.end())
00088       ans += r_i->second.alpha;
00089   }
00090   return ans;
00091 }
00092 
00093 Float APD::score(unsigned r, X& v) {
00094   vector<Fs>& m = M[r];
00095   Float ans = 0;
00096   for (unsigned i = 0; i < v.size(); ++i)
00097     ans += m[v[i]].alpha;
00098   return ans;
00099 }
00100 
00101 // update
00102 void APS::update(X& x, unsigned yt, vector<unsigned>& E) {
00103   unsigned _E_ = E.size();
00104   Float tau = -1.0 / Float(_E_);
00105   for (unsigned i = 0; i < x.size(); ++i) {
00106     pair<unsigned, unsigned> p(yt, x[i]);
00107     M[p].update(t, 1.0);
00108     for (unsigned j = 0; j < _E_; ++j) {
00109       p = make_pair(E[j], x[i]);
00110       M[p].update(t, tau);
00111     }
00112   }
00113 }
00114 
00115 void APSV::update(X& x, unsigned yt, vector<unsigned>& E) {
00116   unsigned _E_ = E.size();
00117   Float tau = -1.0 / Float(_E_);
00118   for (unsigned i = 0; i < x.size(); ++i) {
00119     unsigned xi = x[i];
00120     M[yt][xi].update(t, 1.0);
00121     for (unsigned j = 0; j < _E_; ++j) {
00122       M[E[j]][xi].update(t, tau);
00123     }
00124   }
00125 }
00126 
00127 void APD::update(X& x, unsigned yt, vector<unsigned>& E) {
00128   unsigned _E_ = E.size();
00129   Float tau = -1.0 / Float(_E_);
00130   for (unsigned i = 0; i < x.size(); ++i) {
00131     M[yt][x[i]].update(t, 1.0);
00132     for (unsigned j = 0; j < _E_; ++j)
00133       M[E[j]][x[i]].update(t, tau);
00134   }
00135 }
00136 
00141 int AP::train(Case& cas) {
00142   ++t;                          // increment epoch
00143   X& x = cas.first;
00144   Y y = cas.second;
00145   vector<unsigned> E; // { r != y | Mr x >= My x }
00146   Float score_y = score(y, x);
00147   for (unsigned r = 0; r < k; ++r) {
00148     if (r != y) {
00149       Float scor = score(r, x);
00150       if (scor >= score_y)
00151         E.push_back(r);
00152     }
00153   }
00154   if (E.size()) {
00155     update(x, y, E);
00156     return E.size();
00157   }
00158   return 0;
00159 }
00160 
00161 
00167 // FIXME: share this with MlpParser.cpp
00168 static void rand_permutation(vector<int>& v) {
00169   size_t N = v.size();
00170   for (size_t i = 0; i < N; i++)
00171     v[i] = i;                   // initialize to a sequence
00172   for (size_t i = 0; i < N-1; i++) {
00173     int r = i + rand() % (N-i); // no need to use high-order bits
00174     std::swap(v[i], v[r]);
00175   }
00176 }
00177 
00178 #define ONE_PERM
00179 // train(S, T) train M on S, T times
00180 #ifdef ONE_PERM
00181 void AP::train(Cases& cases, int T) {
00182   unsigned _S_ = cases.size();
00183   if (verbose)
00184     cerr << "AP::AP(" << k << ", " << d << ")"
00185          << " cases: " << _S_ << endl;
00186   vector<int> perm(_S_);
00187   for (int it = 0; it < T; ++it) {
00188     rand_permutation(perm);
00189     int updates = 0;
00190     int errors = 0;
00191     for (unsigned i = 0; i < _S_; ++i) {
00192       int delta = train(cases[perm[i]]);
00193       errors += delta;
00194       updates += delta > 0;
00195     }
00196     float updPercent = (100.0 * updates) / _S_;
00197     if (verbose)
00198       cerr << "\tupds_" << it << " = " << updates
00199            << " (" << updPercent << "%), errors = " << errors << endl;
00200     if (updates == 0 || updPercent < updatePercent)
00201       break;
00202   }
00203 }
00204 #else
00205 void AP::train(Cases& cases, int T) {
00206   unsigned _S_ = cases.size();
00207   if (partitionSize <= 0)
00208     partitionSize = _S_ + 1;
00209   int parts = (_S_ + partitionSize - 1) / partitionSize;
00210   if (verbose)
00211     cerr << "AP::train() |S| = " << _S_ << " partitions = " << parts << endl;
00212   int lastPartitionSize = _S_ % partitionSize;
00213   vector<int> partIndex(parts);
00214   vector<int> partition(partitionSize);
00215   vector<int> lastPartition(lastPartitionSize);
00216   for (int t = 0; t < T; ++t) {
00217     if (parts == 1)
00218       partIndex[0] = 0;         // to get an identical random partition series
00219     else
00220       rand_permutation(partIndex);
00221     int updates = 0;
00222     for (int p = 0; p < parts; p++) {
00223       int part = partIndex[p];  // select partition
00224       int partBase = part * partitionSize;
00225       if (part + 1 == parts) {
00226         // handle last partition separately
00227         rand_permutation(lastPartition);
00228         for (int i = 0; i < lastPartitionSize; ++i)
00229           updates += train(cases[partBase + lastPartition[i]]);
00230       } else {
00231         rand_permutation(partition);
00232         for (int i = 0; i < partitionSize; ++i)
00233           updates += train(cases[partBase + partition[i]]);
00234       }
00235     }
00236     float updPercent = 100.0 * updates / _S_;
00237     if (verbose)
00238       cerr << "\tupds_" << t << " = " << updates
00239            << " (" << updPercent << "%)" << endl;
00240     if (updates == 0 || updPercent < updatePercent)
00241       break;
00242   }
00243 }
00244 #endif
00245 
00247 // AP PREDICT:
00248 
00249 Y AP::predict(X& x)
00250 {
00251   Y best_k = 0;
00252   Float best_score = score(0, x);
00253 
00254   for (unsigned r = 1; r < k; r++) {
00255     Float score_r = score(r, x);
00256     if (score_r > best_score) {
00257       best_score = score_r;
00258       best_k = r;
00259     }
00260   }
00261   return best_k;
00262 }
00263 
00264 vector<Float> AP::scores(X& x)
00265 {
00266   vector<Float> S;
00267   for (unsigned r = 0; r < k; r++)
00268     S.push_back(score(r, x));
00269   return S;
00270 }
00271 
00273 // SAVE/LOAD
00274 
00275 #define MAX_LINE_LEN 8196
00276 
00277 bool AP::load(istream& ifs)
00278 {
00279   char line[MAX_LINE_LEN];
00280   if (!ifs.getline(line, MAX_LINE_LEN)) {
00281     cerr << "Bad model file" << endl;
00282     return false;
00283   }
00284   // read labels
00285   int nl = atoi(line);          // number of labels
00286   while (nl-- && ifs.getline(line, MAX_LINE_LEN)) {
00287     labels.push_back(line);
00288   }
00289   // read predicates
00290   if (!ifs.getline(line, MAX_LINE_LEN)) {
00291     cerr << "Bad model file" << endl;
00292     return false;
00293   }
00294   int np = atoi(line);          // number of predicates
00295 # ifdef AFFINE
00296   int n = 1; np++;
00297 # else
00298   int n = 0;
00299 # endif
00300   while (n < np && ifs.getline(line, MAX_LINE_LEN))
00301     predIndex[(char const*)line] = n++;
00302   return true;
00303 }
00304 
00305 bool APS::load(istream& is)
00306 {
00307   if (!AP::load(is))
00308     return false;
00309   k = labels.size();
00310   d = predIndex.size();
00311   string line;
00312   while (getline(is, line)) {
00313     char* cline = (char *)line.c_str();
00314     int c = strtol(cline, &cline, 10);
00315     int i;
00316     float a;
00317     while (cline && sscanf(cline, " %d:%f", &i, &a)) {
00318       M[make_pair((unsigned)c, (unsigned)i)].alpha = a;
00319       cline = strchr(cline + 1, ' ');
00320     }
00321   }
00322   return true;
00323 }
00324 
00325 void APS::save(ostream& os)
00326 {
00327   os.precision(20);
00328   for (unsigned i = 0; i < k; ++i) {
00329     os << i;
00330     for (unsigned j = 0; j < d; ++j) {
00331       Matrix::const_iterator mit = M.find(make_pair(i, j));
00332       if (mit != M.end())
00333         os << ' ' << j << ':' << mit->second.weight(t);
00334     }
00335     os << endl;
00336   }
00337 }
00338 
00339 bool APSV::load(istream& is)
00340 {
00341   if (!AP::load(is))
00342     return false;
00343   k = labels.size();
00344   d = predIndex.size();
00345   M.resize(k);
00346 #ifdef BINARY
00347   int c;
00348   while (is.read((char*)&c, sizeof(c))) {
00349     if (c == -1)                // terminating -1
00350       break;
00351     int count;
00352     if (!is.read((char*)&count, sizeof(count))) {
00353       cerr << "bad file format: count" << endl;
00354       return false;
00355     }
00356     int i;
00357     float a;
00358     while (count--) {
00359       if (!is.read((char*)&i, sizeof(i))) {
00360         cerr << "bad file format: i" << endl;
00361         return false;
00362       }
00363       if (!is.read((char*)&a, sizeof(a))) {
00364         cerr << "bad file format: a" << endl;
00365         return false;
00366       }
00367       M[c][i].alpha = a;
00368     }
00369   }
00370   return true;
00371 #else
00372   string line;
00373   while (getline(is, line)) {
00374     char* cline = (char *)line.c_str();
00375     int c = strtol(cline, &cline, 10);
00376     int i;
00377     float a;
00378     while (cline && sscanf(cline, " %d:%f", &i, &a)) {
00379       M[c][i].alpha = a;
00380       cline = strchr(cline + 1, ' ');
00381     }
00382   }
00383   return true;
00384 #endif
00385 }
00386 
00387 void APSV::save(ostream& os)
00388 {
00389 #ifdef BINARY
00390   for (unsigned i = 0; i < k; ++i) {
00391     // count non-zero items
00392     int count = 0;
00393     for (Row::const_iterator it = M[i]. begin(); it != M[i].end(); ++it) {
00394       float w = it->second.weight(t); // single float
00395       if (w != 0.0)
00396         count++;
00397     }
00398     if (count == 0)
00399       continue;
00400     os.write((char const*)&i, sizeof(i));
00401     os.write((char const*)&count, sizeof(count));
00402     for (unsigned j = 0; j < d; ++j) {
00403       if (M[i].find(j) != M[i].end()) {
00404         float w = M[i][j].weight(t); // single float
00405         if (w != 0.0) {
00406           os.write((char const*)&j, sizeof(j));
00407           os.write((char const*)&w, sizeof(w));
00408         }
00409       }
00410     }
00411   }
00412   // write terminating -1
00413   int z = -1;
00414   os.write((char const*)&z, sizeof(z));
00415 #else
00416   os.precision(20);
00417   for (unsigned i = 0; i < k; ++i) {
00418     for (unsigned j = 0; j < d; ++j) {
00419       if (j % 100 == 0) {       // break long lines
00420         if (j)
00421           os << endl;
00422         os << i;
00423       }
00424       if (M[i].find(j) != M[i].end()) {
00425         Float w = M[i][j].weight(t);
00426         if (w != 0.0)
00427           os << ' ' << j << ':' << w;
00428       }
00429     }
00430     os << endl;
00431   }
00432 #endif
00433 }
00434 
00435 bool APD::load(istream& is)
00436 {
00437   if (!AP::load(is))
00438     return false;
00439   string line;
00440   while (getline(is, line)) {
00441     char* cline = (char *)line.c_str();
00442     int c = strtol(cline, &cline, 10);
00443     int i;
00444     float a;
00445     while (cline && sscanf(cline, " %d:%f", &i, &a)) {
00446       M[c][i].alpha = a;
00447       cline = strchr(cline + 1, ' ');
00448     }
00449   }
00450   return true;
00451 }
00452 
00453 void APD::save(ostream& os)
00454 {
00455   os.precision(20);
00456   for (unsigned i = 0; i < k; ++i) {
00457     os << i;
00458     for (unsigned j = 0; j < d; ++j) {
00459       Float w = M[i][j].weight(t);
00460       if (w != 0.0)
00461         os << ' ' << j << ':' << w;
00462     }
00463     os << endl;
00464   }
00465 }