ReySajju742 commited on May 6

Commit

ff2da0f

verified ·

1 Parent(s): c60d77d

Upload 51 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -0
Makefile +16 -0
README +52 -0
basic/city.cc +466 -0
basic/city.h +90 -0
basic/hard-ofstream.h +32 -0
basic/indent.cc +3 -0
basic/indent.h +18 -0
basic/lisp.cc +129 -0
basic/lisp.h +34 -0
basic/logging.cc +145 -0
basic/logging.h +122 -0
basic/mem-tracker.cc +53 -0
basic/mem-tracker.h +132 -0
basic/mem.h +14 -0
basic/multi-ostream.cc +61 -0
basic/multi-ostream.h +67 -0
basic/opt.cc +189 -0
basic/opt.h +100 -0
basic/pipe.h +46 -0
basic/prob-utils.cc +75 -0
basic/prob-utils.h +19 -0
basic/stats.cc +1 -0
basic/stats.h +71 -0
basic/std.cc +111 -0
basic/std.h +115 -0
basic/stl-basic.cc +1 -0
basic/stl-basic.h +113 -0
basic/stl-utils.cc +1 -0
basic/stl-utils.h +232 -0
basic/str-str-db.cc +35 -0
basic/str-str-db.h +19 -0
basic/str.cc +91 -0
basic/str.h +22 -0
basic/strdb.cc +209 -0
basic/strdb.h +101 -0
basic/timer.cc +11 -0
basic/timer.h +35 -0
basic/union-set.cc +29 -0
basic/union-set.h +22 -0
cluster-viewer/LICENSE +22 -0
cluster-viewer/README.md +26 -0
cluster-viewer/build-viewer.sh +32 -0
cluster-viewer/code/final.py +8 -0
cluster-viewer/code/htmlrows.html +18 -0
cluster-viewer/code/make_html.py +75 -0
cluster-viewer/code/style.css +9 -0
cluster-viewer/code/template.html +22 -0
input.txt +3 -0
output.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.o

Makefile ADDED Viewed

	@@ -0,0 +1,16 @@

+# 1.2: need to make sure opt.o goes in the right order to get the right scope on the command-line arguments
+# Use this for Linux
+ifeq ($(shell uname),Linux)
+	files=$(subst .cc,.o,basic/logging.cc $(shell /bin/ls *.cc) $(shell /bin/ls basic/*.cc | grep -v logging.cc))
+else
+	files=$(subst .cc,.o,basic/opt.cc $(shell /bin/ls *.cc) $(shell /bin/ls basic/*.cc | grep -v opt.cc))
+endif
+wcluster: $(files)
+	g++ -Wall -g -std=c++0x -O3 -o wcluster $(files) -lpthread
+%.o: %.cc
+	g++ -Wall -g -O3 -std=c++0x -o $@ -c $<
+clean:
+	rm wcluster basic/*.o *.o

README ADDED Viewed

	@@ -0,0 +1,52 @@

+Implementation of the Brown hierarchical word clustering algorithm.
+Percy Liang
+Release 1.3
+2012.07.24
+Input: a sequence of words separated by whitespace (see input.txt for an example).
+Output: for each word type, its cluster (see output.txt for an example).
+        In particular, each line is:
+  <cluster represented as a bit string> <word> <number of times word occurs in input>
+Runs in $O(N C^2)$, where $N$ is the number of word types and $C$
+is the number of clusters.
+References:
+  Brown, et al.: Class-Based n-gram Models of Natural Language
+    http://acl.ldc.upenn.edu/J/J92/J92-4003.pdf
+  Liang: Semi-supervised learning for natural language processing
+    http://cs.stanford.edu/~pliang/papers/meng-thesis.pdf
+Compile:
+  make
+Run:
+  # Clusters input.txt into 50 clusters:
+  ./wcluster --text input.txt --c 50
+  # Output in input-c50-p1.out/paths
+============================================================
+Change Log
+1.3: compatibility updates for newer versions of g++ (courtesy of Chris Dyer).
+1.2: make compatible with MacOS (replaced timespec with timeval and changed order of linking).
+1.1: Removed deprecated operators so it works with GCC 4.3.
+============================================================
+(C) Copyright 2007-2012, Percy Liang
+http://cs.stanford.edu/~pliang
+Permission is granted for anyone to copy, use, or modify these programs and
+accompanying documents for purposes of research or education, provided this
+copyright notice is retained, and note is made of any changes that have been
+made.
+These programs and documents are distributed without any warranty, express or
+implied.  As the programs were written for research purposes only, they have
+not been tested to the degree that would be advisable in any important
+application.  All use of these programs is entirely at the user's own risk.

basic/city.cc ADDED Viewed

	@@ -0,0 +1,466 @@

+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides CityHash64() and related functions.
+//
+// It's probably possible to create even faster hash functions by
+// writing a program that systematically explores some of the space of
+// possible hash functions, by using SIMD instructions, or by
+// compromising on hash quality.
+#include "city.h"
+#include <algorithm>
+#include <string.h>  // for memcpy and memset
+using namespace std;
+static uint64 UNALIGNED_LOAD64(const char *p) {
+  uint64 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+static uint32 UNALIGNED_LOAD32(const char *p) {
+  uint32 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+#if !defined(WORDS_BIGENDIAN)
+#define uint32_in_expected_order(x) (x)
+#define uint64_in_expected_order(x) (x)
+#else
+#ifdef _MSC_VER
+#include <stdlib.h>
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+#else
+#include <byteswap.h>
+#endif
+#define uint32_in_expected_order(x) (bswap_32(x))
+#define uint64_in_expected_order(x) (bswap_64(x))
+#endif  // WORDS_BIGENDIAN
+#if !defined(LIKELY)
+#if HAVE_BUILTIN_EXPECT
+#define LIKELY(x) (__builtin_expect(!!(x), 1))
+#else
+#define LIKELY(x) (x)
+#endif
+#endif
+static uint64 Fetch64(const char *p) {
+  return uint64_in_expected_order(UNALIGNED_LOAD64(p));
+}
+static uint32 Fetch32(const char *p) {
+  return uint32_in_expected_order(UNALIGNED_LOAD32(p));
+}
+// Some primes between 2^63 and 2^64 for various uses.
+static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64 k1 = 0xb492b66fbe98f273ULL;
+static const uint64 k2 = 0x9ae16a3b2f90404fULL;
+static const uint64 k3 = 0xc949d7c7509e6557ULL;
+// Bitwise right rotate.  Normally this will compile to a single
+// instruction, especially if the shift is a manifest constant.
+static uint64 Rotate(uint64 val, int shift) {
+  // Avoid shifting by 64: doing so yields an undefined result.
+  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+// Equivalent to Rotate(), but requires the second arg to be non-zero.
+// On x86-64, and probably others, it's possible for this to compile
+// to a single instruction if both args are already in registers.
+static uint64 RotateByAtLeast1(uint64 val, int shift) {
+  return (val >> shift) | (val << (64 - shift));
+}
+static uint64 ShiftMix(uint64 val) {
+  return val ^ (val >> 47);
+}
+static uint64 HashLen16(uint64 u, uint64 v) {
+  return Hash128to64(uint128(u, v));
+}
+static uint64 HashLen0to16(const char *s, size_t len) {
+  if (len > 8) {
+    uint64 a = Fetch64(s);
+    uint64 b = Fetch64(s + len - 8);
+    return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b;
+  }
+  if (len >= 4) {
+    uint64 a = Fetch32(s);
+    return HashLen16(len + (a << 3), Fetch32(s + len - 4));
+  }
+  if (len > 0) {
+    uint8 a = s[0];
+    uint8 b = s[len >> 1];
+    uint8 c = s[len - 1];
+    uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8);
+    uint32 z = len + (static_cast<uint32>(c) << 2);
+    return ShiftMix(y * k2 ^ z * k3) * k2;
+  }
+  return k2;
+}
+// This probably works well for 16-byte strings as well, but it may be overkill
+// in that case.
+static uint64 HashLen17to32(const char *s, size_t len) {
+  uint64 a = Fetch64(s) * k1;
+  uint64 b = Fetch64(s + 8);
+  uint64 c = Fetch64(s + len - 8) * k2;
+  uint64 d = Fetch64(s + len - 16) * k0;
+  return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
+                   a + Rotate(b ^ k3, 20) - c + len);
+}
+// Return a 16-byte hash for 48 bytes.  Quick and dirty.
+// Callers do best to use "random-looking" values for a and b.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) {
+  a += w;
+  b = Rotate(b + a + z, 21);
+  uint64 c = a;
+  a += x;
+  a += y;
+  b += Rotate(a, 44);
+  return make_pair(a + z, b + c);
+}
+// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    const char* s, uint64 a, uint64 b) {
+  return WeakHashLen32WithSeeds(Fetch64(s),
+                                Fetch64(s + 8),
+                                Fetch64(s + 16),
+                                Fetch64(s + 24),
+                                a,
+                                b);
+}
+// Return an 8-byte hash for 33 to 64 bytes.
+static uint64 HashLen33to64(const char *s, size_t len) {
+  uint64 z = Fetch64(s + 24);
+  uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
+  uint64 b = Rotate(a + z, 52);
+  uint64 c = Rotate(a, 37);
+  a += Fetch64(s + 8);
+  c += Rotate(a, 7);
+  a += Fetch64(s + 16);
+  uint64 vf = a + z;
+  uint64 vs = b + Rotate(a, 31) + c;
+  a = Fetch64(s + 16) + Fetch64(s + len - 32);
+  z = Fetch64(s + len - 8);
+  b = Rotate(a + z, 52);
+  c = Rotate(a, 37);
+  a += Fetch64(s + len - 24);
+  c += Rotate(a, 7);
+  a += Fetch64(s + len - 16);
+  uint64 wf = a + z;
+  uint64 ws = b + Rotate(a, 31) + c;
+  uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
+  return ShiftMix(r * k0 + vs) * k2;
+}
+uint64 CityHash64(const char *s, size_t len) {
+  if (len <= 32) {
+    if (len <= 16) {
+      return HashLen0to16(s, len);
+    } else {
+      return HashLen17to32(s, len);
+    }
+  } else if (len <= 64) {
+    return HashLen33to64(s, len);
+  }
+  // For strings over 64 bytes we hash the end first, and then as we
+  // loop we keep 56 bytes of state: v, w, x, y, and z.
+  uint64 x = Fetch64(s + len - 40);
+  uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
+  uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
+  pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, z);
+  pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
+  x = x * k1 + Fetch64(s);
+  // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
+  len = (len - 1) & ~static_cast<size_t>(63);
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    len -= 64;
+  } while (len != 0);
+  return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+                   HashLen16(v.second, w.second) + x);
+}
+uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) {
+  return CityHash64WithSeeds(s, len, k2, seed);
+}
+uint64 CityHash64WithSeeds(const char *s, size_t len,
+                           uint64 seed0, uint64 seed1) {
+  return HashLen16(CityHash64(s, len) - seed0, seed1);
+}
+// A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
+// of any length representable in signed long.  Based on City and Murmur.
+static uint128 CityMurmur(const char *s, size_t len, uint128 seed) {
+  uint64 a = Uint128Low64(seed);
+  uint64 b = Uint128High64(seed);
+  uint64 c = 0;
+  uint64 d = 0;
+  signed long l = len - 16;
+  if (l <= 0) {  // len <= 16
+    a = ShiftMix(a * k1) * k1;
+    c = b * k1 + HashLen0to16(s, len);
+    d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
+  } else {  // len > 16
+    c = HashLen16(Fetch64(s + len - 8) + k1, a);
+    d = HashLen16(b + len, c + Fetch64(s + len - 16));
+    a += d;
+    do {
+      a ^= ShiftMix(Fetch64(s) * k1) * k1;
+      a *= k1;
+      b ^= a;
+      c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
+      c *= k1;
+      d ^= c;
+      s += 16;
+      l -= 16;
+    } while (l > 0);
+  }
+  a = HashLen16(a, c);
+  b = HashLen16(d, b);
+  return uint128(a ^ b, HashLen16(b, a));
+}
+uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) {
+  if (len < 128) {
+    return CityMurmur(s, len, seed);
+  }
+  // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
+  // v, w, x, y, and z.
+  pair<uint64, uint64> v, w;
+  uint64 x = Uint128Low64(seed);
+  uint64 y = Uint128High64(seed);
+  uint64 z = len * k1;
+  v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
+  v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
+  w.first = Rotate(y + z, 35) * k1 + x;
+  w.second = Rotate(x + Fetch64(s + 88), 53) * k1;
+  // This is the same inner loop as CityHash64(), manually unrolled.
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    len -= 128;
+  } while (LIKELY(len >= 128));
+  x += Rotate(v.first + z, 49) * k0;
+  z += Rotate(w.first, 37) * k0;
+  // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
+  for (size_t tail_done = 0; tail_done < len; ) {
+    tail_done += 32;
+    y = Rotate(x + y, 42) * k0 + v.second;
+    w.first += Fetch64(s + len - tail_done + 16);
+    x = x * k0 + w.first;
+    z += w.second + Fetch64(s + len - tail_done);
+    w.second += v.first;
+    v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);
+  }
+  // At this point our 56 bytes of state should contain more than
+  // enough information for a strong 128-bit hash.  We use two
+  // different 56-byte-to-8-byte hashes to get a 16-byte final result.
+  x = HashLen16(x, v.first);
+  y = HashLen16(y + z, w.first);
+  return uint128(HashLen16(x + v.second, w.second) + y,
+                 HashLen16(x + w.second, y + v.second));
+}
+uint128 CityHash128(const char *s, size_t len) {
+  if (len >= 16) {
+    return CityHash128WithSeed(s + 16,
+                               len - 16,
+                               uint128(Fetch64(s) ^ k3,
+                                       Fetch64(s + 8)));
+  } else if (len >= 8) {
+    return CityHash128WithSeed(NULL,
+                               0,
+                               uint128(Fetch64(s) ^ (len * k0),
+                                       Fetch64(s + len - 8) ^ k1));
+  } else {
+    return CityHash128WithSeed(s, len, uint128(k0, k1));
+  }
+}
+#ifdef __SSE4_2__
+#include <citycrc.h>
+#include <nmmintrin.h>
+// Requires len >= 240.
+static void CityHashCrc256Long(const char *s, size_t len,
+                               uint32 seed, uint64 *result) {
+  uint64 a = Fetch64(s + 56) + k0;
+  uint64 b = Fetch64(s + 96) + k0;
+  uint64 c = result[0] = HashLen16(b, len);
+  uint64 d = result[1] = Fetch64(s + 120) * k0 + len;
+  uint64 e = Fetch64(s + 184) + seed;
+  uint64 f = seed;
+  uint64 g = 0;
+  uint64 h = 0;
+  uint64 i = 0;
+  uint64 j = 0;
+  uint64 t = c + d;
+  // 240 bytes of input per iter.
+  size_t iters = len / 240;
+  len -= iters * 240;
+  do {
+#define CHUNK(multiplier, z)                                    \
+    {                                                           \
+      uint64 old_a = a;                                         \
+      a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s);          \
+      b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8);      \
+      c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16);     \
+      d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24);     \
+      e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32);     \
+      t = old_a;                                                \
+    }                                                           \
+    f = _mm_crc32_u64(f, a);                                    \
+    g = _mm_crc32_u64(g, b);                                    \
+    h = _mm_crc32_u64(h, c);                                    \
+    i = _mm_crc32_u64(i, d);                                    \
+    j = _mm_crc32_u64(j, e);                                    \
+    s += 40
+    CHUNK(1, 1); CHUNK(k0, 0);
+    CHUNK(1, 1); CHUNK(k0, 0);
+    CHUNK(1, 1); CHUNK(k0, 0);
+  } while (--iters > 0);
+  while (len >= 40) {
+    CHUNK(k0, 0);
+    len -= 40;
+  }
+  if (len > 0) {
+    s = s + len - 40;
+    CHUNK(k0, 0);
+  }
+  j += i << 32;
+  a = HashLen16(a, j);
+  h += g << 32;
+  b += h;
+  c = HashLen16(c, f) + i;
+  d = HashLen16(d, e + result[0]);
+  j += e;
+  i += HashLen16(h, t);
+  e = HashLen16(a, d) + j;
+  f = HashLen16(b, c) + a;
+  g = HashLen16(j, i) + c;
+  result[0] = e + f + g + h;
+  a = ShiftMix((a + g) * k0) * k0 + b;
+  result[1] += a + result[0];
+  a = ShiftMix(a * k0) * k0 + c;
+  result[2] = a + result[1];
+  a = ShiftMix((a + e) * k0) * k0;
+  result[3] = a + result[2];
+}
+// Requires len < 240.
+static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) {
+  char buf[240];
+  memcpy(buf, s, len);
+  memset(buf + len, 0, 240 - len);
+  CityHashCrc256Long(buf, 240, ~static_cast<uint32>(len), result);
+}
+void CityHashCrc256(const char *s, size_t len, uint64 *result) {
+  if (LIKELY(len >= 240)) {
+    CityHashCrc256Long(s, len, 0, result);
+  } else {
+    CityHashCrc256Short(s, len, result);
+  }
+}
+uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) {
+  if (len <= 900) {
+    return CityHash128WithSeed(s, len, seed);
+  } else {
+    uint64 result[4];
+    CityHashCrc256(s, len, result);
+    uint64 u = Uint128High64(seed) + result[0];
+    uint64 v = Uint128Low64(seed) + result[1];
+    return uint128(HashLen16(u, v + result[2]),
+                   HashLen16(Rotate(v, 32), u * k0 + result[3]));
+  }
+}
+uint128 CityHashCrc128(const char *s, size_t len) {
+  if (len <= 900) {
+    return CityHash128(s, len);
+  } else {
+    uint64 result[4];
+    CityHashCrc256(s, len, result);
+    return uint128(result[2], result[3]);
+  }
+}
+#endif

basic/city.h ADDED Viewed

	@@ -0,0 +1,90 @@

+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides a few functions for hashing strings. On x86-64
+// hardware in 2011, CityHash64() is faster than other high-quality
+// hash functions, such as Murmur.  This is largely due to higher
+// instruction-level parallelism.  CityHash64() and CityHash128() also perform
+// well on hash-quality tests.
+//
+// CityHash128() is optimized for relatively long strings and returns
+// a 128-bit hash.  For strings more than about 2000 bytes it can be
+// faster than CityHash64().
+//
+// Functions in the CityHash family are not suitable for cryptography.
+//
+// WARNING: This code has not been tested on big-endian platforms!
+// It is known to work well on little-endian platforms that have a small penalty
+// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
+//
+// By the way, for some hash functions, given strings a and b, the hash
+// of a+b is easily derived from the hashes of a and b.  This property
+// doesn't hold for any hash functions in this file.
+#ifndef CITY_HASH_H_
+#define CITY_HASH_H_
+#include <stdlib.h>  // for size_t.
+#include <stdint.h>
+#include <utility>
+typedef uint8_t uint8;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef std::pair<uint64, uint64> uint128;
+inline uint64 Uint128Low64(const uint128& x) { return x.first; }
+inline uint64 Uint128High64(const uint128& x) { return x.second; }
+// Hash function for a byte array.
+uint64 CityHash64(const char *buf, size_t len);
+// Hash function for a byte array.  For convenience, a 64-bit seed is also
+// hashed into the result.
+uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed);
+// Hash function for a byte array.  For convenience, two seeds are also
+// hashed into the result.
+uint64 CityHash64WithSeeds(const char *buf, size_t len,
+                           uint64 seed0, uint64 seed1);
+// Hash function for a byte array.
+uint128 CityHash128(const char *s, size_t len);
+// Hash function for a byte array.  For convenience, a 128-bit seed is also
+// hashed into the result.
+uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed);
+// Hash 128 input bits down to 64 bits of output.
+// This is intended to be a reasonably good hash function.
+inline uint64 Hash128to64(const uint128& x) {
+  // Murmur-inspired hashing.
+  const uint64 kMul = 0x9ddfea08eb382d69ULL;
+  uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+  a ^= (a >> 47);
+  uint64 b = (Uint128High64(x) ^ a) * kMul;
+  b ^= (b >> 47);
+  b *= kMul;
+  return b;
+}
+#endif  // CITY_HASH_H_

basic/hard-ofstream.h ADDED Viewed

	@@ -0,0 +1,32 @@

+#ifndef __HARD_OFSTREAM_H__
+#define __HARD_OFSTREAM_H__
+// On AFS, flushing a file writes it to the local disk but not AFS.
+// Hard flushing ensures that the file will be written, by closing
+// and re-opening the file.
+#include <fstream>
+#include <string>
+using namespace std;
+class hard_ofstream : public ofstream {
+public:
+  hard_ofstream() { }
+  hard_ofstream(const char *file, ofstream::openmode mode = ofstream::trunc) { open(file, mode); }
+  void open(const char *file, ofstream::openmode mode = ofstream::trunc) {
+    ofstream::open(file, mode);
+    this->file = file;
+  }
+  void hard_flush() {
+    close();
+    open(file.c_str(), ofstream::app);
+  }
+private:
+  string file;
+};
+#endif

basic/indent.cc ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #include "indent.h"
2	+
3	+ #include "opt.h"

basic/indent.h ADDED Viewed

	@@ -0,0 +1,18 @@

+#ifndef __INDENT_H__
+#define __INDENT_H__
+#include <iostream>
+using namespace std;
+struct Indent {
+  Indent(int level) : level(level) { }
+  int level;
+};
+inline ostream &operator<<(ostream &out, const Indent &ind) {
+  for(int i = 0; i < ind.level; i++) out << "  ";
+  return out;
+}
+#endif

basic/lisp.cc ADDED Viewed

	@@ -0,0 +1,129 @@

+#include "lisp.h"
+#include "std.h"
+#include "indent.h"
+void LispNode::destroy() {
+  forvec(_, LispNode *, node, children) {
+    node->destroy();
+    delete node;
+  }
+}
+void LispNode::print(intIndex ind) const {
+  cout << Indent(ind) << (value.empty() ? "(empty)" : value) << endl;
+  forvec(_, LispNode *, subnode, children)
+    subnode->print(ind+1);
+}
+////////////////////////////////////////////////////////////
+LispTree::~LispTree() {
+  root->destroy();
+  delete root;
+}
+bool is_paren(char c) {
+  return c == '(' || c == ')' || c == '[' || c == ']';
+}
+bool is_paren(string s) {
+  return s == "(" || s == ")" || s == "[" || s == "]";
+}
+bool is_left_paren(string s) {
+  return s == "(" || s == "[";
+}
+bool is_right_paren(string s) {
+  return s == ")" || s == "]";
+}
+string matching_right_paren(char c) {
+  if(c == '(') return ")";
+  if(c == '[') return "]";
+  return "";
+}
+// Return first non-space character.
+char skip_space(istream &in) {
+  char c;
+  while(true) {
+    c = in.peek();
+    if(!isspace(c)) break;
+    in.get();
+  }
+  return c;
+}
+// Comments start with # and end with the line.
+// There must be a space before the #.
+char skip_comments(istream &in) {
+  while(true) {
+    char c = skip_space(in);
+    if(c == '#')
+      while((c = in.peek()) != '\n') in.get();
+    else
+      return c;
+  }
+}
+bool LispTree::read_token(istream &in, string &s) {
+  char c = skip_comments(in);
+  if(is_paren(c)) {
+    s = in.get();
+    return true;
+  }
+  s = "";
+  while(true) {
+    c = in.peek();
+    if(c == EOF) return false;
+    if(isspace(c) || is_paren(c)) break;
+    s += in.get();
+  }
+  return true;
+}
+LispNode *LispTree::read_node(const vector<string> &tokens, intIndex &i) {
+  LispNode *node = new LispNode();
+  assert(i < len(tokens));
+  string s = tokens[i++];
+  if(is_left_paren(s)) {
+    char left_paren = s[0];
+    if(left_paren == '(') {
+      assert(i < len(tokens) && !is_paren(tokens[i]));
+      node->value = tokens[i++];
+    }
+    while(i < len(tokens) && !is_right_paren(tokens[i])) {
+      node->children.push_back(read_node(tokens, i));
+    }
+    assert(i < len(tokens));
+    s = tokens[i++];
+    assert(s == matching_right_paren(left_paren));
+  }
+  else if(is_right_paren(s))
+    assert(false);
+  else
+    node->value = s;
+  return node;
+}
+void LispTree::read(const char *file) {
+  ifstream in(file);
+  vector<string> tokens;
+  string token;
+  while(read_token(in, token)) {
+    tokens.push_back(token);
+  }
+  intIndex i = 0;
+  root = read_node(tokens, i);
+  assert(i == len(tokens));
+}
+void LispTree::print() const {
+  assert(root);
+  root->print(0);
+}

basic/lisp.h ADDED Viewed

	@@ -0,0 +1,34 @@

+#ifndef __LISP_H__
+#define __LISP_H__
+#include <vector>
+#include <string>
+#include "std.h"
+using namespace std;
+////////////////////////////////////////////////////////////
+struct LispNode {
+  void destroy();
+  void print(intIndex ind) const;
+  string value;
+  vector<LispNode *> children;
+};
+////////////////////////////////////////////////////////////
+struct LispTree {
+  LispTree() : root(NULL) { }
+  ~LispTree();
+  bool read_token(istream &in, string &s);
+  LispNode *read_node(const vector<string> &tokens, intIndex &i);
+  void read(const char *file);
+  void print() const;
+  LispNode *root;
+};
+#endif

basic/logging.cc ADDED Viewed

	@@ -0,0 +1,145 @@

+#include "logging.h"
+#include "opt.h"
+#include "mem.h"
+// The logging output has a tree structure, where each node is a
+// line of output, and the depth of a node is its indent level.
+// A run is the sequence of children of some node.
+// A subset of the lines in the run will get printed.
+////////////////////////////////////////////////////////////
+void Run::init() {
+  num_lines = 0;
+  num_lines_printed = 0;
+  next_line_to_print = 0;
+  print_all_lines = false;
+  timer.start();
+}
+void Run::finish() {
+  // Make it clear that this run is not printed.
+  // Otherwise, logss might think its
+  // parent was printed when it really wasn't.
+  next_line_to_print = -1;
+  timer.stop();
+}
+bool Run::new_line() {
+  bool p = print();
+  num_lines++;
+  if(!p) return false;
+  // We're going to print this line.  Now decide next line to print.
+  int ms_per_line = log_info.ms_per_line;
+  if(num_lines <= 2 ||   // Print first few lines anyway.
+     ms_per_line == 0 || // Print everything.
+     print_all_lines)    // Print every line in this run.
+    next_line_to_print++;
+  else {
+    timer.stop();
+    if(timer.ms == 0) // No time has elapsed.
+      next_line_to_print *= 2; // Exponentially increase time between lines.
+    else
+      next_line_to_print += max(int((double)num_lines * ms_per_line / timer.ms), 1);
+  }
+  num_lines_printed++;
+  return true;
+}
+////////////////////////////////////////////////////////////
+// Global information about logging.
+LogInfo::LogInfo() {
+  ms_per_line   = 0; //1000; // 1 second
+  max_ind_level = 3;
+  ind_level = 0;
+  buf       = "";
+  runs.resize(128);
+  timer.start();
+}
+LogInfo::~LogInfo() {
+  out.flush();
+}
+void LogInfo::init() {
+  if (log_file.empty()) {
+    out.open("/dev/stdout");
+  } else {
+    cout << "Logging to " << log_file << endl;
+    out.open(log_file.c_str());
+  }
+}
+LogInfo log_info;
+////////////////////////////////////////////////////////////
+// LogTracker:: For tracking functions or blocks.
+void LogTracker::begin(bool print_all_lines) {
+  if(_ind_within) {
+    if(log_info.this_run().print()) {
+      const string &s = descrip.str();
+      _logs(name);
+      if(s.size() > 0 && name[0])
+        lout << ": ";
+      lout << s;
+      lout.flush();
+      log_info.buf = " {\n"; // Open the block.
+      log_info.child_run().init();
+      log_info.child_run().print_all_lines = print_all_lines;
+    }
+    else {
+      log_info.max_ind_level = -log_info.max_ind_level; // Prevent children from outputting.
+      output_stopped = true;
+    }
+  }
+  log_info.ind_level++;
+}
+LogTracker::~LogTracker() {
+  log_info.ind_level--;
+  if(output_stopped)
+    log_info.max_ind_level = -log_info.max_ind_level; // Restore indent level.
+  if(_ind_within) {
+    if(log_info.this_run().new_line()) {
+      // Finish up child level.
+      log_info.ind_level++;
+      int n = log_info.this_run().num_omitted();
+      if(n > 0)
+        _logs("... " << n << " lines omitted ...\n");
+      log_info.ind_level--;
+      log_info.child_run().finish();
+      if(log_info.buf[0]) // Nothing was printed, because buf hasn't been emptied.
+        log_info.buf = ""; // Just pretend we didn't open the block.
+      else // Something indented was printed.
+        _logs("}"); // Close the block.
+      // Print time
+      Timer &ct = log_info.child_run().timer;
+      lout << " [" << ct;
+      if(log_info.ind_level > 0) {
+        Timer &tt = log_info.this_run().timer;
+        tt.stop();
+        lout << ", cumulative " << tt;
+      }
+      lout << "]\n";
+    }
+  }
+}
+// Options for logging.
+int _log_info_max_ind_level = opt_define_int_wrap("max-ind-level", &log_info.max_ind_level, log_info.max_ind_level, "Maximum indent level for logging", false);
+int _log_info_ms_per_line = opt_define_int_wrap("ms-per-line", &log_info.ms_per_line, log_info.ms_per_line, "Print a line out every this many milliseconds", false);
+string _log_info_log_file = opt_define_string_wrap("log", &log_info.log_file, log_info.log_file, "File to write log to (\"\" for stdout)", false);

basic/logging.h ADDED Viewed

	@@ -0,0 +1,122 @@

+#ifndef __LOGGING_H__
+#define __LOGGING_H__
+#include "std.h"
+#include "mem.h"
+#include "timer.h"
+#include "indent.h"
+////////////////////////////////////////////////////////////
+// State associated with a run.
+struct Run {
+  Run() { init(); }
+  bool print() const { return num_lines == next_line_to_print; }
+  int num_omitted() { return num_lines - num_lines_printed; }
+  bool new_line();
+  void init();
+  void finish();
+  int num_lines;          // Number of lines that we've gone through so far in this run.
+  int num_lines_printed;  // Number of lines actually printed.
+  int next_line_to_print; // Next line to be printed (lines are 0-based).
+  Timer timer;            // Keeps track of time spent on this run.
+  bool print_all_lines;   // Whether or not to force the printing of each line.
+};
+////////////////////////////////////////////////////////////
+// Global information about logging.
+struct LogInfo {
+  LogInfo();
+  ~LogInfo();
+  void init();
+  void hard_flush() { out.flush(); }
+  Run &parent_run() { return runs[ind_level-1]; }
+  Run &this_run()   { return runs[ind_level]; }
+  Run &child_run()  { return runs[ind_level+1]; }
+  // Parameters.
+  int max_ind_level; // Maximum indent level.
+  int ms_per_line;   // Number of milliseconds between consecutive lines of output.
+  string log_file;
+  // State.
+  ofstream out;
+  int ind_level; // Current indent level.
+  const char *buf; // The buffer to be flushed out the next time _logs is called.
+  vector<Run> runs; // Indent level -> state
+  Timer timer; // Timer that starts at the beginning of the program
+};
+extern LogInfo log_info;
+////////////////////////////////////////////////////////////
+#define lout (log_info.out)
+#define here lout << "HERE " << __FILE__ << ':' << __LINE__ << endl
+#define _ind_within        (log_info.ind_level   <= log_info.max_ind_level)
+#define _parent_ind_within (log_info.ind_level-1 <= log_info.max_ind_level)
+#define _logs(x) \
+  do { lout << log_info.buf << Indent(log_info.ind_level) << x; log_info.buf = ""; } while(0)
+#define logs(x) \
+  do { \
+    if(_ind_within && log_info.this_run().new_line()) { \
+      _logs(x << endl); \
+    } \
+  } while(0)
+// Output something if parent outputted something.
+// Subtle note: parent must have been a track, not logs, so its run
+// information has not been updated yet until it closes.
+// Therefore, calling print() on it is valid.
+#define logss(x) \
+  do { \
+    if(_parent_ind_within && log_info.parent_run().print()) { \
+      log_info.this_run().new_line(); \
+      _logs(x << endl); \
+    } \
+  } while(0)
+#define LOGS(x) _logs(x << endl)
+////////////////////////////////////////////////////////////
+// For tracking functions or blocks.
+struct LogTracker {
+  LogTracker(const char *name) : b(true), output_stopped(false), name(name) { }
+  void begin(bool print_all_lines);
+  ~LogTracker();
+  bool b; // Trick used in track_block to execute the for loop exactly once.
+  bool output_stopped;
+  const char *name;
+  ostringstream descrip;
+};
+#define track(name, x, all) \
+  LogTracker _lt(name); \
+  (_ind_within && log_info.this_run().print() && _lt.descrip << x), _lt.begin(all)
+#define track_block(name, x, all) \
+  for(LogTracker _lt(name); \
+      _lt.b && ((_ind_within && log_info.this_run().print() && _lt.descrip << x), _lt.begin(all), true); \
+      _lt.b = false)
+#define track_foridx(i, n, s, all) \
+  foridx(i, n) track_block(s, i << '/' << n, all)
+#define track_forvec(i, tx, x, vec, s, all) \
+  forvec(i, tx, x, vec) track_block(s, i << '/' << len(vec), all)
+#define init_log \
+  log_info.init(); \
+  track("main", to_vector(argv, argc), true); \
+  logs(now() << " on " << hostname() << " (" << cpu_speed_mhz() << "MHz)");
+#define prog_status \
+  "PROG_STATUS: " << \
+  "time = " << log_info.timer.stop() << \
+  ", memory = " << Mem(mem_usage()*1024)
+#endif

basic/mem-tracker.cc ADDED Viewed

	@@ -0,0 +1,53 @@

+#include "mem-tracker.h"
+#include "mem.h"
+/*
+ * Currently, memory tracking is not accurate.
+ * Alway underestimates.
+ */
+////////////////////////////////////////////////////////////
+long MemTracker::compute_mem_usage(const MemRecord &r) {
+  switch(r.type) {
+    list_types(define_case);
+    default: assert(0);
+  }
+  return 0;
+}
+long MemTracker::compute_mem_usage() {
+  long total_mem = 0;
+  forvec(_, MemRecord &, r, records) {
+    if(r.type != T_RAWNUMBER) r.mem = compute_mem_usage(r);
+    total_mem += r.mem;
+  }
+  return total_mem;
+}
+static bool record_less_than(const MemRecord &r1, const MemRecord &r2) {
+  return r1.mem > r2.mem;
+}
+void MemTracker::report_mem_usage() {
+  track("report_mem_usage()", "", true);
+  long total_mem = compute_mem_usage();
+  sort(records.begin(), records.end(), record_less_than);
+  forvec(_, const MemRecord &, r, records) {
+    logs(type_names[r.type] << ' ' << r.name << ": " <<
+         Mem(r.mem) << " (" << (double)r.mem/total_mem << ')');
+  }
+  logs("Total: " << Mem(total_mem));
+}
+////////////////////////////////////////////////////////////
+MemTracker mem_tracker;
+const char *MemTracker::type_names[] = {
+  "?",
+  list_types(define_str)
+};

basic/mem-tracker.h ADDED Viewed

	@@ -0,0 +1,132 @@

+#ifndef __MEM_TRACKER_H__
+#define __MEM_TRACKER_H__
+#include "std.h"
+#include "stl-basic.h"
+#include "union-set.h"
+#include "strdb.h"
+// Currently, memory tracking is not accurate.
+// Alway underestimates.
+// Call this function.  Don't use anything else.
+#define track_mem(x) mem_tracker.add(__STRING(x), x)
+#define list_types(f) \
+  f(IntVec) \
+  f(IntMat) \
+  f(IntIntMap) \
+  f(IntDoubleMap) \
+  f(IntIntPairMap) \
+  f(IntPairDoubleMap) \
+  f(IntSet) \
+  f(DoubleVec) \
+  f(DoubleVecVec) \
+  f(StrVec) \
+  f(StrIntMap) \
+  f(UnionSet) \
+  f(StrDB)
+#define prefix_t(type) T_##type,
+#define define_str(type) __STRING(type),
+#define define_add(type) \
+  void add(const char *name, const type &data) { \
+    records.push_back(MemRecord(name, T_##type, &data)); \
+  }
+#define define_case(type) \
+  case T_##type: return mem_usage(*((const type *)r.data));
+enum MemType { T_RAWNUMBER, list_types(prefix_t) };
+struct MemRecord {
+  MemRecord(const char *name, long mem) :
+    name(name), type(T_RAWNUMBER), data(NULL), mem(mem) { }
+  MemRecord(const char *name, MemType type, const void *data) :
+    name(name), type(type), data(data), mem(0) { }
+  string name;
+  MemType type;
+  const void *data;
+  long mem;
+};
+// Track amount of memory used.
+class MemTracker {
+public:
+  static const char *type_names[];
+  list_types(define_add)
+  void add(const char *name, long mem) {
+    records.push_back(MemRecord(name, mem));
+  }
+  long compute_mem_usage(const MemRecord &r);
+  long compute_mem_usage();
+  void report_mem_usage();
+private:
+  vector<MemRecord> records;
+};
+extern MemTracker mem_tracker;
+////////////////////////////////////////////////////////////
+// Various mem_usage() functions on various data types.
+template<class T> long mem_usage(const vector< vector< vector< vector<T> > > > &mat) { // matrix
+  long mem = 0;
+  foridx(i, len(mat)) {
+    foridx(j, len(mat[i])) {
+      foridx(k, len(mat[i][j]))
+        mem += len(mat[i][j][k]) * sizeof(T);
+      mem += len(mat[i][j]) * sizeof(vector<T>);
+    }
+    mem += len(mat[i]) * sizeof(vector<T>);
+  }
+  mem += len(mat) * sizeof(vector<T>);
+  return mem;
+}
+template<class T> long mem_usage(const vector< vector< vector<T> > > &mat) { // matrix
+  long mem = 0;
+  foridx(i, len(mat)) {
+    foridx(j, len(mat[i]))
+      mem += len(mat[i][j]) * sizeof(T);
+    mem += len(mat[i]) * sizeof(vector<T>);
+  }
+  mem += len(mat) * sizeof(vector<T>);
+  return mem;
+}
+template<class T> long mem_usage(const vector< vector<T> > &mat) { // matrix
+  long mem = 0;
+  foridx(i, len(mat))
+    mem += len(mat[i]) * sizeof(T);
+  mem += len(mat) * sizeof(vector<T>);
+  return mem;
+}
+template<class T> long mem_usage(const vector<T> &vec) { // vector
+  return len(vec) * sizeof(T);
+}
+template<class T> long mem_usage(const unordered_set<T> &set) { // hash_set
+  return (long)set.bucket_count()*4 + len(set)*(sizeof(T)+sizeof(void *));
+}
+template<class Tx, class Ty, class Hf, class Eq> long mem_usage(const unordered_map<Tx, Ty, Hf, Eq> &map) { // hash_map
+  return (long)map.bucket_count()*4 + len(map)*(sizeof(Tx)+sizeof(Ty)+sizeof(void *));
+}
+inline long mem_usage(const UnionSet &u) { // UnionSet
+  return mem_usage(u.parent);
+}
+inline long mem_usage(const StrDB &db) { // StrDB
+  long mem = mem_usage(db.s2i) + mem_usage(db.i2s);
+  foridx(i, len(db))
+    mem += (strlen(db[i])+1) * sizeof(char);
+  return mem;
+}
+#endif

basic/mem.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#ifndef __MEM_H__
+#define __MEM_H__
+// Takes memory is in bytes and formats it nicely
+struct Mem { Mem(long mem) : mem(mem) { } long mem; };
+inline ostream &operator<<(ostream &out, const Mem &m) {
+  unsigned long mem = m.mem;
+  if(mem < 1024)           out << mem;
+  else if(mem < 1024*1024) out << mem/1024 << 'K';
+  else                     out << mem/(1024*1024) << 'M';
+  return out;
+}
+#endif

basic/multi-ostream.cc ADDED Viewed

	@@ -0,0 +1,61 @@

+#include "multi-ostream.h"
+/*
+ * Create a multi_ostream, and you can add many files or any ostream objects
+ * to it.  The output sent to the multi_ostream will be redirected to the many
+ * destinations.
+ * Useful for logging to a file and stdout.
+ */
+#include <iostream>
+#include <fstream>
+#include <vector>
+using namespace std;
+multi_buf::~multi_buf() {
+  flush();
+  for(size_t i = 0; i < infos.size(); i++)
+    infos[i].destroy();
+}
+void multi_buf::add(ostream *out, bool own, bool hard) {
+  infos.push_back(ostream_info(out, own, hard));
+}
+void multi_buf::flush() {
+  for(size_t i = 0; i < infos.size(); i++) {
+    ostream_info &info = infos[i];
+    info.out->write(buf, buf_i);
+    info.out->flush();
+  }
+  buf_i = 0;
+}
+void multi_buf::hard_flush() {
+  for(size_t i = 0; i < infos.size(); i++) {
+    ostream_info &info = infos[i];
+    info.out->write(buf, buf_i);
+    if(info.hard)
+      ((hard_ofstream *)info.out)->hard_flush();
+    else
+      info.out->flush();
+  }
+  buf_i = 0;
+}
+int multi_buf::overflow(int ch) {
+  buf[buf_i++] = ch;
+  if(buf_i == sizeof(buf) || ch == '\n') flush();
+  return ch;
+}
+ostream &multi_ostream::flush() {
+  sbuf.flush();
+  return *this;
+}
+ostream &multi_ostream::hard_flush() {
+  sbuf.hard_flush();
+  return *this;
+}

basic/multi-ostream.h ADDED Viewed

	@@ -0,0 +1,67 @@

+#ifndef __MULTI_OSTREAM_H__
+#define __MULTI_OSTREAM_H__
+/*
+ * Create a multi_ostream, and you can add many files or any ostream objects
+ * to it.  The output sent to the multi_ostream will be redirected to the many
+ * destinations.
+ * Useful for logging to a file and stdout.
+ */
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include "hard-ofstream.h"
+using namespace std;
+struct ostream_info {
+  ostream_info(ostream *out, bool own, bool hard) : out(out), own(own), hard(hard) { }
+  ostream *out;
+  bool own; // Whether we own the ostream and should destroy it at the end.
+  bool hard; // Whether this is a hard_ofstream.
+  void destroy() { if(own) delete out; }
+};
+class multi_buf : public streambuf {
+public:
+  multi_buf() : buf_i(0) { }
+  ~multi_buf();
+  void flush();
+  void hard_flush();
+  void add(ostream *out, bool own, bool hard);
+  void remove_last() { flush(); infos.back().destroy(); infos.pop_back(); }
+protected:
+  virtual int overflow(int ch);
+private:
+  vector<ostream_info> infos;
+  char buf[16384];
+  int buf_i;
+};
+class multi_ostream : public basic_ostream<char, char_traits<char> > {
+public:
+  multi_ostream() : basic_ostream<char, char_traits<char> >(&sbuf) { }
+  virtual ostream &flush();
+  virtual ostream &hard_flush();
+  void add(const char *file, bool hard = false) {
+    ostream *out = hard ? new hard_ofstream(file) : new ofstream(file);
+    sbuf.add(out, true, hard);
+  }
+  void add(ostream *out) { sbuf.add(out, false, false); }
+  void remove_last() { sbuf.remove_last(); }
+private:
+  multi_buf sbuf;
+};
+#endif

basic/opt.cc ADDED Viewed

	@@ -0,0 +1,189 @@

+#include "opt.h"
+#include "std.h"
+#include "logging.h"
+#include <getopt.h>
+////////////////////////////////////////////////////////////////////////
+// command-line arguments
+void GetOpt::AddOpt(const string &name, bool has_arg) {
+  opts.push_back(pair<string, bool>(name, has_arg));
+}
+void GetOpt::Parse(int argc, char *argv[]) {
+  option *opt_list = new option[opts.size()+1];
+  for(int i = 0; i <= (int)opts.size(); i++) {
+    option *o = &opt_list[i];
+    if(i < (int)opts.size()) {
+      o->name = opts[i].first.c_str();
+      o->has_arg = opts[i].second;
+      //printf("N %s\n", o->name);
+    }
+    else {
+      o->name = NULL;
+      o->has_arg = 0;
+    }
+    o->flag = NULL;
+    o->val = 0;
+  }
+  int i;
+  values.clear();
+  values.resize(opts.size());
+  while(true) {
+    int status = getopt_long(argc, argv, "", opt_list, &i);
+    if(status == -1) break;
+    assert(status == 0);
+    //debug("%d %s -> %s\n", i, opt_list[i].name, optarg);
+    // put a 1 to signify that the argument exists
+    values[i] = optarg ? optarg : "1";
+  }
+  delete [] opt_list;
+}
+int GetOpt::Lookup(const string &name) const {
+  for(int i = 0; i < (int)opts.size(); i++) {
+    if(opts[i].first == name) return i;
+  }
+  return -1;
+}
+string GetOpt::Get(const string &name, const string &default_value) const {
+  int i = Lookup(name);
+  return i != -1 && !values[i].empty() ? values[i] : default_value;
+}
+string GetOpt::Get(const string &name) const {
+  string x = Get(name, "");
+  if(x.empty()) {
+    fprintf(stderr, "Missing required parameter `%s'.\n", name.c_str());
+    exit(1);
+  }
+  return x;
+}
+bool GetOpt::Exists(const string &name) const {
+  return !Get(name, "").empty();
+}
+int GetOpt::GetInt(const string &name) const {
+  int x;
+  int r = sscanf(Get(name).c_str(), "%d", &x);
+  assert(r == 1);
+  return x;
+}
+int GetOpt::GetInt(const string &name, int default_value) const {
+  return Exists(name) ? GetInt(name) : default_value;
+}
+double GetOpt::GetDouble(const string &name) const {
+  double x;
+  int r = sscanf(Get(name).c_str(), "%lf", &x);
+  assert(r == 1);
+  return x;
+}
+double GetOpt::GetDouble(const string &name, double default_value) const {
+  return Exists(name) ? GetDouble(name) : default_value;
+}
+////////////////////////////////////////////////////////////
+void process_opt(int argc, char *argv[]) {
+  GetOpt opt;
+  // set up GetOpt to parse
+  for(int i = 0; i < (int)bool_opts.size(); i++) {
+    opt.AddOpt(bool_opts[i].name, false);
+    opt.AddOpt("no" + bool_opts[i].name, false);
+  }
+  for(int i = 0; i < (int)int_opts.size(); i++)
+    opt.AddOpt(int_opts[i].name, true);
+  for(int i = 0; i < (int)double_opts.size(); i++)
+    opt.AddOpt(double_opts[i].name, true);
+  for(int i = 0; i < (int)string_opts.size(); i++)
+    opt.AddOpt(string_opts[i].name, true);
+  opt.AddOpt("help", false);
+  // parse
+  opt.Parse(argc, argv);
+  // print help if called for
+  if(opt.Exists("help") || !opt.Exists("text")) {
+    printf("usage: %s\n", argv[0]);
+    for(int i = 0; i < (int)bool_opts.size(); i++) {
+      const OptInfo<bool> &o = bool_opts[i];
+      printf(" %c%-20s: %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
+      if(!o.required) printf(" [%s]", *(o.var) ? "true" : "false");
+      printf("\n");
+    }
+    for(int i = 0; i < (int)int_opts.size(); i++) {
+      const OptInfo<int> &o = int_opts[i];
+      printf(" %c%-13s <int> : %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
+      if(!o.required) printf(" [%d]", *(o.var));
+      printf("\n");
+    }
+    for(int i = 0; i < (int)double_opts.size(); i++) {
+      const OptInfo<double> &o = double_opts[i];
+      printf(" %c%-13s <dbl> : %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
+      if(!o.required) printf(" [%f]", *(o.var));
+      printf("\n");
+    }
+    for(int i = 0; i < (int)string_opts.size(); i++) {
+      const OptInfo<string> &o = string_opts[i];
+      printf(" %c%-13s <str> : %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
+      if(!o.required) printf(" [%s]", (o.var)->c_str());
+      printf("\n");
+    }
+    exit(1);
+  }
+  // retrieve data; store the variables
+  for(int i = 0; i < (int)bool_opts.size(); i++) {
+    const OptInfo<bool> &o = bool_opts[i];
+    bool yes = opt.Exists(o.name);
+    bool no = opt.Exists("no" + o.name);
+    assert(!o.required || (yes || no));
+    assert(!yes || !no);
+    if(yes) *(o.var) = true;
+    if(no) *(o.var) = false;
+  }
+  for(int i = 0; i < (int)int_opts.size(); i++) {
+    const OptInfo<int> &o = int_opts[i];
+    *(o.var) = o.required ? opt.GetInt(o.name) : opt.GetInt(o.name, *(o.var));
+  }
+  for(int i = 0; i < (int)double_opts.size(); i++) {
+    const OptInfo<double> &o = double_opts[i];
+    *(o.var) = o.required ? opt.GetDouble(o.name) : opt.GetDouble(o.name, *(o.var));
+  }
+  for(int i = 0; i < (int)string_opts.size(); i++) {
+    const OptInfo<string> &o = string_opts[i];
+    *(o.var) = o.required ? opt.Get(o.name) : opt.Get(o.name, *(o.var));
+  }
+}
+void init_opt(int argc, char *argv[]) {
+  process_opt(argc, argv);
+  srand(rand_seed);
+}
+void print_opts() {
+  track("print_opts()", "", true);
+  forvec(_, const OptInfo<bool> &, o, bool_opts)
+    logs(o.name << " = " << (*o.var ? "true" : "false"));
+  forvec(_, const OptInfo<int> &, o, int_opts)
+    logs(o.name << " = " << *o.var);
+  forvec(_, const OptInfo<double> &, o, double_opts)
+    logs(o.name << " = " << *o.var);
+  forvec(_, const OptInfo<string> &, o, string_opts)
+    logs(o.name << " = " << *o.var);
+}
+////////////////////////////////////////////////////////////
+// Pre defined options.
+// allow user to specify a comment always, so some arbitrary description
+// of this program execution can be embedded in the command-line

basic/opt.h ADDED Viewed

	@@ -0,0 +1,100 @@

+#ifndef __OPT_H__
+#define __OPT_H__
+#include <vector>
+#include <string>
+#include <stdio.h>
+using namespace std;
+// First thing to call in main().
+void init_opt(int argc, char *argv[]);
+////////////////////////////////////////////////////////////////////////
+// command-line arguments
+class GetOpt {
+public:
+  GetOpt() { }
+  void AddOpt(const string &name, bool has_arg);
+  void Parse(int argc, char *argv[]);
+  int Lookup(const string &name) const;
+  bool Exists(const string &name) const;
+  string Get(const string &name, const string &default_value) const;
+  string Get(const string &name) const;
+  int GetInt(const string &name) const;
+  int GetInt(const string &name, int default_value) const;
+  double GetDouble(const string &name) const;
+  double GetDouble(const string &name, double default_value) const;
+private:
+  vector< pair<string, bool> > opts;
+  vector<string> values;
+};
+template<class T> struct OptInfo {
+  OptInfo(const string &name, T *var, const string &msg, bool required)
+    : name(name), var(var), msg(msg), required(required) { }
+  string name;
+  T *var; // location of the variable that stores this value
+  string msg;
+  bool required;
+};
+extern vector< OptInfo<bool> > bool_opts;
+extern vector< OptInfo<int> > int_opts;
+extern vector< OptInfo<double> > double_opts;
+extern vector< OptInfo<string> > string_opts;
+////////////////////////////////////////////////////////////
+// two versions: in one, option is required
+#define opt_define_bool_req(var, name, msg) \
+  bool var = opt_define_bool_wrap(name, &var, false, msg, true)
+#define opt_define_bool(var, name, val, msg) \
+  bool var = opt_define_bool_wrap(name, &var, val, msg, false)
+#define opt_define_int_req(var, name, msg) \
+  int var = opt_define_int_wrap(name, &var, 0, msg, true)
+#define opt_define_int(var, name, val, msg) \
+  int var = opt_define_int_wrap(name, &var, val, msg, false)
+#define opt_define_double_req(var, name, msg) \
+  double var = opt_define_double_wrap(name, &var, 0.0, msg, true)
+#define opt_define_double(var, name, val, msg) \
+  double var = opt_define_double_wrap(name, &var, val, msg, false)
+#define opt_define_string_req(var, name, msg) \
+  string var = opt_define_string_wrap(name, &var, "", msg, true)
+#define opt_define_string(var, name, val, msg) \
+  string var = opt_define_string_wrap(name, &var, val, msg, false)
+inline bool opt_define_bool_wrap(const string &name, bool *var, bool val, const string &msg, bool required) {
+  bool_opts.push_back(OptInfo<bool>(name, var, msg, required));
+  return val;
+}
+inline int opt_define_int_wrap(const string &name, int *var, int val, const string &msg, bool required) {
+  //printf("HELLO %s\n", name.c_str());
+  int_opts.push_back(OptInfo<int>(name, var, msg, required));
+  //printf("N %d\n", (int)int_opts.size());
+  return val;
+}
+inline double opt_define_double_wrap(const string &name, double *var, double val, const string &msg, bool required) {
+  double_opts.push_back(OptInfo<double>(name, var, msg, required));
+  return val;
+}
+inline string opt_define_string_wrap(const string &name, string *var, const string &val, const string &msg, bool required) {
+  string_opts.push_back(OptInfo<string>(name, var, msg, required));
+  return val;
+}
+////////////////////////////////////////////////////////////
+void print_opts();
+extern int rand_seed;
+extern string comment;
+extern int initC;
+#endif

basic/pipe.h ADDED Viewed

	@@ -0,0 +1,46 @@

+/*
+Execute another application, piping input to and from its stdin and stdout.
+*/
+#ifndef __PIPE_H__
+#define __PIPE_H__
+typedef pair<FILE *, FILE *> FILEPair;
+// Return input and output file pointers.
+// User is responsible for closing them.
+// May have to close out before reading from in.
+FILEPair create_pipe(char *const cmd[]) {
+  int p2c_fds[2], c2p_fds[2];
+  assert(pipe(p2c_fds) == 0);
+  assert(pipe(c2p_fds) == 0);
+  int pid = fork();
+  assert(pid != -1);
+  if(pid != 0) { // parent
+    close(p2c_fds[0]);
+    close(c2p_fds[1]);
+    FILE *in = fdopen(c2p_fds[0], "r");
+    FILE *out = fdopen(p2c_fds[1], "w");
+    assert(in && out);
+    return FILEPair(in, out);
+  }
+  else { // child
+    close(p2c_fds[1]);
+    close(c2p_fds[0]);
+    assert(dup2(p2c_fds[0], fileno(stdin)) != -1);
+    assert(dup2(c2p_fds[1], fileno(stdout)) != -1);
+    execvp(cmd[0], cmd);
+    // Execution should not reach here.
+    assert(0);
+    return FILEPair(NULL, NULL);
+  }
+}
+#endif

basic/prob-utils.cc ADDED Viewed

	@@ -0,0 +1,75 @@

+#include "prob-utils.h"
+double rand_gaussian(double mean, double var) {
+  // Use the Box-Muller Transformation
+  // if x_1 and x_2 are independent uniform [0, 1],
+  // then sqrt(-2 ln x_1) * cos(2*pi*x_2) is Gaussian with mean 0 and variance 1
+  double x1 = rand_double(), x2 = rand_double();
+  double z = sqrt(-2*log(x1))*cos(2*M_PI*x2);
+  return z * sqrt(var) + mean;
+}
+// The probability of heads is p.
+// Throw n coin tosses.
+// Return number of heads.
+int rand_binomial(int n, double p) {
+  int k = 0;
+  while(n--) k += rand_double() < p;
+  return k;
+}
+inline double factorial(int n) {
+  double ans = 1;
+  while(n > 1) ans *= n--;
+  return ans;
+}
+inline double choose(int n, int k) {
+  if(n-k < k) k = n-k;
+  double ans = 1;
+  for(int i = 0; i < k; i++) ans *= n-i;
+  ans /= factorial(k);
+  return ans;
+}
+double binomial_prob(int n, int k, double p) {
+  return choose(n, k) * pow(p, k) * pow(1-p, n-k);
+}
+int rand_index(const fvector &probs) {
+  double v = rand_double();
+  double sum = 0;
+  foridx(i, len(probs)) {
+    sum += probs[i];
+    if(v < sum) return i;
+  }
+  assert(0);
+}
+void norm_distrib(fvector &vec) {
+  double sum = 0;
+  foridx(i, len(vec)) sum += vec[i];
+  foridx(i, len(vec)) vec[i] /= sum;
+}
+void norm_distrib(fmatrix &mat, int c) {
+  double sum = 0;
+  foridx(r, len(mat)) sum += mat[r][c];
+  foridx(r, len(mat)) mat[r][c] /= sum;
+}
+void rand_distrib(fvector &probs, int n) {
+  probs.resize(n);
+  foridx(i, n) probs[i] = rand();
+  norm_distrib(probs);
+}
+IntVec rand_permutation(int n) {
+  IntVec perm(n);
+  foridx(i, n) perm[i] = i;
+  foridx(i, n) {
+    int j = mrand(i, n);
+    int t = perm[i]; perm[i] = perm[j]; perm[j] = t;
+  }
+  return perm;
+}

basic/prob-utils.h ADDED Viewed

	@@ -0,0 +1,19 @@

+#ifndef __PROB_UTILS__
+#define __PROB_UTILS__
+#include "stl-basic.h"
+int rand_binomial(int n, double p);
+int rand_index(const fvector &probs);
+double rand_gaussian(double mean, double var);
+inline double factorial(int n);
+inline double choose(int n, int k);
+double binomial_prob(int n, int k, double p);
+void norm_distrib(fvector &vec);
+void norm_distrib(fmatrix &mat, int c);
+void rand_distrib(fvector &probs, int n);
+IntVec rand_permutation(int n);
+#endif

basic/stats.cc ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include "stats.h"

basic/stats.h ADDED Viewed

	@@ -0,0 +1,71 @@

+#ifndef __STATS_H__
+#define __STATS_H__
+#include "std.h"
+#include "stl-basic.h"
+#define DBL_MAX 1e300
+#define DBL_MIN (-1e300)
+struct StatFig {
+  StatFig() { clear(); }
+  StatFig(double sum, int n) : sum(sum), n(n) { }
+  virtual ~StatFig() { }
+  static double F1(const StatFig &fig1, const StatFig &fig2) {
+    if(fig1.n == 0 || fig2.n == 0) return 0;
+    return 2*fig1.val()*fig2.val() / (fig1.val()+fig2.val());
+  }
+  void add()         { add(1); }
+  virtual void add(double v) { sum += v; n++; }
+  virtual void clear() { sum = n = 0; }
+  int size() const { return n; }
+  double val() const { return sum / n; }
+  double mean() const { return sum / n; }
+  double sum;
+  int n;
+};
+inline ostream &operator<<(ostream &out, const StatFig &fig) {
+  return out << fig.sum << '/' << fig.n << '=' << fig.val();
+}
+////////////////////////////////////////////////////////////
+// Stores the min and the amx
+struct BigStatFig : public StatFig {
+  BigStatFig() { clear(); }
+  void add(double v) { if(v < min) min = v; if(v > max) max = v; StatFig::add(v); }
+  void clear() { min = DBL_MAX; max = DBL_MIN; StatFig::clear(); }
+  double min, max;
+};
+inline ostream &operator<<(ostream &out, const BigStatFig &fig) {
+  return out << fig.n << ':' << fig.min << "/<< "  << fig.val() << " >>/" << fig.max;
+}
+////////////////////////////////////////////////////////////
+// Stores the standard deviation (and all points)
+struct FullStatFig : public BigStatFig {
+  FullStatFig() { clear(); }
+  virtual ~FullStatFig() { }
+  void add(double v) { data.push_back(v); BigStatFig::add(v); }
+  void clear() { data.clear(); BigStatFig::clear(); }
+  double variance() const {
+    double var = 0, mean = val();
+    forvec(_, double, v, data) var += sq(v-mean);
+    var /= n;
+    return var;
+  }
+  double stddev() const { return sqrt(variance()); }
+  DoubleVec data;
+};
+inline ostream &operator<<(ostream &out, const FullStatFig &fig) {
+  return out << (BigStatFig)fig << '~' << fig.stddev();
+}
+#endif

basic/std.cc ADDED Viewed

	@@ -0,0 +1,111 @@

+#include <sys/stat.h>
+#include <dirent.h>
+#include <unistd.h>
+#include "std.h"
+#include "str.h"
+#include "timer.h"
+// Return the current date/time.
+string now() {
+  time_t t = time(NULL);
+  return substr(ctime(&t), 0, -1);
+}
+string hostname() {
+  char buf[1024];
+  gethostname(buf, sizeof(buf));
+  return buf;
+}
+// Return the amount of memory (kB) used by this process
+long mem_usage() {
+  ifstream in("/proc/self/status");
+  if(!in) return 0;
+  char buf[1024];
+  static const char *key = "VmRSS";
+  while(in.getline(buf, sizeof(buf))) {
+    if(strncmp(buf, key, strlen(key)) != 0) continue;
+    char *s = strchr(buf, ':');
+    if(!s) return 0;
+    long x;
+    sscanf(s+1, "%ld", &x);
+    return x;
+  }
+  return -1;
+}
+// Return whether the file exists.
+bool file_exists(const char *file) {
+  return access(file, F_OK) == 0;
+}
+// Create an empty file.  Return success.
+bool create_file(const char *file) {
+  ofstream out(file);
+  if(!out) return false;
+  out.close();
+  return true;
+}
+time_t file_modified_time(const char *file) {
+  struct stat stat_buf;
+  if(stat(file, &stat_buf) != 0)
+    return 0;
+  return stat_buf.st_mtime;
+}
+// Return the cpu speed in MHz.
+int cpu_speed_mhz() {
+  ifstream in("/proc/cpuinfo");
+  if(!in) return 0;
+  char buf[1024];
+  static const char *key = "cpu MHz";
+  while(in.getline(buf, sizeof(buf))) {
+    if(strncmp(buf, key, strlen(key)) != 0) continue;
+    char *s = strchr(buf, ':');
+    if(!s) return 0;
+    double x;
+    sscanf(s+1, "%lf", &x);
+    return (int)x;
+  }
+  return 0;
+}
+// "file" -> "file"
+// "dir/file" -> "file"
+string strip_dir(string s) {
+  return substr(s, s.rfind('/')+1);
+}
+// "file" -> "file"
+// "dir/file" -> "dir"
+string get_dir(string s) {
+  int i = s.rfind('/');
+  return i == -1 ? "." : substr(s, 0, s.rfind('/'));
+}
+// "base" -> "base"
+// "base.ext" -> "base"
+string file_base(string s) {
+  int i = s.rfind('.');
+  return i == -1 ? s : substr(s, 0, i);
+}
+bool get_files_in_dir(string dirname, bool fullpath, vector<string> &files) {
+  DIR *dir = opendir(dirname.c_str());
+  if(!dir) return false;
+  while(true) {
+    dirent *ent = readdir(dir);
+    if(!ent) break;
+    // For some reason, sometimes files show up as d_type == DT_UNKNOWN, I
+    // think due to AFS issues
+    //cout << "FFF " << ent->d_name << ' ' << (int)ent->d_type << endl;
+    if(ent->d_type != DT_DIR) {
+      files.push_back((fullpath ? dirname+"/" : string()) + ent->d_name);
+    }
+  }
+  closedir(dir);
+  return true;
+}

basic/std.h ADDED Viewed

	@@ -0,0 +1,115 @@

+#ifndef __STD_H__
+#define __STD_H__
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+//#include <values.h>
+#include <limits.h>
+#include <string.h>
+#include <iostream>
+#include <algorithm>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+using namespace std;
+typedef long intIndex;
+#define INT_SIZED(x)    assert((x) < 2147483648L)
+////////////////////////////////////////////////////////////
+#define len(vec) (intIndex)(vec).size()
+#define sq(x) ((x)*(x))
+// For loop sugar.  This is such a hack!
+#define foridx(i, n)                  for(intIndex i = 0; i < n; i++)
+#define forsidx(i, n)                 for(int i = 0; i < n; i++)
+#define forvec(i, tx, x, vec)         for(intIndex i = 0, _##i = 0; i < len(vec); i++) \
+                                      for(tx x = (vec)[i]; i == _##i; _##i++)
+#define formap(tx, x, ty, y, t, map)  forstl(t, _##x##y, map) _mapvars(tx, x, ty, y)
+#define forcmap(tx, x, ty, y, t, map) forcstl(t, _##x##y, map) _mapvars(tx, x, ty, y)
+#define forstl(t, x, container)       for(t::iterator x = (container).begin(); x != (container).end(); x++)
+#define forcstl(t, x, container)      for(t::const_iterator x = (container).begin(); x != (container).end(); x++)
+#define _mapvars(tx, x, ty, y)        for(tx x = _##x##y->first, *_##x = &x; _##x; _##x = NULL) \
+                                      for(ty y = _##x##y->second, *_##y = &y; _##y; _##y = NULL)
+////////////////////////////////////////////////////////////
+// Generate random numbers.
+inline intIndex mrand(intIndex a)        { return rand() % a; }
+inline intIndex mrand(intIndex a, intIndex b) { return rand() % (b-a) + a; }
+inline double rand_double() {
+  static const intIndex BASE = 100000;
+  return (double)(rand()%BASE)/BASE;
+}
+////////////////////////////////////////////////////////////
+// Floating point stuff.
+const double TOL = 1e-10;
+inline bool flt(double u, double v) { return u + TOL < v; }
+inline bool fgt(double u, double v) { return u - TOL > v; }
+// Comparing floating point numbers.
+inline bool feq(double u, double v, double tol = TOL) { return fabs(u-v) < tol; }
+template <class T> inline intIndex sign(T u) {
+  if(u < 0) return -1;
+  if(u > 0) return 1;
+  return 0;
+}
+#define assert_feq(u, v) do { _assert_feq(u, v, __FILE__, __LINE__); } while(0);
+#define assert_feq2(u, v, tol) do { _assert_feq(u, v, tol, __FILE__, __LINE__); } while(0);
+#define assert_fneq(u, v) do { _assert_fneq(u, v, __FILE__, __LINE__); } while(0);
+inline void _assert_feq(double u, double v, const char *file, int line) {
+  if(!feq(u, v)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); }
+}
+inline void _assert_feq(double u, double v, double tol, const char *file, int line) {
+  if(!feq(u, v, tol)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); }
+}
+inline void _assert_fneq(double u, double v, const char *file, int line) {
+  if(feq(u, v)) { printf("At %s:%d, %f == %f\n", file, line, u, v); assert(0); }
+}
+#define assert_eq(u, v) do { _assert_eq(u, v, __STRING(u), __STRING(v), __FILE__, __LINE__); } while(0)
+template<class T> inline void _assert_eq(const T &u, const T &v, const char *us, const char *vs, const char *file, int line) {
+  if(u != v) {
+    cout << "At " << file << ':' << line << ", " <<
+            us << '(' << u << ')' << " != " <<
+            vs << '(' << v << ')' << endl;
+    assert(0);
+  }
+}
+#define assert2(x, reason) \
+  do { \
+    if(!(x)) { \
+      cout << "\nFAILURE REASON: " << reason << endl; \
+      assert(x); \
+    } \
+  } while(0)
+string now();
+string hostname();
+int cpu_speed_mhz();
+long mem_usage(); // in kB
+bool create_file(const char *file);
+bool file_exists(const char *file);
+time_t file_modified_time(const char *file);
+string strip_dir(string s);
+string get_dir(string s);
+string file_base(string s);
+bool get_files_in_dir(string dirname, bool fullpath, vector<string> &files);
+#endif

basic/stl-basic.cc ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include "stl-basic.h"

basic/stl-basic.h ADDED Viewed

	@@ -0,0 +1,113 @@

+#ifndef __STL_BASIC_H__
+#define __STL_BASIC_H__
+#include "std.h"
+#include "city.h"
+////////////////////////////////////////////////////////////
+typedef double real;
+//typedef float real;
+typedef pair<int, int> IntPair;
+typedef pair<int, real> IntDouble;
+typedef pair<real, int> DoubleInt;
+typedef pair<real, real> DoublePair;
+typedef vector<IntPair> IntPairVec;
+typedef vector<DoubleInt> DoubleIntVec;
+typedef vector<bool> BoolVec;
+typedef vector<int> IntVec;
+typedef vector<string> StringVec;
+typedef vector<IntVec> IntMat;
+typedef vector<IntVec> IntVecVec;
+typedef vector<IntVecVec> IntVecVecVec;
+typedef vector<IntVecVecVec> IntVecVecVecVec;
+typedef vector<real> DoubleVec;
+typedef vector<DoubleVec> DoubleVecVec;
+typedef vector<DoubleVecVec> DoubleVecVecVec;
+typedef vector<DoubleVecVecVec> DoubleVecVecVecVec;
+typedef vector<IntDouble> IntDoubleVec;
+typedef vector<IntDoubleVec> IntDoubleVecVec;
+typedef vector<IntDoubleVecVec> IntDoubleVecVecVec;
+typedef vector<IntDoubleVecVecVec> IntDoubleVecVecVecVec;
+typedef IntVec ivector;
+typedef DoubleVec fvector;
+typedef DoubleVecVec fmatrix;
+////////////////////////////////////////////////////////////
+struct vector_eq {
+  bool operator()(const IntVec &v1, const IntVec &v2) const {
+    return v1 == v2;
+  }
+};
+struct vector_hf {
+  size_t operator()(const IntVec &v) const {
+    return CityHash64(reinterpret_cast<const char*>(&v[0]), sizeof(int) * v.size());
+#if 0
+    int h = 0;
+    foridx(i, len(v))
+      h = (h<<4)^(h>>28)^v[i];
+    return h;
+#endif
+  }
+};
+struct pair_eq {
+  bool operator()(const IntPair &p1, const IntPair &p2) const {
+    return p1 == p2;
+  }
+};
+struct pair_hf {
+  size_t operator()(const IntPair &p) const {
+    return (p.first<<4)^(p.first>>28) ^ p.second;
+  }
+};
+struct str_eq {
+  bool operator()(const char *s1, const char *s2) const {
+    return strcmp(s1, s2) == 0;
+  }
+};
+struct str_hf {
+  size_t operator()(const char *s) const {
+    return CityHash64(s, strlen(s));
+  }
+};
+struct string_eq {
+  bool operator()(const string &s1, const string &s2) const {
+    return s1 == s2;
+  }
+};
+struct string_hf {
+  size_t operator()(const string &s) const {
+    return CityHash64(s.c_str(), s.size());
+  }
+};
+////////////////////////////////////////////////////////////
+typedef unordered_set<int> IntSet;
+typedef unordered_set<IntPair, pair_hf, pair_eq> IntPairSet;
+typedef unordered_set<IntVec, vector_hf, vector_eq> IntVecSet;
+typedef unordered_map<IntVec, real, vector_hf, vector_eq> IntVecDoubleMap;
+typedef unordered_map<IntVec, int, vector_hf, vector_eq> IntVecIntMap;
+typedef unordered_map<int, int> IntIntMap;
+typedef unordered_map<int, real> IntDoubleMap;
+typedef unordered_map<int, IntPair> IntIntPairMap;
+typedef unordered_map<int, IntVec> IntIntVecMap;
+typedef unordered_map<int, IntIntMap> IntIntIntMapMap;
+typedef unordered_map<IntPair, int, pair_hf, pair_eq> IntPairIntMap;
+typedef unordered_map<IntPair, real, pair_hf, pair_eq> IntPairDoubleMap;
+typedef unordered_map<IntPair, DoubleVec, pair_hf, pair_eq> IntPairDoubleVecMap;
+typedef unordered_map<IntVec, IntVec, vector_hf, vector_eq> IntVecIntVecMap;
+typedef unordered_map<IntVec, DoubleVec, vector_hf, vector_eq> IntVecDoubleVecMap;
+typedef vector<IntIntMap> IntIntMapVec;
+typedef vector<const char *> StrVec;
+typedef unordered_map<const char *, int, str_hf, str_eq> StrIntMap;
+typedef unordered_map<const char *, const char *, str_hf, str_eq> StrStrMap;
+#endif

basic/stl-utils.cc ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include "stl-utils.h"

basic/stl-utils.h ADDED Viewed

	@@ -0,0 +1,232 @@

+#ifndef __STL_UTILS__
+#define __STL_UTILS__
+#include "stl-basic.h"
+#include <stdarg.h>
+#define contains(X, x) ((X).find(x) != (X).end())
+inline void improve(DoubleInt &x, const DoubleInt &y) {
+  if(y.first > x.first) x = y; // Bigger is better.
+}
+template<class Compare> inline void improve(DoubleInt &x, const DoubleInt &y, Compare compare) {
+  if(compare(y.first, x.first)) x = y;
+}
+// Free up the memory in a vector or hash_map.
+template<class T> void destroy(T &obj) {
+  T empty_obj;
+  obj.swap(empty_obj);
+}
+template<class T> int index_of(const vector<T> &vec, const T &x, int i0 = 0) {
+  for(int i = i0; i < len(vec); i++)
+    if(vec[i] == x) return i;
+  return -1;
+}
+template<class T> int count_of(const vector<T> &vec, const T &x) {
+  int n = 0;
+  forvec(_, const T &, y, vec)
+    if(x == y) n++;
+  return n;
+}
+// Get vec[i], but if i is out of range, expand the vector and fill
+// everything with x.
+template<class T> T &expand_get(vector<T> &vec, int i, const T &x) {
+  int n = len(vec);
+  if(i >= n) {
+    vec.resize(i+1);
+    for(int ii = n; ii <= i; ii++) vec[ii] = x;
+  }
+  return vec[i];
+}
+template<class T> T &expand_get(vector< vector<T> > &mat, int i, int j, const T &x) {
+  int n = len(mat);
+  if(i >= n) mat.resize(i+1);
+  return expand_get(mat[i], j, x);
+}
+template<class T> T &expand_get(vector< vector< vector<T> > > &mat, int i, int j, int k, const T &x) {
+  int n = len(mat);
+  if(i >= n) mat.resize(i+1);
+  return expand_get(mat[i], j, k, x);
+}
+// Assuming this vector/matrix will not grow any more,
+// we can safely call compact to reduce the memory usage.
+// This is only effective after deletions.
+// This isn't necessary if we haven't actually touched
+// the memory past size (i.e., we didn't have a bigger
+// structure).
+template<class T> void vector_compact(vector<T> &vec) {
+  vector<T> new_vec(len(vec));
+  new_vec = vec;
+  vec.swap(new_vec);
+}
+template<class T> void matrix_compact(vector< vector<T> > &mat) {
+  vector< vector<T> > new_mat(len(mat));
+  foridx(i, len(mat)) compact(mat[i]);
+  new_mat = mat;
+  mat.swap(new_mat);
+}
+// Append to a vector and return the value type.
+template<class T> inline T &push_back(vector<T> &vec, const T &x = T()) {
+  vec.push_back(x);
+  return vec[len(vec)-1];
+}
+template<class T> inline void matrix_resize(vector< vector<T> > &mat, int nr, int nc) {
+  mat.resize(nr);
+  foridx(r, nr) mat[r].resize(nc);
+}
+template<class T> inline void matrix_resize(vector< vector< vector<T> > > &mat, int n1, int n2, int n3) {
+  mat.resize(n1);
+  foridx(i, n1) {
+    mat[i].resize(n2);
+    foridx(j, n2)
+      mat[i][j].resize(n3);
+  }
+}
+template<class T> inline vector< vector<T> > new_matrix(int nr, int nc, T v) {
+  vector< vector<T> > mat;
+  mat.resize(nr);
+  foridx(r, nr) {
+    mat[r].resize(nc);
+    foridx(c, nc)
+      mat[r][c] = v;
+  }
+  return mat;
+}
+template<class T> inline void matrix_fill(vector< vector<T> > &mat, T v) {
+  foridx(i, len(mat)) vector_fill(mat[i], v);
+}
+template<class T> inline void vector_fill(vector<T> &vec, T v) {
+  foridx(i, len(vec)) vec[i] = v;
+}
+template<class T> inline T vector_sum(const vector<T> &vec) {
+  T sum = 0;
+  foridx(i, len(vec)) sum += vec[i];
+  return sum;
+}
+// Returns the index of the minimum element in vec.
+template<class T> inline int vector_index_min(const vector<T> &vec) {
+  T min = vec[0];
+  int best_i = 0;
+  foridx(i, len(vec)) {
+    if(vec[i] < min) {
+      min = vec[i];
+      best_i = i;
+    }
+  }
+  return best_i;
+}
+template<class T> inline int vector_min(const vector<T> &vec) {
+  return vec[vector_index_min(vec)];
+}
+// Returns the index of the maximum element in vec.
+template<class T> inline intIndex vector_index_max(const vector<T> &vec) {
+  T max = vec[0];
+  int best_i = 0;
+  forsidx(i, len(vec)) {
+    if(vec[i] > max) {
+      max = vec[i];
+      best_i = i;
+    }
+  }
+  return best_i;
+}
+template<class T> inline int vector_max(const vector<T> &vec) {
+  return vec[vector_index_max(vec)];
+}
+// Returns the index of the maximum element in vec.
+template<class T> inline IntPair matrix_index_max(const vector< vector<T> > &mat) {
+  T max = mat[0][0];
+  IntPair best_ij = IntPair(0, 0);
+  forsidx(i, len(mat)) {
+    forsidx(j, len(mat[i])) {
+      if(mat[i][j] > max) {
+        max = mat[i][j];
+        best_ij = IntPair(i, j);
+      }
+    }
+  }
+  return best_ij;
+}
+// Returns the sum of the elements in column c.
+template<class T> inline T matrix_col_sum(const vector< vector<T> > &mat, int c) {
+  T sum = 0;
+  foridx(r, len(mat)) sum += mat[r][c];
+  return sum;
+}
+template<class T1, class T2> ostream &operator<<(ostream &out, const pair<T1, T2> &p) {
+  return out << p.first << ' ' << p.second;
+}
+template<class T> ostream &operator<<(ostream &out, const vector<T> &vec) {
+  foridx(i, len(vec)) {
+    if(i > 0) out << ' ';
+    out << vec[i];
+  }
+  return out;
+}
+template<class T> ostream &operator<<(ostream &out, const vector< vector<T> > &mat) {
+  foridx(r, len(mat)) out << mat[r] << endl;
+  return out;
+}
+template<class T> vector<T> subvector(const vector<T> &vec, intIndex i, intIndex j = -1) {
+  intIndex N = len(vec);
+  if(j < 0) j += N;
+  if(j < i) j = i;
+  // Probably some fancy STL way to do this.
+  vector<T> subvec(j-i);
+  foridx(k, j-i) subvec[k] = vec[i+k];
+  return subvec;
+}
+template<class T> vector<T> to_vector(T arr[], int n) {
+  vector<T> vec(n);
+  foridx(i, n) vec[i] = arr[i];
+  return vec;
+}
+inline IntVec to_vector(int n, ...) {
+  va_list ap;
+  IntVec vec;
+  va_start(ap, n);
+  foridx(i, n) vec.push_back(va_arg(ap, int));
+  va_end(ap);
+  return vec;
+}
+inline DoubleVec to_fvector(int n, ...) {
+  va_list ap;
+  DoubleVec vec;
+  va_start(ap, n);
+  foridx(i, n) vec.push_back(va_arg(ap, double));
+  va_end(ap);
+  return vec;
+}
+template<class T> inline void operator+=(vector<T> &vec1, const vector<T> &vec2) {
+  foridx(i, len(vec1)) vec1[i] += vec2[i];
+}
+#endif

basic/str-str-db.cc ADDED Viewed

	@@ -0,0 +1,35 @@

+#include "str-str-db.h"
+#include "std.h"
+#include "str.h"
+#include "strdb.h"
+StrStrDB::~StrStrDB() {
+  destroy_strings(s2t);
+}
+// File format: lines of <t>\t<s>\t<...junk...>
+void StrStrDB::read(const char *file) {
+  track("StrStrDB::read()", file, true);
+  char buf[1024];
+  ifstream in(file);
+  assert2(in, file);
+  // Read the s2t for each word.
+  max_t_len = 0;
+  while(in.getline(buf, sizeof(buf))) {
+    char *t = strtok(buf, "\t");
+    char *s = strtok(NULL, "\t");
+    assert(s && t);
+    assert2(!contains(s2t, s), s << " appears twice");
+    s2t[copy_str(s)] = copy_str(t);
+    max_t_len = max(max_t_len, (int)strlen(t));
+  }
+  logs("Read " << len(s2t) << " strings");
+  logs("Longest mapped string is " << max_t_len << " characters.");
+}
+const char *StrStrDB::operator[](const char *word) const {
+  StrStrMap::const_iterator it = s2t.find(word);
+  return it == s2t.end() ? "" : it->second;
+}

basic/str-str-db.h ADDED Viewed

	@@ -0,0 +1,19 @@

+#ifndef __STR_STR_DB_H__
+#define __STR_STR_DB_H__
+#include "stl-basic.h"
+// Maps strings (s) to strings (t).
+class StrStrDB {
+public:
+  ~StrStrDB();
+  void read(const char *file);
+  const char *operator[](const char *s) const;
+  int max_t_len;
+private:
+  StrStrMap s2t;
+};
+#endif

basic/str.cc ADDED Viewed

	@@ -0,0 +1,91 @@

+#include "stl-basic.h"
+#include <stdarg.h>
+string substr(const string &s, int i, int j) {
+  if(i < 0) i += len(s);
+  if(j < 0) j += len(s);
+  i = max(i, 0);
+  j = max(j, i);
+  return s.substr(i, j-i);
+}
+string substr(const string &s, int i) {
+  return substr(s, i, len(s));
+}
+string str_printf(const char *fmt, ...) {
+  char buf[16384];
+  va_list ap;
+  va_start(ap, fmt);
+  vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+  return buf;
+}
+char *copy_str(const char *s) {
+  char *t = new char[strlen(s)+1];
+  strcpy(t, s);
+  return t;
+}
+string int2str(int x) {
+  return str_printf("%d", x);
+}
+string double2str(double x) {
+  ostringstream os;
+  os << x;
+  return os.str();
+}
+StringVec split(const char *str, const char *delims, bool keep_empty) {
+  StringVec vec; // Store the result.
+  // Build quick lookup table.
+  BoolVec is_delim(256);
+  for(const char *p = delims; *p; p++) is_delim[*p] = true;
+  is_delim['\0'] = true;
+  const char *end = str;
+  while(true) {
+    if(is_delim[*end]) {
+      if(keep_empty || end-str > 0) // Extract token.
+        vec.push_back(string(str, end-str));
+      str = end+1;
+    }
+    if(!*end) break;
+    end++;
+  }
+  return vec;
+}
+StrVec mutate_split(char *str, const char *delims) {
+  StrVec vec;
+  for(char *p = strtok(str, delims); p; p = strtok(NULL, delims))
+    vec.push_back(p);
+  return vec;
+}
+// Remove leading and trailing white space.
+char *trim(char *s) {
+  // Removing leading spaces.
+  while(*s && isspace(*s)) s++;
+  // Remove trailing spaces.
+  char *t;
+  for(t = s+strlen(s)-1; t != s && isspace(*t); t--);
+  t[1] = '\0';
+  return s;
+}
+string tolower(const char *s) {
+  string t = s;
+  foridx(i, len(t)) t[i] = tolower(t[i]);
+  return t;
+}
+// String matching with brute force.
+int index_of(const char *s, const char *t) {
+  int ns = strlen(s), nt = strlen(t);
+  foridx(i, ns-nt+1)
+    if(strncmp(s+i, t, nt) == 0) return i;
+  return -1;
+}

basic/str.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#ifndef __STR_H__
+#define __STR_H__
+#include "stl-basic.h"
+string substr(const string &s, int i, int j);
+string substr(const string &s, int i);
+string str_printf(const char *fmt, ...);
+char *copy_str(const char *s);
+string int2str(int x);
+string double2str(double x);
+StringVec split(const char *str, const char *delims, bool keep_empty);
+StrVec mutate_split(char *str, const char *delims);
+char *trim(char *s);
+string tolower(const char *s);
+int index_of(const char *s, const char *t);
+#endif

basic/strdb.cc ADDED Viewed

	@@ -0,0 +1,209 @@

+#include "strdb.h"
+#include "str.h"
+void destroy_strings(StrVec &vec) {
+  foridx(i, len(vec))
+    delete [] vec[i];
+}
+void destroy_strings(StrStrMap &map) {
+  typedef const char *const_char_ptr;
+  StrVec strs;
+  formap(const_char_ptr, s, const_char_ptr, t, StrStrMap, map) {
+    strs.push_back(s);
+    strs.push_back(t);
+  }
+  destroy_strings(strs);
+}
+////////////////////////////////////////////////////////////
+int StrDB::read(istream &in, int N, bool one_way) {
+  char s[16384];
+  clear();
+  while(size() < N && in >> s) {
+    if(one_way) i2s.push_back(copy_str(s));
+    else (*this)[s];
+  }
+  logs(size() << " strings read");
+  return size();
+}
+int StrDB::read(const char *file, bool one_way) {
+  track("StrDB::read()", file << ", one_way=" << one_way, true);
+  ifstream in(file);
+  assert(in);
+  return read(in, INT_MAX, one_way);
+}
+void StrDB::write(ostream &out) {
+  foridx(i, size())
+    out << i2s[i] << endl;
+  logs(size() << " strings written");
+}
+void StrDB::write(const char *file) {
+  track("StrDB::write()", file, true);
+  ofstream out(file);
+  write(out);
+}
+const char *StrDB::operator[](int i) const {
+  assert(i >= 0 && i < len(i2s));
+  return i2s[i];
+}
+int StrDB::lookup(const char *s, bool incorp_new, int default_i) {
+  StrIntMap::const_iterator it = s2i.find(s);
+  if(it != s2i.end()) return it->second;
+  if(incorp_new) {
+    char *t = copy_str(s);
+    int i = s2i[t] = len(i2s);
+    i2s.push_back(t);
+    return i;
+  }
+  else
+    return default_i;
+}
+IntVec StrDB::lookup(const StrVec &svec) {
+  IntVec ivec(len(svec));
+  foridx(i, len(svec))
+    ivec[i] = lookup(svec[i], true, -1);
+  return ivec;
+}
+int StrDB::operator[](const char *s) const {
+  StrIntMap::const_iterator it = s2i.find(s);
+  if(it != s2i.end()) return it->second;
+  return -1;
+}
+int StrDB::operator[](const char *s) {
+  return lookup(s, true, -1);
+}
+ostream &operator<<(ostream &out, const StrDB &db) {
+  foridx(i, len(db)) out << db[i] << endl;
+  return out;
+}
+////////////////////////////////////////////////////////////
+int IntPairIntDB::lookup(const IntPair &p, bool incorp_new, int default_i) {
+  IntPairIntMap::const_iterator it = p2i.find(p);
+  if(it != p2i.end()) return it->second;
+  if(incorp_new) {
+    int i = p2i[p] = len(i2p);
+    i2p.push_back(p);
+    return i;
+  }
+  else
+    return default_i;
+}
+int IntPairIntDB::read(istream &in, int N) {
+  assert(size() == 0);
+  int a, b;
+  while(size() < N && in >> a >> b)
+    (*this)[IntPair(a, b)];
+  return size();
+}
+void IntPairIntDB::write(ostream &out) {
+  forvec(_, const IntPair &, p, i2p)
+    out << p.first << ' ' << p.second << endl;
+}
+////////////////////////////////////////////////////////////
+int IntVecIntDB::lookup(const IntVec &v, bool incorp_new, int default_i) {
+  IntVecIntMap::const_iterator it = v2i.find(v);
+  if(it != v2i.end()) return it->second;
+  if(incorp_new) {
+    int i = v2i[v] = len(i2v);
+    i2v.push_back(v);
+    return i;
+  }
+  else
+    return default_i;
+}
+////////////////////////////////////////////////////////////
+// A text is basically a string of words.
+// Normally, we just read the strings from file, put them in db,
+// and call back func.
+// But if the db already exists and the strings have been converted
+// into integers (i.e., <file>.{strdb,int} exist), then use those.
+// If incorp_new is false, then words not in db will just get passed -1.
+typedef void int_func(int a);
+void read_text(const char *file, int_func *func, StrDB &db, bool read_cached, bool write_cached, bool incorp_new) {
+  track("read_text()", file, true);
+  string strdb_file = string(file)+".strdb";
+  string int_file = string(file)+".int";
+  // Use the cached strdb and int files only if they exist and they are
+  // newer than the text file.
+  read_cached &= file_exists(strdb_file.c_str()) &&
+                 file_exists(int_file.c_str()) &&
+                 file_modified_time(strdb_file.c_str()) > file_modified_time(file) &&
+                 file_modified_time(int_file.c_str()) > file_modified_time(file);
+  if(read_cached) {
+    // Read from strdb and int.
+    assert(db.size() == 0); // db must be empty because we're going to clobber it all
+    db.read(strdb_file.c_str(), true);
+    track_block("", "Reading from " << int_file, false) {
+      ifstream in(int_file.c_str());
+      char buf[16384];
+      while(true) {
+        in.read(buf, sizeof(buf));
+        if(in.gcount() == 0) break;
+        assert(in.gcount() % sizeof(int) == 0);
+        for(int buf_i = 0; buf_i < in.gcount(); buf_i += 4) {
+          int a = *((int *)(buf+buf_i));
+          assert(a >= 0 && a < db.size());
+          func(a);
+        }
+      }
+    }
+  }
+  else {
+    track_block("", "Reading from " << file, false) {
+      // Write to strdb and int.
+      ifstream in(file);
+      ofstream out;
+      if(write_cached) {
+        out.open(int_file.c_str());
+        if(!out) write_cached = false;
+      }
+      if(write_cached) logs("Writing to " << int_file);
+      char s[16384];
+      char buf[16384]; int buf_i = 0; // Output buffer
+      while(in >> s) { // Read a string
+        int a = db.lookup(s, incorp_new, -1);
+        if(func) func(a);
+        if(write_cached) {
+          if(buf_i + sizeof(a) > sizeof(buf)) { // Flush buffer if full
+            out.write(buf, buf_i);
+            buf_i = 0;
+          }
+          *((int *)(buf+buf_i)) = a;
+          buf_i += sizeof(a);
+        }
+      }
+      if(write_cached) // Final flush
+        out.write(buf, buf_i);
+    }
+    if(write_cached && create_file(strdb_file.c_str()))
+      db.write(strdb_file.c_str());
+  }
+}

basic/strdb.h ADDED Viewed

	@@ -0,0 +1,101 @@

+#ifndef __STRDB_H__
+#define __STRDB_H__
+#include "std.h"
+#include "stl-basic.h"
+#include "stl-utils.h"
+#include "logging.h"
+void destroy_strings(StrVec &vec);
+void destroy_strings(StrStrMap &map);
+// Map between strings and integers.
+// Strings must not have spaces in them.
+// File format: strings, one per line.  Assume strings are distinct.
+struct StrDB {
+  StrDB() { }
+  ~StrDB() { destroy_strings(); }
+  int read(istream &in, int n, bool one_way);
+  int read(const char *file, bool one_way);
+  void write(ostream &out);
+  void write(const char *file);
+  intIndex size() const   { return len(i2s); }
+  void clear()       { destroy_strings(); i2s.clear(); s2i.clear(); }
+  void destroy()     { destroy_strings(); ::destroy(i2s); ::destroy(s2i); }
+  void destroy_s2i() { ::destroy(s2i); }
+  void clear_keep_strings() { i2s.clear(); s2i.clear(); }
+  const char *operator[](int i) const;
+  int operator[](const char *s) const;
+  int operator[](const char *s);
+  int lookup(const char *s, bool incorp_new, int default_i);
+  IntVec lookup(const StrVec &svec);
+  bool exists(const char *s) const { return s2i.find(s) != s2i.end(); }
+  // /usr/bin/top might not show the memory reduced.
+  void destroy_strings() { ::destroy_strings(i2s); }
+  StrVec i2s;
+  StrIntMap s2i;
+};
+ostream &operator<<(ostream &out, const StrDB &db);
+////////////////////////////////////////////////////////////
+// Map between IntPairs and ints.
+struct IntPairIntDB {
+  IntPair operator[](int i) const { return i2p[i]; }
+  int operator[](const IntPair &p) { return lookup(p, true, -1); }
+  int lookup(const IntPair &p, bool incorp_new, int default_i);
+  intIndex size() const { return len(i2p); }
+  int read(istream &in, int N);
+  void write(ostream &out);
+  IntPairIntMap p2i;
+  IntPairVec i2p;
+};
+////////////////////////////////////////////////////////////
+// Map between IntVecs and ints.
+struct IntVecIntDB {
+  const IntVec &operator[](int i) const { return i2v[i]; }
+  int operator[](const IntVec &v) { return lookup(v, true, -1); }
+  int lookup(const IntVec &v, bool incorp_new, int default_i);
+  intIndex size() const { return len(i2v); }
+  IntVecIntMap v2i;
+  IntVecVec i2v;
+};
+////////////////////////////////////////////////////////////
+#if 0
+// Map between IntArrays and ints.  Arrays terminate with -1.
+struct IntArrayIntDB {
+  int *operator[](int i) const { return i2a[i]; }
+  int operator[](const IntArray &a) { return lookup(a, true, -1); }
+  int lookup(const IntArray &a, bool incorp_new, int default_i);
+  int size() const { return len(i2a); }
+  int read(istream &in, int N);
+  void write(ostream &out);
+  hash_map<int *, int, intarray_hf, intarray_eq> p2i;
+  vector<int *> i2a;
+};
+#endif
+////////////////////////////////////////////////////////////
+typedef void int_func(int a);
+void read_text(const char *file, int_func *func, StrDB &db, bool read_cached, bool write_cached, bool incorp_new);
+#endif

basic/timer.cc ADDED Viewed

	@@ -0,0 +1,11 @@

+#include "timer.h"
+ostream &operator<<(ostream &out, const Timer &timer) {
+  int ms = timer.ms;
+  int m = ms / 60000; ms %= 60000;
+  int h = m / 60; m %= 60;
+  if(h > 0) out << h << 'h';
+  if(h > 0 || m > 0) out << m << 'm';
+  out << ms/1000.0 << 's';
+  return out;
+}

basic/timer.h ADDED Viewed

	@@ -0,0 +1,35 @@

+#ifndef __TIMER_H__
+#define __TIMER_H__
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <iostream>
+using namespace std;
+struct Timer {
+  Timer() { }
+  Timer(int ms) : ms(ms) { }
+  //void start() { clock_gettime(0, &start_time); }
+  void start() { gettimeofday(&start_time, NULL); }
+  Timer &stop() {
+    //clock_gettime(0, &end_time);
+    gettimeofday(&end_time, NULL);
+    ms = Timer::to_ms(end_time) - Timer::to_ms(start_time);
+    return *this;
+  }
+  //static int to_ms(const timespec &tv) { return tv.tv_sec*1000 + tv.tv_nsec/1000000; }
+  static int to_ms(const timeval &tv) { return tv.tv_sec*1000 + tv.tv_usec/1000; }
+  //timespec start_time;
+  //timespec end_time;
+  timeval start_time;
+  timeval end_time;
+  int ms;
+};
+ostream &operator<<(ostream &out, const Timer &timer);
+#endif

basic/union-set.cc ADDED Viewed

	@@ -0,0 +1,29 @@

+#include "union-set.h"
+void UnionSet::Init(int n) {
+  parent.resize(n);
+  for(int v = 0; v < n; v++)
+    parent[v] = v;
+}
+// return whether u and v are in the same connected component;
+// connect them if they aren't
+bool UnionSet::Do(int u, int v, bool doit) {
+  int ru = GetRoot(u);
+  int rv = GetRoot(v);
+  if(ru == rv) return true;
+  if(doit) parent[ru] = rv;
+  return false;
+}
+int UnionSet::GetRoot(int v) {
+  int rv = v;
+  while(parent[rv] != rv)
+    rv = parent[rv];
+  while(v != rv) {
+    int pv = parent[v];
+    parent[v] = rv;
+    v = pv;
+  }
+  return rv;
+}

basic/union-set.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#ifndef __UNION_SET_H__
+#define __UNION_SET_H__
+#include <vector>
+using namespace std;
+struct UnionSet {
+  UnionSet() { }
+  UnionSet(int n) { Init(n); }
+  void Init(int n);
+  bool Join(int u, int v) { return Do(u, v, true); }
+  bool InSameSet(int u, int v) { return Do(u, v, false); }
+  bool Do(int u, int v, bool doit);
+  int GetRoot(int v);
+  vector<int> parent;
+};
+#endif

cluster-viewer/LICENSE ADDED Viewed

	@@ -0,0 +1,22 @@

+The MIT License (MIT)
+Copyright (c) 2014 Chris Dyer and Brendan O'Connor
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

cluster-viewer/README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+This code generates an HTML viewer for the clustering tree generated, similar to [this clustering of the words in a corpus of English Twitter data](http://www.ark.cs.cmu.edu/TweetNLP/cluster_viewer.html).
+## Instructions
+The `wcluster` tool generates a directory with a file called `paths` that contains the bit string representations of the clustering tree, e.g.
+        000000  Westfalenpokalfinale    10
+        000000  Heimpunktspiel  10
+        000000  Jugendhallenturnier     10
+        ...
+The script `cluster-viewer/build-viewer.sh` creates an HTML visualization of the contents of this file. You can run it with as follows:
+	./cluster-viewer/build-viewer.sh corpus.out/paths
+This command creates a directory called `clusters/` containing the HTML viewer. Specify an alternative directory as follows:
+	./cluster-viewer/build-viewer.sh corpus.out/paths /some/other/output-dir
+## Requirements
+ * Python must be in your path
+## Acknowledgements
+These scripts were originally written by [Brendan O'Connor](http://brenocon.com/) and extended by [Chris Dyer](http://www.cs.cmu.edu/~cdyer/).

cluster-viewer/build-viewer.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/bin/bash
+set -e
+CODEDIR=`dirname $0`/code
+if [ "$#" -lt "1" ] || [ "$#" -gt "2" ]
+then
+  echo "Usage: $0 path/to/clusters.out/paths [outdir]" 1>&2
+  echo 1>&2
+  echo "Builds an HTML cluster viewer." 1>&2
+  echo 1>&2
+  exit
+fi
+MAPFILE=$1
+CATCMD=cat
+if [[ "$MAPFILE" == *.gz ]]
+then
+  CATCMD='gunzip -c'
+fi
+OUTDIR=clusters
+if [ $# -eq 2 ]
+then
+  OUTDIR=$2
+fi
+echo "Creating output in $OUTDIR ..." 1>&2
+mkdir -p $OUTDIR
+mkdir -p $OUTDIR/paths
+$CATCMD $MAPFILE | python $CODEDIR/make_html.py $CODEDIR $OUTDIR > $OUTDIR/htmlrows.html
+python $CODEDIR/final.py $CODEDIR $OUTDIR > $OUTDIR/cluster_viewer.html
+echo "Done. View clusters in $OUTDIR/cluster_viewer.html" 1>&2

cluster-viewer/code/final.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import sys
+template = open(sys.argv[1] + '/template.html').read()
+final = template
+final = final.replace('STYLE', open(sys.argv[1] + '/style.css').read())
+htmlrows = open(sys.argv[2] + '/htmlrows.html').read()
+final = final.replace('TABLE', htmlrows)
+print(final)

cluster-viewer/code/htmlrows.html ADDED Viewed

	@@ -0,0 +1,18 @@

+    <tr>
+    <td class=path>^<a target=_blank href="paths/000000.html">000000</a> <span class=count>(3)</span>
+    <td class=words><span class=w>Westfalenpokalfinale</span> <span class=w>Heimpunktspiel</span> <span class=w>Jugendhallenturnier</span>
+</tr>
+    <tr>
+    <td class=path>^<a target=_blank href="paths/0000010.html">0000010</a> <span class=count>(3)</span>
+    <td class=words><span class=w>Friesendorf</span> <span class=w>Fallenstellen</span> <span class=w>Strafjustizsystem</span>
+</tr>
+    <tr>
+    <td class=path>^<a target=_blank href="paths/00000110.html">00000110</a> <span class=count>(3)</span>
+    <td class=words><span class=w>Gewerbeflächenkonzept</span> <span class=w>Musikprotokoll</span> <span class=w>Familienbetreuungszentrum</span>
+</tr>

cluster-viewer/code/make_html.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import sys,itertools
+style = open(sys.argv[1] + '/style.css').read()
+def get_word_rows():
+    for line in sys.stdin:
+        path, word, count = line.split('\t')
+        count = int(count)
+        yield path,word,count
+def get_cluster_rows():
+    for path, rows in itertools.groupby(get_word_rows(), key=lambda x: x[0]):
+        wordcounts = [(w,c) for _,w,c in rows]
+        wordcounts.sort(key=lambda w_c: -w_c[1])
+        yield path, len(wordcounts), wordcounts[:50], wordcounts
+def htmlescape(s):
+    return s.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
+def wc_table(wordcounts, tdword=''):
+    r = ['<table>']
+    for i,(w,c) in enumerate(wordcounts):
+        r.append('<tr><td>{} <td class="{}">{} <td class=tdcount>{:,}'.format(i+1, tdword, htmlescape(w), c))
+    r.append('</table>')
+    return '\n'.join(r)
+def top(wc, th):
+  cutoff = int(wc[0][1] * th)
+  res = []
+  for (w,c) in wc:
+    if c > cutoff: res.append((w,c))
+  return res
+for path, nwords, wordcounts, allwc in get_cluster_rows():
+    # wc1 = ' '.join("<span class=w>{w}</span>&thinsp;<span class=c>[{c}]</span>".format(
+    #     w=htmlescape(w), c=c) for w,c in wordcounts)
+    wc1 = ' '.join("<span class=w>{w}</span>".format(
+        w=htmlescape(w)) for w,c in top(wordcounts, 0.01))
+    print("""
+    <tr>
+    <td class=path>^<a target=_blank href="paths/{path}.html">{path}</a> <span class=count>({nwords})</span>
+    <td class=words>{wc}
+    """.format(path=path, nwords=nwords, wc=wc1))
+    print("</tr>")
+    with open(sys.argv[2] + '/paths/{path}.html'.format(**locals()),'w') as f:
+        print("""<style>{style}</style>""".format(**locals()), file=f)
+        print("""<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">""", file=f)
+        print("<a href=../cluster_viewer.html>back to cluster viewer</a>", file=f)
+        print("<h1>cluster path {path}</h1>".format(path=path), file=f)
+        print("{n:,} words, {t:,} tokens".format(n=nwords, t=sum(c for w,c in allwc)), file=f)
+        print("<a href='#freq'>freq</a> <a href='#alpha'>alpha</a> <a href='#suffix'>suffix</a>", file=f)
+        print("<a name=freq><h2>Words in frequency order</h2></a>", file=f)
+        allwc.sort(key=lambda w_c: (-w_c[1],w_c[0]))
+        print(wc_table(allwc), file=f)
+        # wc1 = ' '.join("<span class=w>{w}</span>&nbsp;<span class=c>({c})</span>".format(
+        #     w=htmlescape(w), c=c) for w,c in allwc)
+        # print>>f, wc1
+        print("<a name=alpha><h2>Words in alphabetical order</h2></a>", file=f)
+        allwc.sort(key=lambda w_c1: (w_c1[0],-w_c1[1]))
+        print(wc_table(allwc), file=f)
+        print("<a name=suffix><h2>Words in suffix order</h2></a>", file=f)
+        allwc.sort(key=lambda w_c2: (list(reversed(w_c2[0])),-w_c2[1]))
+        print(wc_table(allwc, tdword='suffixsort'), file=f)
+        # wc1 = ' '.join("<span class=w>{w}</span>&nbsp;<span class=c>({c})</span>".format(
+        #     w=htmlescape(w), c=c) for w,c in allwc)
+        # print>>f, wc1

cluster-viewer/code/style.css ADDED Viewed

	@@ -0,0 +1,9 @@

+table { border-collapse:collapse; border-spacing:0; }
+body { font-family: times; font-size: 11pt; }
+td { border: 1px solid gray; padding:2px 8px; }
+th { border: 1px solid gray; padding:2px 8px; }
+.count { font-size:9pt; color: solid gray; }
+.c { font-size:7pt; color: solid gray; }
+.tdcount { text-align:right }
+.info { font-size: 12pt; }
+.suffixsort { text-align: right }

cluster-viewer/code/template.html ADDED Viewed

	@@ -0,0 +1,22 @@

+<html>
+  <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+<style>
+STYLE
+</style>
+<h1>Word cluster viewer</h1>
+<div class=info>
+Word cluster viewer.
+</div>
+<p>
+<table>
+  <tr>
+    <th>Cluster path (and word type count)
+    <th>Words (most frequent)
+  </tr>
+  TABLE
+</table>
+</html>

input.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+the cat chased the mouse
+the dog chased the cat
+the mouse chased the dog

output.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+0	the	6
+10	chased	3
+110	dog	2
+1110	mouse	2
+1111	cat	2