meta-toolkit
Showing 325 changed files with 9,416 additions and 2,486 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -8,9 +8,11 @@ install:
     - set MSYSTEM=MINGW64
     - bash -lc ""
     - bash -lc "pacman --noconfirm --needed -Sy bash pacman pacman-mirrors msys2-runtime msys2-runtime-devel"
+    # kludge for error: "mingw-w64-x86_64-gcc: /mingw64/etc/gdbinit exists in filesystem"
+    - bash -lc "rm -f /mingw64/etc/gdbinit"
     # we don't actually need ada, fortran, libgfortran, or objc, but in
     # order to update gcc we need to also update those packages as well...
-    - bash -lc "pacman --noconfirm -S mingw-w64-x86_64-{gcc,gcc-ada,gcc-fortran,gcc-libgfortran,gcc-objc,cmake,make,icu,jemalloc,zlib}"
+    - bash -lc "pacman --noconfirm --needed -S mingw-w64-x86_64-{gcc,gcc-ada,gcc-fortran,gcc-libgfortran,gcc-objc,cmake,make,icu,jemalloc,zlib}"
 before_build:
     - set MSYSTEM=MINGW64
     - cd C:\projects\meta

diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ doc/
 data/ceeaus
 data/breast-cancer
 data/housing
+data/cranfield
 biicode.conf
 bii/
 bin/
diff --git a/.travis.yml b/.travis.yml
@@ -65,40 +65,55 @@ matrix:
             - gcc-6
             - g++-6
 
-    # Linux/Clang 3.6
+    # Linux/Clang 3.8
     - os: linux
-      env: COMPILER=clang CLANG_VERSION=3.6
+      env: COMPILER=clang CLANG_VERSION=3.8
       addons:
         apt:
           sources:
             - ubuntu-toolchain-r-test
-            - llvm-toolchain-precise-3.6
+            - llvm-toolchain-precise-3.8
           packages:
             - *default-packages
-            - clang-3.6
-            - llvm-3.6-dev
+            - clang-3.8
 
-    # OS X 10.9 + Xcode 6.1
-    - os: osx
-      env: COMPILER=clang
+    # Linux/Clang 3.8 + libc++-3.9
+    # (I want this to be 3.9 across the board, but the apt source is not
+    # yet whitelisted for llvm 3.9)
+    - os: linux
+      env:
+        - COMPILER=clang
+        - CLANG_VERSION=3.8
+        - LLVM_TAG=RELEASE_390
+        - LIBCXX_EXTRA_CMAKE_FLAGS=-DLIBCXX_INSTALL_EXPERIMENTAL_LIBRARY=On
+        - CMAKE_VERSION=3.4.3
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+            - llvm-toolchain-precise-3.8
+          packages:
+            - *default-packages
+            - clang-3.8
 
-    # OS X 10.10 + Xcode 6.4
+    # OS X 10.10 + Xcode 6.4 (this is the only 10.10 image on Travis)
     - os: osx
       osx_image: xcode6.4
       env: COMPILER=clang
 
-    # OS X 10.10 + Xcode 7.1.1
+    # OS X 10.11 + Xcode 7.3
     - os: osx
-      osx_image: xcode7.1
+      osx_image: xcode7.3
       env: COMPILER=clang
 
-    # OS X 10.11 + Xcode 7.2
+    # OS X 10.12 + Xcode 8.2
     - os: osx
-      osx_image: xcode7.2
+      osx_image: xcode8.2
       env: COMPILER=clang
 
-    # OS X/GCC 6
+    # OS X/GCC 7
     - os: osx
+      osx_image: xcode7.3
       env: COMPILER=gcc
 
 install:
@@ -112,5 +127,4 @@ before_script:
 
 script:
   - git submodule update --init --recursive
-  - ../travis/cmake.sh Debug && make -j2 && make clean
-  - rm -rf CMake* && ../travis/cmake.sh Release && make -j2 && ./unit-test --reporter=spec
+  - ../travis/cmake.sh Debug && make -j2 && make clean && rm -rf CMake* && ../travis/cmake.sh Release && make -j2 && ./unit-test --reporter=spec
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,205 @@
+# [v3.0.2][3.0.2]
+## Bug fixes
+- Fix issues using `MAKE_NUMERIC_IDENTIFIER` instead of
+  `MAKE_NUMERIC_IDENTIFIER_UDL` on GCC 7.1.1.
+- Work around (what we assume is) a bug on MSYS2 where `cmake` would link
+  in additional exception handling libraries that would cause a crash
+  during indexing by building the `mman-win32` library as shared.
+- Silence fallthrough warnings on Clang from `murmur_hash`.
+
+# [v3.0.1][3.0.1]
+## New features
+- Add an optional `xz{i,o}fstream` to `meta::io` if compiled with liblzma
+  available.
+- `util::disk_vector<const T>` can now be used to specify a read-only view
+  of a disk-backed vector.
+
+## Bug fixes
+- `ir_eval::print_stats` now takes a `num_docs` parameter to properly
+  display evaluation metrics at a certain cutoff point, which was always 5
+  beforehand. This fixes a bug in `query-runner` where the stats were not
+  being computed according to the cutoff point specified in the
+  configuration.
+- `ir_eval::avg_p` now correctly stops computing after `num_docs`. Before,
+  if you specified `num_docs` as a smaller value than the size of the
+  result list, it would erroneously keep calculating until the end of the
+  result list instead of stopping after `num_docs` elements.
+- `{inverted,forward}_index` can now be loaded from read-only filesystems.
+
+# [v3.0.0][3.0.0]
+## New features
+- Add an `embedding_analyzer` that represents documents with their averaged word
+  vectors.
+- Add a `parallel::reduction` algorithm designed for parallelizing complex
+  accumulation operations (like an E step in an EM algorithm)
+- Parallelize feature counting in feature selector using the new
+  `parallel::reduction`
+- Add a `parallel::for_each_block` algorithm to run functions on
+  (relatively) equal sub-ranges of an iterator range in parallel
+- Add a parallel merge sort as `parallel::sort`
+- Add a `util/traits.h` header for general useful traits
+- Add a Markov model implementation in `sequence::markov_model`
+- Add a generic unsupervised HMM implementation. This implementation
+  supports HMMs with discrete observations (what is used most often) and
+  sequence observations (useful for log mining applications). The
+  forward-backward algorithm is implemented using both the scaling method
+  and the log-space method. The scaling method is used by default, but the
+  log-space method is useful for HMMs with sequence observations to avoid
+  underflow issues when the output probabilities themselves are very small.
+- Add the KL-divergence retrieval function using pseudo-relevance feedback
+  with the two-component mixture-model approach of Zhai and Lafferty,
+  called `kl_divergence_prf`. This ranker internally can use any
+  `language_model_ranker` subclass like `dirichlet_prior` or
+  `jelinek_mercer` to perform the ranking of the feedback set and the
+  result documents with respect to the modified query.
+
+  The EM algorithm used for the two-component mixture model is provided as
+  the `index::feedback::unigram_mixture` free function and returns the
+  feedback model.
+- Add the Rocchio algorithm (`rocchio`) for pseudo-relevance feedback in
+  the vector space model.
+- **Breaking Change.** To facilitate the above to changes, we have also
+  broken the `ranker` hierarchy into one more level. At the top we have
+  `ranker`, which has a pure virtual function `rank()` that can be
+  overridden to provide entirely custom ranking behavior, This is the class
+  the KL-divergence and Rocchio methods derive from, as we need to
+  re-define what it means to rank documents (first retrieving a feedback
+  set, then ranking documents with respect to an updated query).
+
+  Most of the time, however, you will want to derive from the second level
+  `ranking_function`, which is what was called `ranker` before. This class
+  provides a definition of `rank()` to perform document-at-a-time ranking,
+  and expects deriving classes to instead provide `initial_score()` and
+  `score_one()` implementations to define the scoring function used for
+  each document. **Existing code that derived from `ranker` prior to this
+  version of MeTA likely needs to be changed to instead derive from
+  `ranking_function`.**
+- Add the `util::transform_iterator` class and `util::make_transform_iterator`
+  function for providing iterators that transform their output according to
+  a unary function.
+- **Breaking Change.** `whitespace_tokenizer` now emits *only* word tokens
+  by default, suppressing all whitespace tokens. The old default was to
+  emit tokens containing whitespace in addition to actual word tokens. The
+  old behavior can be obtained by passing `false` to its constructor, or
+  setting `suppress-whitespace = false` in its configuration group in
+  `config.toml.` (Note that whitespace tokens are still needed if using a
+  `sentence_boundary` filter but, in nearly all circumstances,
+  `icu_tokenizer` should be preferred.)
+- **Breaking Change.** Co-occurrence counting for embeddings now uses
+  history that crosses sentence boundaries by default. The old behavior
+  (clearing the history when starting a new sentence) can be obtained by
+  ensuring that a tokenizer is being used that emits sentence boundary tags
+  and by setting `break-on-tags = true` in the `[embeddings]` table of
+  `config.toml`.
+- **Breaking Change.** All references in the embeddings library to "coocur"
+  are have changed to "cooccur". This means that some files and binaries
+  have been renamed. Much of the co-occurrence counting part of the
+  embeddings library has also been moved to the public API.
+- Co-occurrence counting now is performed in parallel. Behavior of its
+  merge strategy can be configured with the new `[embeddings]` config
+  parameter `merge-fanout = n`, which specifies the maximum number of
+  on-disk chunks to allow before kicking off a multi-way merge (default 8).
+
+## Enhancements
+- Add additional `packed_write` and `packed_read` overloads: for
+  `std::pair`, `stats::dirichlet`, `stats::multinomial`,
+  `util::dense_matrix`, and `util::sparse_vector`
+- Additional functions have been added to `ranker_factory` to allow
+  construction/loading of language_model_ranker subclasses (useful for the
+  `kl_divergence_prf` implementation)
+- Add a `util::make_fixed_heap` helper function to simplify the declaration
+  of `util::fixed_heap` classes with lambda function comparators.
+- Add regression tests for rankers MAP and NDCG scores. This adds a new
+  dataset `cranfield` that contains non-binary relevance judgments to
+  facilitate these new tests.
+- Bump bundled version of ICU to 58.2.
+
+## Bug Fixes
+- Fix bug in NDCG calculation (ideal-DCG was computed using the wrong
+  sorting order for non-binary judgments)
+- Fix bug where the final chunks to be merged in index creation were not
+  being deleted when merging completed
+- Fix bug where GloVe training would allocate the embedding matrix before
+  starting the shuffling process, causing it to exceed the "max-ram"
+  config parameter.
+- Fix bug with consuming MeTA from a build directory with `cmake` when
+  building a static ICU library. `meta-utf` is now forced to be a shared
+  library, which (1) should save on binary sizes and (2) ensures that the
+  statically build ICU is linked into the `libmeta-utf.so` library to avoid
+  undefined references to ICU functions.
+- Fix bug with consuming Release-mode MeTA libraries from another project
+  being built in Debug mode. Before, `identifiers.h` would change behavior
+  based on the `NDEBUG` macro's setting. This behavior has been removed,
+  and opaque identifiers are always on.
+
+## Deprecation
+- `disk_index::doc_name` and `disk_index::doc_path` have been deprecated in
+  favor of the more general (and less confusing) `metadata()`. They will be
+  removed in a future major release.
+- Support for 32-bit architectures is provided on a best-effort basis. MeTA
+  makes heavy use of memory mapping, which is best paired with a 64-bit
+  address space. Please move to a 64-bit platform for using MeTA if at all
+  possible (most consumer machines should support 64-bit if they were made
+  in the last 5 years or so).
+
+# [v2.4.2][2.4.2]
+## Bug Fixes
+- Properly shuffle documents when doing an even-split classification test
+- Make forward indexer listen to `indexer-num-threads` config option.
+- Use correct number of threads when deciding block sizes for
+    `parallel_for`
+- Add workaround to `filesystem::remove_all` for Windows systems to avoid
+    spurious failures caused by virus scanners keeping files open after we
+    deleted them
+- Fix invalid memory access in `gzstreambuf::underflow`
+
+# [v2.4.1][2.4.1]
+## Bug fixes
+- Eliminate excess warnings on Darwin about double preprocessor definitions
+- Fix issue finding `config.h` when used as a sub-project via
+    add_subdirectory()
+
+# [v2.4.0][2.4.0]
+## New features
+- Add a minimal perfect hashing implementation for `language_model`, and unify
+  the querying interface with the existing language model.
+- Add a CMake `install()` command to install MeTA as a library (issue #143). For
+  example, once the library is installed, users can do:
+
+    ```
+    find_package(MeTA 2.4 REQUIRED)
+
+    add_executable(my-program src/my_program.cpp)
+    target_link_libraries(my-program meta-index) # or whatever other libs you
+    need from MeTA
+    ```
+- Feature selection functionality added to `multiclass_dataset` and
+  `binary_dataset` and views (issues #111, #149 and PR #150 thanks to @siddshuk).
+
+  ```cpp
+    auto selector = features::make_selector(*config, training_vw);
+    uint64_t total_features_selected = 20;
+    selector->select(total_features_selected);
+    auto filtered_dset = features::filter_dataset(dset, *selector);
+  ```
+- Users can now, similar to `hash_append`, declare standalone functions in the
+  same scope as their type called `packed_read` and `packed_write` which will be
+  called by `io::packed::read` and `io::packed::write`, respectively, via
+  argument-dependent lookup.
+
+## Bug fixes
+- Fix edge-case bug in the succinct data structures
+- Fix off-by-one error in `lm::diff`
+
+## Enhancements
+- Added functionality to the `meta::hashing` library: `hash_append` overload for
+  `std::vector`, manually-seeded hash function
+- Further isolate ICU in MeTA to allow CMake to `install()`
+- Updates to EWS (UIUC) build guide
+- Add `std::vector` operations to `io::packed`
+- Consolidated all variants of chunk iterators into one template
+- Add MeTA's citation to the README!
+
 # [v2.3.0][2.3.0]
 ## New features
 - Forward and inverted indexes are now stored in one directory. **To make
@@ -435,7 +637,13 @@
 # [v1.0][1.0]
 - Initial release.
 
-[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.3.0...develop
+[unreleased]: https://github.com/meta-toolkit/meta/compare/v3.0.2...develop
+[3.0.2]: https://github.com/meta-toolkit/meta/compare/v3.0.1...v3.0.2
+[3.0.1]: https://github.com/meta-toolkit/meta/compare/v3.0.0...v3.0.1
+[3.0.0]: https://github.com/meta-toolkit/meta/compare/v2.4.2...v3.0.0
+[2.4.2]: https://github.com/meta-toolkit/meta/compare/v2.4.1...v2.4.2
+[2.4.1]: https://github.com/meta-toolkit/meta/compare/v2.4.0...v2.4.1
+[2.4.0]: https://github.com/meta-toolkit/meta/compare/v2.3.0...v2.4.0
 [2.3.0]: https://github.com/meta-toolkit/meta/compare/v2.2.0...v2.3.0
 [2.2.0]: https://github.com/meta-toolkit/meta/compare/v2.1.0...v2.2.0
 [2.1.0]: https://github.com/meta-toolkit/meta/compare/v2.0.1...v2.1.0