Merge branch 'develop' for MeTA v3.0.1

meta-toolkit · Mar 13, 2017 · bfdb910 · bfdb910
2 parents 2297108 + e4a2224
commit bfdb910
Show file tree

Hide file tree

Showing 23 changed files with 512 additions and 89 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -96,19 +96,19 @@ matrix:
             - *default-packages
             - clang-3.8
 
-    # OS X 10.10 + Xcode 7.1.1
+    # OS X 10.10 + Xcode 6.4 (this is the only 10.10 image on Travis)
     - os: osx
-      osx_image: xcode7.1
+      osx_image: xcode6.4
       env: COMPILER=clang
 
     # OS X 10.11 + Xcode 7.3
     - os: osx
       osx_image: xcode7.3
       env: COMPILER=clang
 
-    # OS X 10.11 + Xcode 8
+    # OS X 10.12 + Xcode 8.2
     - os: osx
-      osx_image: xcode8
+      osx_image: xcode8.2
       env: COMPILER=clang
 
     # OS X/GCC 6

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,22 @@
+# [v3.0.1][3.0.1]
+## New features
+- Add an optional `xz{i,o}fstream` to `meta::io` if compiled with liblzma
+  available.
+- `util::disk_vector<const T>` can now be used to specify a read-only view
+  of a disk-backed vector.
+
+## Bug fixes
+- `ir_eval::print_stats` now takes a `num_docs` parameter to properly
+  display evaluation metrics at a certain cutoff point, which was always 5
+  beforehand. This fixes a bug in `query-runner` where the stats were not
+  being computed according to the cutoff point specified in the
+  configuration.
+- `ir_eval::avg_p` now correctly stops computing after `num_docs`. Before,
+  if you specified `num_docs` as a smaller value than the size of the
+  result list, it would erroneously keep calculating until the end of the
+  result list instead of stopping after `num_docs` elements.
+- `{inverted,forward}_index` can now be loaded from read-only filesystems.
+
 # [v3.0.0][3.0.0]
 ## New features
 - Add an `embedding_analyzer` that represents documents with their averaged word
@@ -609,7 +628,8 @@
 # [v1.0][1.0]
 - Initial release.
 
-[unreleased]: https://github.com/meta-toolkit/meta/compare/v3.0.0...develop
+[unreleased]: https://github.com/meta-toolkit/meta/compare/v3.0.1...develop
+[3.0.1]: https://github.com/meta-toolkit/meta/compare/v3.0.0...v3.0.1
 [3.0.0]: https://github.com/meta-toolkit/meta/compare/v2.4.2...v3.0.0
 [2.4.2]: https://github.com/meta-toolkit/meta/compare/v2.4.1...v2.4.2
 [2.4.1]: https://github.com/meta-toolkit/meta/compare/v2.4.0...v2.4.1

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,7 +9,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 
 set(MeTA_VERSION_MAJOR 3)
 set(MeTA_VERSION_MINOR 0)
-set(MeTA_VERSION_PATCH 0)
+set(MeTA_VERSION_PATCH 1)
 set(MeTA_VERSION
     "${MeTA_VERSION_MAJOR}.${MeTA_VERSION_MINOR}.${MeTA_VERSION_PATCH}")
 
@@ -29,6 +29,11 @@ include(deps/meta-cmake/CompilerKludges.cmake)
 
 find_package(Threads REQUIRED)
 find_package(ZLIB REQUIRED)
+find_package(LibLZMA)
+
+if (LIBLZMA_FOUND AND LIBLZMA_HAS_EASY_ENCODER)
+  set(META_HAS_LIBLZMA ON)
+endif()
 
 cmake_push_check_state()
 
@@ -68,6 +73,10 @@ endif()
 
 target_include_directories(meta-definitions SYSTEM INTERFACE ${ZLIB_INCLUDE_DIRS})
 
+if (META_HAS_LIBLZMA)
+  target_compile_definitions(meta-definitions INTERFACE -DMETA_HAS_LIBLZMA=1)
+endif()
+
 if (LIBDL_LIBRARY)
   target_link_libraries(meta-definitions INTERFACE ${LIBDL_LIBRARY})
 endif()

diff --git a/RELEASING.md b/RELEASING.md
@@ -12,27 +12,29 @@ follow a consistent releasing process.
    changes (like enhancements) increment the Minor release number. Patch
    versions should be released only for bug fixes.
 
-2. Ensure `CHANGELOG.md` is up to date.
+2. Update the version number in `CMakeLists.txt`.
+
+3. Ensure `CHANGELOG.md` is up to date.
 
    If there are *any* breaking changes, mention these explicitly. If there
    are migration strategies to work around these breaking changes, provide
    a brief explanation (or a link to explain them).
 
-3. If there are major *or* minor API changes, ensure that the documentation
+4. If there are major *or* minor API changes, ensure that the documentation
    on the website (meta-toolkit/meta-toolkit.org) is correct.
 
    Update Doxygen as necessary.
 
-4. Ensure that the build is passing on both Travis (Linux + OS X) and
+5. Ensure that the build is passing on both Travis (Linux + OS X) and
    Appveyor (Windows/MinGW-w64).
 
-5. Merge branch `develop` into `master` with a commit message
+6. Merge branch `develop` into `master` with a commit message
 
    > Merge branch 'develop' for MeTA vX.Y.Z
 
    Use `git merge develop --no-ff` to create a merge commit.
 
-6. Tag the merge commit. The tag should be both annotated *and* signed:
+7. Tag the merge commit. The tag should be both annotated *and* signed:
 
    ```
    git tag -as vX.Y.Z
@@ -42,17 +44,17 @@ follow a consistent releasing process.
    version. Remove unnecessary markdown syntax like header markers and code
    blocks. Backticks can stay.
 
-7. Push the merge and the tags to GitHub:
+8. Push the merge and the tags to GitHub:
 
    ```
    git push --follow-tags
    ```
 
-8. Create a release on GitHub using the new tag. Its title should be "MeTA
+9. Create a release on GitHub using the new tag. Its title should be "MeTA
    vX.Y.Z".
 
    The contents of the message should be exactly the same as the CHANGELOG
    entry for that release.
 
-9. Upload the model files and include a section in the GitHub release notes
-   containing their sha256 sums.
+10. Upload the model files and include a section in the GitHub release notes
+    containing their sha256 sums.
diff --git a/include/meta/index/disk_index_impl.h b/include/meta/index/disk_index_impl.h
@@ -66,7 +66,7 @@ class disk_index::disk_index_impl
      * Loads the doc labels.
      * @param num_docs The number of documents stored in the index
      */
-    void load_labels(uint64_t num_docs = 0);
+    void load_labels();
 
     /**
      * Loads the term_id mapping.
@@ -83,13 +83,6 @@ class disk_index::disk_index_impl
      */
     void save_label_id_mapping();
 
-    /**
-     * Sets the label for a document.
-     * @param id The document id
-     * @param label The new label
-     */
-    void set_label(doc_id id, const class_label& label);
-
     /**
      * @return the total number of unique terms in the index.
      */
@@ -106,22 +99,22 @@ class disk_index::disk_index_impl
      */
     std::vector<class_label> class_labels() const;
 
-  private:
     /**
      * @param lbl the string class label to find the id for
      * @return the label_id of a class_label, creating a new one if
      * necessary
      */
     label_id get_label_id(const class_label& lbl);
 
+  private:
     /// the location of this index
     std::string index_name_;
 
     /**
      * Maps which class a document belongs to (if any).
      * Each index corresponds to a doc_id (uint64_t).
      */
-    util::optional<util::disk_vector<label_id>> labels_;
+    util::optional<util::disk_vector<const label_id>> labels_;
 
     /// Stores additional metadata for each document
     util::optional<metadata_file> metadata_;

diff --git a/include/meta/index/eval/ir_eval.h b/include/meta/index/eval/ir_eval.h
@@ -111,9 +111,11 @@ class ir_eval
      * @param results The ranked list of results
      * @param q_id The query that was run to produce these results
      * @param out The stream to print to
+     * @param num_docs the @k parameters for each measurement
      */
     void print_stats(const result_type& results, query_id q_id,
-                     std::ostream& out = std::cout);
+                     std::ostream& out = std::cout,
+                     uint64_t num_docs = std::numeric_limits<uint64_t>::max());
 
     /**
      * Clears saved scores for MAP and gMAP.

diff --git a/include/meta/index/metadata_file.h b/include/meta/index/metadata_file.h
@@ -76,7 +76,7 @@ class metadata_file
     corpus::metadata::schema_type schema_;
 
     /// the seek positions for every document in this file
-    util::disk_vector<uint64_t> index_;
+    util::disk_vector<const uint64_t> index_;
 
     /// the mapped file for reading metadata from
     io::mmap_file md_db_;

diff --git a/include/meta/index/postings_file.h b/include/meta/index/postings_file.h
@@ -82,7 +82,7 @@ class postings_file
 
   private:
     io::mmap_file postings_;
-    util::disk_vector<uint64_t> byte_locations_;
+    util::disk_vector<const uint64_t> byte_locations_;
 };
 }
 }

diff --git a/include/meta/index/vocabulary_map.h b/include/meta/index/vocabulary_map.h
@@ -38,7 +38,7 @@ class vocabulary_map
      * Byte positions for each term in the leaves to allow for reverse
      * lookup of a the string associated with a given id.
      */
-    util::disk_vector<uint64_t> inverse_;
+    util::disk_vector<const uint64_t> inverse_;
 
     /**
      * The size of the nodes in the tree.

diff --git a/include/meta/io/xzstream.h b/include/meta/io/xzstream.h
@@ -0,0 +1,104 @@
+/**
+ * @file xzstream.h
+ * @author Chase Geigle
+ *
+ * All files in META are dual-licensed under the MIT and NCSA licenses. For more
+ * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the
+ * project.
+ */
+
+#ifndef META_UTIL_XZSTREAM_H_
+#define META_UTIL_XZSTREAM_H_
+
+#include <lzma.h>
+
+#include <cstdio>
+#include <istream>
+#include <ostream>
+#include <stdexcept>
+#include <streambuf>
+#include <vector>
+
+#include "meta/config.h"
+
+namespace meta
+{
+namespace io
+{
+
+class xz_exception : public std::runtime_error
+{
+  public:
+    xz_exception(const std::string& msg, lzma_ret code)
+        : std::runtime_error{msg}, code_{code}
+    {
+        // nothing
+    }
+
+    explicit operator lzma_ret() const
+    {
+        return code_;
+    }
+
+  private:
+    lzma_ret code_;
+};
+
+class xzstreambuf : public std::streambuf
+{
+  public:
+    xzstreambuf(const char* filename, const char* openmode,
+                std::size_t buffer_size = 128 * 1024);
+
+    ~xzstreambuf();
+
+    int_type underflow() override;
+
+    int_type overflow(int_type ch) override;
+
+    int sync() override;
+
+    bool is_open() const;
+
+    uint64_t bytes_read() const;
+
+  private:
+    bool reading_;
+    std::vector<char> in_buffer_;
+    std::vector<char> out_buffer_;
+    FILE* file_;
+    uint64_t bytes_read_;
+    lzma_stream stream_;
+    lzma_action action_;
+};
+
+class xzifstream : public std::istream
+{
+  public:
+    explicit xzifstream(std::string name);
+
+    xzstreambuf* rdbuf() const;
+
+    void flush();
+
+    uint64_t bytes_read() const;
+
+  private:
+    xzstreambuf buffer_;
+};
+
+class xzofstream : public std::ostream
+{
+  public:
+    explicit xzofstream(std::string name);
+
+    xzstreambuf* rdbuf() const;
+
+    void flush();
+
+  private:
+    xzstreambuf buffer_;
+};
+}
+}
+#endif
diff --git a/include/meta/parser/trees/parse_tree.h b/include/meta/parser/trees/parse_tree.h
@@ -22,12 +22,6 @@ namespace parser
  * Represents the parse tree for a sentence. This may either be a sentence
  * parsed from training data, or the output from a trained parser on test
  * data.
- *
- * @todo determine what parts of analyzers::parse_tree are worth
- * keeping---that class deals specifically with trees read from the output
- * of the Stanford parser. When we have our own, we may still want some of
- * that functionality to allow people to use parsers that are not our
- * own?
  */
 class parse_tree
 {

diff --git a/include/meta/util/disk_vector.h b/include/meta/util/disk_vector.h
@@ -70,6 +70,8 @@ class disk_vector
      * @return a reference to the element at position idx in the vector
      * container
      */
+    template <class U = T,
+              class = typename std::enable_if<!std::is_const<U>::value>::type>
     T& operator[](uint64_t idx);
 
     /**
@@ -88,6 +90,8 @@ class disk_vector
      * (i.e., if idx is greater or equal than its size). This is in contrast
      * with member operator[], that does not check against bounds.
      */
+    template <class U = T,
+              class = typename std::enable_if<!std::is_const<U>::value>::type>
     T& at(uint64_t idx);
 
     /**
@@ -112,6 +116,8 @@ class disk_vector
     /**
      * @return an iterator to the beginning of this container
      */
+    template <class U = T,
+              class = typename std::enable_if<!std::is_const<U>::value>::type>
     iterator begin();
 
     /**
@@ -123,6 +129,8 @@ class disk_vector
     /**
      * @return an iterator to the end of this container
      */
+    template <class U = T,
+              class = typename std::enable_if<!std::is_const<U>::value>::type>
     iterator end();
 
     /**