diff --git a/.travis.yml b/.travis.yml index 7dbec261b..b99e99d6f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,9 +96,9 @@ matrix: - *default-packages - clang-3.8 - # OS X 10.10 + Xcode 7.1.1 + # OS X 10.10 + Xcode 6.4 (this is the only 10.10 image on Travis) - os: osx - osx_image: xcode7.1 + osx_image: xcode6.4 env: COMPILER=clang # OS X 10.11 + Xcode 7.3 @@ -106,9 +106,9 @@ matrix: osx_image: xcode7.3 env: COMPILER=clang - # OS X 10.11 + Xcode 8 + # OS X 10.12 + Xcode 8.2 - os: osx - osx_image: xcode8 + osx_image: xcode8.2 env: COMPILER=clang # OS X/GCC 6 diff --git a/CHANGELOG.md b/CHANGELOG.md index 15a5eec81..c411600c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +# [v3.0.1][3.0.1] +## New features +- Add an optional `xz{i,o}fstream` to `meta::io` if compiled with liblzma + available. +- `util::disk_vector` can now be used to specify a read-only view + of a disk-backed vector. + +## Bug fixes +- `ir_eval::print_stats` now takes a `num_docs` parameter to properly + display evaluation metrics at a certain cutoff point, which was always 5 + beforehand. This fixes a bug in `query-runner` where the stats were not + being computed according to the cutoff point specified in the + configuration. +- `ir_eval::avg_p` now correctly stops computing after `num_docs`. Before, + if you specified `num_docs` as a smaller value than the size of the + result list, it would erroneously keep calculating until the end of the + result list instead of stopping after `num_docs` elements. +- `{inverted,forward}_index` can now be loaded from read-only filesystems. + # [v3.0.0][3.0.0] ## New features - Add an `embedding_analyzer` that represents documents with their averaged word @@ -609,7 +628,8 @@ # [v1.0][1.0] - Initial release. -[unreleased]: https://github.com/meta-toolkit/meta/compare/v3.0.0...develop +[unreleased]: https://github.com/meta-toolkit/meta/compare/v3.0.1...develop +[3.0.1]: https://github.com/meta-toolkit/meta/compare/v3.0.0...v3.0.1 [3.0.0]: https://github.com/meta-toolkit/meta/compare/v2.4.2...v3.0.0 [2.4.2]: https://github.com/meta-toolkit/meta/compare/v2.4.1...v2.4.2 [2.4.1]: https://github.com/meta-toolkit/meta/compare/v2.4.0...v2.4.1 diff --git a/CMakeLists.txt b/CMakeLists.txt index f72ab0e81..69175d035 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS 1) set(MeTA_VERSION_MAJOR 3) set(MeTA_VERSION_MINOR 0) -set(MeTA_VERSION_PATCH 0) +set(MeTA_VERSION_PATCH 1) set(MeTA_VERSION "${MeTA_VERSION_MAJOR}.${MeTA_VERSION_MINOR}.${MeTA_VERSION_PATCH}") @@ -29,6 +29,11 @@ include(deps/meta-cmake/CompilerKludges.cmake) find_package(Threads REQUIRED) find_package(ZLIB REQUIRED) +find_package(LibLZMA) + +if (LIBLZMA_FOUND AND LIBLZMA_HAS_EASY_ENCODER) + set(META_HAS_LIBLZMA ON) +endif() cmake_push_check_state() @@ -68,6 +73,10 @@ endif() target_include_directories(meta-definitions SYSTEM INTERFACE ${ZLIB_INCLUDE_DIRS}) +if (META_HAS_LIBLZMA) + target_compile_definitions(meta-definitions INTERFACE -DMETA_HAS_LIBLZMA=1) +endif() + if (LIBDL_LIBRARY) target_link_libraries(meta-definitions INTERFACE ${LIBDL_LIBRARY}) endif() diff --git a/RELEASING.md b/RELEASING.md index 6bdab06b2..8b1133f99 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -12,27 +12,29 @@ follow a consistent releasing process. changes (like enhancements) increment the Minor release number. Patch versions should be released only for bug fixes. -2. Ensure `CHANGELOG.md` is up to date. +2. Update the version number in `CMakeLists.txt`. + +3. Ensure `CHANGELOG.md` is up to date. If there are *any* breaking changes, mention these explicitly. If there are migration strategies to work around these breaking changes, provide a brief explanation (or a link to explain them). -3. If there are major *or* minor API changes, ensure that the documentation +4. If there are major *or* minor API changes, ensure that the documentation on the website (meta-toolkit/meta-toolkit.org) is correct. Update Doxygen as necessary. -4. Ensure that the build is passing on both Travis (Linux + OS X) and +5. Ensure that the build is passing on both Travis (Linux + OS X) and Appveyor (Windows/MinGW-w64). -5. Merge branch `develop` into `master` with a commit message +6. Merge branch `develop` into `master` with a commit message > Merge branch 'develop' for MeTA vX.Y.Z Use `git merge develop --no-ff` to create a merge commit. -6. Tag the merge commit. The tag should be both annotated *and* signed: +7. Tag the merge commit. The tag should be both annotated *and* signed: ``` git tag -as vX.Y.Z @@ -42,17 +44,17 @@ follow a consistent releasing process. version. Remove unnecessary markdown syntax like header markers and code blocks. Backticks can stay. -7. Push the merge and the tags to GitHub: +8. Push the merge and the tags to GitHub: ``` git push --follow-tags ``` -8. Create a release on GitHub using the new tag. Its title should be "MeTA +9. Create a release on GitHub using the new tag. Its title should be "MeTA vX.Y.Z". The contents of the message should be exactly the same as the CHANGELOG entry for that release. -9. Upload the model files and include a section in the GitHub release notes - containing their sha256 sums. +10. Upload the model files and include a section in the GitHub release notes + containing their sha256 sums. diff --git a/include/meta/index/disk_index_impl.h b/include/meta/index/disk_index_impl.h index 3d37e67d5..5ecabec41 100644 --- a/include/meta/index/disk_index_impl.h +++ b/include/meta/index/disk_index_impl.h @@ -66,7 +66,7 @@ class disk_index::disk_index_impl * Loads the doc labels. * @param num_docs The number of documents stored in the index */ - void load_labels(uint64_t num_docs = 0); + void load_labels(); /** * Loads the term_id mapping. @@ -83,13 +83,6 @@ class disk_index::disk_index_impl */ void save_label_id_mapping(); - /** - * Sets the label for a document. - * @param id The document id - * @param label The new label - */ - void set_label(doc_id id, const class_label& label); - /** * @return the total number of unique terms in the index. */ @@ -106,7 +99,6 @@ class disk_index::disk_index_impl */ std::vector class_labels() const; - private: /** * @param lbl the string class label to find the id for * @return the label_id of a class_label, creating a new one if @@ -114,6 +106,7 @@ class disk_index::disk_index_impl */ label_id get_label_id(const class_label& lbl); + private: /// the location of this index std::string index_name_; @@ -121,7 +114,7 @@ class disk_index::disk_index_impl * Maps which class a document belongs to (if any). * Each index corresponds to a doc_id (uint64_t). */ - util::optional> labels_; + util::optional> labels_; /// Stores additional metadata for each document util::optional metadata_; diff --git a/include/meta/index/eval/ir_eval.h b/include/meta/index/eval/ir_eval.h index 32b9969ed..594168489 100644 --- a/include/meta/index/eval/ir_eval.h +++ b/include/meta/index/eval/ir_eval.h @@ -111,9 +111,11 @@ class ir_eval * @param results The ranked list of results * @param q_id The query that was run to produce these results * @param out The stream to print to + * @param num_docs the @k parameters for each measurement */ void print_stats(const result_type& results, query_id q_id, - std::ostream& out = std::cout); + std::ostream& out = std::cout, + uint64_t num_docs = std::numeric_limits::max()); /** * Clears saved scores for MAP and gMAP. diff --git a/include/meta/index/metadata_file.h b/include/meta/index/metadata_file.h index 4bfb93e69..5524e263f 100644 --- a/include/meta/index/metadata_file.h +++ b/include/meta/index/metadata_file.h @@ -76,7 +76,7 @@ class metadata_file corpus::metadata::schema_type schema_; /// the seek positions for every document in this file - util::disk_vector index_; + util::disk_vector index_; /// the mapped file for reading metadata from io::mmap_file md_db_; diff --git a/include/meta/index/postings_file.h b/include/meta/index/postings_file.h index 1dac85acd..47d682195 100644 --- a/include/meta/index/postings_file.h +++ b/include/meta/index/postings_file.h @@ -82,7 +82,7 @@ class postings_file private: io::mmap_file postings_; - util::disk_vector byte_locations_; + util::disk_vector byte_locations_; }; } } diff --git a/include/meta/index/vocabulary_map.h b/include/meta/index/vocabulary_map.h index e46d8c063..120f22ddf 100644 --- a/include/meta/index/vocabulary_map.h +++ b/include/meta/index/vocabulary_map.h @@ -38,7 +38,7 @@ class vocabulary_map * Byte positions for each term in the leaves to allow for reverse * lookup of a the string associated with a given id. */ - util::disk_vector inverse_; + util::disk_vector inverse_; /** * The size of the nodes in the tree. diff --git a/include/meta/io/xzstream.h b/include/meta/io/xzstream.h new file mode 100644 index 000000000..dc2c43c5c --- /dev/null +++ b/include/meta/io/xzstream.h @@ -0,0 +1,104 @@ +/** + * @file xzstream.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_UTIL_XZSTREAM_H_ +#define META_UTIL_XZSTREAM_H_ + +#include + +#include +#include +#include +#include +#include +#include + +#include "meta/config.h" + +namespace meta +{ +namespace io +{ + +class xz_exception : public std::runtime_error +{ + public: + xz_exception(const std::string& msg, lzma_ret code) + : std::runtime_error{msg}, code_{code} + { + // nothing + } + + explicit operator lzma_ret() const + { + return code_; + } + + private: + lzma_ret code_; +}; + +class xzstreambuf : public std::streambuf +{ + public: + xzstreambuf(const char* filename, const char* openmode, + std::size_t buffer_size = 128 * 1024); + + ~xzstreambuf(); + + int_type underflow() override; + + int_type overflow(int_type ch) override; + + int sync() override; + + bool is_open() const; + + uint64_t bytes_read() const; + + private: + bool reading_; + std::vector in_buffer_; + std::vector out_buffer_; + FILE* file_; + uint64_t bytes_read_; + lzma_stream stream_; + lzma_action action_; +}; + +class xzifstream : public std::istream +{ + public: + explicit xzifstream(std::string name); + + xzstreambuf* rdbuf() const; + + void flush(); + + uint64_t bytes_read() const; + + private: + xzstreambuf buffer_; +}; + +class xzofstream : public std::ostream +{ + public: + explicit xzofstream(std::string name); + + xzstreambuf* rdbuf() const; + + void flush(); + + private: + xzstreambuf buffer_; +}; +} +} +#endif diff --git a/include/meta/parser/trees/parse_tree.h b/include/meta/parser/trees/parse_tree.h index 1d2c81766..5413858c4 100644 --- a/include/meta/parser/trees/parse_tree.h +++ b/include/meta/parser/trees/parse_tree.h @@ -22,12 +22,6 @@ namespace parser * Represents the parse tree for a sentence. This may either be a sentence * parsed from training data, or the output from a trained parser on test * data. - * - * @todo determine what parts of analyzers::parse_tree are worth - * keeping---that class deals specifically with trees read from the output - * of the Stanford parser. When we have our own, we may still want some of - * that functionality to allow people to use parsers that are not our - * own? */ class parse_tree { diff --git a/include/meta/util/disk_vector.h b/include/meta/util/disk_vector.h index a9ad16359..795dc4456 100644 --- a/include/meta/util/disk_vector.h +++ b/include/meta/util/disk_vector.h @@ -70,6 +70,8 @@ class disk_vector * @return a reference to the element at position idx in the vector * container */ + template ::value>::type> T& operator[](uint64_t idx); /** @@ -88,6 +90,8 @@ class disk_vector * (i.e., if idx is greater or equal than its size). This is in contrast * with member operator[], that does not check against bounds. */ + template ::value>::type> T& at(uint64_t idx); /** @@ -112,6 +116,8 @@ class disk_vector /** * @return an iterator to the beginning of this container */ + template ::value>::type> iterator begin(); /** @@ -123,6 +129,8 @@ class disk_vector /** * @return an iterator to the end of this container */ + template ::value>::type> iterator end(); /** diff --git a/include/meta/util/disk_vector.tcc b/include/meta/util/disk_vector.tcc index 656e01e68..7c62d4ca4 100644 --- a/include/meta/util/disk_vector.tcc +++ b/include/meta/util/disk_vector.tcc @@ -3,9 +3,9 @@ * @author Sean Massung */ -#include #include "meta/io/filesystem.h" #include "meta/util/disk_vector.h" +#include namespace meta { @@ -16,7 +16,14 @@ template disk_vector::disk_vector(const std::string& path, uint64_t size /* = 0 */) : path_{path}, start_{nullptr}, size_{size}, file_desc_{-1} { - file_desc_ = open(path_.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (std::is_const::value) + { + file_desc_ = open(path_.c_str(), O_RDONLY); + } + else + { + file_desc_ = open(path_.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + } if (file_desc_ < 0) throw disk_vector_exception{"error obtaining file descriptor for " + path_}; @@ -30,6 +37,9 @@ disk_vector::disk_vector(const std::string& path, uint64_t size /* = 0 */) // end and writing a byte if (actual_size != size_bytes) { + if (std::is_const::value) + throw disk_vector_exception{ + "cannot create disk vector when opened in read-only mode"}; auto offset = static_cast(size_bytes - 1); if (lseek(file_desc_, offset, SEEK_SET) == -1) throw disk_vector_exception{"error lseeking to extend file"}; @@ -45,8 +55,11 @@ disk_vector::disk_vector(const std::string& path, uint64_t size /* = 0 */) throw disk_vector_exception{"cannot map empty file " + path}; } - start_ = (T*)mmap(nullptr, sizeof(T) * size_, PROT_READ | PROT_WRITE, - MAP_SHARED, file_desc_, 0); + int prot = PROT_READ; + if (!std::is_const::value) + prot |= PROT_WRITE; + start_ + = (T*)mmap(nullptr, sizeof(T) * size_, prot, MAP_SHARED, file_desc_, 0); if (start_ == MAP_FAILED) throw disk_vector_exception{"error memory-mapping the file " + path_}; @@ -69,7 +82,8 @@ disk_vector& disk_vector::operator=(disk_vector&& other) { if (start_) { - munmap(start_, sizeof(T) * size_); + munmap(const_cast::type*>(start_), + sizeof(T) * size_); close(file_desc_); } path_ = std::move(other.path_); @@ -86,11 +100,13 @@ disk_vector::~disk_vector() { if (!start_) return; - munmap(start_, sizeof(T) * size_); + munmap(const_cast::type*>(start_), + sizeof(T) * size_); close(file_desc_); } template +template T& disk_vector::operator[](uint64_t idx) { return start_[idx]; @@ -103,6 +119,7 @@ const T& disk_vector::operator[](uint64_t idx) const } template +template T& disk_vector::at(uint64_t idx) { if (idx >= size_) @@ -129,6 +146,7 @@ uint64_t disk_vector::size() const } template +template auto disk_vector::begin() -> iterator { return start_; @@ -147,6 +165,7 @@ auto disk_vector::end() const -> const_iterator } template +template auto disk_vector::end() -> iterator { return start_ + size_; diff --git a/src/index/disk_index.cpp b/src/index/disk_index.cpp index 63b5d3a77..c0dcbfd14 100644 --- a/src/index/disk_index.cpp +++ b/src/index/disk_index.cpp @@ -6,12 +6,12 @@ #include #include +#include "meta/analyzers/analyzer.h" #include "meta/index/disk_index.h" #include "meta/index/disk_index_impl.h" #include "meta/index/string_list.h" #include "meta/index/string_list_writer.h" #include "meta/index/vocabulary_map.h" -#include "meta/analyzers/analyzer.h" #include "meta/util/disk_vector.h" #include "meta/util/mapping.h" #include "meta/util/optional.h" @@ -149,15 +149,10 @@ void disk_index::disk_index_impl::initialize_metadata() metadata_ = {index_name_}; } -void disk_index::disk_index_impl::load_labels(uint64_t num_docs) +void disk_index::disk_index_impl::load_labels() { - // clear the current label set; this is so that the disk vector can - // flush via munmap() if needed - labels_ = util::nullopt; - - // load in the new mapping - labels_ = util::disk_vector{index_name_ + files[DOC_LABELS], - num_docs}; + labels_ + = util::disk_vector{index_name_ + files[DOC_LABELS]}; } void disk_index::disk_index_impl::load_term_id_mapping() @@ -175,11 +170,6 @@ void disk_index::disk_index_impl::save_label_id_mapping() map::save_mapping(label_ids_, index_name_ + files[LABEL_IDS_MAPPING]); } -void disk_index::disk_index_impl::set_label(doc_id id, const class_label& label) -{ - (*labels_)[id] = get_label_id(label); -} - uint64_t disk_index::disk_index_impl::total_unique_terms() const { return term_id_mapping_->size(); diff --git a/src/index/eval/ir_eval.cpp b/src/index/eval/ir_eval.cpp index a93a8c410..5f18a4196 100644 --- a/src/index/eval/ir_eval.cpp +++ b/src/index/eval/ir_eval.cpp @@ -184,7 +184,8 @@ double ir_eval::avg_p(const std::vector& results, query_id q_id, } if (num_rel - 1 == total_relevant) break; - ++i; + if (i++ == num_docs) + break; } scores_.push_back(avgp / total_relevant); @@ -217,22 +218,21 @@ double ir_eval::gmap() const } void ir_eval::print_stats(const std::vector& results, - query_id q_id, std::ostream& out) + query_id q_id, std::ostream& out, uint64_t num_docs) { auto w1 = std::setw(8); auto w2 = std::setw(6); int p = 3; - uint64_t max = 5; out << w1 << printing::make_bold(" NDCG:") << w2 << std::setprecision(p) - << ndcg(results, q_id, max); + << ndcg(results, q_id, num_docs); out << w1 << printing::make_bold(" Avg. P:") << w2 << std::setprecision(p) - << avg_p(results, q_id, max); + << avg_p(results, q_id, num_docs); out << w1 << printing::make_bold(" F1 Score:") << w2 - << std::setprecision(p) << f1(results, q_id); + << std::setprecision(p) << f1(results, q_id, num_docs); out << w1 << printing::make_bold(" Precision:") << w2 - << std::setprecision(p) << precision(results, q_id, max); + << std::setprecision(p) << precision(results, q_id, num_docs); out << w1 << printing::make_bold(" Recall:") << w2 << std::setprecision(p) - << recall(results, q_id, max); + << recall(results, q_id, num_docs); out << std::endl; } diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 5d7edd041..40e1528df 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -241,8 +241,6 @@ void forward_index::create_index(const cpptoml::table& config, metadata_writer mdata_writer{index_name(), docs.size(), docs.schema()}; - impl_->load_labels(docs.size()); - auto max_threads = std::thread::hardware_concurrency(); auto num_threads = config.get_as("indexer-num-threads") .value_or(max_threads); @@ -261,7 +259,7 @@ void forward_index::create_index(const cpptoml::table& config, impl_->save_label_id_mapping(); fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); - // reload the label file to ensure it was flushed + // reload the label file impl_->load_labels(); } } @@ -309,6 +307,9 @@ void forward_index::impl::tokenize_docs(corpus::corpus& docs, bool exceeded_budget = false; std::atomic_size_t chunk_id{0}; + util::disk_vector labels{ + idx_->index_name() + idx_->impl_->files[DOC_LABELS], docs.size()}; + parallel::thread_pool pool{num_threads}; corpus::parallel_consume( docs, pool, @@ -342,7 +343,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus& docs, }); mdata_writer.write(doc.id(), length, counts.size(), doc.mdata()); - idx_->impl_->set_label(doc.id(), doc.label()); + labels[doc.id()] = idx_->impl_->get_label_id(doc.label()); forward_index::postings_data_type::count_t pd_counts; pd_counts.reserve(counts.size()); @@ -447,10 +448,11 @@ void forward_index::impl::create_libsvm_postings(corpus::corpus& docs) { auto filename = idx_->index_name() + idx_->impl_->files[POSTINGS]; auto num_docs = docs.size(); - idx_->impl_->load_labels(num_docs); total_unique_terms_ = 0; { + util::disk_vector labels{ + idx_->index_name() + idx_->impl_->files[DOC_LABELS], docs.size()}; postings_file_writer out{filename, num_docs}; @@ -482,7 +484,7 @@ void forward_index::impl::create_libsvm_postings(corpus::corpus& docs) md_writer.write(doc.id(), static_cast(length), num_unique, doc.mdata()); - idx_->impl_->set_label(doc.id(), doc.label()); + labels[doc.id()] = idx_->impl_->get_label_id(doc.label()); } // +1 since we subtracted one from each of the ids in the @@ -490,7 +492,7 @@ void forward_index::impl::create_libsvm_postings(corpus::corpus& docs) ++total_unique_terms_; } - // reload the label file to ensure it was flushed + // load the labels idx_->impl_->load_labels(); LOG(info) << "Created compressed postings file (" diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 88caeb68f..b5fb0d201 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -137,8 +137,6 @@ void inverted_index::create_index(const cpptoml::table& config, postings_inverter inverter{index_name(), max_writers}; { metadata_writer mdata_writer{index_name(), docs.size(), docs.schema()}; - uint64_t num_docs = docs.size(); - impl_->load_labels(num_docs); // RAM budget is given in megabytes inv_impl_->tokenize_docs(docs, inverter, mdata_writer, @@ -201,6 +199,8 @@ void inverted_index::impl::tokenize_docs( corpus::corpus& docs, postings_inverter& inverter, metadata_writer& mdata_writer, uint64_t ram_budget, std::size_t num_threads) { + util::disk_vector labels{ + idx_->index_name() + idx_->impl_->files[DOC_LABELS], docs.size()}; std::mutex io_mutex; printing::progress progress{" > Tokenizing Docs: ", docs.size()}; uint64_t local_budget = ram_budget / num_threads; @@ -237,7 +237,7 @@ void inverted_index::impl::tokenize_docs( }); mdata_writer.write(doc.id(), length, counts.size(), doc.mdata()); - idx_->impl_->set_label(doc.id(), doc.label()); + labels[doc.id()] = idx_->impl_->get_label_id(doc.label()); // update chunk ls.producer_(doc.id(), counts); diff --git a/src/index/tools/query_runner.cpp b/src/index/tools/query_runner.cpp index ec90fc601..ca0dd686f 100644 --- a/src/index/tools/query_runner.cpp +++ b/src/index/tools/query_runner.cpp @@ -145,7 +145,8 @@ int main(int argc, char* argv[]) break; } if (!trec_format && eval) - eval->print_stats(ranking, query_id{q_id}); + eval->print_stats(ranking, query_id{q_id}, std::cout, + max_results); ++q_id; } }); diff --git a/src/index/vocabulary_map.cpp b/src/index/vocabulary_map.cpp index 9219bcd68..1a8b36817 100644 --- a/src/index/vocabulary_map.cpp +++ b/src/index/vocabulary_map.cpp @@ -4,6 +4,7 @@ */ #include + #include "meta/index/vocabulary_map.h" #include "meta/util/optional.h" diff --git a/src/io/CMakeLists.txt b/src/io/CMakeLists.txt index e36257f50..a44baf267 100644 --- a/src/io/CMakeLists.txt +++ b/src/io/CMakeLists.txt @@ -10,6 +10,10 @@ if (WIN32) list(APPEND META_IO_SOURCES mman-win32/mman.c) endif() +if (META_HAS_LIBLZMA) + list(APPEND META_IO_SOURCES xzstream.cpp) +endif() + add_library(meta-io ${META_IO_SOURCES}) target_link_libraries(meta-io meta-util ${ZLIB_LIBRARIES}) @@ -20,6 +24,11 @@ else() ${META_PROJECT_SOURCE_DIR}/deps/meta-stlsoft/include) endif() +if (META_HAS_LIBLZMA) + target_include_directories(meta-io PUBLIC ${LIBLZMA_INCLUDE_DIRS}) + target_link_libraries(meta-io ${LIBLZMA_LIBRARIES}) +endif() + install(TARGETS meta-io EXPORT meta-exports DESTINATION lib) diff --git a/src/io/tools/compressor_test.cpp b/src/io/tools/compressor_test.cpp index d05176c04..3d829cc06 100644 --- a/src/io/tools/compressor_test.cpp +++ b/src/io/tools/compressor_test.cpp @@ -3,26 +3,24 @@ * @author Chase Geigle */ +#include "meta/io/gzstream.h" #include -#include #include -#include "meta/io/gzstream.h" +#include +#if META_HAS_LIBLZMA +#include "meta/io/xzstream.h" +#endif using namespace meta; -int main(int argc, char** argv) +template +void test_compressor(const std::string& infile, const std::string& outfile) { - if (argc < 3) - { - std::cerr << "Usage: " << argv[0] << " input output" << std::endl; - return 1; - } - std::array buffer; { - std::ifstream file{argv[1], std::ios::in | std::ios::binary}; - io::gzofstream output{argv[2]}; + std::ifstream file{infile, std::ios::in | std::ios::binary}; + OutputStream output{outfile}; while (file) { file.read(&buffer[0], 1024); @@ -31,8 +29,8 @@ int main(int argc, char** argv) } { - io::gzifstream input{argv[2]}; - std::ofstream output{std::string{argv[2]} + ".decompressed", + InputStream input{outfile}; + std::ofstream output{outfile + ".decompressed", std::ios::out | std::ios::binary}; while (input) @@ -41,6 +39,22 @@ int main(int argc, char** argv) output.write(&buffer[0], input.gcount()); } } +} + +int main(int argc, char** argv) +{ + if (argc < 3) + { + std::cerr << "Usage: " << argv[0] << " input output" << std::endl; + return 1; + } + + + test_compressor(argv[1], argv[2]); +#if META_HAS_LIBLZMA + test_compressor( + argv[1], std::string{argv[2]} + ".xz"); +#endif return 0; } diff --git a/src/io/xzstream.cpp b/src/io/xzstream.cpp new file mode 100644 index 000000000..2ff18c1ee --- /dev/null +++ b/src/io/xzstream.cpp @@ -0,0 +1,256 @@ +/** + * @file xzstream.cpp + * @author Chase Geigle + * + * Based heavily upon the examples in the xz repo. + * @see + * http://git.tukaani.org/?p=xz.git;a=blob;f=doc/examples/01_compress_easy.c + * @see + * http://git.tukaani.org/?p=xz.git;a=blob;f=doc/examples/02_decompress.c + */ + +#include "meta/io/xzstream.h" +#include "meta/util/string_view.h" + +namespace meta +{ +namespace io +{ + +namespace +{ + +void throw_if_error(lzma_ret code, std::string msg) +{ + switch (code) + { + case LZMA_OK: + case LZMA_STREAM_END: + return; + case LZMA_MEM_ERROR: + throw xz_exception{msg + ": Memory allocation failed", code}; + case LZMA_FORMAT_ERROR: + throw xz_exception{msg + ": Input not in .xz format", code}; + case LZMA_OPTIONS_ERROR: + throw xz_exception{msg + ": Unsupported compression options", code}; + case LZMA_DATA_ERROR: + throw xz_exception{msg + ": Compressed file is corrupt", code}; + case LZMA_BUF_ERROR: + throw xz_exception{ + msg + ": Compressed file is truncated or corrupt", code}; + case LZMA_UNSUPPORTED_CHECK: + throw xz_exception{ + msg + ": Specified integrity check is not supported", code}; + default: + throw xz_exception{msg + ": Unknown error", code}; + } +} +} + +xzstreambuf::xzstreambuf(const char* filename, const char* openmode, + std::size_t buffer_size) + : in_buffer_(buffer_size), + out_buffer_(buffer_size), + file_{std::fopen(filename, openmode)}, + bytes_read_{0} +{ + + stream_ = LZMA_STREAM_INIT; + action_ = LZMA_RUN; + stream_.next_in = nullptr; + stream_.avail_in = 0; + + util::string_view mode{openmode}; + if (mode == "wb") + { + reading_ = false; + setp(&in_buffer_.front(), &in_buffer_.back()); + throw_if_error(lzma_easy_encoder(&stream_, 6, LZMA_CHECK_CRC64), + "Failed to initialize encoder"); + } + else if (mode == "rb") + { + auto end = &out_buffer_.back() + 1; + setg(end, end, end); + reading_ = true; + + throw_if_error(lzma_stream_decoder( + &stream_, std::numeric_limits::max(), 0), + "Failed to initialize decoder"); + } + else + { + throw std::runtime_error{"Unrecognized open mode"}; + } + + stream_.next_out = reinterpret_cast(&out_buffer_[0]); + stream_.avail_out = out_buffer_.size(); +} + +xzstreambuf::~xzstreambuf() +{ + if (!reading_) + { + action_ = LZMA_FINISH; + sync(); + } + + fclose(file_); + lzma_end(&stream_); +} + +auto xzstreambuf::underflow() -> int_type +{ + if (gptr() && (gptr() < egptr())) + return traits_type::to_int_type(*gptr()); + + // keep decompressing until we fill the output buffer, reading input + // from the internal file as needed + lzma_ret ret; + do + { + if (stream_.avail_in == 0 && !std::feof(file_)) + { + stream_.next_in = reinterpret_cast(&in_buffer_[0]); + stream_.avail_in = std::fread(&in_buffer_[0], sizeof(uint8_t), + in_buffer_.size(), file_); + bytes_read_ += stream_.avail_in; + + if (std::ferror(file_)) + { + setg(&out_buffer_[0], &out_buffer_[0], &out_buffer_[0]); + return traits_type::eof(); + } + + if (std::feof(file_)) + { + action_ = LZMA_FINISH; + } + } + + ret = lzma_code(&stream_, action_); + + throw_if_error(ret, "Decoder error"); + } while (stream_.avail_out != 0 && ret != LZMA_STREAM_END); + + // on LZMA_STREAM_END, we might not have filled the entire buffer, so + // compute the actual number of bytes we have in the get buffer + auto bytes = out_buffer_.size() - stream_.avail_out; + if (bytes > 0) + { + setg(&out_buffer_[0], &out_buffer_[0], &out_buffer_[0] + bytes); + stream_.next_out = reinterpret_cast(&out_buffer_[0]); + stream_.avail_out = out_buffer_.size(); + + return traits_type::to_int_type(*gptr()); + } + + // if we get here, we must have exhausted both the input file and the + // input buffer, so finally report EOF + setg(&out_buffer_[0], &out_buffer_[0], &out_buffer_[0]); + return traits_type::eof(); +} + +auto xzstreambuf::overflow(int_type ch) -> int_type +{ + if (ch != traits_type::eof()) + { + *pptr() = traits_type::to_char_type(ch); + pbump(1); + if (sync() == 0) + return ch; + } + + return traits_type::eof(); +} + +int xzstreambuf::sync() +{ + auto bytes = pptr() - pbase(); + stream_.next_in = reinterpret_cast(pbase()); + stream_.avail_in = static_cast(bytes); + + // Two cases: + // 1. We are still compressing the file, in which case we should pump + // the loop until all of the available input bytes are consumed; or + // + // 2. We are done receiving input (action_ == LZMA_FINISH), in which + // case we should pump the loop until we get the LZMA_STREAM_END + // return code indicating that all input has been processed (note + // that processed != read, hence this second case). + lzma_ret ret; + do + { + ret = lzma_code(&stream_, action_); + + if (stream_.avail_out == 0 || ret == LZMA_STREAM_END) + { + auto size = out_buffer_.size() - stream_.avail_out; + + if (std::fwrite(&out_buffer_[0], sizeof(uint8_t), size, file_) + != size) + return -1; + + stream_.next_out = reinterpret_cast(&out_buffer_[0]); + stream_.avail_out = out_buffer_.size(); + } + + throw_if_error(ret, "Encoder error"); + + } while (stream_.avail_in > 0 + || (action_ == LZMA_FINISH && ret != LZMA_STREAM_END)); + + if (bytes > 0) + pbump(-static_cast(bytes)); + + return 0; +} + +bool xzstreambuf::is_open() const +{ + return file_ != nullptr && !::ferror(file_); +} + +uint64_t xzstreambuf::bytes_read() const +{ + return bytes_read_; +} + +xzifstream::xzifstream(std::string name) + : std::istream{&buffer_}, buffer_{name.c_str(), "rb"} +{ + clear(); +} + +xzstreambuf* xzifstream::rdbuf() const +{ + return const_cast(&buffer_); +} + +void xzifstream::flush() +{ + buffer_.sync(); +} + +uint64_t xzifstream::bytes_read() const +{ + return buffer_.bytes_read(); +} + +xzofstream::xzofstream(std::string name) + : std::ostream{&buffer_}, buffer_{name.c_str(), "wb"} +{ + clear(); +} + +xzstreambuf* xzofstream::rdbuf() const +{ + return const_cast(&buffer_); +} + +void xzofstream::flush() +{ + buffer_.sync(); +} +} +} diff --git a/tests/ir_eval_test.cpp b/tests/ir_eval_test.cpp index 2de9f4bb8..339383807 100644 --- a/tests/ir_eval_test.cpp +++ b/tests/ir_eval_test.cpp @@ -108,7 +108,6 @@ go_bandit([]() { 1.0 / idcg); check_query(eval, results, qid, 0.1 / 0.6, 0.5, 0.1, 0.2, 1.0 / idcg_5, 5); - results.emplace_back(doc_id{1}, 0.8); // relevant check_query(eval, results, qid, (2.0 * (2.0 / 3.0) * 0.2) / (2.0 / 3.0 + 0.2), @@ -126,7 +125,7 @@ go_bandit([]() { results.emplace_back(doc_id{38}, 0.2); // relevant results.emplace_back(doc_id{754}, 0.1); // relevant auto avg_p_5 - = (1.0 + 2.0 / 3.0 + 3.0 / 4.0 + 4.0 / 5.0 + 5.0 / 6.0) / 5.0; + = (1.0 + 2.0 / 3.0 + 3.0 / 4.0 + 4.0 / 5.0) / 5.0; auto avg_p = (1.0 + 2.0 / 3.0 + 3.0 / 4.0 + 4.0 / 5.0 + 5.0 / 6.0 + 6.0 / 7.0 + 7.0 / 8.0 + 8.0 / 9.0 + 9.0 / 10.0 + 10.0 / 11.0)