diff --git a/.travis.yml b/.travis.yml index 0c90f8c00..3e140dd36 100644 --- a/.travis.yml +++ b/.travis.yml @@ -62,11 +62,26 @@ matrix: - clang-3.6 - llvm-3.6-dev - # OSX/Clang (XCode) + # OS X 10.9 + Xcode 6.1 - os: osx env: COMPILER=clang - # OSX/GCC 5 + # OS X 10.10 + Xcode 6.4 + - os: osx + osx_image: xcode6.4 + env: COMPILER=clang + + # OS X 10.10 + Xcode 7.1.1 + - os: osx + osx_image: xcode7.1 + env: COMPILER=clang + + # OS X 10.11 + Xcode 7.2 + - os: osx + osx_image: xcode7.2 + env: COMPILER=clang + + # OS X/GCC 5 - os: osx env: COMPILER=gcc diff --git a/CHANGELOG.md b/CHANGELOG.md index 842e9efa0..95957989f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +# [v2.0.1][2.0.1] +## Bug fixes +- Fix issue where `metadata_parser` would not consume spaces in string + metadata fields. Thanks to @hopsalot on the forum for the bug report! +- Fix build issue on OS X with Xcode 6.4 and `clang` related to their + shipped version of `string_view` lacking a const `to_string()` method + +## Enhancements +- The `./profile` executable ensures that the file exists before operating on + it. Thanks to @domarps for the PR! +- Add a generic `util::multiway_merge` algorithm for performing the + merge-step of an external memory merge sort. +- Build with the following Xcode versions on Travis CI: + * Xcode 6.1 and OS X 10.9 (as before) + * Xcode 6.4 and OS X 10.10 (new) + * Xcode 7.1.1 and OS X 10.10 (new) + * Xcode 7.2 and OS X 10.11 (new) + # [v2.0.0][2.0.0] ## New features and major changes @@ -286,7 +304,8 @@ # [v1.0][1.0] - Initial release. -[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.0.0...develop +[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.0.1...develop +[2.0.1]: https://github.com/meta-toolkit/meta/compare/v2.0.0...v2.0.1 [2.0.0]: https://github.com/meta-toolkit/meta/compare/v1.3.8...v2.0.0 [1.3.8]: https://github.com/meta-toolkit/meta/compare/v1.3.7...v1.3.8 [1.3.7]: https://github.com/meta-toolkit/meta/compare/v1.3.6...v1.3.7 diff --git a/CMakeLists.txt b/CMakeLists.txt index 03bfdaca1..c9262053f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,7 +214,10 @@ endif() check_cxx_source_compiles(" #include int main() { - std::experimental::string_view sv = \"hello world\"; + const std::experimental::string_view sv = \"hello world\"; + // test that string_view has to_string() const method + // Xcode 6.4 appears to have shipped a string_view without it + auto str = sv.to_string(); return 0; }" META_HAS_EXPERIMENTAL_STRING_VIEW) diff --git a/LICENSE.mit b/LICENSE.mit index e62cf6623..a4742a015 100644 --- a/LICENSE.mit +++ b/LICENSE.mit @@ -1,4 +1,4 @@ -Copyright (c) 2015 Sean Massung, Chase Geigle +Copyright (c) 2016 Sean Massung, Chase Geigle Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/LICENSE.ncsa b/LICENSE.ncsa index 7ca35777f..5a1e3c532 100644 --- a/LICENSE.ncsa +++ b/LICENSE.ncsa @@ -1,9 +1,9 @@ -Copyright (c) 2015 Sean Massung, Chase Geigle +Copyright (c) 2016 Sean Massung, Chase Geigle All rights reserved. Developed by: MeTA Team University of Illinois at Urbana-Champaign - http://meta-toolkit.github.io/meta/ + https://meta-toolkit.org Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with diff --git a/README.md b/README.md index 97481ac78..18b75dff9 100644 --- a/README.md +++ b/README.md @@ -470,7 +470,7 @@ you should run the following commands to download dependencies and related software needed for building: ```bash -pacman -Syu git mingw-w64-x86_64-{gcc,cmake,make,icu,jemalloc} +pacman -Syu git make mingw-w64-x86_64-{gcc,cmake,icu,jemalloc} ``` Then, exit the shell and launch the "MinGW-w64 Win64" shell. You can obtain diff --git a/cmake/FindOrBuildICU.cmake b/cmake/FindOrBuildICU.cmake index e5b5871eb..4308b4bf3 100644 --- a/cmake/FindOrBuildICU.cmake +++ b/cmake/FindOrBuildICU.cmake @@ -74,7 +74,7 @@ function(FindOrBuildICU) PREFIX ${ICU_EP_PREFIX} URL ${FindOrBuildICU_URL} URL_HASH ${FindOrBuildICU_URL_HASH} - CONFIGURE_COMMAND ${ICU_EP_PREFIX}/src/ExternalICU/source/runConfigureICU ${ICU_PLATFORM} + CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${ICU_EP_PREFIX}/src/ExternalICU/source/runConfigureICU ${ICU_PLATFORM} --disable-shared --enable-static --disable-dyload --disable-extras --disable-tests --disable-samples --prefix= diff --git a/deps/cpptoml b/deps/cpptoml index 51d517ccc..3eb45d727 160000 --- a/deps/cpptoml +++ b/deps/cpptoml @@ -1 +1 @@ -Subproject commit 51d517ccc4b6e1030b15fb5305dd05ccadb94f84 +Subproject commit 3eb45d7278167f1032b8d0a4aa1add2965d4272f diff --git a/include/meta/index/chunk_reader.h b/include/meta/index/chunk_reader.h index 1551685c5..0aa615599 100644 --- a/include/meta/index/chunk_reader.h +++ b/include/meta/index/chunk_reader.h @@ -19,17 +19,75 @@ #include "meta/io/filesystem.h" #include "meta/util/progress.h" #include "meta/util/shim.h" +#include "meta/util/multiway_merge.h" namespace meta { namespace index { +/** + * Simple wrapper class to adapt PostingsData to the Record concept for + * multiway_merge. + */ +template +class postings_record +{ + public: + using primary_key_type = typename PostingsData::primary_key_type; + using count_t = typename PostingsData::count_t; + + postings_record() = default; + + operator PostingsData() && + { + PostingsData pdata{key_}; + pdata.set_counts(std::move(counts_)); + return pdata; + } + + void merge_with(postings_record&& other) + { + std::move(other.counts_.begin(), other.counts_.end(), + std::back_inserter(counts_)); + count_t{}.swap(other.counts_); + } + + template + uint64_t read(InputStream& in) + { + PostingsData pdata; + auto bytes = pdata.read_packed(in); + key_ = pdata.primary_key(); + counts_ = pdata.counts(); + return bytes; + } + + bool operator<(const postings_record& other) const + { + return key_ < other.key_; + } + + bool operator==(const postings_record& other) const + { + return key_ == other.key_; + } + + count_t& counts() const + { + return counts_; + } + + private: + primary_key_type key_; + count_t counts_; +}; + /** * Represents an on-disk chunk to be merged with multi-way merge sort. Each * chunk_reader stores the file it's reading from, the total bytes needed * to be read, and the current number of bytes read, as well as buffers in - * one postings. + * one postings_record. */ template class chunk_reader @@ -40,13 +98,15 @@ class chunk_reader /// the path to the file we're reading from std::string path_; /// the current buffered postings data - PostingsData postings_; + postings_record postings_; /// the total number of bytes in the chunk we're reading uint64_t total_bytes_; /// the total number of bytes read uint64_t bytes_read_; public: + using value_type = postings_record; + /** * Constructs a new chunk reader from the given chunk path. * @param filename The path to the chunk to be read @@ -61,61 +121,25 @@ class chunk_reader } /** - * Destroys the reader **and the chunk file it was reading from**. + * Constructs an empty chunk reader. */ - ~chunk_reader() + chunk_reader() : total_bytes_{0}, bytes_read_{0} { - if (file_) - { - file_ = nullptr; - filesystem::delete_file(path_); - } + // nothing } - /** - * chunk_reader can be move constructed. - */ chunk_reader(chunk_reader&&) = default; /** - * chunk_reader can be move assigned. - * @param rhs the right hand side of the assignment + * Destroys the reader **and the chunk file it was reading from**. */ - chunk_reader& operator=(chunk_reader&& rhs) + ~chunk_reader() { if (file_) { file_ = nullptr; filesystem::delete_file(path_); } - - file_ = std::move(rhs.file_); - path_ = std::move(rhs.path_); - postings_ = std::move(rhs.postings_); - total_bytes_ = rhs.total_bytes_; - bytes_read_ = rhs.bytes_read_; - - return *this; - } - - /** - * Whether or not the chunk_reader is in a good state. - * @return whether the underlying stream is in a good state - */ - operator bool() const - { - return static_cast(*file_); - } - - /** - * Comparison operator for sorting. - * @param other the other reader to compare with - * @return whether the current reader's postings's primary_key is less - * than other's - */ - bool operator<(const chunk_reader& other) const - { - return postings_ < other.postings_; } /** @@ -123,7 +147,7 @@ class chunk_reader */ void operator++() { - bytes_read_ += postings_.read_packed(*file_); + bytes_read_ += postings_.read(*file_); } /** @@ -145,12 +169,48 @@ class chunk_reader /** * @return the current buffered postings object */ - const PostingsData& postings() const + postings_record& operator*() + { + return postings_; + } + + /** + * @return the current buffered postings object + */ + const postings_record& operator*() const { return postings_; } + + /** + * Whether this chunk_reader is equal to another. chunk_readers are + * equal if they are both exhausted or both are reading from the same + * path and have read the same number of bytes. + */ + bool operator==(const chunk_reader& other) const + { + if (!other.file_) + { + return !file_ || !static_cast(*file_); + } + else + { + return std::tie(path_, bytes_read_) + == std::tie(other.path_, bytes_read_); + } + } }; +/** + * Whether two chunk_readers differ. Defined in terms of operator==. + */ +template +bool operator!=(const chunk_reader& a, + const chunk_reader& b) +{ + return !(a == b); +} + /** * Performs a multi-way merge sort of all of the provided chunks, writing * to the provided output stream. Currently, this function will attempt @@ -174,70 +234,11 @@ uint64_t multiway_merge(std::ostream& outstream, ForwardIterator begin, for (; begin != end; ++begin) to_merge.emplace_back(*begin); - printing::progress progress{ - " > Merging postings: ", - std::accumulate(to_merge.begin(), to_merge.end(), 0ul, - [](uint64_t acc, const input_chunk& chunk) - { - return acc + chunk.total_bytes(); - })}; - - uint64_t unique_primary_keys = 0; - - uint64_t total_read - = std::accumulate(to_merge.begin(), to_merge.end(), 0ul, - [](uint64_t acc, const input_chunk& chunk) - { - return acc + chunk.bytes_read(); - }); - while (!to_merge.empty()) - { - progress(total_read); - ++unique_primary_keys; - - std::sort(to_merge.begin(), to_merge.end()); - - // gather all postings that match the smallest primary key, reading - // a new postings from the corresponding file - auto range = std::equal_range(to_merge.begin(), to_merge.end(), - *to_merge.begin()); - auto min_pk = range.first->postings().primary_key(); - - using count_t = typename PostingsData::count_t; - std::vector to_write; - to_write.reserve( - static_cast(std::distance(range.first, range.second))); - std::for_each(range.first, range.second, [&](input_chunk& chunk) - { - to_write.emplace_back(chunk.postings().counts()); - auto before = chunk.bytes_read(); - ++chunk; - total_read += (chunk.bytes_read() - before); - }); - - // merge them all into one big counts vector - count_t counts; - std::for_each(to_write.begin(), to_write.end(), [&](count_t& pd) - { - std::move(pd.begin(), pd.end(), - std::back_inserter(counts)); - count_t{}.swap(pd); - }); - - // write out the merged counts - PostingsData output{std::move(min_pk)}; - output.set_counts(std::move(counts)); - output.write_packed(outstream); - - // remove all empty chunks from the input - to_merge.erase(std::remove_if(to_merge.begin(), to_merge.end(), - [](const input_chunk& chunk) - { - return !chunk; - }), - to_merge.end()); - } - return unique_primary_keys; + return util::multiway_merge(to_merge.begin(), to_merge.end(), + [&](PostingsData&& pdata) + { + pdata.write_packed(outstream); + }); } } } diff --git a/include/meta/util/multiway_merge.h b/include/meta/util/multiway_merge.h new file mode 100644 index 000000000..61b49da77 --- /dev/null +++ b/include/meta/util/multiway_merge.h @@ -0,0 +1,145 @@ +/** + * @file multiway_merge.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_UTIL_MULTIWAY_MERGE_H_ +#define META_UTIL_MULTIWAY_MERGE_H_ + +#include +#include +#include +#include +#include + +#include "meta/util/progress.h" + +namespace meta +{ +namespace util +{ + +/** + * A generic algorithm for performing an N-way merge on a collection of + * sorted "chunks". + * + * The following concepts are involved: + * + * - Record: + * A Record must represent the atomic items that are to be merged. They + * are comparable via operator< and operator==, and must have a member + * function merge_with(Record&&). During the merging process, Records + * will be read from the individual chunks (via + * ChunkIterator::operator++), merge_with will be called across all + * Records across all chunks that compare equal, and the final merged + * Record will be passed to the write callback. + * + * - ForwardIterator: + * A basic ForwardIterator whose value type must model the + * ChunkIterator concept. + * + * - ChunkIterator: + * A ChunkIterator is an iterator over Records within a chunk. + * Typically, these will be streamed-in from disk, and thus + * ChunkIterator can be thought of as an InputIterator with a few + * additional functions. + * + * ChunkIterators must dereference to a Record. + * + * Comparison operators for (in)equality must be provided. A + * ChunkIterator that cannot read any more Records must compare equal + * to the default-constructed ChunkIterator. + * + * ChunkIterator's operator++ shall buffer in (at least) the next + * Record, which can be obtained via the dereference operator. + * + * ChunkIterators also keep track of the state of the underlying + * InputStream from which Records are read. ChunkIterators shall + * provide two observer functions for the state of that stream: + * total_bytes(), which indicates the total number of bytes that occur + * in the stream, and bytes_read(), which indicates the total number of + * bytes that have been consumed from the stream thus far. When + * bytes_read() == total_bytes(), the stream has been exhausted and the + * iterator shall compare equal to the default-constructed + * ChunkIterator. + * + * @return the total number of unique Records that were written to the + * OutputStream + */ + +template +uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, + RecordHandler&& output) +{ + using ChunkIterator = typename ForwardIterator::value_type; + using Record = typename ChunkIterator::value_type; + + uint64_t to_read = std::accumulate( + begin, end, 0ul, [](uint64_t acc, const ChunkIterator& chunk) + { + return acc + chunk.total_bytes(); + }); + + printing::progress progress{" > Merging: ", to_read}; + + uint64_t total_read = std::accumulate( + begin, end, 0ul, [](uint64_t acc, const ChunkIterator& chunk) + { + return acc + chunk.bytes_read(); + }); + + std::vector> to_merge; + to_merge.reserve(static_cast(std::distance(begin, end))); + for (; begin != end; ++begin) + to_merge.emplace_back(*begin); + + auto chunk_iter_comp = [](const ChunkIterator& a, const ChunkIterator& b) + { + return *a < *b; + }; + + uint64_t unique_records = 0; + while (!to_merge.empty()) + { + progress(total_read); + ++unique_records; + + std::sort(to_merge.begin(), to_merge.end(), chunk_iter_comp); + + // gather all Records that match the smallest Record, reading in a + // new Record from the corresponding ChunkIterator + auto range = std::equal_range(to_merge.begin(), to_merge.end(), + to_merge.front(), chunk_iter_comp); + + Record merged{std::move(*(*range.first).get())}; + ++(*range.first).get(); + ++range.first; + std::for_each(range.first, range.second, [&](ChunkIterator& iter) + { + merged.merge_with(std::move(*iter)); + auto before = iter.bytes_read(); + ++iter; + total_read += (iter.bytes_read() - before); + }); + + // write out merged record + output(std::move(merged)); + + // remove all empty ChunkIterators from the input + to_merge.erase(std::remove_if(to_merge.begin(), to_merge.end(), + [](const ChunkIterator& iter) + { + return iter == ChunkIterator{}; + }), + to_merge.end()); + } + + return unique_records; +} +} +} +#endif diff --git a/src/corpus/metadata_parser.cpp b/src/corpus/metadata_parser.cpp index abb8293d9..f19a9ceff 100644 --- a/src/corpus/metadata_parser.cpp +++ b/src/corpus/metadata_parser.cpp @@ -3,9 +3,12 @@ * @author Chase Geigle */ +#include + #include "meta/corpus/metadata_parser.h" #include "meta/io/filesystem.h" #include "meta/util/shim.h" +#include "meta/util/string_view.h" namespace meta { @@ -14,8 +17,7 @@ namespace corpus metadata_parser::metadata_parser(const std::string& filename, metadata::schema schema) - : infile_{make_unique(filename)}, - schema_{std::move(schema)} + : infile_{make_unique(filename)}, schema_{std::move(schema)} { // nothing } @@ -26,32 +28,50 @@ std::vector metadata_parser::next() std::string str; if (*infile_) { + std::getline(*infile_, str); + util::string_view line{str}; mdata.reserve(schema_.size()); for (const auto& finfo : schema_) { - if (!*infile_) + if (!*infile_ || line.empty()) throw metadata_exception{ "metadata input file ended prematurely"}; - *infile_ >> str; + auto delim_pos = line.find('\t'); + auto token = line.substr(0, delim_pos); + + char* end = nullptr; switch (finfo.type) { case metadata::field_type::SIGNED_INT: - mdata.emplace_back(static_cast(std::stol(str))); + { + auto val = std::strtol(token.data(), &end, 10); + mdata.emplace_back(static_cast(val)); break; + } case metadata::field_type::UNSIGNED_INT: - mdata.emplace_back(static_cast(std::stoul(str))); + { + auto val = std::strtoul(token.data(), &end, 10); + mdata.emplace_back(static_cast(val)); break; + } case metadata::field_type::DOUBLE: - mdata.emplace_back(std::stod(str)); + { + auto val = std::strtod(token.data(), &end); + mdata.emplace_back(val); break; + } case metadata::field_type::STRING: - mdata.emplace_back(std::move(str)); + { + mdata.emplace_back(token.to_string()); break; + } } + + line = line.substr(std::min(delim_pos, line.size() - 1) + 1); } } return mdata; diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index ebf166c91..556b22d32 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -416,53 +416,25 @@ void forward_index::impl::merge_chunks( chunks.emplace_back(filename); } - printing::progress progress{ - " > Merging postings: ", - std::accumulate(chunks.begin(), chunks.end(), 0ul, - [](uint64_t acc, const input_chunk& chunk) - { - return acc + chunk.total_bytes(); - })}; - - uint64_t total_read - = std::accumulate(chunks.begin(), chunks.end(), 0ul, - [](uint64_t acc, const input_chunk& chunk) - { - return acc + chunk.bytes_read(); - }); - - while (!chunks.empty()) - { - progress(total_read); - - // find the lowest doc id - auto min_chunk = std::min_element(chunks.begin(), chunks.end()); - - // steal the postings and advance the chunk - auto to_write = min_chunk->postings(); - auto before = min_chunk->bytes_read(); - ++*min_chunk; - total_read += min_chunk->bytes_read() - before; - - // if there were no more postings, remove the chunk for the input - if (!*min_chunk) - chunks.erase(min_chunk); - - // renumber the postings - forward_index::postings_data_type::count_t counts; - counts.reserve(to_write.counts().size()); - for (const auto& count : to_write.counts()) - { - const auto& key = keys.at(count.first); - auto it = vocab.find(key); - assert(it != vocab.end()); - counts.emplace_back(it->value(), count.second); - } - - // set the new counts and write to the postings file - to_write.set_counts(std::move(counts)); - writer.write(to_write); - } + util::multiway_merge(chunks.begin(), chunks.end(), + [&](forward_index::postings_data_type&& to_write) + { + // renumber the postings + forward_index::postings_data_type::count_t counts; + counts.reserve(to_write.counts().size()); + for (const auto& count : to_write.counts()) + { + const auto& key = keys.at(count.first); + auto it = vocab.find(key); + assert(it != vocab.end()); + counts.emplace_back(it->value(), count.second); + } + + // set the new counts and write to the postings + // file + to_write.set_counts(std::move(counts)); + writer.write(to_write); + }); } void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) diff --git a/src/tools/profile.cpp b/src/tools/profile.cpp index 4902f70ba..5e2a6bb8f 100644 --- a/src/tools/profile.cpp +++ b/src/tools/profile.cpp @@ -313,6 +313,11 @@ int main(int argc, char* argv[]) auto config = cpptoml::parse_file(argv[1]); std::string file = argv[2]; + if (!filesystem::file_exists(file)) + { + std::cerr << "File does not exist" << std::endl; + return 1; + } std::unordered_set args{argv + 3, argv + argc}; bool all = args.find("--all") != args.end(); diff --git a/tests/metadata_test.cpp b/tests/metadata_test.cpp index b2bc72cc6..39ec8bc3d 100644 --- a/tests/metadata_test.cpp +++ b/tests/metadata_test.cpp @@ -97,5 +97,46 @@ go_bandit([]() { filesystem::delete_file(filename); }); + + it("should read string metadata with spaces", [&]() { + /// @see https://github.com/meta-toolkit/meta/issues/127 + options_type options = { + {"path", "string"}, {"title", "string"}, {"comment", "string"}}; + auto config = create_metadata_config(options); + + const std::string metadata + = "/my/path1\tWonderful Ducklings\ta great children's book\n" + "/my/path2\tSo Many Goose\tI saw their tiny little feet"; + create_metadata_file(metadata, filename); + + corpus::metadata_parser parser{filename, + corpus::metadata_schema(*config)}; + + auto fields = parser.next(); + AssertThat(fields.size(), Equals(3ul)); + AssertThat(fields[0].type, + Equals(corpus::metadata::field_type::STRING)); + AssertThat(fields[0].str, Equals("/my/path1")); + AssertThat(fields[1].type, + Equals(corpus::metadata::field_type::STRING)); + AssertThat(fields[1].str, Equals("Wonderful Ducklings")); + AssertThat(fields[2].type, + Equals(corpus::metadata::field_type::STRING)); + AssertThat(fields[2].str, Equals("a great children's book")); + + fields = parser.next(); + AssertThat(fields.size(), Equals(3ul)); + AssertThat(fields[0].type, + Equals(corpus::metadata::field_type::STRING)); + AssertThat(fields[0].str, Equals("/my/path2")); + AssertThat(fields[1].type, + Equals(corpus::metadata::field_type::STRING)); + AssertThat(fields[1].str, Equals("So Many Goose")); + AssertThat(fields[2].type, + Equals(corpus::metadata::field_type::STRING)); + AssertThat(fields[2].str, Equals("I saw their tiny little feet")); + + filesystem::delete_file(filename); + }); }); }); diff --git a/travis/install_osx.sh b/travis/install_osx.sh index fbcaebc85..99dd5651e 100755 --- a/travis/install_osx.sh +++ b/travis/install_osx.sh @@ -1,7 +1,13 @@ set -v brew update brew install icu4c jemalloc -brew outdated cmake || brew upgrade cmake + +# Travis Xcode 7.1+ images don't have cmake installed for some reason +if ! [ -x "$(command -v cmake)" ]; then + brew install cmake +else + brew outdated cmake || brew upgrade cmake +fi if [ "$COMPILER" == "gcc" ]; then brew tap homebrew/versions