Skip to content

Commit

Permalink
Merge branch 'develop' for MeTA 2.0.0 release
Browse files Browse the repository at this point in the history
  • Loading branch information
skystrife committed Jan 30, 2016
2 parents 935af7d + 189240e commit 6c062bf
Show file tree
Hide file tree
Showing 633 changed files with 61,333 additions and 11,076 deletions.
20 changes: 20 additions & 0 deletions .appveyor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
version: '{build}'
platform: x64
configuration: Release
os: Visual Studio 2015

install:
- set PATH=C:\msys64\usr\bin;%PATH%
- bash -lc ""
- bash -lc "pacman --noconfirm --needed -Sy bash pacman pacman-mirrors msys2-runtime msys2-runtime-devel"
# we don't actually need ada, fortran, libgfortran, or objc, but in
# order to update gcc we need to also update those packages as well...
- bash -lc "pacman --noconfirm -S mingw-w64-x86_64-{gcc,gcc-ada,gcc-fortran,gcc-libgfortran,gcc-objc,cmake,make,icu,jemalloc}"
before_build:
- cd C:\projects\meta
- git submodule update --init --recursive
- bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER && mkdir build && cd build && cmake .. -G \"MSYS Makefiles\""
build_script:
- bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && make"
test_script:
- bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && cp ../config.toml . && ctest --output-on-failure"
6 changes: 1 addition & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,10 @@ doc/
.*.swp
*.o
*.class
*.pyc
learn
features
search
tester
.*
data/ceeaus
data/breast-cancer
data/housing
biicode.conf
bii/
bin/
10 changes: 6 additions & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
path = deps/cpptoml
url = https://github.com/skystrife/cpptoml.git
branch = master
[submodule "deps/porter2_stemmer"]
path = deps/porter2_stemmer
url = https://github.com/meta-toolkit/porter2_stemmer.git
branch = meta
[submodule "deps/libsvm-modules"]
path = deps/libsvm-modules
url = https://github.com/meta-toolkit/meta-libsvm.git
[submodule "deps/findicu"]
path = deps/findicu
url = https://github.com/julp/FindICU.cmake.git
[submodule "deps/meta-stlsoft"]
path = deps/meta-stlsoft
url = https://github.com/meta-toolkit/meta-stlsoft.git
[submodule "deps/bandit"]
path = deps/bandit
url = https://github.com/joakimkarlsson/bandit.git
108 changes: 71 additions & 37 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,83 @@ notifications:

language: cpp

compiler:
- clang
- gcc
sudo: false

addons:
apt:
packages: &default-packages
- libjemalloc-dev
- zlib1g-dev

matrix:
include:
# Linux/GCC 4.8
- os: linux
env: COMPILER=gcc GCC_VERSION=4.8
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- *default-packages
- gcc-4.8
- g++-4.8

# Linux/GCC 4.9
- os: linux
env: COMPILER=gcc GCC_VERSION=4.9
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- *default-packages
- gcc-4.9
- g++-4.9

# Linux/GCC 5
- os: linux
env: COMPILER=gcc GCC_VERSION=5
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- *default-packages
- gcc-5
- g++-5

# Linux/Clang 3.6
- os: linux
env: COMPILER=clang CLANG_VERSION=3.6
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.6
packages:
- *default-packages
- clang-3.6
- llvm-3.6-dev

# OSX/Clang (XCode)
- os: osx
env: COMPILER=clang

# OSX/GCC 5
- os: osx
env: COMPILER=gcc

install:
- sudo apt-get update -qq
- sudo apt-get install libc6-i386
- wget http://www.cmake.org/files/v3.0/cmake-3.0.1-Linux-i386.sh
- sudo sh cmake-3.0.1-Linux-i386.sh --prefix=/usr/local --exclude-subdir
- sudo apt-get install libicu-dev
# credit: https://github.com/beark/ftl/
# install g++ 4.8, if tests are run with g++
- if [ "`echo $CXX`" == "g++" ]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; fi
- if [ "`echo $CXX`" == "g++" ]; then sudo apt-get update; fi
- if [ "`echo $CXX`" == "g++" ]; then sudo apt-get install -qq g++-4.8; fi
- if [ "`echo $CXX`" == "g++" ]; then sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 50; fi
# install libc++ if tests are run with clang++
- if [ "`echo $CXX`" == "clang++" ]; then cwd=$(pwd); fi
- if [ "`echo $CXX`" == "clang++" ]; then svn co --quiet http://llvm.org/svn/llvm-project/libcxx/trunk libcxx; fi
- if [ "`echo $CXX`" == "clang++" ]; then git clone https://github.com/pathscale/libcxxrt.git libcxxrt; fi
- if [ "`echo $CXX`" == "clang++" ]; then cd libcxxrt; fi
- if [ "`echo $CXX`" == "clang++" ]; then mkdir build; fi
- if [ "`echo $CXX`" == "clang++" ]; then cd build; fi
- if [ "`echo $CXX`" == "clang++" ]; then cmake -DCMAKE_BUILD_TYPE=Release ../; fi
- if [ "`echo $CXX`" == "clang++" ]; then make; fi
- if [ "`echo $CXX`" == "clang++" ]; then sudo cp lib/libcxxrt.so /usr/lib; fi
- if [ "`echo $CXX`" == "clang++" ]; then sudo ln -sf /usr/lib/libcxxrt.so /usr/lib/libcxxrt.so.1; fi
- if [ "`echo $CXX`" == "clang++" ]; then sudo ln -sf /usr/lib/libcxxrt.so /usr/lib/libcxxrt.so.1.0; fi
- if [ "`echo $CXX`" == "clang++" ]; then cd $cwd; fi
- if [ "`echo $CXX`" == "clang++" ]; then cd libcxx; fi
- if [ "`echo $CXX`" == "clang++" ]; then mkdir build; fi
- if [ "`echo $CXX`" == "clang++" ]; then cd build; fi
- if [ "`echo $CXX`" == "clang++" ]; then cmake -DLIBCXX_CXX_ABI=libcxxrt -DLIBCXX_CXX_ABI_INCLUDE_PATHS="../../libcxxrt/src" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr ..; fi
- if [ "`echo $CXX`" == "clang++" ]; then make; fi
- if [ "`echo $CXX`" == "clang++" ]; then sudo make install; fi
- if [ "`echo $CXX`" == "clang++" ]; then cd $cwd; fi
- if [ "$TRAVIS_OS_NAME" == "linux" ]; then source travis/install_linux.sh; fi
- if [ "$TRAVIS_OS_NAME" == "osx" ]; then source travis/install_osx.sh; fi

before_script:
- mkdir build
- cd build
- cp ../config.toml ./

script:
- cmake ../ -DCMAKE_BUILD_TYPE=Debug
- make
- ctest --output-on-failure
- git submodule update --init --recursive
- ../travis/cmake.sh Debug && make && make clean
- rm -rf CMake* && ../travis/cmake.sh Release && make && ctest --output-on-failure
127 changes: 125 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,126 @@
# [v2.0.0][2.0.0]
## New features and major changes

### Indexing
- Index format rewrite: both inverted and forward indices now use the same
compressed postings format, and intermediate chunks are now also
compressed on-the-fly. There is now a built in tool to dump any forward
index to libsvm format (as this is not the on-disk format for that type
of index anymore).
- Metadata support: indices can now store arbitrary metadata associated
with individual documents with string, integer, unsigned integer, and
floating point values
- Corpus configuration is now stored within the corpus directory itself,
allowing for corpora to be distributed with their proper configurations
rather than having to bake this into the main configuration file
- RAM limits can be set for the indexing process via the configuration
file. These are **approximate** and based on heuristics, so you should
always set these to lower than available RAM.
- Forward indices can now be created directly instead of forcing the
creation of an inverted index first

### Tokenization and Analysis
- ICU will be built and statically linked if the system provided library is
too old on both OS X and Linux platforms. MeTA now will specify an
exact version of ICU that should be used per release for consistency.
That version is 56.1 as of this release.
- Analyzers have been modified to support both integral and floating point
values via the use of the `featurizer` object passed to `tokenize()`
- Documents no longer store any count information during the analysis
process

### Ranking
- Postings lists can now be read in a streaming fashion rather than all at
once via `postings_stream`
- Ranking is now performed using a document-at-a-time scheme
- Ranking functions now use fast approximate math from
[fastapprox][fastapprox]
- Rank correlation measures have been added to the evaluation library

### Language Model
- Rewrite of the language model library which can load models from the
[.arpa][arpa] format
- [SyntacticDiff][syndiff] implementation for comparative text mining, which may
include grammatical error correction, summarization, or feature generation

### Machine Learning
- A feature selection library for selecting features for machine learning
using chi square, information gain, correlation coefficient, and odds
ratio has been added
- The API for the machine learning algorithms has been changed to use
`dataset` classes; these are separate from the index classes and
represent data that is memory-resident
- Support for regression has been added (currently only via SGD)
- The SGD algorithm has been improved to use a normalized adaptive gradient
method which should make it less sensitive to feature scaling
- The SGD algorithm now supports (approximate) L1 regularization via a
cumulative penalty approach
- The libsvm modules are now also built using CMake

### Miscellaneous
- Packed binary I/O functions allow for writing integers/floating point
values in a compressed format that can be efficiently decoded. This
should be used for most binary I/O that needs to be performed in the
toolkit unless there is a specific reason not to.
- An interactive demo application has been added for the shift-reduce
constituency parser
- A `string_view` class is provided in the `meta::util` namespace to be
used for non-owning references to strings. This will use
`std::experimental::string_view` if available and our own
implementation if not
- `meta::util::optional` will resolve to `std::experimental::optional` if
it is available
- Support for jemalloc has been added to the build system. We **strongly**
recommend installing and linking against jemalloc for improved indexing
performance.
- A tool has been added to print out the top *k* terms in a corpus
- A new library for hashing has been added in namespace `meta::hashing`.
This includes a generic framework for writing hash functions that are
randomly keyed as well as (insertion only) probing-based hash sets/maps
with configurable resizing and probing strategies
- A utility class `fixed_heap` has been added for places where a fixed size
set of maximal/minimal values should be maintained in constant space
- The filesystem management routines have been converted to use STLsoft in
the event that the filesystem library in
`std::experimental::filesystem` is not available
- Building MeTA on Windows is now officially supported via MSYS2 and
MinGW-w64, and continuious integration now builds it on every commit in
this environment
- A small support library for things related to random number generation
has been added in `meta::random`
- Sparse vectors now support `operator+` and `operator-`
- An STL container compatible allocator `aligned_allocator<T, Alignment>`
has been added that can over-align data (useful for performance in some
situations)
- Bandit is now used for the unit tests, and these have been substantially
improved upon
- `io::parser` deprecated and removed; most uses simply converted to
`std::fstream`
- `binary_file_{reader,writer}` deprecated and removed;
`io::packed` or `io::{read,write}_binary` should be used instead

## Bug fixes
- knn classifier now only requests the top *k* when performing classification
- An issue where uncompressed model files would not be found if using a
zlib-enabled build (#101)

## Enhancements
- Travis CI integration has been switched to their container
infrastructure, and it now builds with OS X with Clang in addition to
Linux with Clang and GCC
- Appveyor CI for Windows builds alongside Travis
- Indexing speeds are dramatically faster (thanks to many changes both in
the in-memory posting chunks as well as optimizations in the
tokenization process)
- If no build type is specified, MeTA will be built in Release mode
- The cpptoml dependency version has been bumped, allowing the use of
things like `value_or` for cleaner code
- The identifiers library has been dramatically simplified

[syndiff]: http://web.engr.illinois.edu/~massung1/files/bigdata-2015.pdf
[fastapprox]: https://code.google.com/p/fastapprox/
[arpa]: http://www.speech.sri.com/projects/srilm/manpages/ngram-format.5.html

# [v1.3.8][1.3.8]
## Bug fixes
- Fix issue with `confusion_matrix` where precision and recall values were
Expand All @@ -10,7 +133,6 @@
- Create a `predicted_label` opaque identifier to emphasize `class_labels` that
are output from some model (and thus shouldn't be interchangeable)


# [v1.3.7][1.3.7]
## Bug fixes
- Fix inconsistent behavior of `utf::segmenter` (and thus `icu_tokenizer`) for
Expand Down Expand Up @@ -164,7 +286,8 @@
# [v1.0][1.0]
- Initial release.

[unreleased]: https://github.com/meta-toolkit/meta/compare/v1.3.8...develop
[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.0.0...develop
[2.0.0]: https://github.com/meta-toolkit/meta/compare/v1.3.8...v2.0.0
[1.3.8]: https://github.com/meta-toolkit/meta/compare/v1.3.7...v1.3.8
[1.3.7]: https://github.com/meta-toolkit/meta/compare/v1.3.6...v1.3.7
[1.3.6]: https://github.com/meta-toolkit/meta/compare/v1.3.5...v1.3.6
Expand Down
Loading

0 comments on commit 6c062bf

Please sign in to comment.