Merge branch 'develop'

meta-toolkit · Mar 4, 2015 · a57a814 · a57a814
2 parents 3577fd3 + ff72026
commit a57a814
Show file tree

Hide file tree

Showing 263 changed files with 11,588 additions and 2,981 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,5 @@ features
 search
 tester
 .*
+data/ceeaus
+data/breast-cancer
diff --git a/.travis.yml b/.travis.yml
@@ -8,6 +8,7 @@ compiler:
     - gcc
 
 install:
+  - sudo apt-get update -qq
   - sudo apt-get install libc6-i386
   - wget http://www.cmake.org/files/v3.0/cmake-3.0.1-Linux-i386.sh
   - sudo sh cmake-3.0.1-Linux-i386.sh --prefix=/usr/local --exclude-subdir
@@ -40,19 +41,8 @@ install:
   - if [ "`echo $CXX`" == "clang++" ]; then cd $cwd; fi
 
 before_script:
-  - cd deps/libsvm-modules/liblinear
-  - make
-  - cd ../libsvm
-  - make
-  - cd ../../../
   - mkdir build
   - cd build
-  - wget http://web.engr.illinois.edu/~massung1/files/ceeaus.tar.gz
-  - wget http://web.engr.illinois.edu/~massung1/files/breast-cancer.tar.gz
-  - tar -xzvf ceeaus.tar.gz > /dev/null
-  - tar -xzvf breast-cancer.tar.gz > /dev/null
-  - mv ceeaus ../data/
-  - mv breast-cancer ../data/
   - cp ../config.toml ./
 
 script:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,96 @@
+# [Unreleased][unreleased]
+- nothing yet!
+
+# [v1.3][1.3]
+## New features
+- additions to the graph library:
+    * myopic search
+    * BFS
+    * preferential attachment graph generation model (supports node
+        attractiveness from different distributions)
+    * betweenness centrality
+    * eigenvector centrality
+- added a new natural language parsing library:
+    * parse tree library (visitor-based)
+    * shift-reduce constituency parser for generating phrase structure
+        trees
+    * reimplementation of evalb metrics for evaluating parsers
+    * new filter for Penn Treebank-style normalization
+- added a greedy averaged Perceptron-based tagger
+- demo application for various basic text processing (profile)
+- basic iostreams that support gzip compression (if compiled with ZLib
+    support)
+- added iteration method for `stats::multinomial` seen events
+- added expected value and entropy functions to `stats` namespace
+- added `linear_model`: a generic multiclass classifier storage class
+- added `gz_corpus`: a compressed version of `line_corpus`
+- added macros for generating type safe identifiers with user defined
+    literal suffixes
+- added a persistent stack data structure to `meta::util`
+
+## Enhancements
+- added operator== for `util::optional<T>`
+- better CMake support for building the libsvm modules
+- better CMake support for downloading unit-test data
+- improved setup guide in README (for OS X, Ubuntu, Arch, and EWS/ENGRIT)
+- tree analyzers refactored to use the new parser library (removes
+    dependency on outside toolkits for generating tree files)
+- analyzers that are not part of the "core" have been moved into their
+    respective folders (so `ngram_pos_analyzer` is in `src/sequence`,
+    `tree_analyzer` is in `src/parser`)
+- `make_index` now checks if the files exist before loading an index, and
+    if they are missing creates a new one (as opposed to just throwing an
+    exception on a nonexistent file)
+- cpptoml upgraded to support TOML v0.4.0
+- enable extra warnings (-Wextra) for clang++ and g++
+
+## Bug fixes
+- fix `sequence_analyzer::analyze() const` when applied to untagged
+    sequences (was throwing when it shouldn't)
+- ensure that the inverted index object is destroyed first before
+    uninverting occurs in the creation of a `forward_idnex`
+- fix bug where `icu_tokenizer` would output spaces as tokens
+- fix bugs where index objects were not destroyed before trying to delete
+    their files in the unit tests
+- fix bug in `sparse_vector::find()` where it would return a non-end
+    iterator when asked to find an element that does not exist
+
+# [v1.2][1.2]
+## New features
+- demo application for CRF-based POS tagging
+- `nearest_centroid` classifier
+- basic statistics library for representing relevant probability
+    distributions
+- `sparse_vector` utility class
+
+## Enhancements
+- `ngram_pos_analyzer` now uses the CRf internally (see issue #46)
+- `knn` classifier new supports weighted knn
+- `filesystem::copy_file()` no longer hangs without progress reporting with
+    large files
+- CMake build system now includes `INTERFACE` targets (better inclusion as
+    a subproject in external projects)
+- MeTA can now (optionally) be built with C++14 support
+
+## Bug fixes
+- `language_model_ranker` scoring function corrected (see issue #50)
+- `naive_bayes` classifier scoring corrected
+- several incorrect instances of `numeric_limits<double>::min()` replaced
+    with the intended `numeric_limits<double>::lowest()`
+- fix compilation with versions of ICU < 4.4
+
+# [v1.1][1.1]
+## Changes
+- sequence analyzer and CRF implementation
+- basic language model
+- basic directed and undirected graphs
+- restructure CMakeLists
+
+# [v1.0][1.0]
+- Initial release.
+
+[unreleased]: https://github.com/meta-toolkit/meta/compare/v1.3...develop
+[1.3]: https://github.com/meta-toolkit/meta/compare/v1.2...v1.3
+[1.2]: https://github.com/meta-toolkit/meta/compare/v1.1...v1.2
+[1.1]: https://github.com/meta-toolkit/meta/compare/v1.0...v1.1
+[1.0]: https://github.com/meta-toolkit/meta/compare/01aff7e0bddfaba997141d96ef7a371b3221e0ee...v1.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,7 +9,8 @@ include(CTest)
 include(CheckCXXCompilerFlag)
 include(CheckCXXSourceCompiles)
 include(CheckCXXSourceRuns)
-
+include(ExternalProject)
+include(FindZLIB)
 
 include_directories(include/)
 
@@ -19,8 +20,12 @@ find_package(Threads REQUIRED)
 find_package(ICU COMPONENTS data i18n uc REQUIRED)
 include_directories(SYSTEM ${ICU_INCLUDE_DIRS})
 
+if(ZLIB_FOUND)
+  include_directories(SYSTEM ${ZLIB_INCLUDE_DIRS})
+endif()
+
 if(UNIX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic")
 
     # if we don't already set the standard for the compiler, detect the
     # best one available and use it
@@ -64,6 +69,9 @@ if(UNIX)
     endif()
 
     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        if(CMAKE_GENERATOR STREQUAL "Ninja")
+          set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics")
+        endif()
         if(USE_LIBCXX)
             message("-- Locating libc++...")
             find_library(LIBCXX_LIBRARY NAMES c++ cxx)
@@ -101,6 +109,12 @@ endif()
 add_library(meta-definitions INTERFACE)
 target_include_directories(meta-definitions INTERFACE ${PROJECT_SOURCE_DIR}/include)
 
+if(ZLIB_FOUND)
+  target_include_directories(meta-definitions INTERFACE
+                             ${ZLIB_INCLUDE_DIRS})
+  target_compile_definitions(meta-definitions INTERFACE -DMETA_HAS_ZLIB)
+endif()
+
 if(HAS_CXX14 OR HAS_CXX1Y)
     target_compile_definitions(meta-definitions INTERFACE -DMETA_HAS_CXX14=1)
 endif()

diff --git a/LICENSE.mit b/LICENSE.mit
@@ -1,4 +1,4 @@
-Copyright (c) 2014 Sean Massung, Chase Geigle
+Copyright (c) 2015 Sean Massung, Chase Geigle
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in

diff --git a/LICENSE.ncsa b/LICENSE.ncsa
@@ -1,4 +1,4 @@
-Copyright (c) 2014 Sean Massung, Chase Geigle
+Copyright (c) 2015 Sean Massung, Chase Geigle
 All rights reserved.
 
 Developed by:           MeTA Team
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,5 @@ features @@
     search
     tester
     .*
+    data/ceeaus
+    data/breast-cancer