diff --git a/.travis.yml b/.travis.yml index 81e6b704..281ffcaa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,16 @@ language: go +dist: bionic go: - - "1.9" - - "1.10" - - "1.11" - - "1.12" + - "1.13" + - "1.14" +arch: + - amd64 + - arm64 env: - # Temporary workaround for go 1.6 - - GODEBUG=cgocheck=0 before_install: - sudo apt-get update -qq - sudo apt-get install -qq libatlas-base-dev - - cd /tmp && wget http://www.csie.ntu.edu.tw/~cjlin/liblinear/oldfiles/liblinear-1.94.tar.gz && tar xf liblinear-1.94.tar.gz && cd liblinear-1.94 && make lib && sudo install -vm644 linear.h /usr/include && sudo install -vm755 liblinear.so.1 /usr/lib && sudo ln -sfv liblinear.so.1 /usr/lib/liblinear.so + - cd /tmp && wget https://www.csie.ntu.edu.tw/~cjlin/liblinear/oldfiles/liblinear-1.94.tar.gz && tar xf liblinear-1.94.tar.gz && cd liblinear-1.94 && make lib && sudo install -vm644 linear.h /usr/include && sudo install -vm755 liblinear.so.1 /usr/lib && sudo ln -sfv liblinear.so.1 /usr/lib/liblinear.so - cd $TRAVIS_BUILD_DIR install: - go get github.com/smartystreets/goconvey/convey diff --git a/ensemble/multisvc_test.go b/ensemble/multisvc_test.go index 66f942e9..2871f0cb 100644 --- a/ensemble/multisvc_test.go +++ b/ensemble/multisvc_test.go @@ -1,11 +1,12 @@ package ensemble import ( + "io/ioutil" + "testing" + "github.com/sjwhitworth/golearn/base" "github.com/sjwhitworth/golearn/evaluation" . "github.com/smartystreets/goconvey/convey" - "io/ioutil" - "testing" ) func TestMultiSVMUnweighted(t *testing.T) { @@ -14,40 +15,43 @@ func TestMultiSVMUnweighted(t *testing.T) { So(err, ShouldBeNil) X, Y := base.InstancesTrainTestSplit(inst, 0.4) - m := NewMultiLinearSVC("l1", "l2", true, 1.0, 1e-4, nil) - m.Fit(X) - - Convey("Predictions should work...", func() { - predictions, err := m.Predict(Y) - So(err, ShouldEqual, nil) - cf, err := evaluation.GetConfusionMatrix(Y, predictions) - So(err, ShouldEqual, nil) - So(evaluation.GetAccuracy(cf), ShouldBeGreaterThan, 0.70) - }) - - Convey("Saving should work...", func() { - f, err := ioutil.TempFile("", "tree") - So(err, ShouldBeNil) - err = m.Save(f.Name()) + Convey("Fitting should work...", func() { + m := NewMultiLinearSVC("l1", "l2", true, 1.0, 1e-4, nil) + err := m.Fit(X) So(err, ShouldBeNil) - Convey("Loading should work...", func() { - mLoaded := NewMultiLinearSVC("l1", "l2", true, 1.00, 1e-8, nil) - err := mLoaded.Load(f.Name()) + Convey("Predictions should work...", func() { + predictions, err := m.Predict(Y) + So(err, ShouldEqual, nil) + cf, err := evaluation.GetConfusionMatrix(Y, predictions) + So(err, ShouldEqual, nil) + So(evaluation.GetAccuracy(cf), ShouldBeGreaterThan, 0.70) + }) + + Convey("Saving should work...", func() { + f, err := ioutil.TempFile("", "tree") + So(err, ShouldBeNil) + err = m.Save(f.Name()) So(err, ShouldBeNil) - Convey("Predictions should be the same...", func() { - originalPredictions, err := m.Predict(Y) - So(err, ShouldBeNil) - newPredictions, err := mLoaded.Predict(Y) + Convey("Loading should work...", func() { + mLoaded := NewMultiLinearSVC("l1", "l2", true, 1.00, 1e-8, nil) + err := mLoaded.Load(f.Name()) So(err, ShouldBeNil) - So(base.InstancesAreEqual(originalPredictions, newPredictions), ShouldBeTrue) + + Convey("Predictions should be the same...", func() { + originalPredictions, err := m.Predict(Y) + So(err, ShouldBeNil) + newPredictions, err := mLoaded.Predict(Y) + So(err, ShouldBeNil) + So(base.InstancesAreEqual(originalPredictions, newPredictions), ShouldBeTrue) + }) + }) }) }) - }) } diff --git a/linear_models/integration.cpp b/linear_models/integration.cpp new file mode 100644 index 00000000..1c89d824 --- /dev/null +++ b/linear_models/integration.cpp @@ -0,0 +1,93 @@ +/* + * This file contains functions related to creating + freeing + * objects on behalf of the go runtime + */ + +#include "linear.h" +#include + +extern "C" { + +/* NOTE: the Golang versions of the structures must call the corresponding + * Free functions via runtime.SetFinalize */ +/* CreateCProblem allocates a new struct problem outside of Golang's + * garbage collection. */ +struct problem *CreateCProblem() { + auto ret = new problem(); + *ret = {}; // < Clear all fields + return ret; +} + +/* CreateCModel allocates a new struct model outside of Golang's + * garbage collection. */ +struct model *CreateCModel() { + auto ret = new model(); + *ret = {}; // < Clear all fields + return ret; +} + +/* CreateCParameter allocates a new struct parameter outside of + * Golang's garbage collection.*/ +struct parameter *CreateCParameter() { + return reinterpret_cast(calloc(1, sizeof(struct parameter))); +} + +/* Free's a previously allocated problem and all its data */ +void FreeCProblem(struct problem *p) { + if (p->y != nullptr) { + free(p->y); + p->y = nullptr; + } + if (p->x != nullptr) { + free(p->x); + p->x = nullptr; + } + delete p; +} + +/* free's a model with libsvm's internal routines */ +void FreeCModel(struct model *m) { + free_model_content(m); + delete m; +} + +/* free's a parameter via libsvm */ +void FreeCParameter(struct parameter *p) { + if (p == nullptr) { + return; + } + free(p); +} + +/* Allocates a vector of doubles for storing target values + * outside of Go's garbage collection */ +int AllocateLabelsForProblem (struct problem *p, int numValues) { + p->y = reinterpret_cast(malloc(sizeof(double) * numValues)); + return p->y == nullptr; +} + +/* Utility method used to set the target value for a particular + * input row */ +void AssignLabelForProblem(struct problem *p, int i, double d) { + p->y[i] = d; +} + +/* Allocates a buffer of input rows and inserts the per-row values */ +int RiffleFeatures(struct problem *p, int num_offsets, int* row_offsets, struct feature_node *features) { + + // Allocate space for the feature node buffer. + p->x = reinterpret_cast( + calloc(num_offsets, sizeof(struct feature_node *)) + ); + if (p->x == nullptr) { + return -1; + } + + for (int i = 0; i < num_offsets; i++) { + int offset = row_offsets[i]; + p->x[i] = features + offset; + } + return 0; +} + +} /* extern "C" */ diff --git a/linear_models/integration.h b/linear_models/integration.h new file mode 100644 index 00000000..16d748df --- /dev/null +++ b/linear_models/integration.h @@ -0,0 +1,19 @@ +#ifndef _H_INTEGRATION_ +#define _H_INTEGRATION_ + +#include "linear.h" + +struct problem *CreateCProblem(); +void FreeCProblem(struct problem*); +struct model *CreateCModel(); +void FreeCModel(struct model*); +struct parameter *CreateCParameter(); +void FreeCParameter(struct parameter*); +// Allocates memory outside of golang for describing feature +// vectors. +int RiffleFeatures(struct problem *p, int num_offsets, int* row_offsets, struct feature_node *features); +int AllocateLabelsForProblem(struct problem *, int); +void AssignLabelForProblem(struct problem *, int, double); +struct feature_node *GetFeatureNodeForIndex(struct problem *, int, int); + +#endif diff --git a/linear_models/liblinear.go b/linear_models/liblinear.go index 47e7456c..beb7e2af 100644 --- a/linear_models/liblinear.go +++ b/linear_models/liblinear.go @@ -1,22 +1,47 @@ package linear_models /* -#include "linear.h" +#include "integration.h" +#cgo CFLAGS: -O3 +#cgo CXXFLAGS: -std=c++11 -O3 */ import "C" -import "fmt" -import "unsafe" +import ( + "fmt" + "runtime" +) +// Problem wraps a libsvm problem struct which describes a classification/ +// regression problem. No externally-accessible fields. type Problem struct { - c_prob C.struct_problem + c_prob *C.struct_problem + featureNodes []C.struct_feature_node +} + +// Free releases resources associated with a libsvm problem. +func (p *Problem) Free() { + C.FreeCProblem(p.c_prob) } +// Parameter encasulates all the possible libsvm training options. +// TODO: make user control of these more extensive. type Parameter struct { - c_param C.struct_parameter + c_param *C.struct_parameter +} + +// Free releases resources associated with a Parameter. +func (p *Parameter) Free() { + C.FreeCParameter(p.c_param) } +// Model encapsulates a trained libsvm model. type Model struct { - c_model unsafe.Pointer + c_model *C.struct_model +} + +// Free releases resources associated with a trained libsvm model. +func (m *Model) Free() { + C.FreeCModel(m.c_model) } const ( @@ -30,8 +55,14 @@ const ( L2R_LR_DUAL = C.L2R_LR_DUAL ) +// NewParameter creates a libsvm parameter structure, which controls +// various aspects of libsvm training. +// For more information on what these parameters do, consult the +// "`train` usage" section of +// https://github.com/cjlin1/liblinear/blob/master/README func NewParameter(solver_type int, C float64, eps float64) *Parameter { - param := Parameter{} + param := &Parameter{C.CreateCParameter()} + runtime.SetFinalizer(param, (*Parameter).Free) param.c_param.solver_type = C.int(solver_type) param.c_param.eps = C.double(eps) param.c_param.C = C.double(C) @@ -39,30 +70,37 @@ func NewParameter(solver_type int, C float64, eps float64) *Parameter { param.c_param.weight_label = nil param.c_param.weight = nil - return ¶m + return param } +// NewProblem creates input to libsvm which describes a particular +// regression/classification problem. It requires an array of float values +// and an array of y values. func NewProblem(X [][]float64, y []float64, bias float64) *Problem { - prob := Problem{} + prob := &Problem{C.CreateCProblem(), nil} + runtime.SetFinalizer(prob, (*Problem).Free) prob.c_prob.l = C.int(len(X)) prob.c_prob.n = C.int(len(X[0]) + 1) - prob.c_prob.x = convert_features(X, bias) - c_y := make([]C.double, len(y)) + convert_features(prob, X, bias) + C.AllocateLabelsForProblem(prob.c_prob, C.int(len(y))) for i := 0; i < len(y); i++ { - c_y[i] = C.double(y[i]) + C.AssignLabelForProblem(prob.c_prob, C.int(i), C.double(y[i])) } - prob.c_prob.y = &c_y[0] + // Should not go out of scope until the Problem struct + // is cleaned up. prob.c_prob.bias = C.double(-1) - return &prob + return prob } +// Train invokes libsvm and returns a trained model. func Train(prob *Problem, param *Parameter) *Model { libLinearHookPrintFunc() // Sets up logging - tmpCProb := &prob.c_prob - tmpCParam := ¶m.c_param - return &Model{unsafe.Pointer(C.train(tmpCProb, tmpCParam))} + out := C.train(prob.c_prob, param.c_param) + m := &Model{out} + runtime.SetFinalizer(m, (*Model).Free) + return m } func Export(model *Model, filePath string) error { @@ -74,19 +112,25 @@ func Export(model *Model, filePath string) error { } func Load(model *Model, filePath string) error { - model.c_model = unsafe.Pointer(C.load_model(C.CString(filePath))) + model.c_model = C.load_model(C.CString(filePath)) if model.c_model == nil { return fmt.Errorf("Something went wrong") } return nil + } +// Predict takes a row of float values corresponding to a particular +// input and returns the regression result. func Predict(model *Model, x []float64) float64 { c_x := convert_vector(x, 0) c_y := C.predict((*C.struct_model)(model.c_model), c_x) y := float64(c_y) return y } + +// convert_vector is an internal function used for converting +// dense float64 vectors into the sparse input that libsvm accepts. func convert_vector(x []float64, bias float64) *C.struct_feature_node { n_ele := 0 for i := 0; i < len(x); i++ { @@ -113,43 +157,48 @@ func convert_vector(x []float64, bias float64) *C.struct_feature_node { c_x[j].index = C.int(-1) return &c_x[0] } -func convert_features(X [][]float64, bias float64) **C.struct_feature_node { - n_samples := len(X) - n_elements := 0 - for i := 0; i < n_samples; i++ { +// convert_features is an internal function used for converting +// dense 2D arrays of float values into the sparse format libsvm accepts. +func convert_features(prob *Problem, X [][]float64, bias float64) { + rowCount := len(X) + + // This structure remembers the start and end elements for each row. + // We push them back into a global list of C.struct_feature_nodes, then + // riffle it in C using their indices to form the **C.struct_feature_nodes + // input. Go retains ownership of struct_feature_nodes, C has ownership of + // the enclosing **C.struct_feature_nodes feature. + rowOffsets := make([]C.int, 0) + featureNodes := make([]C.struct_feature_node, 0) + + // First pass, just counting through each row and counting the number of elements we find. + for i := 0; i < rowCount; i++ { + rowOffsets = append(rowOffsets, C.int(len(featureNodes))) // Push back the starting element of this row + if bias != 0.0 { // Allocate space for a bias node + featureNodes = append(featureNodes, C.struct_feature_node{ + C.int(0), C.double(bias), + }) + } for j := 0; j < len(X[i]); j++ { if X[i][j] != 0.0 { - n_elements++ + // For every non-zero thing in the data grid, allocate a feature node. + featureNodes = append(featureNodes, C.struct_feature_node{ + C.int(j + 1), C.double(X[i][j]), + }) } - n_elements++ //for bias } + // Finally, add a terminating element which tells libsvm that there's nothing + // left on this row + featureNodes = append(featureNodes, C.struct_feature_node{ + C.int(-1), C.double(0), + }) } - x_space := make([]C.struct_feature_node, n_elements+n_samples) - - cursor := 0 - x := make([]*C.struct_feature_node, n_samples) - var c_x **C.struct_feature_node - - for i := 0; i < n_samples; i++ { - x[i] = &x_space[cursor] - - for j := 0; j < len(X[i]); j++ { - if X[i][j] != 0.0 { - x_space[cursor].index = C.int(j + 1) - x_space[cursor].value = C.double(X[i][j]) - cursor++ - } - if bias > 0 { - x_space[cursor].index = C.int(0) - x_space[cursor].value = C.double(bias) - cursor++ - } - } - x_space[cursor].index = C.int(-1) - cursor++ + // Transform [feature_node, feature_node, feature_node, ...] list into + // [*feature_node(1), *feature_node(m), ...] through the C integration bridge. + // C owns that particular memory. + // int RiffleFeatures(struct problem *p, int num_offsets, int* row_offsets, struct feature_node *features) { + if C.RiffleFeatures(prob.c_prob, C.int(len(featureNodes)), &rowOffsets[0], &featureNodes[0]) != 0 { + panic("RiffledFeatures could not allocate memory") } - c_x = &x[0] - return c_x } diff --git a/linear_models/linearsvc.go b/linear_models/linearsvc.go index 6166198c..255b3ef1 100644 --- a/linear_models/linearsvc.go +++ b/linear_models/linearsvc.go @@ -5,10 +5,10 @@ import "C" import ( "encoding/json" "fmt" - "github.com/sjwhitworth/golearn/base" "io/ioutil" "os" - "unsafe" + + "github.com/sjwhitworth/golearn/base" ) // LinearSVCParams represnts all available LinearSVC options. @@ -153,6 +153,7 @@ func (lr *LinearSVC) Fit(X base.FixedDataGrid) error { var weightClasses []C.int // Creates the class weighting + fmt.Println("Generating class weights...") if lr.Param.ClassWeights == nil { if lr.Param.WeightClassesAutomatically { weightVec = generateClassWeightVectorFromDist(X) @@ -169,17 +170,20 @@ func (lr *LinearSVC) Fit(X base.FixedDataGrid) error { } // Convert the problem + fmt.Println("Converting instances...") problemVec := convertInstancesToProblemVec(X) labelVec := convertInstancesToLabelVec(X) // Train + fmt.Println("Training...") prob := NewProblem(problemVec, labelVec, 0) lr.param.c_param.nr_weight = C.int(len(weightVec)) lr.param.c_param.weight_label = &(weightClasses[0]) - lr.param.c_param.weight = (*C.double)(unsafe.Pointer(&weightVec[0])) + lr.param.c_param.weight = (*C.double)(&weightVec[0]) // lr.param.weights = (*C.double)unsafe.Pointer(&(weightVec[0])); lr.model = Train(prob, lr.param) + fmt.Println("Training completed") return nil } diff --git a/linear_models/tmp b/linear_models/tmp deleted file mode 100644 index 4560a48d..00000000 Binary files a/linear_models/tmp and /dev/null differ diff --git a/meta/one_v_all.go b/meta/one_v_all.go index e6a7e6fd..7f7e5bec 100644 --- a/meta/one_v_all.go +++ b/meta/one_v_all.go @@ -2,6 +2,7 @@ package meta import ( "fmt" + "github.com/sjwhitworth/golearn/base" ) @@ -46,7 +47,6 @@ func (m *OneVsAllModel) Fit(using base.FixedDataGrid) { } } attrs := m.generateAttributes(using) - // Find the highest stored value val := uint64(0) classVals := classAttr.GetValues() @@ -60,6 +60,7 @@ func (m *OneVsAllModel) Fit(using base.FixedDataGrid) { panic("Must have more than one class!") } m.maxClassVal = val + fmt.Println("Found maximum rows") // If we're reloading, we may just be fitting to the structure _, srcRows := using.Size() @@ -152,9 +153,6 @@ func (m *OneVsAllModel) LoadWithPrefix(reader *base.ClassifierDeserializer, pref return base.DescribeError("Can't load INSTANCE_STRUCTURE", err) } m.Fit(fitOn) - /*if err != nil { - base.DescribeError("Could not fit reloaded classifier to the structure", err) - }*/ // Reload the filters numFiltersU64, err := reader.GetU64ForKey(reader.Prefix(prefix, "FILTER_COUNT")) @@ -229,7 +227,7 @@ func (m *OneVsAllModel) LoadWithPrefix(reader *base.ClassifierDeserializer, pref for i, c := range classVals { cls := m.NewClassifierFunction(c) clsPrefix := pI("CLASSIFIERS", i) - + fmt.Println("Loading classifier...") err = cls.LoadWithPrefix(reader, clsPrefix) if err != nil { return base.FormatError(err, "Could not reload classifier at: %s", clsPrefix) @@ -264,7 +262,7 @@ func (m *OneVsAllModel) SaveWithPrefix(writer *base.ClassifierSerializer, prefix } // Save the instances - err := writer.WriteInstancesForKey(writer.Prefix(prefix, "INSTANCE_STRUCTURE"), m.fitOn, false) + err := writer.WriteInstancesForKey(writer.Prefix(prefix, "INSTANCE_STRUCTURE"), base.NewStructuralCopy(m.fitOn), false) if err != nil { return base.DescribeError("Unable to write INSTANCE_STRUCTURE", err) }