cleans up code and logs

ClimbsRocks · Oct 7, 2015 · 92efe24 · 92efe24
1 parent 26611c7
commit 92efe24
Show file tree

Hide file tree

Showing 9 changed files with 20 additions and 31 deletions.
diff --git a/neuralNet/controllerNN.js b/neuralNet/controllerNN.js
@@ -23,7 +23,6 @@ module.exports = {
   },
   startTraining: function() {
     utils.setGlobalVars();
-    console.log('dataFile:',argv.dataFile);
 
     readAndFormatData(function() {
       // nn.dataSummary just got set by readAndFormatData, asynchronously;

diff --git a/neuralNet/dataFormatting/readAndFormatData.js b/neuralNet/dataFormatting/readAndFormatData.js
@@ -24,7 +24,6 @@ module.exports = function( callback) {
   // NOTE: your data must be formatted using UTF-8. If you're getting weird errors and you're not sure how to do that, check out this blog post:
     // TODO: add in info on how to make sure your data is formatted using UTF-8
   var readStream = fs.createReadStream(path.join(global.rootDir, argv.dataFile), {encoding: 'utf8'});
-  console.log('we have created the write and read streams to format our data')
 
 
   var tStream1 = formattingUtils.summarizeDataTransformStream();
@@ -36,7 +35,6 @@ module.exports = function( callback) {
   readStream.pipe(tStream1).pipe(writeStream1);
 
   writeStream1.on('finish', function() {
-    console.log('heard a finish event to writeSream');
     // to deal with asynch issues, we are attaching the dataSummary object to tStream1 itself. 
 
     // set the average property on each dataSummary key
@@ -56,7 +54,6 @@ module.exports = function( callback) {
 
     writeStream2.on('finish', function() {
 
-      console.log('finished the second transform!');
       for(var column in nn.dataSummary) {
         var columnObj = nn.dataSummary[column];
 
@@ -86,10 +83,10 @@ module.exports = function( callback) {
       readStream3.pipe(tStream3).pipe(writeStream3);
 
       writeStream3.on('finish', function() {
-        console.log('finished the third transform!');
         var trainingTime = (Date.now() - t2Start) / 1000;
         console.log('third transformStream took:',trainingTime);
 
+        // delete the intermediate files we have created
         fs.unlink(path.join(nn.location,'/formattingData.txt'));
         fs.unlink(path.join(nn.location,'/formattingData2.txt'));
         if(argv.copyData) {

diff --git a/neuralNet/utils/makeExtendedTrainingObj.js b/neuralNet/utils/makeExtendedTrainingObj.js
@@ -4,7 +4,6 @@ var argv = global.argv;
 
 module.exports = function ( hlArray) {
 
-  console.log('nn.bestNetObj:',nn.bestNetObj);
   // NOTE: these are the max training time parameters we can set. we will use other processes to decide when to kill off the net. 
   var trainingObj = {
     errorThresh: 0.0,  // error threshold to reach

diff --git a/ppLib.js b/ppLib.js
@@ -1,9 +1,9 @@
-// global.neuralNetwork = {};
+global.neuralNetwork = {};
 global.argv = require('minimist')(process.argv.slice(1));
 var path = require('path');
 global.rootDir = path.dirname(__filename);
 
-// var controllerNN = require('./neuralNet/controllerNN.js');
+var controllerNN = require('./neuralNet/controllerNN.js');
 var controllerPython = require('./pySetup/controllerPython.js');
 var controllerEnsemble = require('./ensembling/controller.js');
 var dataFile = process.argv[2];
@@ -37,17 +37,15 @@ if (argv.devEnsemble) {
   // Here is where we invoke the method with the path to the data
   // we pass in a callback function that will make the dataSummary a global variable 
     // and invoke parallelNets once formatting the data is done. 
-  // argv.numCPUs = argv.computerTotalCPUs/2;
-  // controllerNN.startTraining();
+  controllerNN.startTraining();
   // **********************************************************************************
-  // argv.numCPUs = argv.computerTotalCPUs/2;
   controllerPython.startTraining(argv);
 
   controllerEnsemble.startListeners(2, argv);
 }
 
 var ppLibShutdown = function() {
-  // controllerNN.killAll();
+  controllerNN.killAll();
   controllerPython.killAll();
 };
 // kills off all the child processes if the parent process faces an uncaught exception and crashes. 

diff --git a/pySetup/controllerPython.js b/pySetup/controllerPython.js
@@ -29,7 +29,7 @@ module.exports = {
         processes.kickOffForestTraining( function() {
           // TODO: add in next step in chain here
           module.exports.makePredictions();
-        });
+        }, 'clRandomForest');
       });
 
     // }

diff --git a/pySetup/makePredictions.py b/pySetup/makePredictions.py
@@ -12,7 +12,6 @@
 
 fileNames = json.loads(sys.argv[4])
 classifierName = sys.argv[5]
-obviousPrint('classifierName',classifierName)
 
 y_file_name = fileNames['y_predict']
 X_file_name = fileNames['X_predict']
@@ -71,14 +70,11 @@
         rowID = int(float(inputRow[idIndex]))
         try:
             len(prediction)
-            # printParent('we are in the try block')
             csvwriter.writerow([rowID,prediction[1]])
         except:
             csvwriter.writerow([rowID,prediction])
-            # printParent('we are in the exception block')
 
 
-# write those predictions to a single, standalone, centralized file that ONLY holds the ID for that row, and then the predictions for each model. 
 # Nope. Each classifier writes it's own predictions to it's own file. 
     # we will keep an array in ppLib.js that has references to all the file names
     # the files will all be in a predictions folder, that will hold nothing but these files holding the predictions from a single classifier

diff --git a/pySetup/parameterMakers/rfParamMaker.py b/pySetup/parameterMakers/rfParamMaker.py
@@ -9,8 +9,8 @@ def makeParams(X, y, globalArgs):
 
 
     parameters_to_try = {
-        # 'max_features': max_features_to_try,
-        # 'min_samples_leaf':[1,2,5,25,50,100,150],
+        'max_features': max_features_to_try,
+        'min_samples_leaf':[1,2,5,25,50,100,150],
         'criterion': ['gini','entropy']
     }
 

diff --git a/pySetup/processes.js b/pySetup/processes.js
@@ -42,12 +42,17 @@ module.exports = {
     module.exports.formatData( callback, 'train');
   },
 
-  kickOffForestTraining: function( callback) {
+  kickOffForestTraining: function( callback, classifierName) {
     // console.log('fileNames:',module.exports.fileNames);
-    var pythonOptions = utils.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), 'clRandomForest']);
+    var pythonOptions = utils.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), classifierName]);
 
 
-    utils.startPythonShell('training.py', callback, pythonOptions);
+    var pyShell = utils.startPythonShell('training.py', callback, pythonOptions);
+    pyShell.on('message', function(message) {
+      if(message.type === 'trainingResults') {
+        global.trainedAlgos[classifierName] = message.text;
+      }
+    });
   },
 
   makePredictions: function( callback, rfPickle) {

diff --git a/pySetup/training.py b/pySetup/training.py
@@ -63,29 +63,25 @@
 classifier = classifierCreater[classifierName]
 
 # create features that are custom to the size of the input data. 
-# this will definitely have to be done individually. 
-# i don't see any harm in making each of these into their own file, because aside from the dev check, everything here will be custom to each classifier. 
+# Each individual paramaterMaker file sits in the paramaterMakers folder. If you want to modify what the parameters are, or submit a PR with a better combination of parameters to try, that is the place to start. 
 allParams = paramMakers.makeAll(X,y,globalArgs)
 parameters_to_try = allParams[classifierName]
 
 
-# here is where we start to do very similar things all over again. everything from here forwards can be generalized. 
 printParent('we are about to run a grid search over the following space:')
 printParent(parameters_to_try)
 
 gridSearch = GridSearchCV(classifier, parameters_to_try, cv=10, n_jobs=globalArgs['numCPUs'])
 
 gridSearch.fit(X_train, y_train)
-
-printParent('we have used grid search to explore the entire parameter space and find the best possible version of a random forest for your particular data set!')
-
+printParent('\n')
 printParent('*********************************************************************************************************')
 printParent("this estimator's best prediction is:")
 printParent(gridSearch.best_score_)
 printParent('*********************************************************************************************************')
 printParent("this estimator's best parameters are:")
 printParent(gridSearch.best_params_)
-printParent('now that we have figured this out, we are going to train a random forest with considerably more trees. more trees means a better fit, but they also take significantly longer to train, so we kept the number of trees relatively low while searching through the parameter space to make sure you were not stuck here until python6 comes out.')
+printParent('\n')
 
 if extendedTraining:
     # create a dict with mappings from algo name ('clRandomForest') to a function that will return a newly instantiated version of that algo (with the proper n_estimators and other custom parameters for that classifier)
@@ -95,7 +91,6 @@
 
     # note: we are testing grid search on 50% of the data (X_train and y_train), but fitting bigClassifier on the entire dataset (X,y)
     bigClassifier.fit(X, y)
-    printParent('we have trained an even more powerful random forest!')
 
     bigClassifierscore = bigClassifier.score(X, y)
     printParent('the bigger randomForest has a score of')
@@ -104,4 +99,4 @@
     joblib.dump(bigClassifier, 'pySetup/bestClassifiers/best' + classifierName + '/best' + classifierName + '.pkl')
 else:
     joblib.dump(gridSearch.best_estimator_, 'pySetup/bestClassifiers/best' + classifierName + '/best' + classifierName + '.pkl')
-printParent('wrote the best estimator to a file')
+printParent('we have written the best estimator to a file')