Skip to content

Commit

Permalink
cleans up code and logs
Browse files Browse the repository at this point in the history
  • Loading branch information
ClimbsRocks committed Oct 7, 2015
1 parent 26611c7 commit 92efe24
Show file tree
Hide file tree
Showing 9 changed files with 20 additions and 31 deletions.
1 change: 0 additions & 1 deletion neuralNet/controllerNN.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ module.exports = {
},
startTraining: function() {
utils.setGlobalVars();
console.log('dataFile:',argv.dataFile);

readAndFormatData(function() {
// nn.dataSummary just got set by readAndFormatData, asynchronously;
Expand Down
5 changes: 1 addition & 4 deletions neuralNet/dataFormatting/readAndFormatData.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ module.exports = function( callback) {
// NOTE: your data must be formatted using UTF-8. If you're getting weird errors and you're not sure how to do that, check out this blog post:
// TODO: add in info on how to make sure your data is formatted using UTF-8
var readStream = fs.createReadStream(path.join(global.rootDir, argv.dataFile), {encoding: 'utf8'});
console.log('we have created the write and read streams to format our data')


var tStream1 = formattingUtils.summarizeDataTransformStream();
Expand All @@ -36,7 +35,6 @@ module.exports = function( callback) {
readStream.pipe(tStream1).pipe(writeStream1);

writeStream1.on('finish', function() {
console.log('heard a finish event to writeSream');
// to deal with asynch issues, we are attaching the dataSummary object to tStream1 itself.

// set the average property on each dataSummary key
Expand All @@ -56,7 +54,6 @@ module.exports = function( callback) {

writeStream2.on('finish', function() {

console.log('finished the second transform!');
for(var column in nn.dataSummary) {
var columnObj = nn.dataSummary[column];

Expand Down Expand Up @@ -86,10 +83,10 @@ module.exports = function( callback) {
readStream3.pipe(tStream3).pipe(writeStream3);

writeStream3.on('finish', function() {
console.log('finished the third transform!');
var trainingTime = (Date.now() - t2Start) / 1000;
console.log('third transformStream took:',trainingTime);

// delete the intermediate files we have created
fs.unlink(path.join(nn.location,'/formattingData.txt'));
fs.unlink(path.join(nn.location,'/formattingData2.txt'));
if(argv.copyData) {
Expand Down
1 change: 0 additions & 1 deletion neuralNet/utils/makeExtendedTrainingObj.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ var argv = global.argv;

module.exports = function ( hlArray) {

console.log('nn.bestNetObj:',nn.bestNetObj);
// NOTE: these are the max training time parameters we can set. we will use other processes to decide when to kill off the net.
var trainingObj = {
errorThresh: 0.0, // error threshold to reach
Expand Down
10 changes: 4 additions & 6 deletions ppLib.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// global.neuralNetwork = {};
global.neuralNetwork = {};
global.argv = require('minimist')(process.argv.slice(1));
var path = require('path');
global.rootDir = path.dirname(__filename);

// var controllerNN = require('./neuralNet/controllerNN.js');
var controllerNN = require('./neuralNet/controllerNN.js');
var controllerPython = require('./pySetup/controllerPython.js');
var controllerEnsemble = require('./ensembling/controller.js');
var dataFile = process.argv[2];
Expand Down Expand Up @@ -37,17 +37,15 @@ if (argv.devEnsemble) {
// Here is where we invoke the method with the path to the data
// we pass in a callback function that will make the dataSummary a global variable
// and invoke parallelNets once formatting the data is done.
// argv.numCPUs = argv.computerTotalCPUs/2;
// controllerNN.startTraining();
controllerNN.startTraining();
// **********************************************************************************
// argv.numCPUs = argv.computerTotalCPUs/2;
controllerPython.startTraining(argv);

controllerEnsemble.startListeners(2, argv);
}

var ppLibShutdown = function() {
// controllerNN.killAll();
controllerNN.killAll();
controllerPython.killAll();
};
// kills off all the child processes if the parent process faces an uncaught exception and crashes.
Expand Down
2 changes: 1 addition & 1 deletion pySetup/controllerPython.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ module.exports = {
processes.kickOffForestTraining( function() {
// TODO: add in next step in chain here
module.exports.makePredictions();
});
}, 'clRandomForest');
});

// }
Expand Down
4 changes: 0 additions & 4 deletions pySetup/makePredictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

fileNames = json.loads(sys.argv[4])
classifierName = sys.argv[5]
obviousPrint('classifierName',classifierName)

y_file_name = fileNames['y_predict']
X_file_name = fileNames['X_predict']
Expand Down Expand Up @@ -71,14 +70,11 @@
rowID = int(float(inputRow[idIndex]))
try:
len(prediction)
# printParent('we are in the try block')
csvwriter.writerow([rowID,prediction[1]])
except:
csvwriter.writerow([rowID,prediction])
# printParent('we are in the exception block')


# write those predictions to a single, standalone, centralized file that ONLY holds the ID for that row, and then the predictions for each model.
# Nope. Each classifier writes it's own predictions to it's own file.
# we will keep an array in ppLib.js that has references to all the file names
# the files will all be in a predictions folder, that will hold nothing but these files holding the predictions from a single classifier
Expand Down
4 changes: 2 additions & 2 deletions pySetup/parameterMakers/rfParamMaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ def makeParams(X, y, globalArgs):


parameters_to_try = {
# 'max_features': max_features_to_try,
# 'min_samples_leaf':[1,2,5,25,50,100,150],
'max_features': max_features_to_try,
'min_samples_leaf':[1,2,5,25,50,100,150],
'criterion': ['gini','entropy']
}

Expand Down
11 changes: 8 additions & 3 deletions pySetup/processes.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,17 @@ module.exports = {
module.exports.formatData( callback, 'train');
},

kickOffForestTraining: function( callback) {
kickOffForestTraining: function( callback, classifierName) {
// console.log('fileNames:',module.exports.fileNames);
var pythonOptions = utils.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), 'clRandomForest']);
var pythonOptions = utils.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), classifierName]);


utils.startPythonShell('training.py', callback, pythonOptions);
var pyShell = utils.startPythonShell('training.py', callback, pythonOptions);
pyShell.on('message', function(message) {
if(message.type === 'trainingResults') {
global.trainedAlgos[classifierName] = message.text;
}
});
},

makePredictions: function( callback, rfPickle) {
Expand Down
13 changes: 4 additions & 9 deletions pySetup/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,29 +63,25 @@
classifier = classifierCreater[classifierName]

# create features that are custom to the size of the input data.
# this will definitely have to be done individually.
# i don't see any harm in making each of these into their own file, because aside from the dev check, everything here will be custom to each classifier.
# Each individual paramaterMaker file sits in the paramaterMakers folder. If you want to modify what the parameters are, or submit a PR with a better combination of parameters to try, that is the place to start.
allParams = paramMakers.makeAll(X,y,globalArgs)
parameters_to_try = allParams[classifierName]


# here is where we start to do very similar things all over again. everything from here forwards can be generalized.
printParent('we are about to run a grid search over the following space:')
printParent(parameters_to_try)

gridSearch = GridSearchCV(classifier, parameters_to_try, cv=10, n_jobs=globalArgs['numCPUs'])

gridSearch.fit(X_train, y_train)

printParent('we have used grid search to explore the entire parameter space and find the best possible version of a random forest for your particular data set!')

printParent('\n')
printParent('*********************************************************************************************************')
printParent("this estimator's best prediction is:")
printParent(gridSearch.best_score_)
printParent('*********************************************************************************************************')
printParent("this estimator's best parameters are:")
printParent(gridSearch.best_params_)
printParent('now that we have figured this out, we are going to train a random forest with considerably more trees. more trees means a better fit, but they also take significantly longer to train, so we kept the number of trees relatively low while searching through the parameter space to make sure you were not stuck here until python6 comes out.')
printParent('\n')

if extendedTraining:
# create a dict with mappings from algo name ('clRandomForest') to a function that will return a newly instantiated version of that algo (with the proper n_estimators and other custom parameters for that classifier)
Expand All @@ -95,7 +91,6 @@

# note: we are testing grid search on 50% of the data (X_train and y_train), but fitting bigClassifier on the entire dataset (X,y)
bigClassifier.fit(X, y)
printParent('we have trained an even more powerful random forest!')

bigClassifierscore = bigClassifier.score(X, y)
printParent('the bigger randomForest has a score of')
Expand All @@ -104,4 +99,4 @@
joblib.dump(bigClassifier, 'pySetup/bestClassifiers/best' + classifierName + '/best' + classifierName + '.pkl')
else:
joblib.dump(gridSearch.best_estimator_, 'pySetup/bestClassifiers/best' + classifierName + '/best' + classifierName + '.pkl')
printParent('wrote the best estimator to a file')
printParent('we have written the best estimator to a file')

0 comments on commit 92efe24

Please sign in to comment.