diff --git a/README.md b/README.md index 27cd6528f..8865a4675 100644 --- a/README.md +++ b/README.md @@ -27,28 +27,60 @@ Documentation for the [internal APIs](docs/hillview-apis.pdf). # Installing Hillview on a local machine -These instructions are for Ubuntu or MacOS machines. +## Ubuntu or MacOS machines. * Install Java 8. At this point newer versions of Java will *not* work. * clone this github repository * run the script `bin/install-dependencies.sh` -* Download the Hillview release [tar -file](https://github.com/vmware/hillview/releases/download/v0.6-alpha/hillview-bin.tar.gz). +* Download the Hillview release [zip +file](https://github.com/vmware/hillview/releases/download/v0.7-alpha/hillview-bin.zip). Save it in the top directory of Hillview. -* Untar the release `tax xfvz hillview-bin.tar.gz` +* Unzip the release `unzip hillview-bin.zip` + +## Windows machines + +* Download and install Java 8. +* Choose a directory for installing Hillview +* Enable execution of powershell scripts; this can be done, for example, by + running the following command in powershell as an administrator: `Set-ExecutionPolicy unrestricted` +* Download and install the following script in the chosen directory `bin/install-hillview.ps1` +* Run the installation script using Windows powershell # Running Hillview locally +## Windows machines + +All Windows scripts are in the `bin` folder: + +``` +$: cd bin +``` + +* Start Hillview processes: + +``` +$: hillview-start.bat +``` + +* If needed give permissions to the application to communicate through the Windows firewall +* To stop hillview: + +``` +$: hillview-stop.bat +``` + +## Ubuntu or MacOS machines + All the following scripts are in the `bin` folder. ``` -$ cd bin +$: cd bin ``` * Start the back-end service which performs all the data processing: ``` -$ ./backend-start.sh & +./backend-start.sh & ``` * Start the web server @@ -56,7 +88,7 @@ $ ./backend-start.sh & in `apache-tomcat-9.0.4/conf/server.xml`). ``` -$ ./frontend-start.sh +$: ./frontend-start.sh ``` * start a web browser and open http://localhost:8080 @@ -74,8 +106,8 @@ machine. the Java SDK) download and prepare the sample data: ``` -$ ./rebuild.sh -a -$ ./demo-data-cleaner.sh +$: ./rebuild.sh -a +$: ./demo-data-cleaner.sh ``` # Deploying the Hillview service on a cluster @@ -146,7 +178,7 @@ two sample files are `bin/config.json`and `bin/config-local.json`. ## Deployment scripts -All deployment scripts are writte in Python, and are in the `bin` folder. +All deployment scripts are written in Python, and are in the `bin` folder. ``` $: cd bin @@ -179,6 +211,11 @@ Query the status of the services: $: ./status config.json ``` +## Data management + +We provide some crude data management scripts and tools for clusters. +They are described [here](bin/README.md). + # Developing Hillview ## Software Dependencies @@ -207,7 +244,7 @@ to>/jdk/jdk1.8.0_101). To set your JAVA_HOME environment variable, add the following to your ~/.bashrc or ~/.zshrc. ``` -$ export JAVA_HOME="" +$: export JAVA_HOME="" ``` (For MacOS you do not need to set up JAVA_HOME.) @@ -219,14 +256,14 @@ for building and testing. On MacOS you first need to install [Homebrew](https://brew.sh/). One way to do that is to run ``` -/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +$: /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" ``` To install all other dependencies you can run: ``` -$ cd bin -$ ./install-dependencies.sh +$: cd bin +$: ./install-dependencies.sh ``` For old versions of Ubuntu this may fail, so you may have to install the required @@ -249,8 +286,8 @@ platform/pom.xml. * Build the software: ``` -$ cd bin -$ ./rebuild.sh -a +$: cd bin +$: ./rebuild.sh -a ``` ### Build details @@ -262,9 +299,9 @@ JAR file `platform/target/hillview-jar-with-dependencies.jar`. This part can be built with: ``` -$ cd platform -$ mvn clean install -$ cd .. +$: cd platform +$: mvn clean install +$: cd .. ``` * web: the web server, web client and web services; this project links @@ -273,9 +310,9 @@ to the result produced by the `platform` project. This produces a WAR be built with: ``` -$ cd web -$ mvn package -$ cd .. +$: cd web +$: mvn package +$: cd .. ``` ## Contributing code @@ -287,13 +324,13 @@ standard. ## Setup IntelliJ IDEA Download and install Intellij IDEA: https://www.jetbrains.com/idea/. -You can just untar the linux binary in a place of your choice and run +You can just untar the Linux binary in a place of your choice and run the shell script `ideaXXX/bin/idea.sh`. The web projects uses capabilities only available in the paid version of Intellij IDEA. One solution is to load only the module that you want to contribute to: move to the corresponding folder: `cd platform` or `cd web` and start -intellij there. +IntelliJ there. Alternatively, if you have IntelliJ Ultimate you can create an empty project in the hillview folder, and then import three modules (from File/Project structure/Modules, diff --git a/bin/README.md b/bin/README.md index bfb14a0da..fe4a316de 100644 --- a/bin/README.md +++ b/bin/README.md @@ -1,26 +1,137 @@ -# This folder contains various scripts for managing Hillview clusters +# This folder contains various scripts and configuration files for managing Hillview clusters -## Shell scripts for building and testing +## Linux/MacOS shell scripts for building and testing -* lib.sh: a small library of useful shell functions used by other scripts -* install-dependencies.sh: Install all dependencies needed to build Hillview -* rebuild.sh: build the Hillview front-end and back-end -* backend-start.sh: start the Hillview back-end service on the local machine -* frontend-start.sh: start the Hillview front-end service on the local machine -* demo-data-cleaner.sh: Downloads test data and preprocesses it -* redeploy.sh: stop services, rebuild the software, deploy it, and restart the service -* force-gc.sh: a simple shell script which tries to force a Java process to execute GC +* `backend-start.sh`: start the Hillview back-end service on the local machine +* `demo-data-cleaner.sh`: download a small test data and preprocesses it +* `force-gc.sh`: asks a Java process to execute GC +* `forever.sh`: runs another command in a loop forever +* `frontend-start.sh`: start the Hillview front-end service on the local machine +* `install-dependencies.sh`: install all dependencies needed to build Hillview +* `lib.sh`: a small library of useful shell functions used by other scripts +* `package-binaries.sh`: used to build an archive with executables and scripts which + is used for the code distribution +* `rebuild.sh`: build the Hillview front-end and back-end +* `redeploy.sh`: Performs four consecutive actions on a remote + Hillview installation: stops the services, rebuilds the software, + deploys it, and restarts the service +* `upload-file.sh`: Given a csv file it will guess a schema for it and + upload it to a remote cluster chopped into small pieces. + +The following are templates that are used to generate actual shell scripts +on a remoate cluster when Hillview is installed + +* `hillview-aggregator-manager-template.sh`: used to generate a file + called `hillview-aggregator-manager.sh` which can be used to start, + stop, query a Hillview aggregation service. The generated file is + are installed on each aggregator machines. + +* `hillview-webserver-manager-template.sh`: used to generate a file + called `hillview-webserver-manager.sh` which can be used to start, + stop, query a Hillview web server. The generated file is installed + on the remote Hillview web server machine. + +* `hillview-worker-manager-template.sh`: used to generate a file + called `hillview-worker-manager.sh` which can be used to start, + stop, query a Hillview worker. The generated file is installed on + each remote worker machine. + +## Windows scripts + +* `install-hillview.ps1`: a PowerShell script used to download and + install Hillview on a Windows machine. +* `detect-java.bat`: a Windows batch file which has a library that + detects where Java is installed +* `hillview-start.bat`: a Windows batch file which starts Hillview on the local machine +* `hillview-stop.bat`: a Windows batch file which stops Hillview on the local machine ## Python scripts for deploying Hillview on a cluster and managing data -* hillviewCommon.py: common library used by other python programs -* upload-data.py: upload a set of files to a folder on a set of machines in a - round-robin fashion -* download-data.py: downloads the files that match from all machines in the cluster -* delete-data.py: delete a folder from all machines in a Hillview cluster -* run-on-all.py: run a command on a set of remote machines -* deploy.py: copy the binaries to all machines in a Hillview cluster -* start.py: start the Hillview service on a remote cluster -* stop.py: stop the Hillview service on a remote cluster -* status.py: check the Hillview service on a remote cluster -* hillviewConsoleLog.py: logging library used by other python programs +* `delete-data.py`: delete a folder from all machines in a Hillview cluster +* `deploy.py`: copy the Hillview binaries to all machines in a Hillview cluster +* `download-data.py`: downloads the specified files from all machines in a cluster +* `hillviewCommon.py`: common library used by other Python programs +* `run-on-all.py`: run a command on all machines in a Hillview cluster +* `start.py`: start the Hillview service on a remote cluster +* `status.py`: check the Hillview service on a remote cluster +* `stop.py`: stop the Hillview service on a remote cluster +* `upload-data.py`: upload a set of files to all machines in a Hillview cluster in a + round-robin fashion + +## Configuration files + +* `config.json`: skeleton configuration file for a Hillview cluster +* `config-local.json`: description of a Hillview cluster that consists + of just the local machine (used both as a web server and as a + worker) + +# Additional documentation + +## Managing a Hillview cluster + +* Copy the file `config.json` and modify it to describe your cluster. Let's say you + saved into `myconfig.json` +* To run Hillview on the local machine just use `config-local.json` +* You can install Hillview on your cluster by running `deploy.py myconfig.json` +* You can start the Hillview service on the cluster by running `start.py myconfig.json` +* You can stop the Hillview service on the cluster by running `stop.py myconfig.json` +* You can check the status of the Hillview service on the cluster by running `status.py myconfig.json` + +## Managing files on a Hillview cluster + +Several scripts can be used to manage data distributed as raw files on +a Hillview cluster. The convention is that a dataset is stored in one +directory; the same directory is used on all machines, and each +machine holds a fragment of the entire dataset. + +Let's say we have a very large file x.csv that we want to upload to a +cluster; we will chop it into pieces and install the pieces in the +directory "data/x" on each machine (below the hillview working +directory). This is done with: + +``` +$: ./upload-file.sh -c myconfig.json -d data/x -h -f x.csv -o +``` + +The various flags have the following significance: +* `-c myconfig.json`: specifies cluster where data is uploaded +* `-d data/x`: specifies directory where data is uploaded on each machine +* `-h`: specifies the fact that the file `x.csv` has a header row +* `-f x.csv`: specifies the input file +* `-o`: specifies that the output should be saved as ORC files (a fast columnar format) + +After uploading the file in this way it can be loaded by selecting +`Load / ORC files' and specifying: +* File name pattern: data/x/x*.orc +* Schema file: schema + +Alternatively, you can split the file locally and upload the pieces +afterwards; the following splits the file into pieces in the `tmp` +directory and then uploads these pieces to the cluster using the +`upload-data.py` program: + +``` +$: ./upload-file.sh -d tmp -h -f x.csv -o +$: ./upload-data.py -d data/x -s schema mycluster.json tmp/*.orc +``` + +To list the files on the cluster you can use the `run-on-all.py` script, e.g.: + +``` +$: ./run-on-all.py mycluster.json "ls -l data/x" +``` + +You can delete a directory from all machines of a cluster: + +``` +$: ./delete-data.py mycluster.json data/x +``` + +Finally, you can download back data you have uploaded to the cluster: + +``` +$: ./download-data.py mycluster.json data/x +``` + +When downloading this utility will create a folder for each machine in +the cluster. \ No newline at end of file diff --git a/bin/delete-data.py b/bin/delete-data.py index 9baf74097..2051552ea 100755 --- a/bin/delete-data.py +++ b/bin/delete-data.py @@ -5,8 +5,7 @@ import os.path from argparse import ArgumentParser -from hillviewCommon import ClusterConfiguration, get_config -from hillviewConsoleLog import get_logger +from hillviewCommon import ClusterConfiguration, get_config, get_logger logger = get_logger("delete-data") diff --git a/bin/deploy.py b/bin/deploy.py index 5cf9188c1..cdecb41b1 100755 --- a/bin/deploy.py +++ b/bin/deploy.py @@ -7,8 +7,7 @@ from argparse import ArgumentParser import tempfile import os.path -from hillviewCommon import ClusterConfiguration, get_config -from hillviewConsoleLog import get_logger +from hillviewCommon import ClusterConfiguration, get_config, get_logger logger = get_logger("deploy") diff --git a/bin/detect-java.bat b/bin/detect-java.bat new file mode 100644 index 000000000..606be697e --- /dev/null +++ b/bin/detect-java.bat @@ -0,0 +1,40 @@ +rem Script to detect Java installation path +rem Adapted from https://stackoverflow.com/questions/3930383/jre-installation-directory-in-windows + +@ECHO on + +SET KIT=JavaSoft\Java Runtime Environment +call:ReadRegValue VER "HKLM\Software\%KIT%" "CurrentVersion" +IF "%VER%" NEQ "" GOTO FoundJRE + +SET KIT=Wow6432Node\JavaSoft\Java Runtime Environment +call:ReadRegValue VER "HKLM\Software\%KIT%" "CurrentVersion" +IF "%VER%" NEQ "" GOTO FoundJRE + +SET KIT=JavaSoft\Java Development Kit +call:ReadRegValue VER "HKLM\Software\%KIT%" "CurrentVersion" +IF "%VER%" NEQ "" GOTO FoundJRE + +SET KIT=Wow6432Node\JavaSoft\Java Development Kit +call:ReadRegValue VER "HKLM\Software\%KIT%" "CurrentVersion" +IF "%VER%" NEQ "" GOTO FoundJRE + +ECHO Failed to find Java +GOTO :EOF + +:FoundJRE +call:ReadRegValue JAVAPATH "HKLM\Software\%KIT%\%VER%" "JavaHome" +ECHO %JAVAPATH% +SET JAVA_HOME=%JAVAPATH% +GOTO :EOF + +:ReadRegValue +SET key=%2% +SET name=%3% +SET "%~1=" +SET reg=reg +IF DEFINED ProgramFiles(x86) ( + IF EXIST %WINDIR%\sysnative\reg.exe SET reg=%WINDIR%\sysnative\reg.exe +) +FOR /F "usebackq tokens=3* skip=1" %%A IN (`%reg% QUERY %key% /v %name% 2^>NUL`) DO SET "%~1=%%A %%B" +GOTO :EOF diff --git a/bin/download-data.py b/bin/download-data.py index 584efc5cc..ac16ebbb4 100755 --- a/bin/download-data.py +++ b/bin/download-data.py @@ -1,16 +1,14 @@ #!/usr/bin/env python3 -"""This script takes a file pattern. -It downloads the files that match from all machines in the cluster. -The list of machines is provided in a Hillview configuration file.""" +"""This script takes a cluster configuration and a file pattern. +It downloads the files that match from all machines in the cluster.""" # pylint: disable=invalid-name from argparse import ArgumentParser import os.path import os import errno -from hillviewCommon import execute_command, ClusterConfiguration, get_config -from hillviewConsoleLog import get_logger +from hillviewCommon import execute_command, ClusterConfiguration, get_config, get_logger logger = get_logger("download-data") diff --git a/bin/hillview-start.bat b/bin/hillview-start.bat new file mode 100644 index 000000000..cd57f3ae4 --- /dev/null +++ b/bin/hillview-start.bat @@ -0,0 +1,6 @@ +REM a script which starts Hillview on a Windows machine + +call detect-java.bat +start /B java -jar ..\platform\target\hillview-server-jar-with-dependencies.jar 127.0.0.1:3569 +start /B ..\apache-tomcat-9.0.4\bin\catalina.sh run +start /B http://localhost:8080 diff --git a/bin/hillview-stop.bat b/bin/hillview-stop.bat new file mode 100644 index 000000000..7e75221a6 --- /dev/null +++ b/bin/hillview-stop.bat @@ -0,0 +1,7 @@ +REM a script which stops Hillview on a Windows machine + +call detect-java.bat +..\apache-tomcat-9.0.4\bin\catalina.sh stop +set /p PID=grpc-stub 1.15.0 - + org.apache.commons commons-compress - 1.16 + 1.19 diff --git a/platform/src/main/java/org/hillview/main/DataUpload.java b/platform/src/main/java/org/hillview/main/DataUpload.java index a9b50dc40..7122cc18c 100644 --- a/platform/src/main/java/org/hillview/main/DataUpload.java +++ b/platform/src/main/java/org/hillview/main/DataUpload.java @@ -18,9 +18,16 @@ package org.hillview.main; import javax.annotation.Nullable; + +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.ArchiveException; +import org.apache.commons.compress.archivers.ArchiveInputStream; +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.input.BOMInputStream; import org.hillview.management.ClusterConfig; import org.hillview.storage.CsvFileLoader; @@ -56,15 +63,16 @@ private static class Params { final int defaultChunkSize = 100000; // number of lines in default file chunk final String defaultSchemaName = "schema"; @Nullable - String schemaPath = null; + String inputSchemaName = null; String filename = ""; // the file to be sent @Nullable String directory; /* final ArrayList fileList = new ArrayList(); */ - String remoteFolder = ""; // the destination path where the files will be put - String cluster = ""; // the path to the cluster config json file + String destinationFolder = ""; // the destination path where the files will be put + @Nullable + String cluster = null; // the path to the cluster config json file boolean hasHeader; // true if file has a header row boolean orc; // true if saving as orc, otherwise save as csv; int chunkSize = defaultChunkSize; // the number of lines in each shard. @@ -74,8 +82,8 @@ private static class Params { private static void usage(Options options) { final String usageString = "Usage: DataUpload "; - System.out.print(usageString); - System.out.println(options.getOptions()); + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(usageString, options); } /** @@ -83,28 +91,28 @@ private static void usage(Options options) { */ private static Params parseCommand(String[] args) { Options options = new Options(); - Option o_filename = new Option("f", "filename", true, "path to file to distribute"); + Option o_filename = new Option("f", "filename", true, "file to distribute"); o_filename.setRequired(true); options.addOption(o_filename); - Option o_destination = new Option("d","destination",true, "relative path to remote folder"); + Option o_destination = new Option("d", "destination", true, "destination folder"); o_destination.setRequired(true); options.addOption(o_destination); - Option o_cluster = new Option("c","cluster",true, "path to cluster config json file"); - o_cluster.setRequired(true); + Option o_cluster = new Option("c", "cluster", true, "path to cluster config json file"); + o_cluster.setRequired(false); options.addOption(o_cluster); - Option o_linenumber = new Option("l","lines",true, "number of lines in each chunk"); + Option o_linenumber = new Option("l", "lines", true, "number of rows in each chunk"); o_linenumber.setRequired(false); options.addOption(o_linenumber); - Option o_format = new Option("o","orc",false, "indicates to save file as orc"); + Option o_format = new Option("o", "orc", false, "save file as orc"); o_format.setRequired(false); options.addOption(o_format); - Option o_schema = new Option("s","schema",true, "path to the schema file"); + Option o_schema = new Option("s", "schema", true, "input schema file"); o_schema.setRequired(false); options.addOption(o_schema); - Option o_header = new Option("h","header",false, "indicates file has header row"); + Option o_header = new Option("h", "header", false, "set if file has header row"); o_header.setRequired(false); options.addOption(o_header); - Option o_fewercolumns = new Option("w","fewer",false, "indicates if fewer columns are allowed"); + Option o_fewercolumns = new Option("w", "fewer", false, "set if rows with fewer columns are allowed"); o_fewercolumns.setRequired(false); options.addOption(o_fewercolumns); /* @@ -129,8 +137,10 @@ private static Params parseCommand(String[] args) { } Params parameters = new Params(); try{ + /* if (cmd.hasOption('f') == cmd.hasOption('D')) throw new RuntimeException("need either file or directory"); + */ if (cmd.hasOption('f')) { parameters.filename = cmd.getOptionValue('f'); // parameters.fileList.add(cmd.getOptionValue('f')); @@ -150,7 +160,7 @@ private static Params parseCommand(String[] args) { } catch (RuntimeException e) { error(e); } - parameters.remoteFolder = cmd.getOptionValue('d'); + parameters.destinationFolder = cmd.getOptionValue('d'); parameters.cluster = cmd.getOptionValue("cluster"); if (cmd.hasOption('l')) { try { @@ -162,7 +172,7 @@ private static Params parseCommand(String[] args) { } parameters.orc = cmd.hasOption('o'); if (cmd.hasOption('s')) - parameters.schemaPath = cmd.getOptionValue('s'); + parameters.inputSchemaName = cmd.getOptionValue('s'); parameters.hasHeader = cmd.hasOption('h'); parameters.allowFewerColumns = cmd.hasOption('w'); return parameters; @@ -171,16 +181,21 @@ private static Params parseCommand(String[] args) { public static void main(String[] args) { Params parameters = parseCommand(args); try { - ClusterConfig config = ClusterConfig.parse(parameters.cluster); + @Nullable + ClusterConfig config = null; + if (parameters.cluster != null) + config = ClusterConfig.parse(parameters.cluster); CsvFileLoader.Config parsConfig = new CsvFileLoader.Config(); parsConfig.hasHeaderRow = parameters.hasHeader; parsConfig.allowFewerColumns = parameters.allowFewerColumns; Schema mySchema; String localSchemaFile; - if (!Utilities.isNullOrEmpty(parameters.schemaPath)) { - localSchemaFile = parameters.schemaPath; - mySchema = Schema.readFromJsonFile(Paths.get(parameters.schemaPath)); + String outputSchemaFile; + if (!Utilities.isNullOrEmpty(parameters.inputSchemaName)) { + localSchemaFile = parameters.inputSchemaName; + outputSchemaFile = FilenameUtils.getBaseName(parameters.inputSchemaName); + mySchema = Schema.readFromJsonFile(Paths.get(parameters.inputSchemaName)); } else { int i = 1; localSchemaFile = parameters.defaultSchemaName; @@ -188,21 +203,26 @@ public static void main(String[] args) { localSchemaFile = parameters.defaultSchemaName + i; i++; } + outputSchemaFile = parameters.defaultSchemaName; mySchema = guessSchema(parameters.filename, parsConfig); + HillviewLogger.instance.info("Writing schema", "{0}", localSchemaFile); mySchema.writeToJsonFile(Paths.get(localSchemaFile)); } // Create directories and place the schema - if (config.workers != null) { + if (config != null && config.workers != null) { for (String host : config.workers) { - createDir(config.user, host, parameters.remoteFolder); - if (parameters.schemaPath != null) - sendFile(localSchemaFile, config.user, host, parameters.remoteFolder, parameters.schemaPath); + createDir(config.user, host, parameters.destinationFolder); + sendFile(localSchemaFile, config.user, host, parameters.destinationFolder, outputSchemaFile); } + if (Utilities.isNullOrEmpty(parameters.inputSchemaName)) + // We have created this schema file + Files.delete(Paths.get(localSchemaFile)); + } else { + if (parameters.inputSchemaName == null) + Files.move(Paths.get(localSchemaFile), + Paths.get(parameters.destinationFolder, outputSchemaFile)); } - if (!localSchemaFile.equals(parameters.schemaPath)) - Files.delete(Paths.get(localSchemaFile)); - int parts = chopFiles(parsConfig, config, mySchema, parameters); System.out.println("Done; created " + parts + " files"); } catch (Exception e) { @@ -300,7 +320,8 @@ private static Schema guessSchema(String filename, CsvFileLoader.Config config) * @param parameters the parameters taken from the command line, include file path and destination * @return the number of parts created */ - private static int chopFiles(CsvFileLoader.Config config, ClusterConfig clusterConfig, + private static int chopFiles(CsvFileLoader.Config config, + @Nullable ClusterConfig clusterConfig, Schema schema, Params parameters) { Reader file; IAppendableColumn[] columns; @@ -365,11 +386,15 @@ private static int chopFiles(CsvFileLoader.Config config, ClusterConfig clusterC break; } writeTable(table, chunkName, parameters.orc); - assert clusterConfig.workers != null; - String host = clusterConfig.workers[currentHost]; - sendFile(chunkName, clusterConfig.user, host, parameters.remoteFolder, chunkName); - Files.deleteIfExists(Paths.get(chunkName)); - currentHost = (currentHost + 1) % clusterConfig.workers.length; + if (clusterConfig != null) { + assert clusterConfig.workers != null; + String host = clusterConfig.workers[currentHost]; + sendFile(chunkName, clusterConfig.user, host, parameters.destinationFolder, chunkName); + currentHost = (currentHost + 1) % clusterConfig.workers.length; + Files.deleteIfExists(Paths.get(chunkName)); + } else { + Files.move(Paths.get(chunkName), Paths.get(parameters.destinationFolder, chunkName)); + } chunk++; columns = schema.createAppendableColumns(); } @@ -420,7 +445,7 @@ private static void sendFile(String filename, String user, String host, String r Process process = pb.start(); int err = process.waitFor(); if (err != 0) - throw new RuntimeException("Scp stopped with error code " + err); + throw new RuntimeException("scp stopped with error code " + err); } /** Writes the table in ORC or CSV format @@ -468,9 +493,23 @@ private static Reader getFileReader(String filename) { // The buffered input stream is needed by the CompressorStream // to detect the compression method at runtime. InputStream fis = new BufferedInputStream(inputStream); - if (Utilities.isCompressed(filename)) { - fis = new CompressorStreamFactory() - .createCompressorInputStream(fis); + String suffix = Utilities.isCompressed(filename); + if (suffix != null) { + if (suffix.equals("zip")) { + // TODO: For zip files we expect a single file in archive + ArchiveInputStream is = new ArchiveStreamFactory(). + createArchiveInputStream(ArchiveStreamFactory.ZIP, fis); + ArchiveEntry entry = is.getNextEntry(); + if (entry == null) + throw new RuntimeException("No files in zip archive"); + ZipArchiveEntry ze = (ZipArchiveEntry)entry; + if (ze.isDirectory()) + throw new RuntimeException("zip archive contains a directory"); + fis = is; + } else { + fis = new CompressorStreamFactory() + .createCompressorInputStream(fis); + } } BOMInputStream bomStream = new BOMInputStream(fis, ByteOrderMark.UTF_8, @@ -479,7 +518,7 @@ private static Reader getFileReader(String filename) { ByteOrderMark bom = bomStream.getBOM(); String charsetName = bom == null ? "UTF-8" : bom.getCharsetName(); return new InputStreamReader(bomStream, charsetName); - } catch (IOException|CompressorException e) { + } catch (IOException | CompressorException | ArchiveException e) { throw new RuntimeException(e); } } diff --git a/platform/src/main/java/org/hillview/management/ClusterConfig.java b/platform/src/main/java/org/hillview/management/ClusterConfig.java index f13776627..30e2a1df0 100644 --- a/platform/src/main/java/org/hillview/management/ClusterConfig.java +++ b/platform/src/main/java/org/hillview/management/ClusterConfig.java @@ -27,7 +27,6 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; -import java.util.Collections; import java.util.List; /** diff --git a/platform/src/main/java/org/hillview/storage/JdbcDatabase.java b/platform/src/main/java/org/hillview/storage/JdbcDatabase.java index 7a4a70a1c..ed4f3a2b4 100644 --- a/platform/src/main/java/org/hillview/storage/JdbcDatabase.java +++ b/platform/src/main/java/org/hillview/storage/JdbcDatabase.java @@ -304,7 +304,7 @@ private static List createColumns(ResultSetMetaData meta) thr List cols = new ArrayList(meta.getColumnCount()); for (int i = 0; i < meta.getColumnCount(); i++) { ColumnDescription cd = JdbcDatabase.getDescription(meta, i); - BaseListColumn col = BaseListColumn.create(cd); + IAppendableColumn col = BaseListColumn.create(cd); cols.add(col); } return cols; diff --git a/platform/src/main/java/org/hillview/storage/OrcFileWriter.java b/platform/src/main/java/org/hillview/storage/OrcFileWriter.java index 132ac1eb8..d8a92102b 100644 --- a/platform/src/main/java/org/hillview/storage/OrcFileWriter.java +++ b/platform/src/main/java/org/hillview/storage/OrcFileWriter.java @@ -49,10 +49,10 @@ private static TypeDescription getSchema(Schema schema) { ColumnDescription cd = schema.getDescription(col); TypeDescription current; switch (cd.kind) { - case None: default: throw new RuntimeException("Unexpected data type " + cd.kind); case String: + case None: case Json: current = TypeDescription.createString(); break; diff --git a/platform/src/main/java/org/hillview/storage/TextFileLoader.java b/platform/src/main/java/org/hillview/storage/TextFileLoader.java index e9466e63d..7b43b2d5d 100644 --- a/platform/src/main/java/org/hillview/storage/TextFileLoader.java +++ b/platform/src/main/java/org/hillview/storage/TextFileLoader.java @@ -17,6 +17,11 @@ package org.hillview.storage; +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.ArchiveException; +import org.apache.commons.compress.archivers.ArchiveInputStream; +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.io.ByteOrderMark; @@ -73,10 +78,24 @@ Reader getFileReader() { // to detect the compression method at runtime. InputStream fis = this.bufferedInputStream; - if (Utilities.isCompressed(this.filename)) { - this.compressedStream = new CompressorStreamFactory() - .createCompressorInputStream(fis); - fis = this.compressedStream; + String suffix = Utilities.isCompressed(this.filename); + if (suffix != null) { + if (suffix.equals("zip")) { + // TODO: For zip files we expect a single file in archive + ArchiveInputStream is = new ArchiveStreamFactory(). + createArchiveInputStream(ArchiveStreamFactory.ZIP, fis); + ArchiveEntry entry = is.getNextEntry(); + if (entry == null) + throw new RuntimeException("No files in zip archive"); + ZipArchiveEntry ze = (ZipArchiveEntry)entry; + if (ze.isDirectory()) + throw new RuntimeException("zip archive contains a directory"); + fis = is; + } else { + this.compressedStream = new CompressorStreamFactory() + .createCompressorInputStream(fis); + fis = this.compressedStream; + } } this.bomStream = new BOMInputStream(fis, ByteOrderMark.UTF_8, @@ -85,7 +104,7 @@ Reader getFileReader() { ByteOrderMark bom = this.bomStream.getBOM(); String charsetName = bom == null ? "UTF-8" : bom.getCharsetName(); return new InputStreamReader(this.bomStream, charsetName); - } catch (IOException|CompressorException e) { + } catch (IOException | CompressorException | ArchiveException e) { throw new RuntimeException(e); } } diff --git a/platform/src/main/java/org/hillview/table/Schema.java b/platform/src/main/java/org/hillview/table/Schema.java index 44e0a7c8b..400d120f4 100644 --- a/platform/src/main/java/org/hillview/table/Schema.java +++ b/platform/src/main/java/org/hillview/table/Schema.java @@ -292,7 +292,7 @@ public IAppendableColumn[] createAppendableColumns() { IAppendableColumn[] cols = new IAppendableColumn[this.getColumnCount()]; int index = 0; for (ColumnDescription cd: this.columns.values()) { - BaseListColumn col = BaseListColumn.create(cd); + IAppendableColumn col = BaseListColumn.create(cd); cols[index++] = col; } return cols; diff --git a/platform/src/main/java/org/hillview/table/columns/BaseListColumn.java b/platform/src/main/java/org/hillview/table/columns/BaseListColumn.java index a9196dfd7..d61c8d903 100644 --- a/platform/src/main/java/org/hillview/table/columns/BaseListColumn.java +++ b/platform/src/main/java/org/hillview/table/columns/BaseListColumn.java @@ -120,11 +120,13 @@ void growMissing() { this.missing.add(new BitSet(SegmentSize)); } - public static BaseListColumn create(ColumnDescription desc) { + public static IAppendableColumn create(ColumnDescription desc) { switch (desc.kind) { case String: case Json: return new StringListColumn(desc); + case None: + return new EmptyColumn(desc); case Date: return new DateListColumn(desc); case Integer: diff --git a/platform/src/main/java/org/hillview/table/columns/EmptyColumn.java b/platform/src/main/java/org/hillview/table/columns/EmptyColumn.java new file mode 100644 index 000000000..26c7a3af9 --- /dev/null +++ b/platform/src/main/java/org/hillview/table/columns/EmptyColumn.java @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019 VMware Inc. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.hillview.table.columns; + +import net.openhft.hashing.LongHashFunction; +import org.hillview.table.ColumnDescription; +import org.hillview.table.api.*; + +import javax.annotation.Nullable; + +/** + * An empty column does not really store any data - all values are null. + */ +public class EmptyColumn extends BaseColumn implements IAppendableColumn { + private int size; + private boolean sealed; + + private EmptyColumn(String name, int size) { + super(new ColumnDescription(name, ContentsKind.None)); + this.size = size; + this.sealed = true; + } + + EmptyColumn(ColumnDescription desc) { + super(desc); + this.size = 0; + this.sealed = false; + } + + @Override + public IColumn seal() { + this.sealed = true; + return this; + } + + @Override + public void append(@Nullable Object value) { + if (value == null) + this.appendMissing(); + else + throw new RuntimeException("Appending value to empty column" + value); + } + + @Override + public void appendMissing() { + if (this.sealed) + throw new RuntimeException("Appending to sealed column"); + this.size++; + } + + @Override + public boolean isMissing(int rowIndex) { + return true; + } + + @Override + public void parseAndAppendString(@Nullable String s) { + if (s == null) + this.appendMissing(); + else + throw new RuntimeException("Appending value to empty column" + s); + } + + @Override + public boolean isLoaded() { + return true; + } + + @Override + public int sizeInRows() { + return this.size; + } + + @Override + public double asDouble(int rowIndex) { + throw new RuntimeException("asDouble of value in null column"); + } + + @Nullable + @Override + public String asString(int rowIndex) { + return null; + } + + @Override + public IndexComparator getComparator() { + return new IndexComparator() { + @Override + public int compare(final int i, final int j) { + return 0; + } + }; + } + + @Override + public IColumn rename(String newName) { + return new EmptyColumn(newName, this.size); + } + + @Override + public IColumn convertKind(ContentsKind kind, String newColName, IMembershipSet unused) { + return new EmptyColumn(new ColumnDescription(newColName, kind)); + } + + @Override + public long hashCode64(int rowIndex, LongHashFunction hash) { + return 0; + } +} diff --git a/platform/src/main/java/org/hillview/utils/Utilities.java b/platform/src/main/java/org/hillview/utils/Utilities.java index a46de87a4..2fec1678d 100644 --- a/platform/src/main/java/org/hillview/utils/Utilities.java +++ b/platform/src/main/java/org/hillview/utils/Utilities.java @@ -162,14 +162,18 @@ public static String[] toArray(List list) { } private static final List compressionSuffixes = Arrays.asList( - "gz", "bz", "bzip2", "xz", "Z", "arj"); + "gz", "bz", "bzip2", "xz", "Z", "arj", "zip"); /** * Detect whether a file is compressed based on the file name. + * Returns the file extension if the file is compressed, null otherwise. */ - public static boolean isCompressed(String filename) { + @Nullable + public static String isCompressed(String filename) { String suffix = FilenameUtils.getExtension(filename).toLowerCase(); - return compressionSuffixes.contains(suffix); + if (compressionSuffixes.contains(suffix)) + return suffix; + return null; } /** @@ -178,7 +182,7 @@ public static boolean isCompressed(String filename) { */ public static String getBasename(String filename) { String basename = FilenameUtils.getBaseName(filename); - if (isCompressed(basename)) + if (isCompressed(basename) != null) basename = FilenameUtils.removeExtension(basename); return FilenameUtils.removeExtension(basename); } diff --git a/web/src/main/webapp/dataViews/schemaView.ts b/web/src/main/webapp/dataViews/schemaView.ts index 04dfb7af9..938554630 100644 --- a/web/src/main/webapp/dataViews/schemaView.ts +++ b/web/src/main/webapp/dataViews/schemaView.ts @@ -102,7 +102,7 @@ export class SchemaView extends TSViewBase { /* Dialog box for selecting columns based on name */ const nameDialog = new Dialog("Select by name", "Allows selecting/deselecting columns by name using regular expressions"); - const name = nameDialog.addTextField("selected", "Name", FieldKind.String, "", + const name = nameDialog.addTextField("selected", "Name (regex)", FieldKind.String, "", "Names of columns to select (regular expressions allowed)"); name.required = true; const actions: string[] = ["Add", "Remove"]; @@ -182,7 +182,7 @@ export class SchemaView extends TSViewBase { action: () => this.heatmapSelected(), help: "Plot the data in the selected columns as a heatmap or as a Trellis plot of heatmaps. " + "Applies to two columns only.", - }, selectedCount >= 2); + }, selectedCount === 2); this.contextMenu.addItem({ text: "Trellis histograms", action: () => this.trellisSelected(false),