Skip to content

Commit

Permalink
gcp fix bulk (#590)
Browse files Browse the repository at this point in the history
* format example files as .ndjson

* reference compression in README

* support hierarchical rules

* example file variable in customer bulk connector examples

* fix compile

* workflow to CI build of actual deployment bundles

* fix action invocation

* add checkout again

* try to fix invocation

* lack of checkout also a problem?

* fix path problem?

* revert path

* inline the action

* Specify explicit distribution
  • Loading branch information
eschultink authored Dec 1, 2023
1 parent 0b2ac5b commit f6713b6
Show file tree
Hide file tree
Showing 24 changed files with 210 additions and 61 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build-java.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Setup Java
uses: actions/setup-java@v3
with:
java-version: ${{ inputs.java-version }}
# https://github.com/actions/setup-java#supported-distributions
distribution: zulu
distribution: ${{ inputs.java-distribution }}
- name: Clear our artifacts from Maven cache # q: does this work!?!?!
run: |
rm -rf ~/.m2/repository/co/worklytics/
Expand Down
39 changes: 39 additions & 0 deletions .github/workflows/ci-bundles.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: CI - bundles

# CI for our project as a GitHub action
# see https://help.github.com/en/actions/language-and-framework-guides/building-and-testing-java-with-maven

# refactor to use matrix?
# see https://github.com/actions/setup-java#testing-against-different-java-versions

on:
push: # mainline + rc's only
branches:
- 'main'
- 'rc-*'
- 's163-gcp-fix-bulk'

jobs:
bundle:
runs-on: ubuntu-latest
strategy:
matrix:
implementation: [ 'gcp', 'aws' ]
name: bundle ${{ matrix.implementation }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Setup Java
uses: actions/setup-java@v3
with:
java-version: 17
distribution: zulu
- name: Package Deployment artifact
working-directory: java/
# see tools/build.sh; this should be equivalent to that, w/o re-routing errors to log file
# or logic to skip build if artifact already exists
run: |
mvn clean -f pom.xml
mvn package install -f "gateway-core/pom.xml" -Dmaven.test.skip=true
mvn package install -f "core/pom.xml" -Dmaven.test.skip=true
mvn package -f "impl/${{ matrix.implementation }}/pom.xml"
1 change: 0 additions & 1 deletion .github/workflows/ci-java-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ on:
branches:
- 'main'
- 'rc-*'
- 's162-prep-release'

jobs:
# Java 11 - Oracle support ended 30 Sept 2023 ... but still what ships with GCP cloud shell!!!
Expand Down
5 changes: 3 additions & 2 deletions docs/sources/slack/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Use this step if you want to install in the whole org, across multiple workspace
![](./img/slack-step-distribution.png)

3. Generate the following URL replacing the placeholder for *YOUR_CLIENT_ID* and save it for
later:


`https://api.slack.com/api/oauth.v2.access?client_id=YOUR_CLIENT_ID`

Expand Down Expand Up @@ -77,7 +77,8 @@ ingest the resulting sanitized data to Worklytics. Example data of this is givin
This data can be processing using custom multi-file type rules in the proxy, of which
[`discovery-bulk.yaml`](discovery-bulk.yaml) is an example.


For clarity, example files are NOT compressed, so don't have `.gz` extension; but rules expect
`.gz`.



30 changes: 30 additions & 0 deletions docs/sources/slack/discovery-bulk-hierarchical.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
fileRules:
/export-{week}/{file}:
fileRules:
users.ndjson.gz:
format: "NDJSON"
transforms:
- pseudonymize: "$.profile.email"
- redact: "$.name"
- redact: "$.real_name"
channels.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.name"
dms.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
groups.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.name"
mpims.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
messages-{id}.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
- redact: "$.channel_name"
12 changes: 6 additions & 6 deletions docs/sources/slack/discovery-bulk.yaml
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
fileRules:
/export-{week}/users.json.gz:
/export-{week}/users.ndjson.gz:
format: "NDJSON"
transforms:
- pseudonymize: "$.profile.email"
- redact: "$.name"
- redact: "$.real_name"
/export-{week}/channels.json.gz:
/export-{week}/channels.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.name"
/export-{week}/dms.json.gz:
/export-{week}/dms.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
/export-{week}/groups.json.gz:
/export-{week}/groups.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.name"
/export-{week}/mpims.json.gz:
/export-{week}/mpims.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
/export-{week}/messages-{id}.json.gz:
/export-{week}/messages-{id}.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
{"id":"D06TX2RP0","created":1435454909,"members":["U06FG9AKF","U06TX27FW"]}
{"id":"D06TX2RP1","created":1435454910,"members":["U06FG9AKF","U06TX27FW"]}
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
{"client_msg_id":"70299f8c-38ab-4911-835e-f2fe92fea6b6","type":"message","text":"","user":"U06FG9AKF","ts":"1620665890.036300","team":"T06FG94SV","reactions":[{"name":"100","users":["USER_ID"],"count":1}],"channel_id":"D01VATQPMUG","channel_name":"general"}
{"client_msg_id":"70299f8c-38ab-4911-835e-f2fe92fea6b4","type":"message","text":"","user":"U06FG9AAF","ts":"1620665890.036310","team":"T06FG94SV","reactions":[{"name":"100","users":["USER_ID"],"count":1}],"channel_id":"D01VATQPMUG","channel_name":"general"}
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
{"id":"U999999999","team_id":"TEAM_ID","profile":{"email":"[email protected]"}, "is_bot":false}
{"id":"U999999998","team_id":"TEAM_ID","profile":{"email":"[email protected]"}, "is_bot":false}
1 change: 0 additions & 1 deletion docs/sources/slack/example-bulk/sanitized/channels.json

This file was deleted.

2 changes: 2 additions & 0 deletions docs/sources/slack/example-bulk/sanitized/channels.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id":"D06TX2RP0","created":1435454909,"members":["U06FG9AKF","U06TX27FW"]}
{"id":"D06TX2RP1","created":1435454910,"members":["U06FG9AKF","U06TX27FW"]}
19 changes: 0 additions & 19 deletions docs/sources/slack/example-bulk/sanitized/messages-D06TX2RP0.json

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"client_msg_id":"70299f8c-38ab-4911-835e-f2fe92fea6b6","type":"message","text":null,"user":"U06FG9AKF","ts":"1620665890.036300","team":"T06FG94SV","reactions":[{"name":"100","users":["USER_ID"],"count":1}],"channel_id":"D01VATQPMUG","channel_name":null}
{"client_msg_id":"70299f8c-38ab-4911-835e-f2fe92fea6b4","type":"message","text":null,"user":"U06FG9AAF","ts":"1620665890.036310","team":"T06FG94SV","reactions":[{"name":"100","users":["USER_ID"],"count":1}],"channel_id":"D01VATQPMUG","channel_name":null}
8 changes: 0 additions & 8 deletions docs/sources/slack/example-bulk/sanitized/users.json

This file was deleted.

2 changes: 2 additions & 0 deletions docs/sources/slack/example-bulk/sanitized/users.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id":"U999999999","team_id":"TEAM_ID","profile":{"email":"[email protected]"},"is_bot":false}
{"id":"U999999998","team_id":"TEAM_ID","profile":{"email":"[email protected]"},"is_bot":false}
1 change: 1 addition & 0 deletions infra/examples-dev/aws-all/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ variable "custom_bulk_connectors" {
}))
memory_size_mb = optional(number, null)
settings_to_provide = optional(map(string), {})
example_file = optional(string)
}))
description = "specs of custom bulk connectors to create"

Expand Down
1 change: 1 addition & 0 deletions infra/examples-dev/gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ variable "custom_bulk_connectors" {
}))
rules_file = optional(string)
settings_to_provide = optional(map(string), {})
example_file = optional(string)
}))
description = "specs of custom bulk connectors to create"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.avaulta.gateway.rules.BulkDataRules;
import com.avaulta.gateway.rules.MultiTypeBulkDataRules;
import com.avaulta.gateway.rules.PathTemplateUtils;
import com.avaulta.gateway.rules.PathTemplateUtils.Match;
import com.avaulta.gateway.rules.RuleSet;
import com.google.common.annotations.VisibleForTesting;
import lombok.*;
Expand Down Expand Up @@ -36,7 +37,7 @@ public class StorageHandler {
Pseudonymizer pseudonymizer;

@Inject
RuleSet defaultRuleSet;
BulkDataRules defaultRuleSet;

@Inject
RulesUtils rulesUtils;
Expand Down Expand Up @@ -185,22 +186,29 @@ public static class ObjectTransform implements Serializable {

@VisibleForTesting
ObjectTransform buildDefaultTransform() {
if (defaultRuleSet instanceof BulkDataRules) {
return ObjectTransform.builder()
return ObjectTransform.builder()
.destinationBucketName(config.getConfigPropertyOrError(BulkModeConfigProperty.OUTPUT_BUCKET))
.pathWithinBucket(config.getConfigPropertyAsOptional(BulkModeConfigProperty.OUTPUT_BASE_PATH).orElse(""))
.rules((BulkDataRules) defaultRuleSet)
.rules(defaultRuleSet)
.build();
} else {
throw new RuntimeException("Default rules are not BulkDataRules");
}
}


private BulkDataRules getApplicableRules(RuleSet rules, String sourceObjectPath) {
if (rules instanceof MultiTypeBulkDataRules) {
return pathTemplateUtils.match(((MultiTypeBulkDataRules) rules).getFileRules(), sourceObjectPath)
Match<BulkDataRules> match = pathTemplateUtils.matchVerbose(((MultiTypeBulkDataRules) rules).getFileRules(), sourceObjectPath)
.orElseThrow(() -> new IllegalArgumentException("No matching rules for path: " + sourceObjectPath));
if (match.getMatch() instanceof MultiTypeBulkDataRules) {
String subPath = match.getCapturedParams().get(match.getCapturedParams().size() - 1);
BulkDataRules nextMatch = pathTemplateUtils.match(((MultiTypeBulkDataRules) match.getMatch()).getFileRules(), subPath)
.orElseThrow(() -> new IllegalArgumentException("No matching rules for path: " + sourceObjectPath));
if (nextMatch instanceof MultiTypeBulkDataRules) {
throw new RuntimeException("MultiTypeBulkDataRules cannot be nested more than 1 level");
}
return nextMatch;
} else {
return match.getMatch();
}
} else if (rules instanceof BulkDataRules) {
return (BulkDataRules) rules;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.ValueSource;

import javax.inject.Inject;
Expand All @@ -33,6 +34,8 @@

public class SlackDiscoveryBulkTests {

static String rulesPath;


@Inject
RuleSet rules;
Expand All @@ -43,8 +46,10 @@ public class SlackDiscoveryBulkTests {
Writer writer;
ByteArrayOutputStream outputStream;

@BeforeEach
public void setUp() {
// to cover both rules versions, calling this inside of each test with different rules to set up
// with that rule set at run time
void setUp(String rulesPath) {
this.rulesPath = rulesPath;

Container container = DaggerSlackDiscoveryBulkTests_Container.create();
container.inject(this);
Expand Down Expand Up @@ -73,7 +78,7 @@ interface ForConfigService {
static ConfigService configService() {
ConfigService mock = MockModules.provideMock(ConfigService.class);
when(mock.getConfigPropertyAsOptional(eq(ProxyConfigProperty.RULES)))
.thenReturn(Optional.of(new String(TestUtils.getData("sources/slack/discovery-bulk.yaml"))));
.thenReturn(Optional.of(new String(TestUtils.getData(rulesPath))));
when(mock.getConfigPropertyAsOptional(eq(ProxyConfigProperty.PSOXY_SALT)))
.thenReturn(Optional.of("salt"));

Expand All @@ -88,8 +93,16 @@ static HostEnvironment hostEnvironment() {
}


@Test
public void rulesValid() {
@ValueSource(
strings = {
"sources/slack/discovery-bulk-hierarchical.yaml",
"sources/slack/discovery-bulk.yaml"
}

)
@ParameterizedTest
public void rulesValid(String rulesPath) {
setUp(rulesPath);
assertTrue(rules instanceof MultiTypeBulkDataRules);
}

Expand All @@ -109,13 +122,17 @@ StorageEventRequest request(
}

@SneakyThrows
@ValueSource(strings ={
"channels.json",
"messages-D06TX2RP0.json",
"users.json",
@CsvSource({
"sources/slack/discovery-bulk-hierarchical.yaml,channels.ndjson",
"sources/slack/discovery-bulk-hierarchical.yaml,messages-D06TX2RP0.ndjson",
"sources/slack/discovery-bulk-hierarchical.yaml,users.ndjson",
"sources/slack/discovery-bulk.yaml,channels.ndjson",
"sources/slack/discovery-bulk.yaml,messages-D06TX2RP0.ndjson",
"sources/slack/discovery-bulk.yaml,users.ndjson",
})
@ParameterizedTest
public void files(String file) {
public void files(String rulesPath, String file) {
setUp(rulesPath);
final String objectPath = "/export-20231128/" + file + ".gz";
final String pathToOriginal = "sources/slack/example-bulk/original/" + file;
final String pathToSanitized = "sources/slack/example-bulk/sanitized/" + file;
Expand All @@ -125,6 +142,6 @@ public void files(String file) {
writer.close();

String output = new String(outputStream.toByteArray());
TestUtils.assertJsonEquals(new String(TestUtils.getData(pathToSanitized)), output);
assertEquals(new String(TestUtils.getData(pathToSanitized)), output);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,16 @@ public interface Container {
public interface ForRules {
@Provides
@Singleton
static RuleSet rules() {
static BulkDataRules bulkRules() {
return MockModules.provideMock(BulkDataRules.class);
}

@Provides
@Singleton
static RuleSet rules(BulkDataRules rules) {
return rules;
}

}

InputStreamReader mockReader;
Expand Down
8 changes: 8 additions & 0 deletions java/core/src/test/java/co/worklytics/test/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Base64;
import java.util.function.Function;
import java.util.zip.GZIPOutputStream;

import static org.junit.jupiter.api.Assertions.assertEquals;
Expand Down Expand Up @@ -109,4 +110,11 @@ public static String prettyPrintJson(byte[] json) {
public static void assertJsonEquals(String expected, String actual) {
assertEquals(prettyPrintJson(expected), prettyPrintJson(actual));
}

public static void assertNdjsonEquals(String expected, String actual) {
Function<String, String> ndjsonToJson = (String s) -> "[" + s.replaceAll("\n", ",") + "]";
String expectedJson = expected.replaceAll("\n", ",");

assertEquals(expected, actual);
}
}
Loading

0 comments on commit f6713b6

Please sign in to comment.