From cb37cd8e3b1d669b6b705d778d598e3e0c8b0e57 Mon Sep 17 00:00:00 2001 From: Erik Schultink Date: Tue, 5 Dec 2023 14:15:18 -0800 Subject: [PATCH] revert expectation of explicit leading '/' in object paths/keys --- .../slack/discovery-bulk-hierarchical.yaml | 6 ++-- docs/sources/slack/discovery-bulk.yaml | 12 +++---- .../co/worklytics/psoxy/rules/Validator.java | 10 ------ .../psoxy/storage/StorageHandler.java | 34 ++++++++++++++++--- .../rules/slack/SlackDiscoveryBulkTests.java | 4 +-- .../gateway/rules/MultiTypeBulkDataRules.java | 17 ++++++---- 6 files changed, 51 insertions(+), 32 deletions(-) diff --git a/docs/sources/slack/discovery-bulk-hierarchical.yaml b/docs/sources/slack/discovery-bulk-hierarchical.yaml index 5036c2f62..1a5a7ceb9 100644 --- a/docs/sources/slack/discovery-bulk-hierarchical.yaml +++ b/docs/sources/slack/discovery-bulk-hierarchical.yaml @@ -1,5 +1,5 @@ fileRules: - /export-{week}/{file}: + export-{week}/{file}: fileRules: users.ndjson.gz: format: "NDJSON" @@ -24,7 +24,7 @@ fileRules: transforms: - redact: "$.text" messages-{id}.ndjson.gz: - format: "NDJSON" - transforms: + format: "NDJSON" + transforms: - redact: "$.text" - redact: "$.channel_name" diff --git a/docs/sources/slack/discovery-bulk.yaml b/docs/sources/slack/discovery-bulk.yaml index d24ca9563..7b035538c 100644 --- a/docs/sources/slack/discovery-bulk.yaml +++ b/docs/sources/slack/discovery-bulk.yaml @@ -1,27 +1,27 @@ fileRules: - /export-{week}/users.ndjson.gz: + export-{week}/users.ndjson.gz: format: "NDJSON" transforms: - pseudonymize: "$.profile.email" - redact: "$.name" - redact: "$.real_name" - /export-{week}/channels.ndjson.gz: + export-{week}/channels.ndjson.gz: format: "NDJSON" transforms: - redact: "$.name" - /export-{week}/dms.ndjson.gz: + export-{week}/dms.ndjson.gz: format: "NDJSON" transforms: - redact: "$.text" - /export-{week}/groups.ndjson.gz: + export-{week}/groups.ndjson.gz: format: "NDJSON" transforms: - redact: "$.name" - /export-{week}/mpims.ndjson.gz: + export-{week}/mpims.ndjson.gz: format: "NDJSON" transforms: - redact: "$.text" - /export-{week}/messages-{id}.ndjson.gz: + export-{week}/messages-{id}.ndjson.gz: format: "NDJSON" transforms: - redact: "$.text" diff --git a/java/core/src/main/java/co/worklytics/psoxy/rules/Validator.java b/java/core/src/main/java/co/worklytics/psoxy/rules/Validator.java index a4c2b5bad..b0d89de36 100644 --- a/java/core/src/main/java/co/worklytics/psoxy/rules/Validator.java +++ b/java/core/src/main/java/co/worklytics/psoxy/rules/Validator.java @@ -61,16 +61,6 @@ public void validate(@NonNull RecordRules rules) { public void validate(@NonNull MultiTypeBulkDataRules rules) { Preconditions.checkArgument(rules.getFileRules().size() > 0, "Must have at least one file rule"); - List templatesNotPrefixedWithSlash = rules.getFileRules().keySet().stream() - .filter(k -> !k.startsWith("/")) - .collect(Collectors.toList()); - - //not invalid per se, but likely to be a mistake - if (!templatesNotPrefixedWithSlash.isEmpty()) { - log.warning("The following path templates do not start with '/', so likely to be wrong:\n " + templatesNotPrefixedWithSlash.stream().collect(Collectors.joining("\n"))); - } - - rules.getFileRules().values().forEach(this::validate); } diff --git a/java/core/src/main/java/co/worklytics/psoxy/storage/StorageHandler.java b/java/core/src/main/java/co/worklytics/psoxy/storage/StorageHandler.java index 94d261e68..776ebaff3 100644 --- a/java/core/src/main/java/co/worklytics/psoxy/storage/StorageHandler.java +++ b/java/core/src/main/java/co/worklytics/psoxy/storage/StorageHandler.java @@ -9,6 +9,7 @@ import com.avaulta.gateway.rules.PathTemplateUtils.Match; import com.avaulta.gateway.rules.RuleSet; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; import lombok.*; import lombok.extern.java.Log; import org.apache.commons.lang3.StringUtils; @@ -122,6 +123,15 @@ public StorageEventResponse handle(StorageEventRequest request, RuleSet rules) { + /** + * + * @param reader from source, may be null if not yet known + * @param writer to destination, may be null if not yet known + * @param sourceBucketName bucket name + * @param sourceObjectPath a path (object key) within the bucket; no leading `/` expected + * @param transform + * @return + */ public StorageEventRequest buildRequest(Reader reader, Writer writer, String sourceBucketName, @@ -129,8 +139,7 @@ public StorageEventRequest buildRequest(Reader reader, ObjectTransform transform) { String sourceObjectPathWithinBase = - config.getConfigPropertyAsOptional(BulkModeConfigProperty.INPUT_BASE_PATH) - .filter(inputBasePath -> StringUtils.isNotBlank(inputBasePath)) + inputBasePath() .map(inputBasePath -> sourceObjectPath.replace(inputBasePath, "")) .orElse(sourceObjectPath); @@ -168,7 +177,7 @@ public Map buildObjectMetadata(String sourceBucket, String sourc return Map.of( BulkMetaData.INSTANCE_ID.getMetaDataKey(), hostEnvironment.getInstanceId(), BulkMetaData.VERSION.getMetaDataKey(), config.getConfigPropertyAsOptional(ProxyConfigProperty.BUNDLE_FILENAME).orElse("unknown"), - BulkMetaData.ORIGINAL_OBJECT_KEY.getMetaDataKey(), sourceBucket + "/" + sourceKey + BulkMetaData.ORIGINAL_OBJECT_KEY.getMetaDataKey(), sourceBucket + sourceKey ); } @@ -223,7 +232,7 @@ public static class ObjectTransform implements Serializable { ObjectTransform buildDefaultTransform() { return ObjectTransform.builder() .destinationBucketName(config.getConfigPropertyOrError(BulkModeConfigProperty.OUTPUT_BUCKET)) - .pathWithinBucket(config.getConfigPropertyAsOptional(BulkModeConfigProperty.OUTPUT_BASE_PATH).orElse("")) + .pathWithinBucket(outputBasePath().orElse("")) .rules(defaultRuleSet) .build(); } @@ -267,4 +276,21 @@ public Optional getApplicableRules(RuleSet rules, String sourceOb } + /** + * helper to parse inputBasePath from config, if present; + * left here to support potential logic change around presumption of leading `/` + * @return + */ + Optional inputBasePath() { + return config.getConfigPropertyAsOptional(BulkModeConfigProperty.INPUT_BASE_PATH) + .filter(inputBasePath -> StringUtils.isNotBlank(inputBasePath)); + //.map(inputBasePath -> inputBasePath.startsWith("/") ? inputBasePath : "/" + inputBasePath); + } + + Optional outputBasePath() { + return config.getConfigPropertyAsOptional(BulkModeConfigProperty.OUTPUT_BASE_PATH) + .filter(outputBasePath -> StringUtils.isNotBlank(outputBasePath)); + //.map(outputBasePath -> outputBasePath.startsWith("/") ? outputBasePath : "/" + outputBasePath); + } + } diff --git a/java/core/src/test/java/co/worklytics/psoxy/rules/slack/SlackDiscoveryBulkTests.java b/java/core/src/test/java/co/worklytics/psoxy/rules/slack/SlackDiscoveryBulkTests.java index 912f6750b..b3e9b9682 100644 --- a/java/core/src/test/java/co/worklytics/psoxy/rules/slack/SlackDiscoveryBulkTests.java +++ b/java/core/src/test/java/co/worklytics/psoxy/rules/slack/SlackDiscoveryBulkTests.java @@ -115,9 +115,9 @@ StorageEventRequest request( .sourceBucketName("bucket") .sourceObjectPath(sourceObjectPath) .sourceReader(reader) + .destinationWriter(writer) .destinationBucketName("bucket") .destinationObjectPath(sourceObjectPath) - .destinationWriter(writer) .build(); } @@ -133,7 +133,7 @@ StorageEventRequest request( @ParameterizedTest public void files(String rulesPath, String file) { setUp(rulesPath); - final String objectPath = "/export-20231128/" + file + ".gz"; + final String objectPath = "export-20231128/" + file + ".gz"; final String pathToOriginal = "sources/slack/example-bulk/original/" + file; final String pathToSanitized = "sources/slack/example-bulk/sanitized/" + file; storageHandler.handle(request(objectPath, pathToOriginal), rules); diff --git a/java/gateway-core/src/main/java/com/avaulta/gateway/rules/MultiTypeBulkDataRules.java b/java/gateway-core/src/main/java/com/avaulta/gateway/rules/MultiTypeBulkDataRules.java index dc3873eba..956c1581e 100644 --- a/java/gateway-core/src/main/java/com/avaulta/gateway/rules/MultiTypeBulkDataRules.java +++ b/java/gateway-core/src/main/java/com/avaulta/gateway/rules/MultiTypeBulkDataRules.java @@ -28,28 +28,31 @@ public class MultiTypeBulkDataRules implements BulkDataRules { /** - * map of file path templates to rules for matching files, where "path template" has the same - * interpretation as in OpenAPI 3.0.0 + * map of file path (really object key) templates to rules for matching files (object), + * where "path template" has the same interpretation as in OpenAPI 3.0.0, but matching + * against the object key (file path) rather than the URL path * - * in particular, leading `/` is not optional; so in practice, all will begin with `/`, to - * conform with OpenAPI 3.0.0 semantics, which always explicitly include it. + * as S3/GCS/etc implement file system abstraction only as a convention, object key for matching + * will not begin with `/` * * see: https://swagger.io/specification/ , section "Path Templating" * * @see PathTemplateUtils for more details on interpretation * * - * eg, /export/{week}/data_{shard}.csv -> ColumnarRules + * eg, export/{week}/data_{shard}.csv -> ColumnarRules * - * if provided, has the effect of pathRegex = "^/export/[^/]+/data_[^/]+\.csv$" + * if provided, has the effect of pathRegex = "^export/[^/]+/data_[^/]+\.csv$" * * files that trigger proxy instance but match NONE of the templates will not be processed * - * NOTE: + * NOTE: INPUT_BASE_PATH, if any, is trimmed from the beginning of the path before matching + * see {@link BulkModeConfigProperty#INPUT_BASE_PATH} * * q: what if file matches multiple path templates? pick lexicographically first? * * q: flatten somehow, so no indentation at the top-level? + * */ Map fileRules;