Skip to content

Commit

Permalink
revert expectation of explicit leading '/' in object paths/keys
Browse files Browse the repository at this point in the history
  • Loading branch information
eschultink committed Dec 5, 2023
1 parent 25b7637 commit cb37cd8
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 32 deletions.
6 changes: 3 additions & 3 deletions docs/sources/slack/discovery-bulk-hierarchical.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
fileRules:
/export-{week}/{file}:
export-{week}/{file}:
fileRules:
users.ndjson.gz:
format: "NDJSON"
Expand All @@ -24,7 +24,7 @@ fileRules:
transforms:
- redact: "$.text"
messages-{id}.ndjson.gz:
format: "NDJSON"
transforms:
format: "NDJSON"
transforms:
- redact: "$.text"
- redact: "$.channel_name"
12 changes: 6 additions & 6 deletions docs/sources/slack/discovery-bulk.yaml
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
fileRules:
/export-{week}/users.ndjson.gz:
export-{week}/users.ndjson.gz:
format: "NDJSON"
transforms:
- pseudonymize: "$.profile.email"
- redact: "$.name"
- redact: "$.real_name"
/export-{week}/channels.ndjson.gz:
export-{week}/channels.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.name"
/export-{week}/dms.ndjson.gz:
export-{week}/dms.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
/export-{week}/groups.ndjson.gz:
export-{week}/groups.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.name"
/export-{week}/mpims.ndjson.gz:
export-{week}/mpims.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
/export-{week}/messages-{id}.ndjson.gz:
export-{week}/messages-{id}.ndjson.gz:
format: "NDJSON"
transforms:
- redact: "$.text"
Expand Down
10 changes: 0 additions & 10 deletions java/core/src/main/java/co/worklytics/psoxy/rules/Validator.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,6 @@ public void validate(@NonNull RecordRules rules) {
public void validate(@NonNull MultiTypeBulkDataRules rules) {
Preconditions.checkArgument(rules.getFileRules().size() > 0, "Must have at least one file rule");

List<String> templatesNotPrefixedWithSlash = rules.getFileRules().keySet().stream()
.filter(k -> !k.startsWith("/"))
.collect(Collectors.toList());

//not invalid per se, but likely to be a mistake
if (!templatesNotPrefixedWithSlash.isEmpty()) {
log.warning("The following path templates do not start with '/', so likely to be wrong:\n " + templatesNotPrefixedWithSlash.stream().collect(Collectors.joining("\n")));
}


rules.getFileRules().values().forEach(this::validate);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import com.avaulta.gateway.rules.PathTemplateUtils.Match;
import com.avaulta.gateway.rules.RuleSet;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import lombok.*;
import lombok.extern.java.Log;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -122,15 +123,23 @@ public StorageEventResponse handle(StorageEventRequest request, RuleSet rules) {



/**
*
* @param reader from source, may be null if not yet known
* @param writer to destination, may be null if not yet known
* @param sourceBucketName bucket name
* @param sourceObjectPath a path (object key) within the bucket; no leading `/` expected
* @param transform
* @return
*/
public StorageEventRequest buildRequest(Reader reader,
Writer writer,
String sourceBucketName,
String sourceObjectPath,
ObjectTransform transform) {

String sourceObjectPathWithinBase =
config.getConfigPropertyAsOptional(BulkModeConfigProperty.INPUT_BASE_PATH)
.filter(inputBasePath -> StringUtils.isNotBlank(inputBasePath))
inputBasePath()
.map(inputBasePath -> sourceObjectPath.replace(inputBasePath, ""))
.orElse(sourceObjectPath);

Expand Down Expand Up @@ -168,7 +177,7 @@ public Map<String, String> buildObjectMetadata(String sourceBucket, String sourc
return Map.of(
BulkMetaData.INSTANCE_ID.getMetaDataKey(), hostEnvironment.getInstanceId(),
BulkMetaData.VERSION.getMetaDataKey(), config.getConfigPropertyAsOptional(ProxyConfigProperty.BUNDLE_FILENAME).orElse("unknown"),
BulkMetaData.ORIGINAL_OBJECT_KEY.getMetaDataKey(), sourceBucket + "/" + sourceKey
BulkMetaData.ORIGINAL_OBJECT_KEY.getMetaDataKey(), sourceBucket + sourceKey
);
}

Expand Down Expand Up @@ -223,7 +232,7 @@ public static class ObjectTransform implements Serializable {
ObjectTransform buildDefaultTransform() {
return ObjectTransform.builder()
.destinationBucketName(config.getConfigPropertyOrError(BulkModeConfigProperty.OUTPUT_BUCKET))
.pathWithinBucket(config.getConfigPropertyAsOptional(BulkModeConfigProperty.OUTPUT_BASE_PATH).orElse(""))
.pathWithinBucket(outputBasePath().orElse(""))
.rules(defaultRuleSet)
.build();
}
Expand Down Expand Up @@ -267,4 +276,21 @@ public Optional<BulkDataRules> getApplicableRules(RuleSet rules, String sourceOb
}


/**
* helper to parse inputBasePath from config, if present;
* left here to support potential logic change around presumption of leading `/`
* @return
*/
Optional<String> inputBasePath() {
return config.getConfigPropertyAsOptional(BulkModeConfigProperty.INPUT_BASE_PATH)
.filter(inputBasePath -> StringUtils.isNotBlank(inputBasePath));
//.map(inputBasePath -> inputBasePath.startsWith("/") ? inputBasePath : "/" + inputBasePath);
}

Optional<String> outputBasePath() {
return config.getConfigPropertyAsOptional(BulkModeConfigProperty.OUTPUT_BASE_PATH)
.filter(outputBasePath -> StringUtils.isNotBlank(outputBasePath));
//.map(outputBasePath -> outputBasePath.startsWith("/") ? outputBasePath : "/" + outputBasePath);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,9 @@ StorageEventRequest request(
.sourceBucketName("bucket")
.sourceObjectPath(sourceObjectPath)
.sourceReader(reader)
.destinationWriter(writer)
.destinationBucketName("bucket")
.destinationObjectPath(sourceObjectPath)
.destinationWriter(writer)
.build();
}

Expand All @@ -133,7 +133,7 @@ StorageEventRequest request(
@ParameterizedTest
public void files(String rulesPath, String file) {
setUp(rulesPath);
final String objectPath = "/export-20231128/" + file + ".gz";
final String objectPath = "export-20231128/" + file + ".gz";
final String pathToOriginal = "sources/slack/example-bulk/original/" + file;
final String pathToSanitized = "sources/slack/example-bulk/sanitized/" + file;
storageHandler.handle(request(objectPath, pathToOriginal), rules);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,31 @@
public class MultiTypeBulkDataRules implements BulkDataRules {

/**
* map of file path templates to rules for matching files, where "path template" has the same
* interpretation as in OpenAPI 3.0.0
* map of file path (really object key) templates to rules for matching files (object),
* where "path template" has the same interpretation as in OpenAPI 3.0.0, but matching
* against the object key (file path) rather than the URL path
*
* in particular, leading `/` is not optional; so in practice, all will begin with `/`, to
* conform with OpenAPI 3.0.0 semantics, which always explicitly include it.
* as S3/GCS/etc implement file system abstraction only as a convention, object key for matching
* will not begin with `/`
*
* see: https://swagger.io/specification/ , section "Path Templating"
*
* @see PathTemplateUtils for more details on interpretation
*
*
* eg, /export/{week}/data_{shard}.csv -> ColumnarRules
* eg, export/{week}/data_{shard}.csv -> ColumnarRules
*
* if provided, has the effect of pathRegex = "^/export/[^/]+/data_[^/]+\.csv$"
* if provided, has the effect of pathRegex = "^export/[^/]+/data_[^/]+\.csv$"
*
* files that trigger proxy instance but match NONE of the templates will not be processed
*
* NOTE:
* NOTE: INPUT_BASE_PATH, if any, is trimmed from the beginning of the path before matching
* see {@link BulkModeConfigProperty#INPUT_BASE_PATH}
*
* q: what if file matches multiple path templates? pick lexicographically first?
*
* q: flatten somehow, so no indentation at the top-level?
*
*/
Map<String, BulkDataRules> fileRules;

Expand Down

0 comments on commit cb37cd8

Please sign in to comment.