From 7ecc7095f923145a9fc931a595aaddbdbc177440 Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Thu, 10 Aug 2023 12:17:45 -0400 Subject: [PATCH] GH-37096: [MATLAB] Add utility which makes valid MATLAB table variable names from an arbitrary list of strings (#37098) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change To make it possible to safely convert Arrow Schema field names to corresponding MATLAB `table` variable names, it would be helpful to add a utility which can take an arbitrary list of strings and return a set of valid MATLAB `table` variable names, which are (1) unique, (2) non-empty, and (3) do not conflict with the "reserved" variable names "Properties", "VariableNames", "RowNames", and ":". An additional restriction is that variable names must have 63 or less characters. ### What changes are included in this PR? 1. Added a new function called `arrow.tabular.internal.makeValidVariableNames` that accepts an arbitrary list of strings and returns valid MATLAB `table` variable names. ```matlab >> originalVarNames = ["", "Properties", ":", "ValidVar", "ValidVar"]; >> validVarNames = arrow.tabular.internal.makeValidVariableNames(originalVarNames) validVarNames = 1×5 string array "Var1" "Properties_1" ":_1" "ValidVar" "ValidVar_1" ``` 3. Added a new function called `arrow.tabular.internal.makeValidDimensionNames` that returns valid table dimension names with respect to a list of valid variable names. In MATLAB the default `table` dimension names are `"Row"` and `"Variables"`, but they must not conflict with any variables names. In other words, they must be unique with respect to the variable names. ```matlab >> validVarNames = ["Row" "Test" "Variables"]; >> validDimNames = arrow.tabular.internal.makeValidDimensionNames(validVarNames) validDimNames = 1×2 string array "Row_1" "Variables_1" ``` To summarize, MATLAB `table`s cannot have arbitrary variable names. For example, `"Properties"`, `"RowNames"`, `"VariableNames"`, and `":"` are all disallowed. Variable names must also be unique and must be between 1 and 63 characters in length. They also must be unique with respect to each other. ### Are these changes tested? Yes. Added the following new test classes: 1. `tMakeValidVariableNames.m` 2. `tMakeValidDimensionNames.m` ### Are there any user-facing changes? No. ### Future Directions 1. In a follow-up PR, we will integrate `makeValidVariableNames` and `makeValidDimensionNames` into the `table()` and `toMATLAB()` methods of `arrow.tabular.RecordBatch`. ### Notes Thanks to @ kevingurney for help writing the test cases! * Closes: #37096 Lead-authored-by: Sarah Gilmore Co-authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- .../+internal/makeValidDimensionNames.m | 28 ++ .../+internal/makeValidVariableNames.m | 36 +++ .../arrow/tabular/tMakeValidDimensionNames.m | 81 ++++++ .../arrow/tabular/tMakeValidVariableNames.m | 240 ++++++++++++++++++ 4 files changed, 385 insertions(+) create mode 100644 matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m create mode 100644 matlab/src/matlab/+arrow/+tabular/+internal/makeValidVariableNames.m create mode 100644 matlab/test/arrow/tabular/tMakeValidDimensionNames.m create mode 100644 matlab/test/arrow/tabular/tMakeValidVariableNames.m diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m b/matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m new file mode 100644 index 0000000000000..88f7b10806212 --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m @@ -0,0 +1,28 @@ +%MAKEVALIDDIMENSIONNAMES Makes valid table dimension names with +% respect to the variable names. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function dimnames = makeValidDimensionNames(varnames) + + dimnames = ["Row" "Variables"]; + + numvars = numel(varnames); + indicesToUniqify = [numvars + 1 numvars + 2]; + + strs = matlab.lang.makeUniqueStrings([varnames dimnames], indicesToUniqify); + dimnames = strs(indicesToUniqify); +end + diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/makeValidVariableNames.m b/matlab/src/matlab/+arrow/+tabular/+internal/makeValidVariableNames.m new file mode 100644 index 0000000000000..b2abdc55aee8b --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/makeValidVariableNames.m @@ -0,0 +1,36 @@ +%MAKEVALIDVARIABLENAMES Makes valid table variable names. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function [varnames, modified] = makeValidVariableNames(varnames) + arguments + varnames(1, :) string + end + + reservedNames = ["Properties", "VariableNames", "RowNames", ":"]; + + [varnames, replacedVars] = replaceEmptyVariableNames(varnames); + [varnames, madeUnique] = matlab.lang.makeUniqueStrings(varnames, reservedNames, 63); + + modified = replacedVars || any(madeUnique); +end + +function [varnames, modified] = replaceEmptyVariableNames(varnames) + emptyIndices = find(varnames == ""); + modified = any(emptyIndices); + if modified + varnames(emptyIndices) = compose("Var%d", emptyIndices); + end +end diff --git a/matlab/test/arrow/tabular/tMakeValidDimensionNames.m b/matlab/test/arrow/tabular/tMakeValidDimensionNames.m new file mode 100644 index 0000000000000..3e80fec91bc8d --- /dev/null +++ b/matlab/test/arrow/tabular/tMakeValidDimensionNames.m @@ -0,0 +1,81 @@ +%TMAKEVALIDDIMENSIONNAMES Unit tests for +% arrow.tabular.internal.makeValidDimensionNames. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tMakeValidDimensionNames < matlab.unittest.TestCase + + methods(Test) + + function VariableNamedRow(testCase) + % Verify the default dimension name "Row" is replaced with "Row_1" + % if one of the variables is named "Row". + import arrow.tabular.internal.* + + varnames = ["Row" "Var2"]; + dimnames = makeValidDimensionNames(varnames); + testCase.verifyEqual(dimnames, ["Row_1", "Variables"]); + end + + function VariableNamedVariables(testCase) + % Verify the default dimension name "Variables" is replaced with + % "Variables_1" if one of the variables is named "Variables". + import arrow.tabular.internal.* + + varnames = ["Var1" "Variables"]; + dimnames = makeValidDimensionNames(varnames); + testCase.verifyEqual(dimnames, ["Row", "Variables_1"]); + end + + function VariablesWithConflictingNumericSuffix(testCase) + % Verify that conflicting numeric suffixes (e.g. "Variables" + % and "Variables_1") are resolved as expected. + + import arrow.tabular.internal.* + + varnames = ["A" "Variables_1" "Variables"]; + dimnames = makeValidDimensionNames(varnames); + testCase.verifyEqual(dimnames, ["Row", "Variables_2"]); + end + + function RowWithConflictingNumericSuffix(testCase) + % Verify that conflicting numeric suffixes (e.g. "Row" + % and "Row_1") are resolved as expected. + + import arrow.tabular.internal.* + + varnames = ["Row_1" "Row" "Row_3" "Test"]; + dimnames = makeValidDimensionNames(varnames); + testCase.verifyEqual(dimnames, ["Row_2", "Variables"]); + end + + function DefaultDimensionNamesOk(testCase) + % Verify the dimension names are set to the default values + % ("Row" and "Variables") if they are not one of the variable + % names. + + import arrow.tabular.internal.* + + varnames = ["row" "variables"]; + dimnames = makeValidDimensionNames(varnames); + testCase.verifyEqual(dimnames, ["Row", "Variables"]); + + varnames = ["A" "B" "C"]; + dimnames = makeValidDimensionNames(varnames); + testCase.verifyEqual(dimnames, ["Row", "Variables"]); + end + end +end \ No newline at end of file diff --git a/matlab/test/arrow/tabular/tMakeValidVariableNames.m b/matlab/test/arrow/tabular/tMakeValidVariableNames.m new file mode 100644 index 0000000000000..53b4ea67fb4b1 --- /dev/null +++ b/matlab/test/arrow/tabular/tMakeValidVariableNames.m @@ -0,0 +1,240 @@ +%TMAKEVALIDVARIABLENAMES Unit tests for +% arrow.tabular.internal.makeValidVariableNames. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tMakeValidVariableNames < matlab.unittest.TestCase + + methods(Test) + + function Colon(testCase) + % Verify that ":" becomes ":_1". + import arrow.tabular.internal.* + + original = ":"; + expected = ":_1"; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function RowNames(testCase) + % Verify that "RowNames" becomes "RowNames_1". + import arrow.tabular.internal.* + + original = "RowNames"; + expected = "RowNames_1"; + [actual, modified] = makeValidVariableNames(original); + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function Properties(testCase) + % Verify that "Properties" becomes "Properties_1". + import arrow.tabular.internal.* + + original = "Properties"; + expected = "Properties_1"; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function VariableNames(testCase) + % Verify that "VariableNames" becomes VariableNames_1. + import arrow.tabular.internal.* + + original = "VariableNames"; + expected = "VariableNames_1"; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function ValidVariableNames(testCase) + % Verify that when all of the input strings + % are valid table variable names, that none of them + % are modified. + import arrow.tabular.internal.* + + original = ["A", "B", "C"]; + expected = original; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyFalse(modified); + end + + function ValidVariableNamesUnicode(testCase) + % Verify that when all of the input strings are valid Unicode + % table variable names, that none of them are modified. + import arrow.tabular.internal.* + + smiley = "😀"; + tree = "🌲"; + mango = "🥭"; + + original = [smiley, tree, mango]; + expected = original; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyFalse(modified); + end + + function PropertiesWithConflictingNumericSuffix(testCase) + % Verify that conflicting numeric suffixes (e.g. "Properties" + % and "Properties_1") are resolved as expected. + import arrow.tabular.internal.* + + original = ["Properties", "Properties_1"]; + expected = ["Properties_2", "Properties_1"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + + original = ["Properties_1", "Properties", "Properties_4"]; + expected = ["Properties_1", "Properties_2", "Properties_4"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function RowNamesWithConflictingNumericSuffix(testCase) + % Verify that conflicting numeric suffixes (e.g. "RowNames" + % and "RowNames_1") are resolved as expected. + import arrow.tabular.internal.* + + original = ["RowNames", "RowNames_1"]; + expected = ["RowNames_2", "RowNames_1"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + + original = ["RowNames_1", "RowNames", "RowNames_4"]; + expected = ["RowNames_1", "RowNames_2", "RowNames_4"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function VariableNamesWithConflictingNumericSuffix(testCase) + % Verify that conflicting numeric suffixes (e.g. "VariableNames" + % and "VariableNames_1") are resolved as expected. + import arrow.tabular.internal.* + + original = ["VariableNames", "VariableNames_1"]; + expected = ["VariableNames_2", "VariableNames_1"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + + original = ["VariableNames_1", "VariableNames", "VariableNames_4"]; + expected = ["VariableNames_1", "VariableNames_2", "VariableNames_4"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function ColonWithConflictingSuffix(testCase) + % Verify that conflicting suffixes (e.g. ":" + % and "x_") are resolved as expected. + import arrow.tabular.internal.* + + original = [":", ":_1"]; + expected = [":_2", ":_1"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + + original = [":_1", ":", ":_4"]; + expected = [":_1", ":_2", ":_4"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function EmptyStrings(testCase) + % Verify that empty strings are mapped to Var1, ..., Vari, ..., + % VarN as expected and that conflicting names are resolved as + % expected. + import arrow.tabular.internal.* + + original = ""; + expected = "Var1"; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + + original = ["", "Var1", ""]; + expected = ["Var1", "Var1_1", "Var3"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + + original = ["", "Var1", "Var1_1"]; + expected = ["Var1", "Var1_2", "Var1_1"]; + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + function VariableNameLengthMax(testCase) + % Verify strings whose character length exceeds 63 + % are truncated to the max variable name length (63). + import arrow.tabular.internal.* + + original = string(repmat('a', [1 64])); + expected = extractBefore(original, 64); + + [actual, modified] = makeValidVariableNames(original); + + testCase.verifyEqual(actual, expected); + testCase.verifyTrue(modified); + end + + end + +end \ No newline at end of file