Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dhall-to-nix encoding of symbols with special keys #2426

Merged
164 changes: 140 additions & 24 deletions dhall-nix/src/Dhall/Nix.hs
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,13 @@ import Data.Fix (Fix (..))
import Data.Foldable (toList)
import Data.List.NonEmpty (NonEmpty(..))
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Traversable (for)
import Data.Typeable (Typeable)
import Data.Void (Void, absurd)
import Lens.Family (toListOf)
import Numeric (showHex)
import Data.Char (ord, isDigit, isAsciiLower, isAsciiUpper)

import Dhall.Core
( Binding (..)
Expand Down Expand Up @@ -226,7 +229,7 @@ Nix
$_ERROR: Cannot project by type

The ❰dhall-to-nix❱ compiler does not support projecting out a subset of a record
by the expected type (i.e. ❰someRecord.(someType)❱
by the expected type (i.e. ❰someRecord.(someType)❱
|]

show CannotShowConstructor =
Expand Down Expand Up @@ -255,7 +258,7 @@ instance Exception CompileError
{-| Convert a Dhall expression to the equivalent Nix expression

>>> :set -XOverloadedStrings
>>> dhallToNix (Lam "x" Natural (Lam "y" Natural (NaturalPlus "x" "y")))
>>> dhallToNix (Lam "x" Natural (Lam "y" Natural (NaturalPlus "x" "y")))
Right (NAbs (Param "x") (NAbs (Param "y") (NBinary NPlus (NSym "x") (NSym "y"))))
>>> fmap Nix.Pretty.prettyNix it
Right x: y: x + y
Expand Down Expand Up @@ -336,21 +339,17 @@ dhallToNix e =
Dhall.Optics.rewriteOf Dhall.Core.subExpressions renameShadowed

loop (Const _) = return untranslatable
loop (Var (V a 0)) = return (Nix.mkSym a)
loop (Var (V a 0)) = return (Nix.mkSym (zEncodeSymbol a))
loop (Var a ) = Left (CannotReferenceShadowedVariable a)
loop (Lam _ FunctionBinding { functionBindingVariable = a } c) = do
c' <- loop c
return (Param (VarName a) ==> c')
return (Param (VarName $ zEncodeSymbol a) ==> c')
loop (Pi _ _ _ _) = return untranslatable
loop (App None _) =
return Nix.mkNull
loop (App (Field (Union kts) (Dhall.Core.fieldSelectionLabel -> k)) v) = do
loop (App (Field (Union _kts) (Dhall.Core.fieldSelectionLabel -> k)) v) = do
v' <- loop v
let e0 = do
k' <- Dhall.Map.keys kts
return (k', Nothing)
let e2 = Nix.mkSym k @@ v'
return (Nix.mkParamset e0 False ==> e2)
return (unionChoice (VarName k) (Just v'))
loop (App a b) = do
a' <- loop a
b' <- loop b
Expand All @@ -359,7 +358,7 @@ dhallToNix e =
let MultiLet bindings b = Dhall.Core.multiLet a0 b0
bindings' <- for bindings $ \Binding{ variable, value } -> do
value' <- loop value
pure (variable, value')
pure (zEncodeSymbol variable, value')
b' <- loop b
return (Nix.letsE (toList bindings') b')
loop (Annot a _) = loop a
Expand Down Expand Up @@ -626,7 +625,7 @@ dhallToNix e =
-- see https://github.com/dhall-lang/dhall-haskell/issues/2414
nixAttrs pairs =
Fix $ NSet NonRecursive $
(\(key, val) -> NamedVar (DynamicKey (Plain (DoubleQuoted [Plain key])) :| []) val Nix.nullPos)
(\(key, val) -> NamedVar ((mkDoubleQuotedIfNecessary (VarName key)) :| []) val Nix.nullPos)
<$> pairs
loop (Union _) = return untranslatable
loop (Combine _ _ a b) = do
Expand Down Expand Up @@ -715,20 +714,11 @@ dhallToNix e =
-- (here "x").
--
-- This translates `< Foo : T >.Foo` to `x: { Foo }: Foo x`
Just (Just _) -> do
let e0 = do
k' <- Dhall.Map.keys kts
return (k', Nothing)
return ("x" ==> Nix.mkParamset e0 False ==> (Nix.mkSym k @@ "x"))

_ -> do
let e0 = do
k' <- Dhall.Map.keys kts
return (k', Nothing)
return (Nix.mkParamset e0 False ==> Nix.mkSym k)
Just (Just _) -> return ("x" ==> (unionChoice (VarName k) (Just "x")))
_ -> return (unionChoice (VarName k) Nothing)
loop (Field a (Dhall.Core.fieldSelectionLabel -> b)) = do
a' <- loop a
return (a' @. b)
return (Fix (Nix.NSelect Nothing a' (mkDoubleQuotedIfNecessary (VarName b) :| [])))
loop (Project a (Left b)) = do
a' <- loop a
return (Nix.mkNonRecSet [ Nix.inheritFrom a' (fmap VarName b) ])
Expand Down Expand Up @@ -759,3 +749,129 @@ dhallToNix e =
loop (ImportAlt a _) = loop a
loop (Note _ b) = loop b
loop (Embed x) = absurd x

-- | Previously we turned @<Foo | Bar>.Foo@ into @{ Foo, Bar }: Foo@,
-- but this would not work with <Frob/Baz>.Frob/Baz (cause the slash is not a valid symbol char in nix)
-- so we generate @union: union."Frob/Baz"@ instead.
--
-- If passArgument is @Just@, pass the argument to the union selector.
unionChoice :: VarName -> Maybe NExpr -> NExpr
unionChoice chosenKey passArgument =
let selector = Fix (Nix.NSelect Nothing (Nix.mkSym "u") (mkDoubleQuotedIfNecessary chosenKey :| []))
in Nix.Param "u" ==>
case passArgument of
Nothing -> selector
Just arg -> selector @@ arg


-- | Double-quote a field name (record or union). This makes sure it’s recognized as a valid name by nix, e.g. in
--
-- @{ "foo/bar" = 42; }."foo/bar" }@
--
-- where
--
-- @{ foo/bar = 42; }.foo/bar@ is not syntactically valid nix.
--
-- This is only done if necessary (where “necessary” is not super defined right now).
mkDoubleQuotedIfNecessary :: VarName -> NKeyName r
mkDoubleQuotedIfNecessary key@(VarName keyName) =
if Text.all simpleChar keyName
then StaticKey key
else DynamicKey (Plain (DoubleQuoted [Plain keyName]))
where
simpleChar c = isAsciiLower c || isAsciiUpper c
Copy link
Collaborator

@Gabriella439 Gabriella439 Dec 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor suggestion: I think this makes the intention more clear

Suggested change
simpleChar c = isAsciiLower c || isAsciiUpper c
simpleChar c = isAscii c && isLetter c

This would also require a matching change to the imports

Edit: Fixed to use && instead of ||

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I … don’t think all ascii values are a valid nix symbol. It includes stuff like 0x04 (Ctrl+c); and I’d have to verify if they are allowed to start with a digit

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example

nix-repl> { 1 = "foo"; }
error: syntax error, unexpected INT

       at «string»:1:3:

            1| { 1 = "foo"; }
             |   ^
            2|

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops, I meant to say isAscii c && isLetter c

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But isAscii still includes all control characters for example

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i.e.

-- | Selects the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
isAscii :: Char -> Bool
isAscii c = c < '\x80'

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please read my responses more closely

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, using && … at least to me filtering out by boolean and is not more intuitive than set logic.

Copy link
Collaborator

@Gabriella439 Gabriella439 Dec 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main reason I suggested using isLetter is that it seems more direct than specifying that something is a letter by saying that it is uppercase or lowercase. What you've written reads to me like:

simpleChar c = (isAscii c && isUpper c) || (isAscii c && isLower c)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anyway, coming back to this, apart from the aesthetics, isLetter needs support for unicode ICU (it’s not just a simple ord check):

isLetter :: Char -> Bool
isLetter c = case generalCategory c of
 GeneralCategory
 UppercaseLetter -> True
 LowercaseLetter -> True
 TitlecaseLetter -> True
 ModifierLetter -> True
 OtherLetter -> True
 _ -> False

generalCategory :: Char -> GeneralCategory
generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c

foreign import ccall unsafe "u_gencat"
 wgencat :: Int -> Int

vs

isAsciiLower :: Char -> Bool
isAsciiLower c = c >= 'a' && c <= 'z'



-- | Nix does not support symbols like @foo/bar@, but they are allowed in dhall.
-- So if they happen, we need to encode them with an ASCII escaping scheme.
--
-- This is copied/inspired by the Z-Encoding scheme from GHC, see
-- https://hackage.haskell.org/package/zenc-0.1.2/docs/Text-Encoding-Z.html
--
-- Original Source is BSD-3-Clause, Copyright (c)2011, Jason Dagit
zEncodeSymbol :: Text -> Text
zEncodeSymbol = zEncodeString

-- | The basic encoding scheme is this:

-- * Alphabetic characters (upper and lower) and digits
-- all translate to themselves;
-- except 'Z', which translates to 'ZZ'
-- and 'z', which translates to 'zz'
--
-- * Most other printable characters translate to 'zx' or 'Zx' for some
-- alphabetic character x
--
-- * The others translate as 'znnnU' where 'nnn' is the decimal number
-- of the character
--
-- @
-- Before After
-- --------------------------
-- Trak Trak
-- foo-wib foozmwib
-- \> zg
-- \>1 zg1
-- foo\# foozh
-- foo\#\# foozhzh
-- foo\#\#1 foozhzh1
-- fooZ fooZZ
-- :+ ZCzp
-- @
Comment on lines +808 to +820
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you begin each line with > then you don't need to escape special characters like \#

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the change I'm referring to:

Suggested change
-- @
-- Before After
-- --------------------------
-- Trak Trak
-- foo-wib foozmwib
-- \> zg
-- \>1 zg1
-- foo\# foozh
-- foo\#\# foozhzh
-- foo\#\#1 foozhzh1
-- fooZ fooZZ
-- :+ ZCzp
-- @
-- > Before After
-- > --------------------------
-- > Trak Trak
-- > foo-wib foozmwib
-- > > zg
-- > >1 zg1
-- > foo# foozh
-- > foo## foozhzh
-- > foo##1 foozhzh1
-- > fooZ fooZZ
-- > :+ ZCzp

zEncodeString :: Text -> Text
zEncodeString cs = case Text.uncons cs of
Nothing -> Text.empty
Just (c, cs') ->
encodeDigitChar c
<> Text.concatMap encodeChar cs'

-- | Whether the given characters needs to be z-encoded.
needsEncoding :: Char -> Bool
needsEncoding 'Z' = True
needsEncoding 'z' = True
needsEncoding c = not
( isAsciiLower c
|| isAsciiUpper c
Comment on lines +833 to +834
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps reuse the simpleChar function here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I’d have to think about whether they are semantically the same.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, thinking about it they are not the same, since we either double quote where necessary (using the simpleChar filter), or z-encode where necessary (internal symbols that cannot be quoted). In particular, z-encoded values should never land inside a struct for example.

I think this made me notice a place where I’d forgotten the encoding, namely Project.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I mean is that right now simpleChar c is defined as isAsciiLower c || isAsciiUpper c, so it seems like you should be able to replace isAsciiLower c || isAsciiUpper c right here with simpleChar c

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still feel like you can reuse the simpleChar function here:

needsEncoding c = not (simpleChar c)

|| isDigit c )

-- If a digit is at the start of a symbol then we need to encode it.
encodeDigitChar :: Char -> Text
encodeDigitChar c | isDigit c = encodeAsUnicodeChar c
encodeDigitChar c = encodeChar c

encodeChar :: Char -> Text
encodeChar c | not (needsEncoding c) = [c] -- Common case first

encodeChar '(' = "ZL"
encodeChar ')' = "ZR"
encodeChar '[' = "ZM"
encodeChar ']' = "ZN"
encodeChar ':' = "ZC"
encodeChar 'Z' = "ZZ"
encodeChar 'z' = "zz"
encodeChar '&' = "za"
encodeChar '|' = "zb"
encodeChar '^' = "zc"
encodeChar '$' = "zd"
encodeChar '=' = "ze"
encodeChar '>' = "zg"
encodeChar '#' = "zh"
encodeChar '.' = "zi"
encodeChar '<' = "zl"
-- we can’t allow @-@, because it is not valid at the start of a symbol
encodeChar '-' = "zm"
encodeChar '!' = "zn"
encodeChar '+' = "zp"
encodeChar '\'' = "zq"
encodeChar '\\' = "zr"
encodeChar '/' = "zs"
encodeChar '*' = "zt"
-- We can allow @_@ because it can appear anywhere in a symbol
-- encodeChar '_' = "zu"
encodeChar '%' = "zv"
encodeChar c = encodeAsUnicodeChar c

encodeAsUnicodeChar :: Char -> Text
encodeAsUnicodeChar c = 'z' `Text.cons` if isDigit (Text.head hex_str) then hex_str
else '0' `Text.cons` hex_str
where hex_str = Text.pack $ showHex (ord c) "U"
Loading