diff --git a/Duckling/Types.hs b/Duckling/Types.hs index 5300acd4e..00a183bfd 100644 --- a/Duckling/Types.hs +++ b/Duckling/Types.hs @@ -131,6 +131,10 @@ data Dimension a where Volume :: Dimension VolumeData CustomDimension :: CustomDimension a => a -> Dimension (DimensionData a) +-- Eq +instance Eq (Dimension a) where + (==) a b = hash a == hash b + -- Show instance Show (Dimension a) where show RegexMatch = "RegexMatch" diff --git a/Duckling/Types/Document.hs b/Duckling/Types/Document.hs index 5d223549f..50303883f 100644 --- a/Duckling/Types/Document.hs +++ b/Duckling/Types/Document.hs @@ -56,8 +56,8 @@ data Document = Document -- for "żółty" :: Document -- tDropToBSDrop = [0,2,4,6,7,8] -- bsDropToTDrop = [0,1,1,2,2,3,3,4,5] - -- tDropToUtf16Drop = [0,1,2,3,4,5] - , tDropToUtf16Drop :: UArray Int Int + -- tDropToUtf8Drop = [0,1,2,3,4,5] -- out of date + , tDropToUtf8Drop :: UArray Int Int -- translate Text.drop to Data.Text.Unsafe.dropWord16 } deriving (Show) @@ -97,8 +97,8 @@ fromText rawInput = Document{..} | otherwise = (ix, ix:acc) tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList - tDropToUtf16Drop = Array.listArray (0, rawInputLength) $ - scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked + tDropToUtf8Drop = Array.listArray (0, rawInputLength) $ + scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $ reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc) @@ -109,11 +109,6 @@ fromText rawInput = Document{..} | otherwise = 4 where w = UText.ord c - utf16CharWidth c - | w < 0x10000 = 1 - | otherwise = 2 - where - w = UText.ord c data CharClass = Alpha @@ -262,7 +257,7 @@ byteStringFromPos , utf8Encoded = utf8Encoded , tDropToBSDrop = tDropToBSDrop , bsDropToTDrop = bsDropToTDrop - , tDropToUtf16Drop = tDropToUtf16Drop + , tDropToUtf8Drop = tDropToUtf8Drop } position = (substring, rangeToText, translateRange) where @@ -271,16 +266,15 @@ byteStringFromPos utf8Position = tDropToBSDrop Array.! position substring :: ByteString substring = BS.drop utf8Position utf8Encoded - -- get a subrange of Text reusing the underlying buffer using - -- utf16 start and end positions + -- get a subrange of Text reusing the underlying buffer rangeToText :: (Int, Int) -> Text rangeToText (-1, _) = "" -- this is what regexec from Text.Regex.PCRE.ByteString does - rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $ - UText.dropWord16 start16Pos rawInput + rangeToText r = UText.takeWord8 (end8Pos - start8Pos) $ + UText.dropWord8 start8Pos rawInput where - start16Pos = tDropToUtf16Drop Array.! startPos - end16Pos = tDropToUtf16Drop Array.! endPos + start8Pos = tDropToUtf8Drop Array.! startPos + end8Pos = tDropToUtf8Drop Array.! endPos (startPos, endPos) = uncurry translateRange r -- from utf8 offset and length to Text character start and end position translateRange :: Int -> Int -> (Int, Int) diff --git a/duckling.cabal b/duckling.cabal index ca1f25b30..825745a89 100644 --- a/duckling.cabal +++ b/duckling.cabal @@ -885,9 +885,9 @@ library , deepseq >= 1.4.1.1 && < 1.5 , dependent-sum >= 0.3.2.2 && < 0.8 , extra >= 1.4.10 && < 1.8 - , hashable >= 1.2.4.0 && < 1.4 + , hashable >= 1.2.4.0 && < 1.5 , regex-base >= 0.93.2 && < 0.95 - , text >= 1.2.2.1 && < 1.3 + , text >= 2 && < 2.2 , text-show >= 2.1.2 && < 3.10 , time >= 1.5.0.1 && < 2 , timezone-series >= 0.1.5.1 && < 0.2